You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

548 lines
18 KiB

  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "net/url"
  32. "regexp"
  33. "strings"
  34. )
  35. // Policy encapsulates the whitelist of HTML elements and attributes that will
  36. // be applied to the sanitised HTML.
  37. //
  38. // You should use bluemonday.NewPolicy() to create a blank policy as the
  39. // unexported fields contain maps that need to be initialized.
  40. type Policy struct {
  41. // Declares whether the maps have been initialized, used as a cheap check to
  42. // ensure that those using Policy{} directly won't cause nil pointer
  43. // exceptions
  44. initialized bool
  45. // Allows the <!DOCTYPE > tag to exist in the sanitized document
  46. allowDocType bool
  47. // If true then we add spaces when stripping tags, specifically the closing
  48. // tag is replaced by a space character.
  49. addSpaces bool
  50. // When true, add rel="nofollow" to HTML anchors
  51. requireNoFollow bool
  52. // When true, add rel="nofollow" to HTML anchors
  53. // Will add for href="http://foo"
  54. // Will skip for href="/foo" or href="foo"
  55. requireNoFollowFullyQualifiedLinks bool
  56. // When true add target="_blank" to fully qualified links
  57. // Will add for href="http://foo"
  58. // Will skip for href="/foo" or href="foo"
  59. addTargetBlankToFullyQualifiedLinks bool
  60. // When true, URLs must be parseable by "net/url" url.Parse()
  61. requireParseableURLs bool
  62. // When true, u, _ := url.Parse("url"); !u.IsAbs() is permitted
  63. allowRelativeURLs bool
  64. // map[htmlElementName]map[htmlAttributeName]attrPolicy
  65. elsAndAttrs map[string]map[string]attrPolicy
  66. // map[htmlAttributeName]attrPolicy
  67. globalAttrs map[string]attrPolicy
  68. // If urlPolicy is nil, all URLs with matching schema are allowed.
  69. // Otherwise, only the URLs with matching schema and urlPolicy(url)
  70. // returning true are allowed.
  71. allowURLSchemes map[string]urlPolicy
  72. // If an element has had all attributes removed as a result of a policy
  73. // being applied, then the element would be removed from the output.
  74. //
  75. // However some elements are valid and have strong layout meaning without
  76. // any attributes, i.e. <table>. To prevent those being removed we maintain
  77. // a list of elements that are allowed to have no attributes and that will
  78. // be maintained in the output HTML.
  79. setOfElementsAllowedWithoutAttrs map[string]struct{}
  80. setOfElementsToSkipContent map[string]struct{}
  81. }
  82. type attrPolicy struct {
  83. // optional pattern to match, when not nil the regexp needs to match
  84. // otherwise the attribute is removed
  85. regexp *regexp.Regexp
  86. }
  87. type attrPolicyBuilder struct {
  88. p *Policy
  89. attrNames []string
  90. regexp *regexp.Regexp
  91. allowEmpty bool
  92. }
  93. type urlPolicy func(url *url.URL) (allowUrl bool)
  94. // init initializes the maps if this has not been done already
  95. func (p *Policy) init() {
  96. if !p.initialized {
  97. p.elsAndAttrs = make(map[string]map[string]attrPolicy)
  98. p.globalAttrs = make(map[string]attrPolicy)
  99. p.allowURLSchemes = make(map[string]urlPolicy)
  100. p.setOfElementsAllowedWithoutAttrs = make(map[string]struct{})
  101. p.setOfElementsToSkipContent = make(map[string]struct{})
  102. p.initialized = true
  103. }
  104. }
  105. // NewPolicy returns a blank policy with nothing whitelisted or permitted. This
  106. // is the recommended way to start building a policy and you should now use
  107. // AllowAttrs() and/or AllowElements() to construct the whitelist of HTML
  108. // elements and attributes.
  109. func NewPolicy() *Policy {
  110. p := Policy{}
  111. p.addDefaultElementsWithoutAttrs()
  112. p.addDefaultSkipElementContent()
  113. return &p
  114. }
  115. // AllowAttrs takes a range of HTML attribute names and returns an
  116. // attribute policy builder that allows you to specify the pattern and scope of
  117. // the whitelisted attribute.
  118. //
  119. // The attribute policy is only added to the core policy when either Globally()
  120. // or OnElements(...) are called.
  121. func (p *Policy) AllowAttrs(attrNames ...string) *attrPolicyBuilder {
  122. p.init()
  123. abp := attrPolicyBuilder{
  124. p: p,
  125. allowEmpty: false,
  126. }
  127. for _, attrName := range attrNames {
  128. abp.attrNames = append(abp.attrNames, strings.ToLower(attrName))
  129. }
  130. return &abp
  131. }
  132. // AllowNoAttrs says that attributes on element are optional.
  133. //
  134. // The attribute policy is only added to the core policy when OnElements(...)
  135. // are called.
  136. func (p *Policy) AllowNoAttrs() *attrPolicyBuilder {
  137. p.init()
  138. abp := attrPolicyBuilder{
  139. p: p,
  140. allowEmpty: true,
  141. }
  142. return &abp
  143. }
  144. // AllowNoAttrs says that attributes on element are optional.
  145. //
  146. // The attribute policy is only added to the core policy when OnElements(...)
  147. // are called.
  148. func (abp *attrPolicyBuilder) AllowNoAttrs() *attrPolicyBuilder {
  149. abp.allowEmpty = true
  150. return abp
  151. }
  152. // Matching allows a regular expression to be applied to a nascent attribute
  153. // policy, and returns the attribute policy. Calling this more than once will
  154. // replace the existing regexp.
  155. func (abp *attrPolicyBuilder) Matching(regex *regexp.Regexp) *attrPolicyBuilder {
  156. abp.regexp = regex
  157. return abp
  158. }
  159. // OnElements will bind an attribute policy to a given range of HTML elements
  160. // and return the updated policy
  161. func (abp *attrPolicyBuilder) OnElements(elements ...string) *Policy {
  162. for _, element := range elements {
  163. element = strings.ToLower(element)
  164. for _, attr := range abp.attrNames {
  165. if _, ok := abp.p.elsAndAttrs[element]; !ok {
  166. abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
  167. }
  168. ap := attrPolicy{}
  169. if abp.regexp != nil {
  170. ap.regexp = abp.regexp
  171. }
  172. abp.p.elsAndAttrs[element][attr] = ap
  173. }
  174. if abp.allowEmpty {
  175. abp.p.setOfElementsAllowedWithoutAttrs[element] = struct{}{}
  176. if _, ok := abp.p.elsAndAttrs[element]; !ok {
  177. abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
  178. }
  179. }
  180. }
  181. return abp.p
  182. }
  183. // Globally will bind an attribute policy to all HTML elements and return the
  184. // updated policy
  185. func (abp *attrPolicyBuilder) Globally() *Policy {
  186. for _, attr := range abp.attrNames {
  187. if _, ok := abp.p.globalAttrs[attr]; !ok {
  188. abp.p.globalAttrs[attr] = attrPolicy{}
  189. }
  190. ap := attrPolicy{}
  191. if abp.regexp != nil {
  192. ap.regexp = abp.regexp
  193. }
  194. abp.p.globalAttrs[attr] = ap
  195. }
  196. return abp.p
  197. }
  198. // AllowElements will append HTML elements to the whitelist without applying an
  199. // attribute policy to those elements (the elements are permitted
  200. // sans-attributes)
  201. func (p *Policy) AllowElements(names ...string) *Policy {
  202. p.init()
  203. for _, element := range names {
  204. element = strings.ToLower(element)
  205. if _, ok := p.elsAndAttrs[element]; !ok {
  206. p.elsAndAttrs[element] = make(map[string]attrPolicy)
  207. }
  208. }
  209. return p
  210. }
  211. // RequireNoFollowOnLinks will result in all <a> tags having a rel="nofollow"
  212. // added to them if one does not already exist
  213. //
  214. // Note: This requires p.RequireParseableURLs(true) and will enable it.
  215. func (p *Policy) RequireNoFollowOnLinks(require bool) *Policy {
  216. p.requireNoFollow = require
  217. p.requireParseableURLs = true
  218. return p
  219. }
  220. // RequireNoFollowOnFullyQualifiedLinks will result in all <a> tags that point
  221. // to a non-local destination (i.e. starts with a protocol and has a host)
  222. // having a rel="nofollow" added to them if one does not already exist
  223. //
  224. // Note: This requires p.RequireParseableURLs(true) and will enable it.
  225. func (p *Policy) RequireNoFollowOnFullyQualifiedLinks(require bool) *Policy {
  226. p.requireNoFollowFullyQualifiedLinks = require
  227. p.requireParseableURLs = true
  228. return p
  229. }
  230. // AddTargetBlankToFullyQualifiedLinks will result in all <a> tags that point
  231. // to a non-local destination (i.e. starts with a protocol and has a host)
  232. // having a target="_blank" added to them if one does not already exist
  233. //
  234. // Note: This requires p.RequireParseableURLs(true) and will enable it.
  235. func (p *Policy) AddTargetBlankToFullyQualifiedLinks(require bool) *Policy {
  236. p.addTargetBlankToFullyQualifiedLinks = require
  237. p.requireParseableURLs = true
  238. return p
  239. }
  240. // RequireParseableURLs will result in all URLs requiring that they be parseable
  241. // by "net/url" url.Parse()
  242. // This applies to:
  243. // - a.href
  244. // - area.href
  245. // - blockquote.cite
  246. // - img.src
  247. // - link.href
  248. // - script.src
  249. func (p *Policy) RequireParseableURLs(require bool) *Policy {
  250. p.requireParseableURLs = require
  251. return p
  252. }
  253. // AllowRelativeURLs enables RequireParseableURLs and then permits URLs that
  254. // are parseable, have no schema information and url.IsAbs() returns false
  255. // This permits local URLs
  256. func (p *Policy) AllowRelativeURLs(require bool) *Policy {
  257. p.RequireParseableURLs(true)
  258. p.allowRelativeURLs = require
  259. return p
  260. }
  261. // AllowURLSchemes will append URL schemes to the whitelist
  262. // Example: p.AllowURLSchemes("mailto", "http", "https")
  263. func (p *Policy) AllowURLSchemes(schemes ...string) *Policy {
  264. p.init()
  265. p.RequireParseableURLs(true)
  266. for _, scheme := range schemes {
  267. scheme = strings.ToLower(scheme)
  268. // Allow all URLs with matching scheme.
  269. p.allowURLSchemes[scheme] = nil
  270. }
  271. return p
  272. }
  273. // AllowURLSchemeWithCustomPolicy will append URL schemes with
  274. // a custom URL policy to the whitelist.
  275. // Only the URLs with matching schema and urlPolicy(url)
  276. // returning true will be allowed.
  277. func (p *Policy) AllowURLSchemeWithCustomPolicy(
  278. scheme string,
  279. urlPolicy func(url *url.URL) (allowUrl bool),
  280. ) *Policy {
  281. p.init()
  282. p.RequireParseableURLs(true)
  283. scheme = strings.ToLower(scheme)
  284. p.allowURLSchemes[scheme] = urlPolicy
  285. return p
  286. }
  287. // AllowDocType states whether the HTML sanitised by the sanitizer is allowed to
  288. // contain the HTML DocType tag: <!DOCTYPE HTML> or one of it's variants.
  289. //
  290. // The HTML spec only permits one doctype per document, and as you know how you
  291. // are using the output of this, you know best as to whether we should ignore it
  292. // (default) or not.
  293. //
  294. // If you are sanitizing a HTML fragment the default (false) is fine.
  295. func (p *Policy) AllowDocType(allow bool) *Policy {
  296. p.allowDocType = allow
  297. return p
  298. }
  299. // AddSpaceWhenStrippingTag states whether to add a single space " " when
  300. // removing tags that are not whitelisted by the policy.
  301. //
  302. // This is useful if you expect to strip tags in dense markup and may lose the
  303. // value of whitespace.
  304. //
  305. // For example: "<p>Hello</p><p>World</p>"" would be sanitized to "HelloWorld"
  306. // with the default value of false, but you may wish to sanitize this to
  307. // " Hello World " by setting AddSpaceWhenStrippingTag to true as this would
  308. // retain the intent of the text.
  309. func (p *Policy) AddSpaceWhenStrippingTag(allow bool) *Policy {
  310. p.addSpaces = allow
  311. return p
  312. }
  313. // SkipElementsContent adds the HTML elements whose tags is needed to be removed
  314. // with it's content.
  315. func (p *Policy) SkipElementsContent(names ...string) *Policy {
  316. p.init()
  317. for _, element := range names {
  318. element = strings.ToLower(element)
  319. if _, ok := p.setOfElementsToSkipContent[element]; !ok {
  320. p.setOfElementsToSkipContent[element] = struct{}{}
  321. }
  322. }
  323. return p
  324. }
  325. // AllowElementsContent marks the HTML elements whose content should be
  326. // retained after removing the tag.
  327. func (p *Policy) AllowElementsContent(names ...string) *Policy {
  328. p.init()
  329. for _, element := range names {
  330. delete(p.setOfElementsToSkipContent, strings.ToLower(element))
  331. }
  332. return p
  333. }
  334. // addDefaultElementsWithoutAttrs adds the HTML elements that we know are valid
  335. // without any attributes to an internal map.
  336. // i.e. we know that <table> is valid, but <bdo> isn't valid as the "dir" attr
  337. // is mandatory
  338. func (p *Policy) addDefaultElementsWithoutAttrs() {
  339. p.init()
  340. p.setOfElementsAllowedWithoutAttrs["abbr"] = struct{}{}
  341. p.setOfElementsAllowedWithoutAttrs["acronym"] = struct{}{}
  342. p.setOfElementsAllowedWithoutAttrs["article"] = struct{}{}
  343. p.setOfElementsAllowedWithoutAttrs["aside"] = struct{}{}
  344. p.setOfElementsAllowedWithoutAttrs["audio"] = struct{}{}
  345. p.setOfElementsAllowedWithoutAttrs["b"] = struct{}{}
  346. p.setOfElementsAllowedWithoutAttrs["bdi"] = struct{}{}
  347. p.setOfElementsAllowedWithoutAttrs["blockquote"] = struct{}{}
  348. p.setOfElementsAllowedWithoutAttrs["body"] = struct{}{}
  349. p.setOfElementsAllowedWithoutAttrs["br"] = struct{}{}
  350. p.setOfElementsAllowedWithoutAttrs["button"] = struct{}{}
  351. p.setOfElementsAllowedWithoutAttrs["canvas"] = struct{}{}
  352. p.setOfElementsAllowedWithoutAttrs["caption"] = struct{}{}
  353. p.setOfElementsAllowedWithoutAttrs["cite"] = struct{}{}
  354. p.setOfElementsAllowedWithoutAttrs["code"] = struct{}{}
  355. p.setOfElementsAllowedWithoutAttrs["col"] = struct{}{}
  356. p.setOfElementsAllowedWithoutAttrs["colgroup"] = struct{}{}
  357. p.setOfElementsAllowedWithoutAttrs["datalist"] = struct{}{}
  358. p.setOfElementsAllowedWithoutAttrs["dd"] = struct{}{}
  359. p.setOfElementsAllowedWithoutAttrs["del"] = struct{}{}
  360. p.setOfElementsAllowedWithoutAttrs["details"] = struct{}{}
  361. p.setOfElementsAllowedWithoutAttrs["dfn"] = struct{}{}
  362. p.setOfElementsAllowedWithoutAttrs["div"] = struct{}{}
  363. p.setOfElementsAllowedWithoutAttrs["dl"] = struct{}{}
  364. p.setOfElementsAllowedWithoutAttrs["dt"] = struct{}{}
  365. p.setOfElementsAllowedWithoutAttrs["em"] = struct{}{}
  366. p.setOfElementsAllowedWithoutAttrs["fieldset"] = struct{}{}
  367. p.setOfElementsAllowedWithoutAttrs["figcaption"] = struct{}{}
  368. p.setOfElementsAllowedWithoutAttrs["figure"] = struct{}{}
  369. p.setOfElementsAllowedWithoutAttrs["footer"] = struct{}{}
  370. p.setOfElementsAllowedWithoutAttrs["h1"] = struct{}{}
  371. p.setOfElementsAllowedWithoutAttrs["h2"] = struct{}{}
  372. p.setOfElementsAllowedWithoutAttrs["h3"] = struct{}{}
  373. p.setOfElementsAllowedWithoutAttrs["h4"] = struct{}{}
  374. p.setOfElementsAllowedWithoutAttrs["h5"] = struct{}{}
  375. p.setOfElementsAllowedWithoutAttrs["h6"] = struct{}{}
  376. p.setOfElementsAllowedWithoutAttrs["head"] = struct{}{}
  377. p.setOfElementsAllowedWithoutAttrs["header"] = struct{}{}
  378. p.setOfElementsAllowedWithoutAttrs["hgroup"] = struct{}{}
  379. p.setOfElementsAllowedWithoutAttrs["hr"] = struct{}{}
  380. p.setOfElementsAllowedWithoutAttrs["html"] = struct{}{}
  381. p.setOfElementsAllowedWithoutAttrs["i"] = struct{}{}
  382. p.setOfElementsAllowedWithoutAttrs["ins"] = struct{}{}
  383. p.setOfElementsAllowedWithoutAttrs["kbd"] = struct{}{}
  384. p.setOfElementsAllowedWithoutAttrs["li"] = struct{}{}
  385. p.setOfElementsAllowedWithoutAttrs["mark"] = struct{}{}
  386. p.setOfElementsAllowedWithoutAttrs["nav"] = struct{}{}
  387. p.setOfElementsAllowedWithoutAttrs["ol"] = struct{}{}
  388. p.setOfElementsAllowedWithoutAttrs["optgroup"] = struct{}{}
  389. p.setOfElementsAllowedWithoutAttrs["option"] = struct{}{}
  390. p.setOfElementsAllowedWithoutAttrs["p"] = struct{}{}
  391. p.setOfElementsAllowedWithoutAttrs["pre"] = struct{}{}
  392. p.setOfElementsAllowedWithoutAttrs["q"] = struct{}{}
  393. p.setOfElementsAllowedWithoutAttrs["rp"] = struct{}{}
  394. p.setOfElementsAllowedWithoutAttrs["rt"] = struct{}{}
  395. p.setOfElementsAllowedWithoutAttrs["ruby"] = struct{}{}
  396. p.setOfElementsAllowedWithoutAttrs["s"] = struct{}{}
  397. p.setOfElementsAllowedWithoutAttrs["samp"] = struct{}{}
  398. p.setOfElementsAllowedWithoutAttrs["section"] = struct{}{}
  399. p.setOfElementsAllowedWithoutAttrs["select"] = struct{}{}
  400. p.setOfElementsAllowedWithoutAttrs["small"] = struct{}{}
  401. p.setOfElementsAllowedWithoutAttrs["span"] = struct{}{}
  402. p.setOfElementsAllowedWithoutAttrs["strike"] = struct{}{}
  403. p.setOfElementsAllowedWithoutAttrs["strong"] = struct{}{}
  404. p.setOfElementsAllowedWithoutAttrs["style"] = struct{}{}
  405. p.setOfElementsAllowedWithoutAttrs["sub"] = struct{}{}
  406. p.setOfElementsAllowedWithoutAttrs["summary"] = struct{}{}
  407. p.setOfElementsAllowedWithoutAttrs["sup"] = struct{}{}
  408. p.setOfElementsAllowedWithoutAttrs["svg"] = struct{}{}
  409. p.setOfElementsAllowedWithoutAttrs["table"] = struct{}{}
  410. p.setOfElementsAllowedWithoutAttrs["tbody"] = struct{}{}
  411. p.setOfElementsAllowedWithoutAttrs["td"] = struct{}{}
  412. p.setOfElementsAllowedWithoutAttrs["textarea"] = struct{}{}
  413. p.setOfElementsAllowedWithoutAttrs["tfoot"] = struct{}{}
  414. p.setOfElementsAllowedWithoutAttrs["th"] = struct{}{}
  415. p.setOfElementsAllowedWithoutAttrs["thead"] = struct{}{}
  416. p.setOfElementsAllowedWithoutAttrs["title"] = struct{}{}
  417. p.setOfElementsAllowedWithoutAttrs["time"] = struct{}{}
  418. p.setOfElementsAllowedWithoutAttrs["tr"] = struct{}{}
  419. p.setOfElementsAllowedWithoutAttrs["tt"] = struct{}{}
  420. p.setOfElementsAllowedWithoutAttrs["u"] = struct{}{}
  421. p.setOfElementsAllowedWithoutAttrs["ul"] = struct{}{}
  422. p.setOfElementsAllowedWithoutAttrs["var"] = struct{}{}
  423. p.setOfElementsAllowedWithoutAttrs["video"] = struct{}{}
  424. p.setOfElementsAllowedWithoutAttrs["wbr"] = struct{}{}
  425. }
  426. // addDefaultSkipElementContent adds the HTML elements that we should skip
  427. // rendering the character content of, if the element itself is not allowed.
  428. // This is all character data that the end user would not normally see.
  429. // i.e. if we exclude a <script> tag then we shouldn't render the JavaScript or
  430. // anything else until we encounter the closing </script> tag.
  431. func (p *Policy) addDefaultSkipElementContent() {
  432. p.init()
  433. p.setOfElementsToSkipContent["frame"] = struct{}{}
  434. p.setOfElementsToSkipContent["frameset"] = struct{}{}
  435. p.setOfElementsToSkipContent["iframe"] = struct{}{}
  436. p.setOfElementsToSkipContent["noembed"] = struct{}{}
  437. p.setOfElementsToSkipContent["noframes"] = struct{}{}
  438. p.setOfElementsToSkipContent["noscript"] = struct{}{}
  439. p.setOfElementsToSkipContent["nostyle"] = struct{}{}
  440. p.setOfElementsToSkipContent["object"] = struct{}{}
  441. p.setOfElementsToSkipContent["script"] = struct{}{}
  442. p.setOfElementsToSkipContent["style"] = struct{}{}
  443. p.setOfElementsToSkipContent["title"] = struct{}{}
  444. }