You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

179 lines
3.5 KiB

10 years ago
  1. package mahonia
  2. // decoding HTML entities
  3. import (
  4. "sort"
  5. )
  6. // EntityDecoder returns a Decoder that decodes HTML character entities.
  7. // If there is no valid character entity at the current position, it returns INVALID_CHAR.
  8. // So it needs to be combined with another Decoder via FallbackDecoder.
  9. func EntityDecoder() Decoder {
  10. var leftover rune // leftover rune from two-rune entity
  11. return func(p []byte) (r rune, size int, status Status) {
  12. if leftover != 0 {
  13. r = leftover
  14. leftover = 0
  15. return r, 0, SUCCESS
  16. }
  17. if len(p) == 0 {
  18. return 0, 0, NO_ROOM
  19. }
  20. if p[0] != '&' {
  21. return 0xfffd, 1, INVALID_CHAR
  22. }
  23. if len(p) < 3 {
  24. return 0, 1, NO_ROOM
  25. }
  26. r, size, status = 0xfffd, 1, INVALID_CHAR
  27. n := 1 // number of bytes read so far
  28. if p[n] == '#' {
  29. n++
  30. c := p[n]
  31. hex := false
  32. if c == 'x' || c == 'X' {
  33. hex = true
  34. n++
  35. }
  36. var x rune
  37. for n < len(p) {
  38. c = p[n]
  39. n++
  40. if hex {
  41. if '0' <= c && c <= '9' {
  42. x = 16*x + rune(c) - '0'
  43. continue
  44. } else if 'a' <= c && c <= 'f' {
  45. x = 16*x + rune(c) - 'a' + 10
  46. continue
  47. } else if 'A' <= c && c <= 'F' {
  48. x = 16*x + rune(c) - 'A' + 10
  49. continue
  50. }
  51. } else if '0' <= c && c <= '9' {
  52. x = 10*x + rune(c) - '0'
  53. continue
  54. }
  55. if c != ';' {
  56. n--
  57. }
  58. break
  59. }
  60. if n == len(p) && p[n-1] != ';' {
  61. return 0, 0, NO_ROOM
  62. }
  63. size = n
  64. if p[n-1] == ';' {
  65. n--
  66. }
  67. if hex {
  68. n--
  69. }
  70. n--
  71. // Now n is the number of actual digits read.
  72. if n == 0 {
  73. return 0xfffd, 1, INVALID_CHAR
  74. }
  75. if 0x80 <= x && x <= 0x9F {
  76. // Replace characters from Windows-1252 with UTF-8 equivalents.
  77. x = replacementTable[x-0x80]
  78. } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
  79. // Replace invalid characters with the replacement character.
  80. return 0xfffd, size, INVALID_CHAR
  81. }
  82. r = x
  83. status = SUCCESS
  84. return
  85. }
  86. // Look for a named entity in EntityList.
  87. possible := entityList
  88. for len(possible) > 0 {
  89. if len(p) <= n {
  90. leftover = 0
  91. return 0, 0, NO_ROOM
  92. }
  93. c := p[n]
  94. // Narrow down the selection in possible to those items that have c in the
  95. // appropriate byte.
  96. first := sort.Search(len(possible), func(i int) bool {
  97. e := possible[i].name
  98. if len(e) < n {
  99. return false
  100. }
  101. return e[n-1] >= c
  102. })
  103. possible = possible[first:]
  104. last := sort.Search(len(possible), func(i int) bool {
  105. return possible[i].name[n-1] > c
  106. })
  107. possible = possible[:last]
  108. n++
  109. if len(possible) > 0 && len(possible[0].name) == n-1 {
  110. r, leftover = possible[0].r1, possible[0].r2
  111. size = n
  112. status = SUCCESS
  113. // but don't return yet, since we need the longest match
  114. }
  115. }
  116. return
  117. }
  118. }
  119. // This table is copied from /src/pkg/html/escape.go in the Go source
  120. //
  121. // These replacements permit compatibility with old numeric entities that
  122. // assumed Windows-1252 encoding.
  123. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  124. var replacementTable = [...]rune{
  125. '\u20AC', // First entry is what 0x80 should be replaced with.
  126. '\u0081',
  127. '\u201A',
  128. '\u0192',
  129. '\u201E',
  130. '\u2026',
  131. '\u2020',
  132. '\u2021',
  133. '\u02C6',
  134. '\u2030',
  135. '\u0160',
  136. '\u2039',
  137. '\u0152',
  138. '\u008D',
  139. '\u017D',
  140. '\u008F',
  141. '\u0090',
  142. '\u2018',
  143. '\u2019',
  144. '\u201C',
  145. '\u201D',
  146. '\u2022',
  147. '\u2013',
  148. '\u2014',
  149. '\u02DC',
  150. '\u2122',
  151. '\u0161',
  152. '\u203A',
  153. '\u0153',
  154. '\u009D',
  155. '\u017E',
  156. '\u0178', // Last entry is 0x9F.
  157. // 0x00->'\uFFFD' is handled programmatically.
  158. // 0x0D->'\u000D' is a no-op.
  159. }