|
|
- package mahonia
-
- // decoding HTML entities
-
- import (
- "sort"
- )
-
- // EntityDecoder returns a Decoder that decodes HTML character entities.
- // If there is no valid character entity at the current position, it returns INVALID_CHAR.
- // So it needs to be combined with another Decoder via FallbackDecoder.
- func EntityDecoder() Decoder {
- var leftover rune // leftover rune from two-rune entity
- return func(p []byte) (r rune, size int, status Status) {
- if leftover != 0 {
- r = leftover
- leftover = 0
- return r, 0, SUCCESS
- }
-
- if len(p) == 0 {
- return 0, 0, NO_ROOM
- }
-
- if p[0] != '&' {
- return 0xfffd, 1, INVALID_CHAR
- }
-
- if len(p) < 3 {
- return 0, 1, NO_ROOM
- }
-
- r, size, status = 0xfffd, 1, INVALID_CHAR
- n := 1 // number of bytes read so far
-
- if p[n] == '#' {
- n++
- c := p[n]
- hex := false
- if c == 'x' || c == 'X' {
- hex = true
- n++
- }
-
- var x rune
- for n < len(p) {
- c = p[n]
- n++
- if hex {
- if '0' <= c && c <= '9' {
- x = 16*x + rune(c) - '0'
- continue
- } else if 'a' <= c && c <= 'f' {
- x = 16*x + rune(c) - 'a' + 10
- continue
- } else if 'A' <= c && c <= 'F' {
- x = 16*x + rune(c) - 'A' + 10
- continue
- }
- } else if '0' <= c && c <= '9' {
- x = 10*x + rune(c) - '0'
- continue
- }
- if c != ';' {
- n--
- }
- break
- }
-
- if n == len(p) && p[n-1] != ';' {
- return 0, 0, NO_ROOM
- }
-
- size = n
- if p[n-1] == ';' {
- n--
- }
- if hex {
- n--
- }
- n--
- // Now n is the number of actual digits read.
- if n == 0 {
- return 0xfffd, 1, INVALID_CHAR
- }
-
- if 0x80 <= x && x <= 0x9F {
- // Replace characters from Windows-1252 with UTF-8 equivalents.
- x = replacementTable[x-0x80]
- } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
- // Replace invalid characters with the replacement character.
- return 0xfffd, size, INVALID_CHAR
- }
-
- r = x
- status = SUCCESS
- return
- }
-
- // Look for a named entity in EntityList.
-
- possible := entityList
- for len(possible) > 0 {
- if len(p) <= n {
- leftover = 0
- return 0, 0, NO_ROOM
- }
-
- c := p[n]
-
- // Narrow down the selection in possible to those items that have c in the
- // appropriate byte.
- first := sort.Search(len(possible), func(i int) bool {
- e := possible[i].name
- if len(e) < n {
- return false
- }
- return e[n-1] >= c
- })
- possible = possible[first:]
- last := sort.Search(len(possible), func(i int) bool {
- return possible[i].name[n-1] > c
- })
- possible = possible[:last]
-
- n++
- if len(possible) > 0 && len(possible[0].name) == n-1 {
- r, leftover = possible[0].r1, possible[0].r2
- size = n
- status = SUCCESS
- // but don't return yet, since we need the longest match
- }
- }
-
- return
- }
- }
-
- // This table is copied from /src/pkg/html/escape.go in the Go source
- //
- // These replacements permit compatibility with old numeric entities that
- // assumed Windows-1252 encoding.
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
- var replacementTable = [...]rune{
- '\u20AC', // First entry is what 0x80 should be replaced with.
- '\u0081',
- '\u201A',
- '\u0192',
- '\u201E',
- '\u2026',
- '\u2020',
- '\u2021',
- '\u02C6',
- '\u2030',
- '\u0160',
- '\u2039',
- '\u0152',
- '\u008D',
- '\u017D',
- '\u008F',
- '\u0090',
- '\u2018',
- '\u2019',
- '\u201C',
- '\u201D',
- '\u2022',
- '\u2013',
- '\u2014',
- '\u02DC',
- '\u2122',
- '\u0161',
- '\u203A',
- '\u0153',
- '\u009D',
- '\u017E',
- '\u0178', // Last entry is 0x9F.
- // 0x00->'\uFFFD' is handled programmatically.
- // 0x0D->'\u000D' is a no-op.
- }
|