You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
2.8 KiB

10 years ago
  1. package mahonia
  2. import (
  3. "sync"
  4. )
  5. // Converters for GB18030 encoding.
  6. func init() {
  7. RegisterCharset(&Charset{
  8. Name: "GB18030",
  9. NewDecoder: func() Decoder {
  10. gb18030Once.Do(buildGB18030Tables)
  11. return decodeGB18030Rune
  12. },
  13. NewEncoder: func() Encoder {
  14. gb18030Once.Do(buildGB18030Tables)
  15. return encodeGB18030Rune
  16. },
  17. })
  18. }
  19. func decodeGB18030Rune(p []byte) (r rune, size int, status Status) {
  20. if len(p) == 0 {
  21. status = NO_ROOM
  22. return
  23. }
  24. b := p[0]
  25. if b < 128 {
  26. return rune(b), 1, SUCCESS
  27. }
  28. if len(p) < 2 {
  29. status = NO_ROOM
  30. return
  31. }
  32. if p[0] < 0x81 || p[0] > 0xfe {
  33. return 0xfffd, 1, INVALID_CHAR
  34. }
  35. if p[1] >= 0x40 {
  36. // 2-byte character
  37. c := uint16(p[0])<<8 + uint16(p[1])
  38. r = rune(gbkToUnicode[c])
  39. if r == 0 {
  40. r = gbkToUnicodeExtra[c]
  41. }
  42. if r != 0 {
  43. return r, 2, SUCCESS
  44. }
  45. } else if p[1] >= 0x30 {
  46. // 4-byte character
  47. if len(p) < 4 {
  48. return 0, 0, NO_ROOM
  49. }
  50. if p[2] < 0x81 || p[2] > 0xfe || p[3] < 0x30 || p[3] > 0x39 {
  51. return 0xfffd, 1, INVALID_CHAR
  52. }
  53. code := uint32(p[0])<<24 + uint32(p[1])<<16 + uint32(p[2])<<8 + uint32(p[3])
  54. lin := gb18030Linear(code)
  55. if lin <= maxGB18030Linear {
  56. r = rune(gb18030LinearToUnicode[lin])
  57. if r != 0 {
  58. return r, 4, SUCCESS
  59. }
  60. }
  61. for _, rng := range gb18030Ranges {
  62. if lin >= rng.firstGB && lin <= rng.lastGB {
  63. return rng.firstRune + rune(lin) - rune(rng.firstGB), 4, SUCCESS
  64. }
  65. }
  66. }
  67. return 0xfffd, 1, INVALID_CHAR
  68. }
  69. func encodeGB18030Rune(p []byte, r rune) (size int, status Status) {
  70. if len(p) == 0 {
  71. status = NO_ROOM
  72. return
  73. }
  74. if r < 128 {
  75. p[0] = byte(r)
  76. return 1, SUCCESS
  77. }
  78. if len(p) < 2 {
  79. status = NO_ROOM
  80. return
  81. }
  82. var c uint16
  83. if r < 0x10000 {
  84. c = unicodeToGBK[r]
  85. } else {
  86. c = unicodeToGBKExtra[r]
  87. }
  88. if c != 0 {
  89. p[0] = byte(c >> 8)
  90. p[1] = byte(c)
  91. return 2, SUCCESS
  92. }
  93. if len(p) < 4 {
  94. return 0, NO_ROOM
  95. }
  96. if r < 0x10000 {
  97. f := unicodeToGB18030[r]
  98. if f != 0 {
  99. p[0] = byte(f >> 24)
  100. p[1] = byte(f >> 16)
  101. p[2] = byte(f >> 8)
  102. p[3] = byte(f)
  103. return 4, SUCCESS
  104. }
  105. }
  106. for _, rng := range gb18030Ranges {
  107. if r >= rng.firstRune && r <= rng.lastRune {
  108. lin := rng.firstGB + uint32(r) - uint32(rng.firstRune)
  109. p[0] = byte(lin/(10*126*10)) + 0x81
  110. p[1] = byte(lin/(126*10)%10) + 0x30
  111. p[2] = byte(lin/10%126) + 0x81
  112. p[3] = byte(lin%10) + 0x30
  113. return 4, SUCCESS
  114. }
  115. }
  116. p[0] = 0x1a
  117. return 1, INVALID_CHAR
  118. }
  119. var gb18030Once sync.Once
  120. // Mapping from gb18039Linear values to Unicode.
  121. var gb18030LinearToUnicode []uint16
  122. var unicodeToGB18030 []uint32
  123. func buildGB18030Tables() {
  124. gb18030LinearToUnicode = make([]uint16, maxGB18030Linear+1)
  125. unicodeToGB18030 = make([]uint32, 65536)
  126. for _, data := range gb18030Data {
  127. gb18030LinearToUnicode[gb18030Linear(data.gb18030)] = data.unicode
  128. unicodeToGB18030[data.unicode] = data.gb18030
  129. }
  130. }