You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

229 lines
7.3 KiB

10 years ago
  1. package mahonia
  2. import (
  3. "bytes"
  4. "io/ioutil"
  5. "testing"
  6. )
  7. var nameTests = map[string]string{
  8. "utf8": "utf8",
  9. "ISO 8859-1": "iso88591",
  10. "Big5": "big5",
  11. "": "",
  12. }
  13. func TestSimplifyName(t *testing.T) {
  14. for name, simple := range nameTests {
  15. if simple != simplifyName(name) {
  16. t.Errorf("%s came out as %s instead of as %s", name, simplifyName(name), simple)
  17. }
  18. }
  19. }
  20. var testData = []struct {
  21. utf8, other, otherEncoding string
  22. }{
  23. {"Résumé", "Résumé", "utf8"},
  24. {"Résumé", "R\xe9sum\xe9", "latin-1"},
  25. {"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"},
  26. {"これは漢字です。", "0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16BE"},
  27. {"これは漢字です。", "\xfe\xff0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16"},
  28. {"𝄢𝄞𝄪𝄫", "\xfe\xff\xd8\x34\xdd\x22\xd8\x34\xdd\x1e\xd8\x34\xdd\x2a\xd8\x34\xdd\x2b", "UTF-16"},
  29. {"Hello, world", "Hello, world", "ASCII"},
  30. {"Gdańsk", "Gda\xf1sk", "ISO-8859-2"},
  31. {"Ââ Čč Đđ Ŋŋ Õõ Šš Žž Åå Ää", "\xc2\xe2 \xc8\xe8 \xa9\xb9 \xaf\xbf \xd5\xf5 \xaa\xba \xac\xbc \xc5\xe5 \xc4\xe4", "ISO-8859-10"},
  32. {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "ISO-8859-11"},
  33. {"latviešu", "latvie\xf0u", "ISO-8859-13"},
  34. {"Seònaid", "Se\xf2naid", "ISO-8859-14"},
  35. {"€1 is cheap", "\xa41 is cheap", "ISO-8859-15"},
  36. {"românește", "rom\xe2ne\xbate", "ISO-8859-16"},
  37. {"nutraĵo", "nutra\xbco", "ISO-8859-3"},
  38. {"Kalâdlit", "Kal\xe2dlit", "ISO-8859-4"},
  39. {"русский", "\xe0\xe3\xe1\xe1\xda\xd8\xd9", "ISO-8859-5"},
  40. {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "ISO-8859-7"},
  41. {"Kağan", "Ka\xf0an", "ISO-8859-9"},
  42. {"Résumé", "R\x8esum\x8e", "macintosh"},
  43. {"Gdańsk", "Gda\xf1sk", "windows-1250"},
  44. {"русский", "\xf0\xf3\xf1\xf1\xea\xe8\xe9", "windows-1251"},
  45. {"Résumé", "R\xe9sum\xe9", "windows-1252"},
  46. {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "windows-1253"},
  47. {"Kağan", "Ka\xf0an", "windows-1254"},
  48. {"עִבְרִית", "\xf2\xc4\xe1\xc0\xf8\xc4\xe9\xfa", "windows-1255"},
  49. {"العربية", "\xc7\xe1\xda\xd1\xc8\xed\xc9", "windows-1256"},
  50. {"latviešu", "latvie\xf0u", "windows-1257"},
  51. {"Việt", "Vi\xea\xf2t", "windows-1258"},
  52. {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "windows-874"},
  53. {"русский", "\xd2\xd5\xd3\xd3\xcb\xc9\xca", "KOI8-R"},
  54. {"українська", "\xd5\xcb\xd2\xc1\xa7\xce\xd3\xd8\xcb\xc1", "KOI8-U"},
  55. {"Hello 常用國字標準字體表", "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed", "big5"},
  56. {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gbk"},
  57. {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gb18030"},
  58. {"עִבְרִית", "\x81\x30\xfb\x30\x81\x30\xf6\x34\x81\x30\xf9\x33\x81\x30\xf6\x30\x81\x30\xfb\x36\x81\x30\xf6\x34\x81\x30\xfa\x31\x81\x30\xfb\x38", "gb18030"},
  59. {"㧯", "\x82\x31\x89\x38", "gb18030"},
  60. {"これは漢字です。", "\x82\xb1\x82\xea\x82\xcd\x8a\xbf\x8e\x9a\x82\xc5\x82\xb7\x81B", "SJIS"},
  61. {"Hello, 世界!", "Hello, \x90\xa2\x8aE!", "SJIS"},
  62. {"イウエオカ", "\xb2\xb3\xb4\xb5\xb6", "SJIS"},
  63. {"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "EUC-JP"},
  64. {"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "CP51932"},
  65. {"Thông tin bạn đồng hànhỌ", "Th\xabng tin b\xb9n \xae\xe5ng h\xb5nhO\xe4", "TCVN3"},
  66. {"Hello, 世界!", "Hello, \x1b$B@$3&\x1b(B!", "ISO-2022-JP"},
  67. {"네이트 | 즐거움의 시작, 슈파스(Spaβ) NATE", "\xb3\xd7\xc0\xcc\xc6\xae | \xc1\xf1\xb0\xc5\xbf\xf2\xc0\xc7 \xbd\xc3\xc0\xdb, \xbd\xb4\xc6\xc4\xbd\xba(Spa\xa5\xe2) NATE", "EUC-KR"},
  68. }
  69. func TestDecode(t *testing.T) {
  70. for _, data := range testData {
  71. d := NewDecoder(data.otherEncoding)
  72. if d == nil {
  73. t.Errorf("Could not create decoder for %s", data.otherEncoding)
  74. continue
  75. }
  76. str := d.ConvertString(data.other)
  77. if str != data.utf8 {
  78. t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
  79. }
  80. }
  81. }
  82. func TestDecodeTranslate(t *testing.T) {
  83. for _, data := range testData {
  84. d := NewDecoder(data.otherEncoding)
  85. if d == nil {
  86. t.Errorf("Could not create decoder for %s", data.otherEncoding)
  87. continue
  88. }
  89. _, cdata, _ := d.Translate([]byte(data.other), true)
  90. str := string(cdata)
  91. if str != data.utf8 {
  92. t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
  93. }
  94. }
  95. }
  96. func TestEncode(t *testing.T) {
  97. for _, data := range testData {
  98. e := NewEncoder(data.otherEncoding)
  99. if e == nil {
  100. t.Errorf("Could not create encoder for %s", data.otherEncoding)
  101. continue
  102. }
  103. str := e.ConvertString(data.utf8)
  104. if str != data.other {
  105. t.Errorf("Unexpected value: %#v (expected %#v)", str, data.other)
  106. }
  107. }
  108. }
  109. func TestReader(t *testing.T) {
  110. for _, data := range testData {
  111. d := NewDecoder(data.otherEncoding)
  112. if d == nil {
  113. t.Errorf("Could not create decoder for %s", data.otherEncoding)
  114. continue
  115. }
  116. b := bytes.NewBufferString(data.other)
  117. r := d.NewReader(b)
  118. result, _ := ioutil.ReadAll(r)
  119. str := string(result)
  120. if str != data.utf8 {
  121. t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
  122. }
  123. }
  124. }
  125. func TestWriter(t *testing.T) {
  126. for _, data := range testData {
  127. e := NewEncoder(data.otherEncoding)
  128. if e == nil {
  129. t.Errorf("Could not create encoder for %s", data.otherEncoding)
  130. continue
  131. }
  132. b := new(bytes.Buffer)
  133. w := e.NewWriter(b)
  134. w.Write([]byte(data.utf8))
  135. str := b.String()
  136. if str != data.other {
  137. t.Errorf("Unexpected value: %#v (expected %#v)", str, data.other)
  138. }
  139. }
  140. }
  141. func TestFallback(t *testing.T) {
  142. mixed := "résum\xe9 " // The space is needed because of the issue mentioned in the Note: in fallback.go
  143. pure := "résumé "
  144. d := FallbackDecoder(NewDecoder("utf8"), NewDecoder("ISO-8859-1"))
  145. result := d.ConvertString(mixed)
  146. if result != pure {
  147. t.Errorf("Unexpected value: %#v (expected %#v)", result, pure)
  148. }
  149. }
  150. func TestEntities(t *testing.T) {
  151. escaped := "&notit; I'm ∉ I tell you&#X82 ≪⃒ "
  152. plain := "¬it; I'm ∉ I tell you\u201a \u226A\u20D2 "
  153. d := FallbackDecoder(EntityDecoder(), NewDecoder("ISO-8859-1"))
  154. result := d.ConvertString(escaped)
  155. if result != plain {
  156. t.Errorf("Unexpected value: %#v (expected %#v)", result, plain)
  157. }
  158. }
  159. func TestConvertStringOK(t *testing.T) {
  160. d := NewDecoder("ASCII")
  161. if d == nil {
  162. t.Fatal("Could not create decoder for ASCII")
  163. }
  164. str, ok := d.ConvertStringOK("hello")
  165. if !ok {
  166. t.Error("Spurious error found while decoding")
  167. }
  168. if str != "hello" {
  169. t.Errorf("expected %#v, got %#v", "hello", str)
  170. }
  171. str, ok = d.ConvertStringOK("\x80")
  172. if ok {
  173. t.Error(`Failed to detect error decoding "\x80"`)
  174. }
  175. e := NewEncoder("ISO-8859-3")
  176. if e == nil {
  177. t.Fatal("Could not create encoder for ISO-8859-1")
  178. }
  179. str, ok = e.ConvertStringOK("nutraĵo")
  180. if !ok {
  181. t.Error("spurious error while encoding")
  182. }
  183. if str != "nutra\xbco" {
  184. t.Errorf("expected %#v, got %#v", "nutra\xbco", str)
  185. }
  186. str, ok = e.ConvertStringOK("\x80abc")
  187. if ok {
  188. t.Error("failed to detect invalid UTF-8 while encoding")
  189. }
  190. str, ok = e.ConvertStringOK("русский")
  191. if ok {
  192. t.Error("failed to detect characters that couldn't be encoded")
  193. }
  194. }
  195. func TestBadCharset(t *testing.T) {
  196. d := NewDecoder("this is not a valid charset")
  197. if d != nil {
  198. t.Fatal("got a non-nil decoder for an invalid charset")
  199. }
  200. }