You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
12 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "bytes"
  7. "fmt"
  8. "sort"
  9. "strconv"
  10. "golang.org/x/text/internal/tag"
  11. )
  12. // findIndex tries to find the given tag in idx and returns a standardized error
  13. // if it could not be found.
  14. func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
  15. if !tag.FixCase(form, key) {
  16. return 0, errSyntax
  17. }
  18. i := idx.Index(key)
  19. if i == -1 {
  20. return 0, mkErrInvalid(key)
  21. }
  22. return i, nil
  23. }
  24. func searchUint(imap []uint16, key uint16) int {
  25. return sort.Search(len(imap), func(i int) bool {
  26. return imap[i] >= key
  27. })
  28. }
  29. type langID uint16
  30. // getLangID returns the langID of s if s is a canonical subtag
  31. // or langUnknown if s is not a canonical subtag.
  32. func getLangID(s []byte) (langID, error) {
  33. if len(s) == 2 {
  34. return getLangISO2(s)
  35. }
  36. return getLangISO3(s)
  37. }
  38. // mapLang returns the mapped langID of id according to mapping m.
  39. func normLang(id langID) (langID, langAliasType) {
  40. k := sort.Search(len(langAliasMap), func(i int) bool {
  41. return langAliasMap[i].from >= uint16(id)
  42. })
  43. if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
  44. return langID(langAliasMap[k].to), langAliasTypes[k]
  45. }
  46. return id, langAliasTypeUnknown
  47. }
  48. // getLangISO2 returns the langID for the given 2-letter ISO language code
  49. // or unknownLang if this does not exist.
  50. func getLangISO2(s []byte) (langID, error) {
  51. if !tag.FixCase("zz", s) {
  52. return 0, errSyntax
  53. }
  54. if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
  55. return langID(i), nil
  56. }
  57. return 0, mkErrInvalid(s)
  58. }
  59. const base = 'z' - 'a' + 1
  60. func strToInt(s []byte) uint {
  61. v := uint(0)
  62. for i := 0; i < len(s); i++ {
  63. v *= base
  64. v += uint(s[i] - 'a')
  65. }
  66. return v
  67. }
  68. // converts the given integer to the original ASCII string passed to strToInt.
  69. // len(s) must match the number of characters obtained.
  70. func intToStr(v uint, s []byte) {
  71. for i := len(s) - 1; i >= 0; i-- {
  72. s[i] = byte(v%base) + 'a'
  73. v /= base
  74. }
  75. }
  76. // getLangISO3 returns the langID for the given 3-letter ISO language code
  77. // or unknownLang if this does not exist.
  78. func getLangISO3(s []byte) (langID, error) {
  79. if tag.FixCase("und", s) {
  80. // first try to match canonical 3-letter entries
  81. for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
  82. if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
  83. // We treat "und" as special and always translate it to "unspecified".
  84. // Note that ZZ and Zzzz are private use and are not treated as
  85. // unspecified by default.
  86. id := langID(i)
  87. if id == nonCanonicalUnd {
  88. return 0, nil
  89. }
  90. return id, nil
  91. }
  92. }
  93. if i := altLangISO3.Index(s); i != -1 {
  94. return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
  95. }
  96. n := strToInt(s)
  97. if langNoIndex[n/8]&(1<<(n%8)) != 0 {
  98. return langID(n) + langNoIndexOffset, nil
  99. }
  100. // Check for non-canonical uses of ISO3.
  101. for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
  102. if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
  103. return langID(i), nil
  104. }
  105. }
  106. return 0, mkErrInvalid(s)
  107. }
  108. return 0, errSyntax
  109. }
  110. // stringToBuf writes the string to b and returns the number of bytes
  111. // written. cap(b) must be >= 3.
  112. func (id langID) stringToBuf(b []byte) int {
  113. if id >= langNoIndexOffset {
  114. intToStr(uint(id)-langNoIndexOffset, b[:3])
  115. return 3
  116. } else if id == 0 {
  117. return copy(b, "und")
  118. }
  119. l := lang[id<<2:]
  120. if l[3] == 0 {
  121. return copy(b, l[:3])
  122. }
  123. return copy(b, l[:2])
  124. }
  125. // String returns the BCP 47 representation of the langID.
  126. // Use b as variable name, instead of id, to ensure the variable
  127. // used is consistent with that of Base in which this type is embedded.
  128. func (b langID) String() string {
  129. if b == 0 {
  130. return "und"
  131. } else if b >= langNoIndexOffset {
  132. b -= langNoIndexOffset
  133. buf := [3]byte{}
  134. intToStr(uint(b), buf[:])
  135. return string(buf[:])
  136. }
  137. l := lang.Elem(int(b))
  138. if l[3] == 0 {
  139. return l[:3]
  140. }
  141. return l[:2]
  142. }
  143. // ISO3 returns the ISO 639-3 language code.
  144. func (b langID) ISO3() string {
  145. if b == 0 || b >= langNoIndexOffset {
  146. return b.String()
  147. }
  148. l := lang.Elem(int(b))
  149. if l[3] == 0 {
  150. return l[:3]
  151. } else if l[2] == 0 {
  152. return altLangISO3.Elem(int(l[3]))[:3]
  153. }
  154. // This allocation will only happen for 3-letter ISO codes
  155. // that are non-canonical BCP 47 language identifiers.
  156. return l[0:1] + l[2:4]
  157. }
  158. // IsPrivateUse reports whether this language code is reserved for private use.
  159. func (b langID) IsPrivateUse() bool {
  160. return langPrivateStart <= b && b <= langPrivateEnd
  161. }
  162. type regionID uint16
  163. // getRegionID returns the region id for s if s is a valid 2-letter region code
  164. // or unknownRegion.
  165. func getRegionID(s []byte) (regionID, error) {
  166. if len(s) == 3 {
  167. if isAlpha(s[0]) {
  168. return getRegionISO3(s)
  169. }
  170. if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
  171. return getRegionM49(int(i))
  172. }
  173. }
  174. return getRegionISO2(s)
  175. }
  176. // getRegionISO2 returns the regionID for the given 2-letter ISO country code
  177. // or unknownRegion if this does not exist.
  178. func getRegionISO2(s []byte) (regionID, error) {
  179. i, err := findIndex(regionISO, s, "ZZ")
  180. if err != nil {
  181. return 0, err
  182. }
  183. return regionID(i) + isoRegionOffset, nil
  184. }
  185. // getRegionISO3 returns the regionID for the given 3-letter ISO country code
  186. // or unknownRegion if this does not exist.
  187. func getRegionISO3(s []byte) (regionID, error) {
  188. if tag.FixCase("ZZZ", s) {
  189. for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
  190. if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
  191. return regionID(i) + isoRegionOffset, nil
  192. }
  193. }
  194. for i := 0; i < len(altRegionISO3); i += 3 {
  195. if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
  196. return regionID(altRegionIDs[i/3]), nil
  197. }
  198. }
  199. return 0, mkErrInvalid(s)
  200. }
  201. return 0, errSyntax
  202. }
  203. func getRegionM49(n int) (regionID, error) {
  204. if 0 < n && n <= 999 {
  205. const (
  206. searchBits = 7
  207. regionBits = 9
  208. regionMask = 1<<regionBits - 1
  209. )
  210. idx := n >> searchBits
  211. buf := fromM49[m49Index[idx]:m49Index[idx+1]]
  212. val := uint16(n) << regionBits // we rely on bits shifting out
  213. i := sort.Search(len(buf), func(i int) bool {
  214. return buf[i] >= val
  215. })
  216. if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
  217. return regionID(r & regionMask), nil
  218. }
  219. }
  220. var e ValueError
  221. fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
  222. return 0, e
  223. }
  224. // normRegion returns a region if r is deprecated or 0 otherwise.
  225. // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
  226. // TODO: consider mapping split up regions to new most populous one (like CLDR).
  227. func normRegion(r regionID) regionID {
  228. m := regionOldMap
  229. k := sort.Search(len(m), func(i int) bool {
  230. return m[i].from >= uint16(r)
  231. })
  232. if k < len(m) && m[k].from == uint16(r) {
  233. return regionID(m[k].to)
  234. }
  235. return 0
  236. }
  237. const (
  238. iso3166UserAssigned = 1 << iota
  239. ccTLD
  240. bcp47Region
  241. )
  242. func (r regionID) typ() byte {
  243. return regionTypes[r]
  244. }
  245. // String returns the BCP 47 representation for the region.
  246. // It returns "ZZ" for an unspecified region.
  247. func (r regionID) String() string {
  248. if r < isoRegionOffset {
  249. if r == 0 {
  250. return "ZZ"
  251. }
  252. return fmt.Sprintf("%03d", r.M49())
  253. }
  254. r -= isoRegionOffset
  255. return regionISO.Elem(int(r))[:2]
  256. }
  257. // ISO3 returns the 3-letter ISO code of r.
  258. // Note that not all regions have a 3-letter ISO code.
  259. // In such cases this method returns "ZZZ".
  260. func (r regionID) ISO3() string {
  261. if r < isoRegionOffset {
  262. return "ZZZ"
  263. }
  264. r -= isoRegionOffset
  265. reg := regionISO.Elem(int(r))
  266. switch reg[2] {
  267. case 0:
  268. return altRegionISO3[reg[3]:][:3]
  269. case ' ':
  270. return "ZZZ"
  271. }
  272. return reg[0:1] + reg[2:4]
  273. }
  274. // M49 returns the UN M.49 encoding of r, or 0 if this encoding
  275. // is not defined for r.
  276. func (r regionID) M49() int {
  277. return int(m49[r])
  278. }
  279. // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
  280. // may include private-use tags that are assigned by CLDR and used in this
  281. // implementation. So IsPrivateUse and IsCountry can be simultaneously true.
  282. func (r regionID) IsPrivateUse() bool {
  283. return r.typ()&iso3166UserAssigned != 0
  284. }
  285. type scriptID uint8
  286. // getScriptID returns the script id for string s. It assumes that s
  287. // is of the format [A-Z][a-z]{3}.
  288. func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
  289. i, err := findIndex(idx, s, "Zzzz")
  290. return scriptID(i), err
  291. }
  292. // String returns the script code in title case.
  293. // It returns "Zzzz" for an unspecified script.
  294. func (s scriptID) String() string {
  295. if s == 0 {
  296. return "Zzzz"
  297. }
  298. return script.Elem(int(s))
  299. }
  300. // IsPrivateUse reports whether this script code is reserved for private use.
  301. func (s scriptID) IsPrivateUse() bool {
  302. return _Qaaa <= s && s <= _Qabx
  303. }
  304. const (
  305. maxAltTaglen = len("en-US-POSIX")
  306. maxLen = maxAltTaglen
  307. )
  308. var (
  309. // grandfatheredMap holds a mapping from legacy and grandfathered tags to
  310. // their base language or index to more elaborate tag.
  311. grandfatheredMap = map[[maxLen]byte]int16{
  312. [maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
  313. [maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
  314. [maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
  315. [maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
  316. [maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
  317. [maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
  318. [maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
  319. [maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
  320. [maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
  321. [maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
  322. [maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
  323. [maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
  324. [maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
  325. [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
  326. [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
  327. [maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
  328. [maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
  329. [maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
  330. [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
  331. [maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
  332. // Grandfathered tags with no modern replacement will be converted as
  333. // follows:
  334. [maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
  335. [maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
  336. [maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
  337. [maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
  338. [maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
  339. [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
  340. // CLDR-specific tag.
  341. [maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
  342. [maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
  343. }
  344. altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
  345. altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
  346. )
  347. func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
  348. if v, ok := grandfatheredMap[s]; ok {
  349. if v < 0 {
  350. return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
  351. }
  352. t.lang = langID(v)
  353. return t, true
  354. }
  355. return t, false
  356. }