You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

305 lines
8.6 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Language tag table generator.
  6. // Data read from the web.
  7. package main
  8. import (
  9. "flag"
  10. "fmt"
  11. "io"
  12. "log"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "golang.org/x/text/internal/gen"
  17. "golang.org/x/text/internal/language"
  18. "golang.org/x/text/unicode/cldr"
  19. )
  20. var (
  21. test = flag.Bool("test",
  22. false,
  23. "test existing tables; can be used to compare web data with package data.")
  24. outputFile = flag.String("output",
  25. "tables.go",
  26. "output file for generated tables")
  27. )
  28. func main() {
  29. gen.Init()
  30. w := gen.NewCodeWriter()
  31. defer w.WriteGoFile("tables.go", "language")
  32. b := newBuilder(w)
  33. gen.WriteCLDRVersion(w)
  34. b.writeConstants()
  35. b.writeMatchData()
  36. }
  37. type builder struct {
  38. w *gen.CodeWriter
  39. hw io.Writer // MultiWriter for w and w.Hash
  40. data *cldr.CLDR
  41. supp *cldr.SupplementalData
  42. }
  43. func (b *builder) langIndex(s string) uint16 {
  44. return uint16(language.MustParseBase(s))
  45. }
  46. func (b *builder) regionIndex(s string) int {
  47. return int(language.MustParseRegion(s))
  48. }
  49. func (b *builder) scriptIndex(s string) int {
  50. return int(language.MustParseScript(s))
  51. }
  52. func newBuilder(w *gen.CodeWriter) *builder {
  53. r := gen.OpenCLDRCoreZip()
  54. defer r.Close()
  55. d := &cldr.Decoder{}
  56. data, err := d.DecodeZip(r)
  57. if err != nil {
  58. log.Fatal(err)
  59. }
  60. b := builder{
  61. w: w,
  62. hw: io.MultiWriter(w, w.Hash),
  63. data: data,
  64. supp: data.Supplemental(),
  65. }
  66. return &b
  67. }
  68. // writeConsts computes f(v) for all v in values and writes the results
  69. // as constants named _v to a single constant block.
  70. func (b *builder) writeConsts(f func(string) int, values ...string) {
  71. fmt.Fprintln(b.w, "const (")
  72. for _, v := range values {
  73. fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
  74. }
  75. fmt.Fprintln(b.w, ")")
  76. }
  77. // TODO: region inclusion data will probably not be use used in future matchers.
  78. var langConsts = []string{
  79. "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
  80. }
  81. var scriptConsts = []string{
  82. "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
  83. "Zzzz",
  84. }
  85. var regionConsts = []string{
  86. "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
  87. "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
  88. }
  89. func (b *builder) writeConstants() {
  90. b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
  91. b.writeConsts(b.regionIndex, regionConsts...)
  92. b.writeConsts(b.scriptIndex, scriptConsts...)
  93. }
  94. type mutualIntelligibility struct {
  95. want, have uint16
  96. distance uint8
  97. oneway bool
  98. }
  99. type scriptIntelligibility struct {
  100. wantLang, haveLang uint16
  101. wantScript, haveScript uint8
  102. distance uint8
  103. // Always oneway
  104. }
  105. type regionIntelligibility struct {
  106. lang uint16 // compact language id
  107. script uint8 // 0 means any
  108. group uint8 // 0 means any; if bit 7 is set it means inverse
  109. distance uint8
  110. // Always twoway.
  111. }
  112. // writeMatchData writes tables with languages and scripts for which there is
  113. // mutual intelligibility. The data is based on CLDR's languageMatching data.
  114. // Note that we use a different algorithm than the one defined by CLDR and that
  115. // we slightly modify the data. For example, we convert scores to confidence levels.
  116. // We also drop all region-related data as we use a different algorithm to
  117. // determine region equivalence.
  118. func (b *builder) writeMatchData() {
  119. lm := b.supp.LanguageMatching.LanguageMatches
  120. cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
  121. regionHierarchy := map[string][]string{}
  122. for _, g := range b.supp.TerritoryContainment.Group {
  123. regions := strings.Split(g.Contains, " ")
  124. regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
  125. }
  126. regionToGroups := make([]uint8, language.NumRegions)
  127. idToIndex := map[string]uint8{}
  128. for i, mv := range lm[0].MatchVariable {
  129. if i > 6 {
  130. log.Fatalf("Too many groups: %d", i)
  131. }
  132. idToIndex[mv.Id] = uint8(i + 1)
  133. // TODO: also handle '-'
  134. for _, r := range strings.Split(mv.Value, "+") {
  135. todo := []string{r}
  136. for k := 0; k < len(todo); k++ {
  137. r := todo[k]
  138. regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
  139. todo = append(todo, regionHierarchy[r]...)
  140. }
  141. }
  142. }
  143. b.w.WriteVar("regionToGroups", regionToGroups)
  144. // maps language id to in- and out-of-group region.
  145. paradigmLocales := [][3]uint16{}
  146. locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
  147. for i := 0; i < len(locales); i += 2 {
  148. x := [3]uint16{}
  149. for j := 0; j < 2; j++ {
  150. pc := strings.SplitN(locales[i+j], "-", 2)
  151. x[0] = b.langIndex(pc[0])
  152. if len(pc) == 2 {
  153. x[1+j] = uint16(b.regionIndex(pc[1]))
  154. }
  155. }
  156. paradigmLocales = append(paradigmLocales, x)
  157. }
  158. b.w.WriteVar("paradigmLocales", paradigmLocales)
  159. b.w.WriteType(mutualIntelligibility{})
  160. b.w.WriteType(scriptIntelligibility{})
  161. b.w.WriteType(regionIntelligibility{})
  162. matchLang := []mutualIntelligibility{}
  163. matchScript := []scriptIntelligibility{}
  164. matchRegion := []regionIntelligibility{}
  165. // Convert the languageMatch entries in lists keyed by desired language.
  166. for _, m := range lm[0].LanguageMatch {
  167. // Different versions of CLDR use different separators.
  168. desired := strings.Replace(m.Desired, "-", "_", -1)
  169. supported := strings.Replace(m.Supported, "-", "_", -1)
  170. d := strings.Split(desired, "_")
  171. s := strings.Split(supported, "_")
  172. if len(d) != len(s) {
  173. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  174. continue
  175. }
  176. distance, _ := strconv.ParseInt(m.Distance, 10, 8)
  177. switch len(d) {
  178. case 2:
  179. if desired == supported && desired == "*_*" {
  180. continue
  181. }
  182. // language-script pair.
  183. matchScript = append(matchScript, scriptIntelligibility{
  184. wantLang: uint16(b.langIndex(d[0])),
  185. haveLang: uint16(b.langIndex(s[0])),
  186. wantScript: uint8(b.scriptIndex(d[1])),
  187. haveScript: uint8(b.scriptIndex(s[1])),
  188. distance: uint8(distance),
  189. })
  190. if m.Oneway != "true" {
  191. matchScript = append(matchScript, scriptIntelligibility{
  192. wantLang: uint16(b.langIndex(s[0])),
  193. haveLang: uint16(b.langIndex(d[0])),
  194. wantScript: uint8(b.scriptIndex(s[1])),
  195. haveScript: uint8(b.scriptIndex(d[1])),
  196. distance: uint8(distance),
  197. })
  198. }
  199. case 1:
  200. if desired == supported && desired == "*" {
  201. continue
  202. }
  203. if distance == 1 {
  204. // nb == no is already handled by macro mapping. Check there
  205. // really is only this case.
  206. if d[0] != "no" || s[0] != "nb" {
  207. log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
  208. }
  209. continue
  210. }
  211. // TODO: consider dropping oneway field and just doubling the entry.
  212. matchLang = append(matchLang, mutualIntelligibility{
  213. want: uint16(b.langIndex(d[0])),
  214. have: uint16(b.langIndex(s[0])),
  215. distance: uint8(distance),
  216. oneway: m.Oneway == "true",
  217. })
  218. case 3:
  219. if desired == supported && desired == "*_*_*" {
  220. continue
  221. }
  222. if desired != supported {
  223. // This is now supported by CLDR, but only one case, which
  224. // should already be covered by paradigm locales. For instance,
  225. // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
  226. // testdata/CLDRLocaleMatcherTest.txt tests this.
  227. if supported != "en_*_GB" {
  228. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  229. }
  230. continue
  231. }
  232. ri := regionIntelligibility{
  233. lang: b.langIndex(d[0]),
  234. distance: uint8(distance),
  235. }
  236. if d[1] != "*" {
  237. ri.script = uint8(b.scriptIndex(d[1]))
  238. }
  239. switch {
  240. case d[2] == "*":
  241. ri.group = 0x80 // not contained in anything
  242. case strings.HasPrefix(d[2], "$!"):
  243. ri.group = 0x80
  244. d[2] = "$" + d[2][len("$!"):]
  245. fallthrough
  246. case strings.HasPrefix(d[2], "$"):
  247. ri.group |= idToIndex[d[2]]
  248. }
  249. matchRegion = append(matchRegion, ri)
  250. default:
  251. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  252. }
  253. }
  254. sort.SliceStable(matchLang, func(i, j int) bool {
  255. return matchLang[i].distance < matchLang[j].distance
  256. })
  257. b.w.WriteComment(`
  258. matchLang holds pairs of langIDs of base languages that are typically
  259. mutually intelligible. Each pair is associated with a confidence and
  260. whether the intelligibility goes one or both ways.`)
  261. b.w.WriteVar("matchLang", matchLang)
  262. b.w.WriteComment(`
  263. matchScript holds pairs of scriptIDs where readers of one script
  264. can typically also read the other. Each is associated with a confidence.`)
  265. sort.SliceStable(matchScript, func(i, j int) bool {
  266. return matchScript[i].distance < matchScript[j].distance
  267. })
  268. b.w.WriteVar("matchScript", matchScript)
  269. sort.SliceStable(matchRegion, func(i, j int) bool {
  270. return matchRegion[i].distance < matchRegion[j].distance
  271. })
  272. b.w.WriteVar("matchRegion", matchRegion)
  273. }