- // Copyright 2013 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
-
- // +build ignore
-
- // Language tag table generator.
- // Data read from the web.
-
- package main
-
- import (
- "flag"
- "fmt"
- "io"
- "log"
- "sort"
- "strconv"
- "strings"
-
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/internal/language"
- "golang.org/x/text/unicode/cldr"
- )
-
- var (
- test = flag.Bool("test",
- false,
- "test existing tables; can be used to compare web data with package data.")
- outputFile = flag.String("output",
- "tables.go",
- "output file for generated tables")
- )
-
- func main() {
- gen.Init()
-
- w := gen.NewCodeWriter()
- defer w.WriteGoFile("tables.go", "language")
-
- b := newBuilder(w)
- gen.WriteCLDRVersion(w)
-
- b.writeConstants()
- b.writeMatchData()
- }
-
- type builder struct {
- w *gen.CodeWriter
- hw io.Writer // MultiWriter for w and w.Hash
- data *cldr.CLDR
- supp *cldr.SupplementalData
- }
-
- func (b *builder) langIndex(s string) uint16 {
- return uint16(language.MustParseBase(s))
- }
-
- func (b *builder) regionIndex(s string) int {
- return int(language.MustParseRegion(s))
- }
-
- func (b *builder) scriptIndex(s string) int {
- return int(language.MustParseScript(s))
- }
-
- func newBuilder(w *gen.CodeWriter) *builder {
- r := gen.OpenCLDRCoreZip()
- defer r.Close()
- d := &cldr.Decoder{}
- data, err := d.DecodeZip(r)
- if err != nil {
- log.Fatal(err)
- }
- b := builder{
- w: w,
- hw: io.MultiWriter(w, w.Hash),
- data: data,
- supp: data.Supplemental(),
- }
- return &b
- }
-
- // writeConsts computes f(v) for all v in values and writes the results
- // as constants named _v to a single constant block.
- func (b *builder) writeConsts(f func(string) int, values ...string) {
- fmt.Fprintln(b.w, "const (")
- for _, v := range values {
- fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
- }
- fmt.Fprintln(b.w, ")")
- }
-
- // TODO: region inclusion data will probably not be use used in future matchers.
-
- var langConsts = []string{
- "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
- }
-
- var scriptConsts = []string{
- "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
- "Zzzz",
- }
-
- var regionConsts = []string{
- "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
- "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
- }
-
- func (b *builder) writeConstants() {
- b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
- b.writeConsts(b.regionIndex, regionConsts...)
- b.writeConsts(b.scriptIndex, scriptConsts...)
- }
-
- type mutualIntelligibility struct {
- want, have uint16
- distance uint8
- oneway bool
- }
-
- type scriptIntelligibility struct {
- wantLang, haveLang uint16
- wantScript, haveScript uint8
- distance uint8
- // Always oneway
- }
-
- type regionIntelligibility struct {
- lang uint16 // compact language id
- script uint8 // 0 means any
- group uint8 // 0 means any; if bit 7 is set it means inverse
- distance uint8
- // Always twoway.
- }
-
- // writeMatchData writes tables with languages and scripts for which there is
- // mutual intelligibility. The data is based on CLDR's languageMatching data.
- // Note that we use a different algorithm than the one defined by CLDR and that
- // we slightly modify the data. For example, we convert scores to confidence levels.
- // We also drop all region-related data as we use a different algorithm to
- // determine region equivalence.
- func (b *builder) writeMatchData() {
- lm := b.supp.LanguageMatching.LanguageMatches
- cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
-
- regionHierarchy := map[string][]string{}
- for _, g := range b.supp.TerritoryContainment.Group {
- regions := strings.Split(g.Contains, " ")
- regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
- }
- regionToGroups := make([]uint8, language.NumRegions)
-
- idToIndex := map[string]uint8{}
- for i, mv := range lm[0].MatchVariable {
- if i > 6 {
- log.Fatalf("Too many groups: %d", i)
- }
- idToIndex[mv.Id] = uint8(i + 1)
- // TODO: also handle '-'
- for _, r := range strings.Split(mv.Value, "+") {
- todo := []string{r}
- for k := 0; k < len(todo); k++ {
- r := todo[k]
- regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
- todo = append(todo, regionHierarchy[r]...)
- }
- }
- }
- b.w.WriteVar("regionToGroups", regionToGroups)
-
- // maps language id to in- and out-of-group region.
- paradigmLocales := [][3]uint16{}
- locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
- for i := 0; i < len(locales); i += 2 {
- x := [3]uint16{}
- for j := 0; j < 2; j++ {
- pc := strings.SplitN(locales[i+j], "-", 2)
- x[0] = b.langIndex(pc[0])
- if len(pc) == 2 {
- x[1+j] = uint16(b.regionIndex(pc[1]))
- }
- }
- paradigmLocales = append(paradigmLocales, x)
- }
- b.w.WriteVar("paradigmLocales", paradigmLocales)
-
- b.w.WriteType(mutualIntelligibility{})
- b.w.WriteType(scriptIntelligibility{})
- b.w.WriteType(regionIntelligibility{})
-
- matchLang := []mutualIntelligibility{}
- matchScript := []scriptIntelligibility{}
- matchRegion := []regionIntelligibility{}
- // Convert the languageMatch entries in lists keyed by desired language.
- for _, m := range lm[0].LanguageMatch {
- // Different versions of CLDR use different separators.
- desired := strings.Replace(m.Desired, "-", "_", -1)
- supported := strings.Replace(m.Supported, "-", "_", -1)
- d := strings.Split(desired, "_")
- s := strings.Split(supported, "_")
- if len(d) != len(s) {
- log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
- continue
- }
- distance, _ := strconv.ParseInt(m.Distance, 10, 8)
- switch len(d) {
- case 2:
- if desired == supported && desired == "*_*" {
- continue
- }
- // language-script pair.
- matchScript = append(matchScript, scriptIntelligibility{
- wantLang: uint16(b.langIndex(d[0])),
- haveLang: uint16(b.langIndex(s[0])),
- wantScript: uint8(b.scriptIndex(d[1])),
- haveScript: uint8(b.scriptIndex(s[1])),
- distance: uint8(distance),
- })
- if m.Oneway != "true" {
- matchScript = append(matchScript, scriptIntelligibility{
- wantLang: uint16(b.langIndex(s[0])),
- haveLang: uint16(b.langIndex(d[0])),
- wantScript: uint8(b.scriptIndex(s[1])),
- haveScript: uint8(b.scriptIndex(d[1])),
- distance: uint8(distance),
- })
- }
- case 1:
- if desired == supported && desired == "*" {
- continue
- }
- if distance == 1 {
- // nb == no is already handled by macro mapping. Check there
- // really is only this case.
- if d[0] != "no" || s[0] != "nb" {
- log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
- }
- continue
- }
- // TODO: consider dropping oneway field and just doubling the entry.
- matchLang = append(matchLang, mutualIntelligibility{
- want: uint16(b.langIndex(d[0])),
- have: uint16(b.langIndex(s[0])),
- distance: uint8(distance),
- oneway: m.Oneway == "true",
- })
- case 3:
- if desired == supported && desired == "*_*_*" {
- continue
- }
- if desired != supported {
- // This is now supported by CLDR, but only one case, which
- // should already be covered by paradigm locales. For instance,
- // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
- // testdata/CLDRLocaleMatcherTest.txt tests this.
- if supported != "en_*_GB" {
- log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
- }
- continue
- }
- ri := regionIntelligibility{
- lang: b.langIndex(d[0]),
- distance: uint8(distance),
- }
- if d[1] != "*" {
- ri.script = uint8(b.scriptIndex(d[1]))
- }
- switch {
- case d[2] == "*":
- ri.group = 0x80 // not contained in anything
- case strings.HasPrefix(d[2], "$!"):
- ri.group = 0x80
- d[2] = "$" + d[2][len("$!"):]
- fallthrough
- case strings.HasPrefix(d[2], "$"):
- ri.group |= idToIndex[d[2]]
- }
- matchRegion = append(matchRegion, ri)
- default:
- log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
- }
- }
- sort.SliceStable(matchLang, func(i, j int) bool {
- return matchLang[i].distance < matchLang[j].distance
- })
- b.w.WriteComment(`
- matchLang holds pairs of langIDs of base languages that are typically
- mutually intelligible. Each pair is associated with a confidence and
- whether the intelligibility goes one or both ways.`)
- b.w.WriteVar("matchLang", matchLang)
-
- b.w.WriteComment(`
- matchScript holds pairs of scriptIDs where readers of one script
- can typically also read the other. Each is associated with a confidence.`)
- sort.SliceStable(matchScript, func(i, j int) bool {
- return matchScript[i].distance < matchScript[j].distance
- })
- b.w.WriteVar("matchScript", matchScript)
-
- sort.SliceStable(matchRegion, func(i, j int) bool {
- return matchRegion[i].distance < matchRegion[j].distance
- })
- b.w.WriteVar("matchRegion", matchRegion)
- }
|