closed-social
/
gitea


								// Copyright 2013 The Go Authors. All rights reserved.

								// Use of this source code is governed by a BSD-style

								// license that can be found in the LICENSE file.


								// +build ignore


								// Language tag table generator.

								// Data read from the web.


								package main


								import (

									"flag"

									"fmt"

									"io"

									"log"

									"sort"

									"strconv"

									"strings"


									"golang.org/x/text/internal/gen"

									"golang.org/x/text/internal/language"

									"golang.org/x/text/unicode/cldr"

								)


								var (

									test = flag.Bool("test",

										false,

										"test existing tables; can be used to compare web data with package data.")

									outputFile = flag.String("output",

										"tables.go",

										"output file for generated tables")

								)


								func main() {

									gen.Init()


									w := gen.NewCodeWriter()

									defer w.WriteGoFile("tables.go", "language")


									b := newBuilder(w)

									gen.WriteCLDRVersion(w)


									b.writeConstants()

									b.writeMatchData()

								}


								type builder struct {

									w    *gen.CodeWriter

									hw   io.Writer // MultiWriter for w and w.Hash

									data *cldr.CLDR

									supp *cldr.SupplementalData

								}


								func (b *builder) langIndex(s string) uint16 {

									return uint16(language.MustParseBase(s))

								}


								func (b *builder) regionIndex(s string) int {

									return int(language.MustParseRegion(s))

								}


								func (b *builder) scriptIndex(s string) int {

									return int(language.MustParseScript(s))

								}


								func newBuilder(w *gen.CodeWriter) *builder {

									r := gen.OpenCLDRCoreZip()

									defer r.Close()

									d := &cldr.Decoder{}

									data, err := d.DecodeZip(r)

									if err != nil {

										log.Fatal(err)

									}

									b := builder{

										w:    w,

										hw:   io.MultiWriter(w, w.Hash),

										data: data,

										supp: data.Supplemental(),

									}

									return &b

								}


								// writeConsts computes f(v) for all v in values and writes the results

								// as constants named _v to a single constant block.

								func (b *builder) writeConsts(f func(string) int, values ...string) {

									fmt.Fprintln(b.w, "const (")

									for _, v := range values {

										fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))

									}

									fmt.Fprintln(b.w, ")")

								}


								// TODO: region inclusion data will probably not be use used in future matchers.


								var langConsts = []string{

									"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",

								}


								var scriptConsts = []string{

									"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",

									"Zzzz",

								}


								var regionConsts = []string{

									"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",

									"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.

								}


								func (b *builder) writeConstants() {

									b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)

									b.writeConsts(b.regionIndex, regionConsts...)

									b.writeConsts(b.scriptIndex, scriptConsts...)

								}


								type mutualIntelligibility struct {

									want, have uint16

									distance   uint8

									oneway     bool

								}


								type scriptIntelligibility struct {

									wantLang, haveLang     uint16

									wantScript, haveScript uint8

									distance               uint8

									// Always oneway

								}


								type regionIntelligibility struct {

									lang     uint16 // compact language id

									script   uint8  // 0 means any

									group    uint8  // 0 means any; if bit 7 is set it means inverse

									distance uint8

									// Always twoway.

								}


								// writeMatchData writes tables with languages and scripts for which there is

								// mutual intelligibility. The data is based on CLDR's languageMatching data.

								// Note that we use a different algorithm than the one defined by CLDR and that

								// we slightly modify the data. For example, we convert scores to confidence levels.

								// We also drop all region-related data as we use a different algorithm to

								// determine region equivalence.

								func (b *builder) writeMatchData() {

									lm := b.supp.LanguageMatching.LanguageMatches

									cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")


									regionHierarchy := map[string][]string{}

									for _, g := range b.supp.TerritoryContainment.Group {

										regions := strings.Split(g.Contains, " ")

										regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)

									}

									regionToGroups := make([]uint8, language.NumRegions)


									idToIndex := map[string]uint8{}

									for i, mv := range lm[0].MatchVariable {

										if i > 6 {

											log.Fatalf("Too many groups: %d", i)

										}

										idToIndex[mv.Id] = uint8(i + 1)

										// TODO: also handle '-'

										for _, r := range strings.Split(mv.Value, "+") {

											todo := []string{r}

											for k := 0; k < len(todo); k++ {

												r := todo[k]

												regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)

												todo = append(todo, regionHierarchy[r]...)

											}

										}

									}

									b.w.WriteVar("regionToGroups", regionToGroups)


									// maps language id to in- and out-of-group region.

									paradigmLocales := [][3]uint16{}

									locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")

									for i := 0; i < len(locales); i += 2 {

										x := [3]uint16{}

										for j := 0; j < 2; j++ {

											pc := strings.SplitN(locales[i+j], "-", 2)

											x[0] = b.langIndex(pc[0])

											if len(pc) == 2 {

												x[1+j] = uint16(b.regionIndex(pc[1]))

											}

										}

										paradigmLocales = append(paradigmLocales, x)

									}

									b.w.WriteVar("paradigmLocales", paradigmLocales)


									b.w.WriteType(mutualIntelligibility{})

									b.w.WriteType(scriptIntelligibility{})

									b.w.WriteType(regionIntelligibility{})


									matchLang := []mutualIntelligibility{}

									matchScript := []scriptIntelligibility{}

									matchRegion := []regionIntelligibility{}

									// Convert the languageMatch entries in lists keyed by desired language.

									for _, m := range lm[0].LanguageMatch {

										// Different versions of CLDR use different separators.

										desired := strings.Replace(m.Desired, "-", "_", -1)

										supported := strings.Replace(m.Supported, "-", "_", -1)

										d := strings.Split(desired, "_")

										s := strings.Split(supported, "_")

										if len(d) != len(s) {

											log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)

											continue

										}

										distance, _ := strconv.ParseInt(m.Distance, 10, 8)

										switch len(d) {

										case 2:

											if desired == supported && desired == "*_*" {

												continue

											}

											// language-script pair.

											matchScript = append(matchScript, scriptIntelligibility{

												wantLang:   uint16(b.langIndex(d[0])),

												haveLang:   uint16(b.langIndex(s[0])),

												wantScript: uint8(b.scriptIndex(d[1])),

												haveScript: uint8(b.scriptIndex(s[1])),

												distance:   uint8(distance),

											})

											if m.Oneway != "true" {

												matchScript = append(matchScript, scriptIntelligibility{

													wantLang:   uint16(b.langIndex(s[0])),

													haveLang:   uint16(b.langIndex(d[0])),

													wantScript: uint8(b.scriptIndex(s[1])),

													haveScript: uint8(b.scriptIndex(d[1])),

													distance:   uint8(distance),

												})

											}

										case 1:

											if desired == supported && desired == "*" {

												continue

											}

											if distance == 1 {

												// nb == no is already handled by macro mapping. Check there

												// really is only this case.

												if d[0] != "no" || s[0] != "nb" {

													log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])

												}

												continue

											}

											// TODO: consider dropping oneway field and just doubling the entry.

											matchLang = append(matchLang, mutualIntelligibility{

												want:     uint16(b.langIndex(d[0])),

												have:     uint16(b.langIndex(s[0])),

												distance: uint8(distance),

												oneway:   m.Oneway == "true",

											})

										case 3:

											if desired == supported && desired == "*_*_*" {

												continue

											}

											if desired != supported {

												// This is now supported by CLDR, but only one case, which

												// should already be covered by paradigm locales. For instance,

												// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in

												// testdata/CLDRLocaleMatcherTest.txt tests this.

												if supported != "en_*_GB" {

													log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)

												}

												continue

											}

											ri := regionIntelligibility{

												lang:     b.langIndex(d[0]),

												distance: uint8(distance),

											}

											if d[1] != "*" {

												ri.script = uint8(b.scriptIndex(d[1]))

											}

											switch {

											case d[2] == "*":

												ri.group = 0x80 // not contained in anything

											case strings.HasPrefix(d[2], "$!"):

												ri.group = 0x80

												d[2] = "$" + d[2][len("$!"):]

												fallthrough

											case strings.HasPrefix(d[2], "$"):

												ri.group |= idToIndex[d[2]]

											}

											matchRegion = append(matchRegion, ri)

										default:

											log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)

										}

									}

									sort.SliceStable(matchLang, func(i, j int) bool {

										return matchLang[i].distance < matchLang[j].distance

									})

									b.w.WriteComment(`

										matchLang holds pairs of langIDs of base languages that are typically

										mutually intelligible. Each pair is associated with a confidence and

										whether the intelligibility goes one or both ways.`)

									b.w.WriteVar("matchLang", matchLang)


									b.w.WriteComment(`

										matchScript holds pairs of scriptIDs where readers of one script

										can typically also read the other. Each is associated with a confidence.`)

									sort.SliceStable(matchScript, func(i, j int) bool {

										return matchScript[i].distance < matchScript[j].distance

									})

									b.w.WriteVar("matchScript", matchScript)


									sort.SliceStable(matchRegion, func(i, j int) bool {

										return matchRegion[i].distance < matchRegion[j].distance

									})

									b.w.WriteVar("matchRegion", matchRegion)

								}