123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- #!/usr/bin/env python3
- #
- # Copyright 2022 Google LLC All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS-IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- from collections import defaultdict, Counter
- import regex
- import unicodedata
- from gflanguages import (
- LoadLanguages,
- languages_public_pb2,
- LoadScripts,
- LoadRegions,
- parse,
- )
- import pytest
- import youseedee
- LANGUAGES = LoadLanguages()
- SCRIPTS = LoadScripts()
- REGIONS = LoadRegions()
- CLDR_SCRIPT_TO_UCD_SCRIPT = {
- "Bangla": "Bengali",
- "Traditional Han": "Han",
- "Simplified Han": "Han",
- "Korean": "Hangul",
- "Odia": "Oriya",
- "Makasar": "Buginese",
- "Lanna": "Tai Tham",
- "Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal",
- "S-A Cuneiform": "Cuneiform",
- "Pollard Phonetic": "Miao",
- "Egyptian hieroglyphs": "Egyptian Hieroglyphs",
- "Zanabazar": "Zanabazar Square",
- "Nüshu": "Nushu",
- "Mandaean": "Mandaic",
- "N’Ko": "Nko",
- "Varang Kshiti": "Warang Citi",
- "Mende": "Mende Kikakui",
- "Phags-pa": "Phags Pa",
- "Fraser": "Lisu",
- "Georgian Khutsuri": "Georgian",
- "Orkhon": "Old Turkic",
- }
- SKIP_EXEMPLARS = {
- "ja_Jpan": "Contains multiple scripts",
- "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
- "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
- "ykg_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
- "ady_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
- "sla_Latn": "Does indeed use Cyrillic glyphs (ь) when written in Latin",
- "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
- "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
- "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
- "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
- "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
- "gov_Latn": "Does use future Unicode 16 Latin glyphs",
- }
- SKIP_REGION = {
- "cpf_Latn": "French-based creole languages is a group of languages.",
- "gem_Latn": "Germanic languages is a group of languages.",
- "sla_Latn": "Slavic languages is a group of languages.",
- "hmn_Latn": "Homnic languages is a group of languages.",
- "ie_Latn": "Interlingue is an artifical language.",
- "io_Latn": "Ido is an artifical language.",
- "jbo_Latn": "Lobjan is an artifical language.",
- "tlh_Latn": "Klingon is an artifical language.",
- }
- LANGUAGE_NAME_REGEX = regex.compile(
- r"^[-'’ʼ\p{L} ]+(, [-'’ʼ\p{L}/ ]+)?( [(][-'’ʼ\p{L} ]+[)])?$"
- )
- # Some scripts have abbreviated names for reference in language names that are
- # sufficient in context. If an alternate is listed here, it should be used
- # universally and consistently across all language names.
- ALTERNATE_SCRIPT_NAMES = {
- "Dupl": "Duployan",
- "Hans": "Simplified",
- "Hant": "Traditional",
- }
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
- )
- def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- normalized = defaultdict(set)
- for g in exemplar:
- if g[0] == "{" and g[-1] == "}":
- g = g.lstrip("{").rstrip("}")
- normalized[unicodedata.normalize("NFC", g)].add(g)
- result = [(len(gs), n) for n, gs in normalized.items()]
- expected = [(1, n) for n, _ in normalized.items()]
- assert result == expected
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
- )
- def test_languages_exemplars_duplicates(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- counter = Counter(exemplar)
- counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
- assert counts == [(v, 1) for v in exemplar]
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
- )
- def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- if lang.script != "Latn":
- return
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- for chars in exemplar:
- if len(chars) > 1:
- assert chars.startswith("{") and chars.endswith("}")
- assert len(chars[1:-1]) > 1
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- def test_languages_exemplars_marks_in_base(lang_code):
- lang = LANGUAGES[lang_code]
- bases = lang.exemplar_chars.base
- problems = []
- for chars in bases.split():
- if len(chars) > 1:
- chars = chars.lstrip("{").rstrip("}")
- if unicodedata.category(chars[0]) == "Mn":
- problems.append("\u25CC" + chars)
- if "\u25CC" in chars:
- problems.append(chars)
- assert not problems, f"Found marks in base: {problems}"
- SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
- ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_language_samples(lang_code):
- # Although marked as optional in the protobuf file, all
- # SampleText fields (except note) are required, so make
- # sure they are present.
- lang = LANGUAGES[lang_code]
- if not lang.sample_text.ListFields():
- pytest.skip("No sample text for language " + lang_code)
- return
- for field in SampleText.fields:
- if field.name == "note":
- continue
- assert getattr(lang.sample_text, field.name)
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_script_is_known(lang_code):
- lang = LANGUAGES[lang_code]
- script = lang.script
- assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- def test_region_is_known(lang_code):
- lang = LANGUAGES[lang_code]
- if lang.id in SKIP_REGION:
- pytest.skip(SKIP_REGION[lang.id])
- return
- regions = lang.region
- for region in regions:
- assert region in REGIONS.keys()
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_exemplars_are_in_script(lang_code):
- lang = LANGUAGES[lang_code]
- script_name = SCRIPTS[lang.script].name
- script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
- if not lang.exemplar_chars.ListFields():
- pytest.skip("No exemplars for language " + lang_code)
- return
- if lang.id in SKIP_EXEMPLARS:
- pytest.skip(SKIP_EXEMPLARS[lang.id])
- return
- out_of_script = {}
- for field in ExemplarChars.fields:
- if field.name == "auxiliary" or field.name == "index":
- continue
- exemplars = getattr(lang.exemplar_chars, field.name)
- group_of_chars = regex.findall(r"(\{[^}]+\}|\S+)", exemplars)
- for chars in group_of_chars:
- for char in chars:
- char_script = youseedee.ucd_data(ord(char)).get("Script")
- if char_script == "Common" or char_script == "Inherited":
- continue
- char_script = char_script.replace("_", " ")
- if char_script != script_name:
- out_of_script[chars] = char_script
- break
- assert not out_of_script, (
- f"{lang_code} exemplars contained out-of-script characters"
- f": {', '.join(out_of_script.keys())}"
- f" from scripts {', '.join(set(out_of_script.values()))}"
- )
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_sample_texts_are_in_script(lang_code):
- if lang_code in [
- "mak_Maka",
- "orv_Cyrl",
- "cu_Cyrl",
- "ff_Adlm",
- "idu_Latn",
- "ban_Bali",
- ]:
- pytest.xfail("These languages have known issues with their sample text")
- return
- lang = LANGUAGES[lang_code]
- script_name = SCRIPTS[lang.script].name
- script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
- if not lang.sample_text.ListFields():
- pytest.skip("No sample text for language " + lang_code)
- return
- if lang.id in SKIP_EXEMPLARS:
- pytest.skip(SKIP_EXEMPLARS[lang.id])
- return
- out_of_script = defaultdict(set)
- for field in SampleText.fields:
- if field.name == "note":
- continue
- samples = getattr(lang.sample_text, field.name)
- chars = set(samples)
- for char in chars:
- char_script = (
- youseedee.ucd_data(ord(char)).get("Script", "").replace("_", " ")
- )
- if char_script == "Common" or char_script == "Inherited":
- continue
- if char_script != script_name:
- extensions = (
- youseedee.ucd_data(ord(char))
- .get("Script_Extensions", "")
- .split(" ")
- )
- if any(ext == lang.script for ext in extensions):
- continue
- out_of_script[char_script].add(char)
- break
- msg = []
- for script, chars in out_of_script.items():
- msg.append(f"'{''.join(chars)}' ({script} != {script_name})")
- assert not out_of_script, (
- f"{lang_code} sample text contained out-of-script characters"
- f": {', '.join(msg)}"
- )
- def test_exemplar_parser():
- bases = "a A ā Ā {a̍} {A̍} {kl}"
- parsed_bases = parse(bases)
- assert parsed_bases == {
- "a",
- "A",
- "ā",
- "Ā",
- "k",
- "l",
- "̍",
- }
- def test_language_uniqueness():
- names = Counter([])
- for lang in LANGUAGES.values():
- if lang.preferred_name:
- names[lang.preferred_name] += 1
- else:
- names[lang.name] += 1
- if any(count > 1 for count in names.values()):
- duplicates = {name: count for name, count in names.items() if count > 1}
- pytest.fail(f"Duplicate language names: {duplicates}")
- def test_language_name_structure():
- languages_with_bad_name_structure = {}
- for lang in LANGUAGES.values():
- script_name = (
- SCRIPTS[lang.script].name
- if lang.script not in ALTERNATE_SCRIPT_NAMES
- else ALTERNATE_SCRIPT_NAMES[lang.script]
- )
- names = [["name", lang.name]]
- if lang.preferred_name:
- names += [["preferred_name", lang.preferred_name]]
- bad_names = []
- for type, name in names:
- bad_structure = not regex.match(LANGUAGE_NAME_REGEX, name)
- bad_script_suffix = name.endswith(")") and not name.endswith(
- f"({script_name})"
- )
- if bad_structure or bad_script_suffix:
- bad_names.append(type)
- if len(bad_names) > 0:
- languages_with_bad_name_structure[lang.id] = bad_names
- if len(languages_with_bad_name_structure) > 0:
- misstructured_language_names = [
- f"{language_id}" if len(types) == 1 else f"{language_id}: {types}"
- for language_id, types in languages_with_bad_name_structure.items()
- if len(types) > 0
- ]
- pytest.fail(
- f'Languages names without expected structure ("LANGUAGE, MODIFIER (SCRIPT)"): {misstructured_language_names}'
- )
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- def test_id_well_formed(lang_code):
- if lang_code in ["tw_akuapem_Latn"]:
- pytest.xfail("Well we need to have a conversation about that")
- return
- lang = LANGUAGES[lang_code]
- assert lang.id.startswith(lang.language + "_" + lang.script)
|