123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- #!/usr/bin/env python3
- #
- # Copyright 2022 Google LLC All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS-IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- from collections import defaultdict, Counter
- import re
- import unicodedata
- from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts, LoadRegions
- import pytest
- import youseedee
- LANGUAGES = LoadLanguages()
- SCRIPTS = LoadScripts()
- REGIONS = LoadRegions()
- CLDR_SCRIPT_TO_UCD_SCRIPT = {
- "Bangla": "Bengali",
- "Traditional Han": "Han",
- "Simplified Han": "Han",
- "Korean": "Hangul",
- "Odia": "Oriya",
- "Ol Chiki": "Ol_Chiki",
- }
- SKIP_EXEMPLARS = {
- "ja_Jpan": "Contains multiple scripts",
- "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
- "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
- "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
- "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
- "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
- "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
- "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
- "gov_Latn": "Does use future Unicode 16 Latin glyphs",
- }
- SKIP_REGION = {
- "cpf_Latn": "French-based creole languages is a group of languages.",
- "gem_Latn": "Germanic languages is a group of languages.",
- "sla_Latn": "Slavic languages is a group of languages.",
- "hmn_Latn": "Homnic languages is a group of languages.",
- "ie_Latn": "Interlingue is an artifical language.",
- "io_Latn": "Ido is an artifical language.",
- "jbo_Latn": "Lobjan is an artifical language.",
- "tlh_Latn": "Klingon is an artifical language.",
- }
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
- )
- def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- normalized = defaultdict(set)
- for g in exemplar:
- if g[0] == "{" and g[-1] == "}":
- g = g.lstrip("{").rstrip("}")
- normalized[unicodedata.normalize("NFC", g)].add(g)
- result = [(len(gs), n) for n, gs in normalized.items()]
- expected = [(1, n) for n, _ in normalized.items()]
- assert result == expected
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
- )
- def test_languages_exemplars_duplicates(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- counter = Counter(exemplar)
- counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
- assert counts == [(v, 1) for v in exemplar]
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- @pytest.mark.parametrize(
- "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
- )
- def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
- lang = LANGUAGES[lang_code]
- if lang.script != "Latn":
- return
- exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
- for chars in exemplar:
- if len(chars) > 1:
- assert chars.startswith("{") and chars.endswith("}")
- assert len(chars[1:-1]) > 1
- SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
- ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_language_samples(lang_code):
- # Although marked as optional in the protobuf file, all
- # SampleText fields (except note) are required, so make
- # sure they are present.
- lang = LANGUAGES[lang_code]
- if not lang.sample_text.ListFields():
- pytest.skip("No sample text for language " + lang_code)
- return
- for field in SampleText.fields:
- if field.name == "note":
- continue
- assert getattr(lang.sample_text, field.name)
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_script_is_known(lang_code):
- lang = LANGUAGES[lang_code]
- script = lang.script
- assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
- @pytest.mark.parametrize("lang_code", LANGUAGES)
- def test_region_is_known(lang_code):
- lang = LANGUAGES[lang_code]
- if lang.id in SKIP_REGION:
- pytest.skip(SKIP_REGION[lang.id])
- return
- regions = lang.region
- for region in regions:
- assert region in REGIONS.keys()
- @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
- def test_exemplars_are_in_script(lang_code):
- lang = LANGUAGES[lang_code]
- script_name = SCRIPTS[lang.script].name
- script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
- if not lang.exemplar_chars.ListFields():
- pytest.skip("No exemplars for language " + lang_code)
- return
- if lang.id in SKIP_EXEMPLARS:
- pytest.skip(SKIP_EXEMPLARS[lang.id])
- return
- out_of_script = {}
- for field in ExemplarChars.fields:
- if field.name == "auxiliary" or field.name == "index":
- continue
- exemplars = getattr(lang.exemplar_chars, field.name)
- group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars)
- for chars in group_of_chars:
- for char in chars:
- char_script = youseedee.ucd_data(ord(char)).get("Script")
- if char_script == "Common" or char_script == "Inherited":
- continue
- if char_script is not None and char_script != script_name:
- out_of_script[chars] = char_script
- break
- assert not out_of_script, (
- f"{lang_code} exemplars contained out-of-script characters"
- f": {', '.join(out_of_script.keys())}"
- f" from scripts {', '.join(set(out_of_script.values()))}"
- )
|