test_data_languages.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright 2022 Google LLC All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS-IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. from collections import defaultdict, Counter
  18. import re
  19. import unicodedata
  20. from gflanguages import LoadLanguages, languages_public_pb2, LoadScripts, LoadRegions
  21. import pytest
  22. import youseedee
  23. LANGUAGES = LoadLanguages()
  24. SCRIPTS = LoadScripts()
  25. REGIONS = LoadRegions()
  26. CLDR_SCRIPT_TO_UCD_SCRIPT = {
  27. "Bangla": "Bengali",
  28. "Traditional Han": "Han",
  29. "Simplified Han": "Han",
  30. "Korean": "Hangul",
  31. "Odia": "Oriya",
  32. "Ol Chiki": "Ol_Chiki",
  33. }
  34. SKIP_EXEMPLARS = {
  35. "ja_Jpan": "Contains multiple scripts",
  36. "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  37. "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  38. "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
  39. "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
  40. "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
  41. "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
  42. "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
  43. "gov_Latn": "Does use future Unicode 16 Latin glyphs",
  44. }
  45. SKIP_REGION = {
  46. "cpf_Latn": "French-based creole languages is a group of languages.",
  47. "gem_Latn": "Germanic languages is a group of languages.",
  48. "sla_Latn": "Slavic languages is a group of languages.",
  49. "hmn_Latn": "Homnic languages is a group of languages.",
  50. "ie_Latn": "Interlingue is an artifical language.",
  51. "io_Latn": "Ido is an artifical language.",
  52. "jbo_Latn": "Lobjan is an artifical language.",
  53. "tlh_Latn": "Klingon is an artifical language.",
  54. }
  55. @pytest.mark.parametrize("lang_code", LANGUAGES)
  56. @pytest.mark.parametrize(
  57. "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
  58. )
  59. def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
  60. lang = LANGUAGES[lang_code]
  61. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  62. normalized = defaultdict(set)
  63. for g in exemplar:
  64. if g[0] == "{" and g[-1] == "}":
  65. g = g.lstrip("{").rstrip("}")
  66. normalized[unicodedata.normalize("NFC", g)].add(g)
  67. result = [(len(gs), n) for n, gs in normalized.items()]
  68. expected = [(1, n) for n, _ in normalized.items()]
  69. assert result == expected
  70. @pytest.mark.parametrize("lang_code", LANGUAGES)
  71. @pytest.mark.parametrize(
  72. "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
  73. )
  74. def test_languages_exemplars_duplicates(lang_code, exemplar_name):
  75. lang = LANGUAGES[lang_code]
  76. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  77. counter = Counter(exemplar)
  78. counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
  79. assert counts == [(v, 1) for v in exemplar]
  80. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  81. @pytest.mark.parametrize(
  82. "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
  83. )
  84. def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
  85. lang = LANGUAGES[lang_code]
  86. if lang.script != "Latn":
  87. return
  88. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  89. for chars in exemplar:
  90. if len(chars) > 1:
  91. assert chars.startswith("{") and chars.endswith("}")
  92. assert len(chars[1:-1]) > 1
  93. SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
  94. ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
  95. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  96. def test_language_samples(lang_code):
  97. # Although marked as optional in the protobuf file, all
  98. # SampleText fields (except note) are required, so make
  99. # sure they are present.
  100. lang = LANGUAGES[lang_code]
  101. if not lang.sample_text.ListFields():
  102. pytest.skip("No sample text for language " + lang_code)
  103. return
  104. for field in SampleText.fields:
  105. if field.name == "note":
  106. continue
  107. assert getattr(lang.sample_text, field.name)
  108. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  109. def test_script_is_known(lang_code):
  110. lang = LANGUAGES[lang_code]
  111. script = lang.script
  112. assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
  113. @pytest.mark.parametrize("lang_code", LANGUAGES)
  114. def test_region_is_known(lang_code):
  115. lang = LANGUAGES[lang_code]
  116. if lang.id in SKIP_REGION:
  117. pytest.skip(SKIP_REGION[lang.id])
  118. return
  119. regions = lang.region
  120. for region in regions:
  121. assert region in REGIONS.keys()
  122. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  123. def test_exemplars_are_in_script(lang_code):
  124. lang = LANGUAGES[lang_code]
  125. script_name = SCRIPTS[lang.script].name
  126. script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
  127. if not lang.exemplar_chars.ListFields():
  128. pytest.skip("No exemplars for language " + lang_code)
  129. return
  130. if lang.id in SKIP_EXEMPLARS:
  131. pytest.skip(SKIP_EXEMPLARS[lang.id])
  132. return
  133. out_of_script = {}
  134. for field in ExemplarChars.fields:
  135. if field.name == "auxiliary" or field.name == "index":
  136. continue
  137. exemplars = getattr(lang.exemplar_chars, field.name)
  138. group_of_chars = re.findall(r"(\{[^}]+\}|\S+)", exemplars)
  139. for chars in group_of_chars:
  140. for char in chars:
  141. char_script = youseedee.ucd_data(ord(char)).get("Script")
  142. if char_script == "Common" or char_script == "Inherited":
  143. continue
  144. if char_script is not None and char_script != script_name:
  145. out_of_script[chars] = char_script
  146. break
  147. assert not out_of_script, (
  148. f"{lang_code} exemplars contained out-of-script characters"
  149. f": {', '.join(out_of_script.keys())}"
  150. f" from scripts {', '.join(set(out_of_script.values()))}"
  151. )