test_data_languages.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright 2022 Google LLC All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS-IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. from collections import defaultdict, Counter
  18. import regex
  19. import unicodedata
  20. from gflanguages import (
  21. LoadLanguages,
  22. languages_public_pb2,
  23. LoadScripts,
  24. LoadRegions,
  25. parse,
  26. )
  27. import pytest
  28. import youseedee
  29. LANGUAGES = LoadLanguages()
  30. SCRIPTS = LoadScripts()
  31. REGIONS = LoadRegions()
  32. CLDR_SCRIPT_TO_UCD_SCRIPT = {
  33. "Bangla": "Bengali",
  34. "Traditional Han": "Han",
  35. "Simplified Han": "Han",
  36. "Korean": "Hangul",
  37. "Odia": "Oriya",
  38. "Makasar": "Buginese",
  39. "Lanna": "Tai Tham",
  40. "Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal",
  41. "S-A Cuneiform": "Cuneiform",
  42. "Pollard Phonetic": "Miao",
  43. "Egyptian hieroglyphs": "Egyptian Hieroglyphs",
  44. "Zanabazar": "Zanabazar Square",
  45. "Nüshu": "Nushu",
  46. "Mandaean": "Mandaic",
  47. "N’Ko": "Nko",
  48. "Varang Kshiti": "Warang Citi",
  49. "Mende": "Mende Kikakui",
  50. "Phags-pa": "Phags Pa",
  51. "Fraser": "Lisu",
  52. "Georgian Khutsuri": "Georgian",
  53. "Orkhon": "Old Turkic",
  54. }
  55. SKIP_EXEMPLARS = {
  56. "ja_Jpan": "Contains multiple scripts",
  57. "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  58. "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  59. "ykg_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
  60. "ady_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
  61. "sla_Latn": "Does indeed use Cyrillic glyphs (ь) when written in Latin",
  62. "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
  63. "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
  64. "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
  65. "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
  66. "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
  67. "gov_Latn": "Does use future Unicode 16 Latin glyphs",
  68. }
  69. SKIP_REGION = {
  70. "cpf_Latn": "French-based creole languages is a group of languages.",
  71. "gem_Latn": "Germanic languages is a group of languages.",
  72. "sla_Latn": "Slavic languages is a group of languages.",
  73. "hmn_Latn": "Homnic languages is a group of languages.",
  74. "ie_Latn": "Interlingue is an artifical language.",
  75. "io_Latn": "Ido is an artifical language.",
  76. "jbo_Latn": "Lobjan is an artifical language.",
  77. "tlh_Latn": "Klingon is an artifical language.",
  78. }
  79. LANGUAGE_NAME_REGEX = regex.compile(r"^[-'’ʼ\p{L} ]+(, [-'’ʼ\p{L}/ ]+)?( [(][-'’ʼ\p{L} ]+[)])?$")
  80. # Some scripts have abbreviated names for reference in language names that are
  81. # sufficient in context. If an alternate is listed here, it should be used
  82. # universally and consistently across all language names.
  83. ALTERNATE_SCRIPT_NAMES = {
  84. "Dupl": "Duployan",
  85. "Hans": "Simplified",
  86. "Hant": "Traditional",
  87. }
  88. @pytest.mark.parametrize("lang_code", LANGUAGES)
  89. @pytest.mark.parametrize(
  90. "exemplar_name", ["base", "auxiliary", "marks",
  91. "numerals", "punctuation", "index"]
  92. )
  93. def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
  94. lang = LANGUAGES[lang_code]
  95. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  96. normalized = defaultdict(set)
  97. for g in exemplar:
  98. if g[0] == "{" and g[-1] == "}":
  99. g = g.lstrip("{").rstrip("}")
  100. normalized[unicodedata.normalize("NFC", g)].add(g)
  101. result = [(len(gs), n) for n, gs in normalized.items()]
  102. expected = [(1, n) for n, _ in normalized.items()]
  103. assert result == expected
  104. @pytest.mark.parametrize("lang_code", LANGUAGES)
  105. @pytest.mark.parametrize(
  106. "exemplar_name", ["base", "auxiliary", "marks",
  107. "numerals", "punctuation", "index"]
  108. )
  109. def test_languages_exemplars_duplicates(lang_code, exemplar_name):
  110. lang = LANGUAGES[lang_code]
  111. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  112. counter = Counter(exemplar)
  113. counts = sorted(counter.most_common(),
  114. key=lambda pair: exemplar.index(pair[0]))
  115. assert counts == [(v, 1) for v in exemplar]
  116. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  117. @pytest.mark.parametrize(
  118. "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
  119. )
  120. def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
  121. lang = LANGUAGES[lang_code]
  122. if lang.script != "Latn":
  123. return
  124. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  125. for chars in exemplar:
  126. if len(chars) > 1:
  127. assert chars.startswith("{") and chars.endswith("}")
  128. assert len(chars[1:-1]) > 1
  129. SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
  130. ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
  131. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  132. def test_language_samples(lang_code):
  133. # Although marked as optional in the protobuf file, all
  134. # SampleText fields (except note) are required, so make
  135. # sure they are present.
  136. lang = LANGUAGES[lang_code]
  137. if not lang.sample_text.ListFields():
  138. pytest.skip("No sample text for language " + lang_code)
  139. return
  140. for field in SampleText.fields:
  141. if field.name == "note":
  142. continue
  143. assert getattr(lang.sample_text, field.name)
  144. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  145. def test_script_is_known(lang_code):
  146. lang = LANGUAGES[lang_code]
  147. script = lang.script
  148. assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
  149. @pytest.mark.parametrize("lang_code", LANGUAGES)
  150. def test_region_is_known(lang_code):
  151. lang = LANGUAGES[lang_code]
  152. if lang.id in SKIP_REGION:
  153. pytest.skip(SKIP_REGION[lang.id])
  154. return
  155. regions = lang.region
  156. for region in regions:
  157. assert region in REGIONS.keys()
  158. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  159. def test_exemplars_are_in_script(lang_code):
  160. lang = LANGUAGES[lang_code]
  161. script_name = SCRIPTS[lang.script].name
  162. script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
  163. if not lang.exemplar_chars.ListFields():
  164. pytest.skip("No exemplars for language " + lang_code)
  165. return
  166. if lang.id in SKIP_EXEMPLARS:
  167. pytest.skip(SKIP_EXEMPLARS[lang.id])
  168. return
  169. out_of_script = {}
  170. for field in ExemplarChars.fields:
  171. if field.name == "auxiliary" or field.name == "index":
  172. continue
  173. exemplars = getattr(lang.exemplar_chars, field.name)
  174. group_of_chars = regex.findall(r"(\{[^}]+\}|\S+)", exemplars)
  175. for chars in group_of_chars:
  176. for char in chars:
  177. char_script = youseedee.ucd_data(ord(char)).get("Script")
  178. if char_script == "Common" or char_script == "Inherited":
  179. continue
  180. char_script = char_script.replace("_", " ")
  181. if char_script != script_name:
  182. out_of_script[chars] = char_script
  183. break
  184. assert not out_of_script, (
  185. f"{lang_code} exemplars contained out-of-script characters"
  186. f": {', '.join(out_of_script.keys())}"
  187. f" from scripts {', '.join(set(out_of_script.values()))}"
  188. )
  189. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  190. def test_sample_texts_are_in_script(lang_code):
  191. if lang_code in [
  192. "mak_Maka",
  193. "orv_Cyrl",
  194. "cu_Cyrl",
  195. "ff_Adlm",
  196. "idu_Latn",
  197. "ban_Bali",
  198. ]:
  199. pytest.xfail(
  200. "These languages have known issues with their sample text")
  201. return
  202. lang = LANGUAGES[lang_code]
  203. script_name = SCRIPTS[lang.script].name
  204. script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
  205. if not lang.sample_text.ListFields():
  206. pytest.skip("No sample text for language " + lang_code)
  207. return
  208. if lang.id in SKIP_EXEMPLARS:
  209. pytest.skip(SKIP_EXEMPLARS[lang.id])
  210. return
  211. out_of_script = defaultdict(set)
  212. for field in SampleText.fields:
  213. if field.name == "note":
  214. continue
  215. samples = getattr(lang.sample_text, field.name)
  216. chars = set(samples)
  217. for char in chars:
  218. char_script = (
  219. youseedee.ucd_data(ord(char)).get(
  220. "Script", "").replace("_", " ")
  221. )
  222. if char_script == "Common" or char_script == "Inherited":
  223. continue
  224. if char_script != script_name:
  225. extensions = (
  226. youseedee.ucd_data(ord(char))
  227. .get("Script_Extensions", "")
  228. .split(" ")
  229. )
  230. if any(ext == lang.script for ext in extensions):
  231. continue
  232. out_of_script[char_script].add(char)
  233. break
  234. msg = []
  235. for script, chars in out_of_script.items():
  236. msg.append(f"'{''.join(chars)}' ({script} != {script_name})")
  237. assert not out_of_script, (
  238. f"{lang_code} sample text contained out-of-script characters"
  239. f": {', '.join(msg)}"
  240. )
  241. def test_exemplar_parser():
  242. bases = "a A ā Ā {a̍} {A̍} {kl}"
  243. parsed_bases = parse(bases)
  244. assert parsed_bases == {
  245. "a",
  246. "A",
  247. "ā",
  248. "Ā",
  249. "k",
  250. "l",
  251. "̍",
  252. }
  253. def test_language_uniqueness():
  254. names = Counter([])
  255. for lang in LANGUAGES.values():
  256. if lang.preferred_name:
  257. names[lang.preferred_name] += 1
  258. else:
  259. names[lang.name] += 1
  260. if any(count > 1 for count in names.values()):
  261. duplicates = {name: count for name,
  262. count in names.items() if count > 1}
  263. pytest.fail(f"Duplicate language names: {duplicates}")
  264. def test_language_name_structure():
  265. languages_with_bad_name_structure = {}
  266. for lang in LANGUAGES.values():
  267. script_name = SCRIPTS[lang.script].name if lang.script not in ALTERNATE_SCRIPT_NAMES else ALTERNATE_SCRIPT_NAMES[lang.script]
  268. names = [["name", lang.name]]
  269. if lang.preferred_name:
  270. names += [["preferred_name", lang.preferred_name]]
  271. bad_names = []
  272. for type, name in names:
  273. bad_structure = not regex.match(LANGUAGE_NAME_REGEX, name)
  274. bad_script_suffix = name.endswith(
  275. ")") and not name.endswith(f"({script_name})")
  276. if bad_structure or bad_script_suffix:
  277. bad_names.append(type)
  278. if len(bad_names) > 0:
  279. languages_with_bad_name_structure[lang.id] = bad_names
  280. if len(languages_with_bad_name_structure) > 0:
  281. misstructured_language_names = [f"{language_id}" if len(
  282. types) == 1 else f"{language_id}: {types}" for language_id, types in languages_with_bad_name_structure.items() if len(types) > 0]
  283. pytest.fail(
  284. f"Languages names without expected structure (\"LANGUAGE, MODIFIER (SCRIPT)\"): {misstructured_language_names}")