test_data_languages.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright 2022 Google LLC All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS-IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. from collections import defaultdict, Counter
  18. import regex
  19. import unicodedata
  20. from gflanguages import (
  21. LoadLanguages,
  22. languages_public_pb2,
  23. LoadScripts,
  24. LoadRegions,
  25. parse,
  26. )
  27. import pytest
  28. import youseedee
  29. LANGUAGES = LoadLanguages()
  30. SCRIPTS = LoadScripts()
  31. REGIONS = LoadRegions()
  32. CLDR_SCRIPT_TO_UCD_SCRIPT = {
  33. "Bangla": "Bengali",
  34. "Traditional Han": "Han",
  35. "Simplified Han": "Han",
  36. "Korean": "Hangul",
  37. "Odia": "Oriya",
  38. "Makasar": "Buginese",
  39. "Lanna": "Tai Tham",
  40. "Unified Canadian Aboriginal Syllabics": "Canadian Aboriginal",
  41. "S-A Cuneiform": "Cuneiform",
  42. "Pollard Phonetic": "Miao",
  43. "Egyptian hieroglyphs": "Egyptian Hieroglyphs",
  44. "Zanabazar": "Zanabazar Square",
  45. "Nüshu": "Nushu",
  46. "Mandaean": "Mandaic",
  47. "N’Ko": "Nko",
  48. "Varang Kshiti": "Warang Citi",
  49. "Mende": "Mende Kikakui",
  50. "Phags-pa": "Phags Pa",
  51. "Fraser": "Lisu",
  52. "Georgian Khutsuri": "Georgian",
  53. "Orkhon": "Old Turkic",
  54. }
  55. SKIP_EXEMPLARS = {
  56. "ja_Jpan": "Contains multiple scripts",
  57. "aii_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  58. "sel_Cyrl": "Does indeed use Latin glyphs while writing Cyrillic",
  59. "ykg_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
  60. "ady_Cyrl": "Does indeed use Latin glyphs (w) while writing Cyrillic",
  61. "sla_Latn": "Does indeed use Cyrillic glyphs (ь) when written in Latin",
  62. "coo_Latn": "Does indeed use Greek glyphs while writing Latin",
  63. "hur_Latn": "Does indeed use Greek glyphs while writing Latin",
  64. "kwk_Latn": "Does indeed use Greek glyphs while writing Latin",
  65. "thp_Latn": "Does indeed use Greek glyphs while writing Latin",
  66. "dnj_Latn": "Does use future Unicode 16 Latin glyphs",
  67. "gov_Latn": "Does use future Unicode 16 Latin glyphs",
  68. }
  69. SKIP_REGION = {
  70. "cpf_Latn": "French-based creole languages is a group of languages.",
  71. "gem_Latn": "Germanic languages is a group of languages.",
  72. "sla_Latn": "Slavic languages is a group of languages.",
  73. "hmn_Latn": "Homnic languages is a group of languages.",
  74. "ie_Latn": "Interlingue is an artifical language.",
  75. "io_Latn": "Ido is an artifical language.",
  76. "jbo_Latn": "Lobjan is an artifical language.",
  77. "tlh_Latn": "Klingon is an artifical language.",
  78. }
  79. LANGUAGE_NAME_REGEX = regex.compile(
  80. r"^[-'’ʼ\p{L} ]+(, [-'’ʼ\p{L}/ ]+)?( [(][-'’ʼ\p{L} ]+[)])?$"
  81. )
  82. # Some scripts have abbreviated names for reference in language names that are
  83. # sufficient in context. If an alternate is listed here, it should be used
  84. # universally and consistently across all language names.
  85. ALTERNATE_SCRIPT_NAMES = {
  86. "Dupl": "Duployan",
  87. "Hans": "Simplified",
  88. "Hant": "Traditional",
  89. }
  90. @pytest.mark.parametrize("lang_code", LANGUAGES)
  91. @pytest.mark.parametrize(
  92. "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
  93. )
  94. def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
  95. lang = LANGUAGES[lang_code]
  96. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  97. normalized = defaultdict(set)
  98. for g in exemplar:
  99. if g[0] == "{" and g[-1] == "}":
  100. g = g.lstrip("{").rstrip("}")
  101. normalized[unicodedata.normalize("NFC", g)].add(g)
  102. result = [(len(gs), n) for n, gs in normalized.items()]
  103. expected = [(1, n) for n, _ in normalized.items()]
  104. assert result == expected
  105. @pytest.mark.parametrize("lang_code", LANGUAGES)
  106. @pytest.mark.parametrize(
  107. "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
  108. )
  109. def test_languages_exemplars_duplicates(lang_code, exemplar_name):
  110. lang = LANGUAGES[lang_code]
  111. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  112. counter = Counter(exemplar)
  113. counts = sorted(counter.most_common(), key=lambda pair: exemplar.index(pair[0]))
  114. assert counts == [(v, 1) for v in exemplar]
  115. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  116. @pytest.mark.parametrize(
  117. "exemplar_name", ["base", "auxiliary", "numerals", "punctuation", "index"]
  118. )
  119. def test_exemplars_bracketed_sequences(lang_code, exemplar_name):
  120. lang = LANGUAGES[lang_code]
  121. if lang.script != "Latn":
  122. return
  123. exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
  124. for chars in exemplar:
  125. if len(chars) > 1:
  126. assert chars.startswith("{") and chars.endswith("}")
  127. assert len(chars[1:-1]) > 1
  128. @pytest.mark.parametrize("lang_code", LANGUAGES)
  129. def test_languages_exemplars_marks_in_base(lang_code):
  130. lang = LANGUAGES[lang_code]
  131. bases = lang.exemplar_chars.base
  132. problems = []
  133. for chars in bases.split():
  134. if len(chars) > 1:
  135. chars = chars.lstrip("{").rstrip("}")
  136. if unicodedata.category(chars[0]) == "Mn":
  137. problems.append("\u25CC" + chars)
  138. if "\u25CC" in chars:
  139. problems.append(chars)
  140. assert not problems, f"Found marks in base: {problems}"
  141. SampleText = languages_public_pb2.SampleTextProto().DESCRIPTOR
  142. ExemplarChars = languages_public_pb2.ExemplarCharsProto().DESCRIPTOR
  143. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  144. def test_language_samples(lang_code):
  145. # Although marked as optional in the protobuf file, all
  146. # SampleText fields (except note) are required, so make
  147. # sure they are present.
  148. lang = LANGUAGES[lang_code]
  149. if not lang.sample_text.ListFields():
  150. pytest.skip("No sample text for language " + lang_code)
  151. return
  152. for field in SampleText.fields:
  153. if field.name == "note":
  154. continue
  155. assert getattr(lang.sample_text, field.name)
  156. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  157. def test_script_is_known(lang_code):
  158. lang = LANGUAGES[lang_code]
  159. script = lang.script
  160. assert script in SCRIPTS, f"{lang_code} used unknown script {lang.script}"
  161. @pytest.mark.parametrize("lang_code", LANGUAGES)
  162. def test_region_is_known(lang_code):
  163. lang = LANGUAGES[lang_code]
  164. if lang.id in SKIP_REGION:
  165. pytest.skip(SKIP_REGION[lang.id])
  166. return
  167. regions = lang.region
  168. for region in regions:
  169. assert region in REGIONS.keys()
  170. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  171. def test_exemplars_are_in_script(lang_code):
  172. lang = LANGUAGES[lang_code]
  173. script_name = SCRIPTS[lang.script].name
  174. script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
  175. if not lang.exemplar_chars.ListFields():
  176. pytest.skip("No exemplars for language " + lang_code)
  177. return
  178. if lang.id in SKIP_EXEMPLARS:
  179. pytest.skip(SKIP_EXEMPLARS[lang.id])
  180. return
  181. out_of_script = {}
  182. for field in ExemplarChars.fields:
  183. if field.name == "auxiliary" or field.name == "index":
  184. continue
  185. exemplars = getattr(lang.exemplar_chars, field.name)
  186. group_of_chars = regex.findall(r"(\{[^}]+\}|\S+)", exemplars)
  187. for chars in group_of_chars:
  188. for char in chars:
  189. char_script = youseedee.ucd_data(ord(char)).get("Script")
  190. if char_script == "Common" or char_script == "Inherited":
  191. continue
  192. char_script = char_script.replace("_", " ")
  193. if char_script != script_name:
  194. out_of_script[chars] = char_script
  195. break
  196. assert not out_of_script, (
  197. f"{lang_code} exemplars contained out-of-script characters"
  198. f": {', '.join(out_of_script.keys())}"
  199. f" from scripts {', '.join(set(out_of_script.values()))}"
  200. )
  201. @pytest.mark.parametrize("lang_code", LANGUAGES.keys())
  202. def test_sample_texts_are_in_script(lang_code):
  203. if lang_code in [
  204. "mak_Maka",
  205. "orv_Cyrl",
  206. "cu_Cyrl",
  207. "ff_Adlm",
  208. "idu_Latn",
  209. "ban_Bali",
  210. ]:
  211. pytest.xfail("These languages have known issues with their sample text")
  212. return
  213. lang = LANGUAGES[lang_code]
  214. script_name = SCRIPTS[lang.script].name
  215. script_name = CLDR_SCRIPT_TO_UCD_SCRIPT.get(script_name, script_name)
  216. if not lang.sample_text.ListFields():
  217. pytest.skip("No sample text for language " + lang_code)
  218. return
  219. if lang.id in SKIP_EXEMPLARS:
  220. pytest.skip(SKIP_EXEMPLARS[lang.id])
  221. return
  222. out_of_script = defaultdict(set)
  223. for field in SampleText.fields:
  224. if field.name == "note":
  225. continue
  226. samples = getattr(lang.sample_text, field.name)
  227. chars = set(samples)
  228. for char in chars:
  229. char_script = (
  230. youseedee.ucd_data(ord(char)).get("Script", "").replace("_", " ")
  231. )
  232. if char_script == "Common" or char_script == "Inherited":
  233. continue
  234. if char_script != script_name:
  235. extensions = (
  236. youseedee.ucd_data(ord(char))
  237. .get("Script_Extensions", "")
  238. .split(" ")
  239. )
  240. if any(ext == lang.script for ext in extensions):
  241. continue
  242. out_of_script[char_script].add(char)
  243. break
  244. msg = []
  245. for script, chars in out_of_script.items():
  246. msg.append(f"'{''.join(chars)}' ({script} != {script_name})")
  247. assert not out_of_script, (
  248. f"{lang_code} sample text contained out-of-script characters"
  249. f": {', '.join(msg)}"
  250. )
  251. def test_exemplar_parser():
  252. bases = "a A ā Ā {a̍} {A̍} {kl}"
  253. parsed_bases = parse(bases)
  254. assert parsed_bases == {
  255. "a",
  256. "A",
  257. "ā",
  258. "Ā",
  259. "k",
  260. "l",
  261. "̍",
  262. }
  263. def test_language_uniqueness():
  264. names = Counter([])
  265. for lang in LANGUAGES.values():
  266. if lang.preferred_name:
  267. names[lang.preferred_name] += 1
  268. else:
  269. names[lang.name] += 1
  270. if any(count > 1 for count in names.values()):
  271. duplicates = {name: count for name, count in names.items() if count > 1}
  272. pytest.fail(f"Duplicate language names: {duplicates}")
  273. def test_language_name_structure():
  274. languages_with_bad_name_structure = {}
  275. for lang in LANGUAGES.values():
  276. script_name = (
  277. SCRIPTS[lang.script].name
  278. if lang.script not in ALTERNATE_SCRIPT_NAMES
  279. else ALTERNATE_SCRIPT_NAMES[lang.script]
  280. )
  281. names = [["name", lang.name]]
  282. if lang.preferred_name:
  283. names += [["preferred_name", lang.preferred_name]]
  284. bad_names = []
  285. for type, name in names:
  286. bad_structure = not regex.match(LANGUAGE_NAME_REGEX, name)
  287. bad_script_suffix = name.endswith(")") and not name.endswith(
  288. f"({script_name})"
  289. )
  290. if bad_structure or bad_script_suffix:
  291. bad_names.append(type)
  292. if len(bad_names) > 0:
  293. languages_with_bad_name_structure[lang.id] = bad_names
  294. if len(languages_with_bad_name_structure) > 0:
  295. misstructured_language_names = [
  296. f"{language_id}" if len(types) == 1 else f"{language_id}: {types}"
  297. for language_id, types in languages_with_bad_name_structure.items()
  298. if len(types) > 0
  299. ]
  300. pytest.fail(
  301. f'Languages names without expected structure ("LANGUAGE, MODIFIER (SCRIPT)"): {misstructured_language_names}'
  302. )
  303. @pytest.mark.parametrize("lang_code", LANGUAGES)
  304. def test_id_well_formed(lang_code):
  305. if lang_code in ["tw_akuapem_Latn"]:
  306. pytest.xfail("Well we need to have a conversation about that")
  307. return
  308. lang = LANGUAGES[lang_code]
  309. assert lang.id.startswith(lang.language + "_" + lang.script)