Просмотр исходного кода

Merge commit 'a495359cdf3c536688f79f712554d55afb902615' into lang-0.7.0

Simon Cozens 4 месяцев назад
Родитель
Сommit
dbb63139ab

+ 25 - 35
lang/Lib/gflanguages/__init__.py

@@ -25,56 +25,46 @@ import unicodedata
 
 from gflanguages import languages_public_pb2
 from google.protobuf import text_format
-from pkg_resources import resource_filename
+from importlib_resources import files
 
 try:
     from ._version import version as __version__  # type: ignore
 except ImportError:
     __version__ = "0.0.0+unknown"
 
-DATA_DIR = resource_filename("gflanguages", "data")
 
+def _load_thing(thing_type, proto_class, base_dir=None):
+    things = {}
 
-def LoadLanguages(base_dir=DATA_DIR):
-    if base_dir is None:
-        base_dir = DATA_DIR
+    def read_a_thing(contents):
+        proto = proto_class()
+        thing = text_format.Parse(contents, proto)
+        assert thing.id not in things, f"Duplicate {thing_type} id: {thing.id}"
+        things[thing.id] = thing
 
-    languages_dir = os.path.join(base_dir, "languages")
-    langs = {}
-    for textproto_file in glob.iglob(os.path.join(languages_dir, "*.textproto")):
-        with open(textproto_file, "r", encoding="utf-8") as f:
-            language = text_format.Parse(f.read(), languages_public_pb2.LanguageProto())
-            assert language.id not in langs, f"Duplicate language id: {language.id}"
-            langs[language.id] = language
-    return langs
+    if base_dir is not None:
+        thing_dir = os.path.join(base_dir, thing_type)
+        for textproto_file in glob.iglob(os.path.join(thing_dir, "*.textproto")):
+            with open(textproto_file, "r", encoding="utf-8") as f:
+                read_a_thing(f.read())
+    else:
+        for textproto_file in files("gflanguages.data").joinpath(thing_type).iterdir():
+            if not textproto_file.name.endswith(".textproto"):
+                continue
+            read_a_thing(textproto_file.read_text(encoding="utf-8"))
+    return things
 
 
-def LoadScripts(base_dir=DATA_DIR):
-    if base_dir is None:
-        base_dir = DATA_DIR
+def LoadLanguages(base_dir=None):
+    return _load_thing("languages", languages_public_pb2.LanguageProto, base_dir)
 
-    scripts_dir = os.path.join(base_dir, "scripts")
-    scripts = {}
-    for textproto_file in glob.iglob(os.path.join(scripts_dir, "*.textproto")):
-        with open(textproto_file, "r", encoding="utf-8") as f:
-            script = text_format.Parse(f.read(), languages_public_pb2.ScriptProto())
-            assert script.id not in scripts, f"Duplicate script id: {script.id}"
-            scripts[script.id] = script
-    return scripts
 
+def LoadScripts(base_dir=None):
+    return _load_thing("scripts", languages_public_pb2.ScriptProto, base_dir)
 
-def LoadRegions(base_dir=DATA_DIR):
-    if base_dir is None:
-        base_dir = DATA_DIR
 
-    regions_dir = os.path.join(base_dir, "regions")
-    regions = {}
-    for textproto_file in glob.iglob(os.path.join(regions_dir, "*.textproto")):
-        with open(textproto_file, "r", encoding="utf-8") as f:
-            region = text_format.Parse(f.read(), languages_public_pb2.RegionProto())
-            assert region.id not in regions, f"Duplicate region id: {region.id}"
-            regions[region.id] = region
-    return regions
+def LoadRegions(base_dir=None):
+    return _load_thing("regions", languages_public_pb2.RegionProto, base_dir)
 
 
 def parse(exemplars: str):

+ 3 - 1
lang/Lib/gflanguages/data/languages/abn_Latn.textproto

@@ -7,7 +7,9 @@ population: 25000
 region: "NG"
 exemplar_chars {
   base: "A Ạ B Ḅ D Ḍ E Ẹ F G H I Ị J K L M N O Ọ P R S T U Ụ V W Y Z a ạ b ḅ d ḍ e ẹ f g h i ị j k l m n o ọ p r s t u ụ v w y z"
-  marks: "◌̣"
+  auxiliary: "á Á {ạ́} {Ạ́} é É {ẹ́} {Ẹ́} í Í {ị́} {Ị́} ó Ó {ọ́} {Ọ́} ú Ú {ụ́} {Ụ́} ā Ā {ạ̄} {Ạ̄} ē Ē {ẹ̄} {Ẹ̄} ī Ī {ị̄} {Ị̄} ō Ō {ọ̄} {Ọ̄} ū Ū {ụ̄} {Ụ̄}"
+  marks: "◌́ ◌̄ ◌̣"
 }
 source: "Ethelbert Emmanuel Kari, Kiikpoye Johnny Joshua, \"Abuan orthography\", in Tony Enyia, Orthographies of Nigerian Languages: Manual IX, Nigerian Educational Research and Development Council (NERDC), 2011, p. 1–12"
 source: "Wikipedia"
+note: "Kari & Kiikoye 2011 p. 8-9 indicates tones can be marked (with acute for high tone, macron for downstepped-high tone) in technical works or advanced works like grammars, dictionaries, or literary texts such as poems."

+ 2 - 2
lang/Lib/gflanguages/data/languages/ar_Arab.textproto

@@ -42,9 +42,9 @@ region: "TR"
 region: "YE"
 exemplar_chars {
   base: "ء أ ؤ إ ئ ا آ ب ة ت ث ج ح خ د ذ ر ز س ش ص ض ط ظ ع غ ف ق ك ل م ن ه و ى ي ـ"
-  auxiliary: "ـ ‌ ‍  ‏ پ چ ژ ڜ ڢ ڤ ڥ ٯ ڧ ڨ ک گ ی"
+  auxiliary: "ـ ‌ ‍  ‏ پ چ ژ ڜ ڢ ڤ ڥ ٯ ڧ ڨ ک گ ی"
   marks: "◌ٰ ◌ٓ ◌ٔ ◌ٕ ◌ً ◌ٌ ◌ٍ ◌َ ◌ُ ◌ِ ◌ّ ◌ْ"
-  numerals: "؜  - , ٫ ٬ . % ٪ ؉ + 0٠ 1 ١ 2 ٢ 3 ٣ 4 ٤ 5 ٥ 6 ٦ 7 ٧ 8 ٨ 9 ٩"
+  numerals: "  - , ٫ ٬ . % ٪ ؉ + 0٠ 1 ١ 2 ٢ 3 ٣ 4 ٤ 5 ٥ 6 ٦ 7 ٧ 8 ٨ 9 ٩"
   punctuation: "- – — ۔ ، ؛ : ! ؟ ٭ . … \' \" « » ﴾ ﴿ ( ) [ ] ؍"
   index: "ا ب ت ث ج ح خ د ذ ر ز س ش ص ض ط ظ ع غ ف ق ك ل م ن ه و ي"
 }

+ 1 - 1
lang/Lib/gflanguages/data/languages/bap_Deva.textproto

@@ -1,7 +1,7 @@
 id: "bap_Deva"
 language: "bap"
 script: "Deva"
-name: "Bantawa"
+name: "Bantawa (Devanagari)"
 population: 454918
 region: "NP"
 sample_text {

Разница между файлами не показана из-за своего большого размера
+ 19 - 0
lang/Lib/gflanguages/data/languages/bap_Krai.textproto


+ 2 - 2
lang/Lib/gflanguages/data/languages/ckb_Arab.textproto

@@ -7,7 +7,7 @@ region: "IQ"
 region: "IR"
 exemplar_chars {
   base: "ئ ا ب پ ت ج چ ح خ د ر ز ڕ ژ س ش ع غ ف ڤ ق ک گ ل ڵ م ن ھ ە و ۆ ی ێ"
-  auxiliary: "‏ ً ٌ ٍ َ ُ ِ ّ ْ ء آ أ ؤ إ ة ث ذ ص ض ط ظ ك ه ى ي"
-  numerals: "‏ - , ٫ ٬ . % ٪ ؉ + 0٠ 1١ 2٢ 3٣ 4٤ 5٥ 6٦ 7٧ 8٨ 9٩"
+  auxiliary: "‏ ً ٌ ٍ َ ُ ِ ّ ْ ء آ أ ؤ إ ة ث ذ ص ض ط ظ ك ه ى ي"
+  numerals: "‏ - , ٫ ٬ . % ٪ ؉ + 0٠ 1١ 2٢ 3٣ 4٤ 5٥ 6٦ 7٧ 8٨ 9٩"
   index: "ئ ا ب پ ت ج چ ح خ د ر ز ڕ ژ س ش ع غ ف ڤ ق ک گ ل ڵ م ن ھ ە و ۆ ی ێ"
 }

+ 2 - 1
lang/Lib/gflanguages/data/languages/ckt_Cyrl.textproto

@@ -3,7 +3,8 @@ language: "ckt"
 script: "Cyrl"
 name: "Chukot"
 autonym: "Ԓыгъоравэтԓьэн"
-population: 0
+region: "RU"
+population: 8526
 exemplar_chars {
   base: "А Б В Г Д Е Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я Ё Ӄ Ӈ Ԓ а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я ё ӄ ӈ ԓ ’"
   marks: "◌̆ ◌̈"

+ 5 - 1
lang/Lib/gflanguages/data/languages/dng_Cyrl.textproto

@@ -3,7 +3,11 @@ language: "dng"
 script: "Cyrl"
 name: "Dungan"
 autonym: "Хуэйзў йүян"
-population: 0
+population: 110000
+region: "KG"
+region: "KZ"
+region: "RU"
+region: "UZ"
 exemplar_chars {
   base: "А Б В Г Д Е Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я Ё Ў Җ Ң Ү Ә а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я ё ў җ ң ү ә"
   marks: "◌̆ ◌̈"

+ 1 - 1
lang/Lib/gflanguages/data/languages/dnj_Latn_LR.textproto

@@ -6,7 +6,7 @@ autonym: "Gio"
 population: 1099244
 region: "LR"
 exemplar_chars {
-  base: "a A à À á Á â Â ã Ã {ã̀} {Ã̀} {ã́} {Ã́} b B ɓ Ɓ d D ɗ Ɗ e E è È é É ê Ê ǝ Ǝ {ǝ̀} {Ǝ̀} {ǝ́} {Ǝ́} {ǝ̂} {Ǝ̂} {ǝ̃} {Ǝ̃} {ǝ̃̀} {Ǝ̃̀} {ǝ̃́} {Ǝ̃́} {ǝ̃̂} {Ǝ̃̂} ɛ Ɛ {ɛ̀} {Ɛ̀} {ɛ́} {Ɛ́} {ɛ̃} {Ɛ̃} {ɛ̃̀} {Ɛ̃̀} {ɛ̃́} {Ɛ̃́} f F g G h H i I ì Ì í Í î Î ĩ Ĩ {ĩ̀} {Ĩ̀} {ĩ́} {Ĩ́} j J k K l L m M n N o O ò Ò ó Ó ô Ô ɔ Ɔ {ɔ̀} {Ɔ̀} {ɔ́} {Ɔ́} {ɔ̂} {Ɔ̂} {ɔ̃} {Ɔ̃} {ɔ̃̀} {Ɔ̃̀} {ɔ̃́} {Ɔ̃́} {ɔ̃̂} {Ɔ̃̂} ɵ Ɵ {ɵ̀} {Ɵ̀} {ɵ́} {Ɵ́} {ɵ̂} {Ɵ̂} p P r R s S t T u U ù Ù ú Ú û Û ũ Ũ {ũ̀} {Ũ̀} ṹ Ṹ ɥ Ɥ {ɥ̀} {Ɥ̀} {ɥ́} {Ɥ́} {ɥ̂} {Ɥ̂} {ɥ̃̀} {Ɥ̃̀} v V w W x X y Y z Z"
+  base: "a A à À á Á â Â ã Ã {ã̀} {Ã̀} {ã́} {Ã́} b B ɓ Ɓ d D ɗ Ɗ e E è È é É ê Ê ǝ Ǝ {ǝ̀} {Ǝ̀} {ǝ́} {Ǝ́} {ǝ̂} {Ǝ̂} {ǝ̃} {Ǝ̃} {ǝ̃̀} {Ǝ̃̀} {ǝ̃́} {Ǝ̃́} {ǝ̃̂} {Ǝ̃̂} ɛ Ɛ {ɛ̀} {Ɛ̀} {ɛ́} {Ɛ́} {ɛ̃} {Ɛ̃} {ɛ̃̀} {Ɛ̃̀} {ɛ̃́} {Ɛ̃́} f F g G h H i I ì Ì í Í î Î ĩ Ĩ {ĩ̀} {Ĩ̀} {ĩ́} {Ĩ́} j J k K l L m M n N o O ò Ò ó Ó ô Ô ɔ Ɔ {ɔ̀} {Ɔ̀} {ɔ́} {Ɔ́} {ɔ̂} {Ɔ̂} {ɔ̃} {Ɔ̃} {ɔ̃̀} {Ɔ̃̀} {ɔ̃́} {Ɔ̃́} {ɔ̃̂} {Ɔ̃̂} ɵ Ɵ {ɵ̀} {Ɵ̀} {ɵ́} {Ɵ́} {ɵ̂} {Ɵ̂} p P r R s S t T u U ù Ù ú Ú û Û ũ Ũ {ũ̀} {Ũ̀} ṹ Ṹ ɥ Ɥ {ɥ̀} {Ɥ̀} {ɥ́} {Ɥ́} {ɥ̂} {Ɥ̂} {ɥ̃̀} {Ɥ̃̀} {ɥ̃́} {Ɥ̃́} v V w W x X y Y z Z"
   marks: "◌́ ◌̀ ◌̂ ◌̃"
   auxiliary: "ƃ Ƃ ə Ə"
 }

+ 1 - 1
lang/Lib/gflanguages/data/languages/dv_Thaa.textproto

@@ -7,7 +7,7 @@ population: 372368
 region: "IN"
 region: "MV"
 exemplar_chars {
-  base: "އ ކ ޅ ބ ރ ނ ށ ހ ޏ ގ ލ ތ ދ ފ މ ވ ޗ ޖ ޕ ޔ ޓ ޒ ޑ ސ ޱ "
+  base: "އ ކ ޅ ބ ރ ނ ށ ހ ޏ ގ ލ ތ ދ ފ މ ވ ޗ ޖ ޕ ޔ ޓ ޒ ޑ ސ ޱ "
   marks: "◌ަ ◌ާ ◌ި ◌ީ ◌ު ◌ޫ ◌ެ ◌ޭ ◌ޮ ◌ޯ ◌ް"
 }
 sample_text {

Некоторые файлы не были показаны из-за большого количества измененных файлов