constant.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995
  1. # -*- coding: utf-8 -*-
  2. from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
  3. from encodings.aliases import aliases
  4. from re import IGNORECASE, compile as re_compile
  5. from typing import Dict, List, Set, Union
  6. # Contain for each eligible encoding a list of/item bytes SIG/BOM
  7. ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
  8. "utf_8": BOM_UTF8,
  9. "utf_7": [
  10. b"\x2b\x2f\x76\x38",
  11. b"\x2b\x2f\x76\x39",
  12. b"\x2b\x2f\x76\x2b",
  13. b"\x2b\x2f\x76\x2f",
  14. b"\x2b\x2f\x76\x38\x2d",
  15. ],
  16. "gb18030": b"\x84\x31\x95\x33",
  17. "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
  18. "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
  19. }
  20. TOO_SMALL_SEQUENCE: int = 32
  21. TOO_BIG_SEQUENCE: int = int(10e6)
  22. UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
  23. # Up-to-date Unicode ucd/15.0.0
  24. UNICODE_RANGES_COMBINED: Dict[str, range] = {
  25. "Control character": range(32),
  26. "Basic Latin": range(32, 128),
  27. "Latin-1 Supplement": range(128, 256),
  28. "Latin Extended-A": range(256, 384),
  29. "Latin Extended-B": range(384, 592),
  30. "IPA Extensions": range(592, 688),
  31. "Spacing Modifier Letters": range(688, 768),
  32. "Combining Diacritical Marks": range(768, 880),
  33. "Greek and Coptic": range(880, 1024),
  34. "Cyrillic": range(1024, 1280),
  35. "Cyrillic Supplement": range(1280, 1328),
  36. "Armenian": range(1328, 1424),
  37. "Hebrew": range(1424, 1536),
  38. "Arabic": range(1536, 1792),
  39. "Syriac": range(1792, 1872),
  40. "Arabic Supplement": range(1872, 1920),
  41. "Thaana": range(1920, 1984),
  42. "NKo": range(1984, 2048),
  43. "Samaritan": range(2048, 2112),
  44. "Mandaic": range(2112, 2144),
  45. "Syriac Supplement": range(2144, 2160),
  46. "Arabic Extended-B": range(2160, 2208),
  47. "Arabic Extended-A": range(2208, 2304),
  48. "Devanagari": range(2304, 2432),
  49. "Bengali": range(2432, 2560),
  50. "Gurmukhi": range(2560, 2688),
  51. "Gujarati": range(2688, 2816),
  52. "Oriya": range(2816, 2944),
  53. "Tamil": range(2944, 3072),
  54. "Telugu": range(3072, 3200),
  55. "Kannada": range(3200, 3328),
  56. "Malayalam": range(3328, 3456),
  57. "Sinhala": range(3456, 3584),
  58. "Thai": range(3584, 3712),
  59. "Lao": range(3712, 3840),
  60. "Tibetan": range(3840, 4096),
  61. "Myanmar": range(4096, 4256),
  62. "Georgian": range(4256, 4352),
  63. "Hangul Jamo": range(4352, 4608),
  64. "Ethiopic": range(4608, 4992),
  65. "Ethiopic Supplement": range(4992, 5024),
  66. "Cherokee": range(5024, 5120),
  67. "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
  68. "Ogham": range(5760, 5792),
  69. "Runic": range(5792, 5888),
  70. "Tagalog": range(5888, 5920),
  71. "Hanunoo": range(5920, 5952),
  72. "Buhid": range(5952, 5984),
  73. "Tagbanwa": range(5984, 6016),
  74. "Khmer": range(6016, 6144),
  75. "Mongolian": range(6144, 6320),
  76. "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
  77. "Limbu": range(6400, 6480),
  78. "Tai Le": range(6480, 6528),
  79. "New Tai Lue": range(6528, 6624),
  80. "Khmer Symbols": range(6624, 6656),
  81. "Buginese": range(6656, 6688),
  82. "Tai Tham": range(6688, 6832),
  83. "Combining Diacritical Marks Extended": range(6832, 6912),
  84. "Balinese": range(6912, 7040),
  85. "Sundanese": range(7040, 7104),
  86. "Batak": range(7104, 7168),
  87. "Lepcha": range(7168, 7248),
  88. "Ol Chiki": range(7248, 7296),
  89. "Cyrillic Extended-C": range(7296, 7312),
  90. "Georgian Extended": range(7312, 7360),
  91. "Sundanese Supplement": range(7360, 7376),
  92. "Vedic Extensions": range(7376, 7424),
  93. "Phonetic Extensions": range(7424, 7552),
  94. "Phonetic Extensions Supplement": range(7552, 7616),
  95. "Combining Diacritical Marks Supplement": range(7616, 7680),
  96. "Latin Extended Additional": range(7680, 7936),
  97. "Greek Extended": range(7936, 8192),
  98. "General Punctuation": range(8192, 8304),
  99. "Superscripts and Subscripts": range(8304, 8352),
  100. "Currency Symbols": range(8352, 8400),
  101. "Combining Diacritical Marks for Symbols": range(8400, 8448),
  102. "Letterlike Symbols": range(8448, 8528),
  103. "Number Forms": range(8528, 8592),
  104. "Arrows": range(8592, 8704),
  105. "Mathematical Operators": range(8704, 8960),
  106. "Miscellaneous Technical": range(8960, 9216),
  107. "Control Pictures": range(9216, 9280),
  108. "Optical Character Recognition": range(9280, 9312),
  109. "Enclosed Alphanumerics": range(9312, 9472),
  110. "Box Drawing": range(9472, 9600),
  111. "Block Elements": range(9600, 9632),
  112. "Geometric Shapes": range(9632, 9728),
  113. "Miscellaneous Symbols": range(9728, 9984),
  114. "Dingbats": range(9984, 10176),
  115. "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
  116. "Supplemental Arrows-A": range(10224, 10240),
  117. "Braille Patterns": range(10240, 10496),
  118. "Supplemental Arrows-B": range(10496, 10624),
  119. "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
  120. "Supplemental Mathematical Operators": range(10752, 11008),
  121. "Miscellaneous Symbols and Arrows": range(11008, 11264),
  122. "Glagolitic": range(11264, 11360),
  123. "Latin Extended-C": range(11360, 11392),
  124. "Coptic": range(11392, 11520),
  125. "Georgian Supplement": range(11520, 11568),
  126. "Tifinagh": range(11568, 11648),
  127. "Ethiopic Extended": range(11648, 11744),
  128. "Cyrillic Extended-A": range(11744, 11776),
  129. "Supplemental Punctuation": range(11776, 11904),
  130. "CJK Radicals Supplement": range(11904, 12032),
  131. "Kangxi Radicals": range(12032, 12256),
  132. "Ideographic Description Characters": range(12272, 12288),
  133. "CJK Symbols and Punctuation": range(12288, 12352),
  134. "Hiragana": range(12352, 12448),
  135. "Katakana": range(12448, 12544),
  136. "Bopomofo": range(12544, 12592),
  137. "Hangul Compatibility Jamo": range(12592, 12688),
  138. "Kanbun": range(12688, 12704),
  139. "Bopomofo Extended": range(12704, 12736),
  140. "CJK Strokes": range(12736, 12784),
  141. "Katakana Phonetic Extensions": range(12784, 12800),
  142. "Enclosed CJK Letters and Months": range(12800, 13056),
  143. "CJK Compatibility": range(13056, 13312),
  144. "CJK Unified Ideographs Extension A": range(13312, 19904),
  145. "Yijing Hexagram Symbols": range(19904, 19968),
  146. "CJK Unified Ideographs": range(19968, 40960),
  147. "Yi Syllables": range(40960, 42128),
  148. "Yi Radicals": range(42128, 42192),
  149. "Lisu": range(42192, 42240),
  150. "Vai": range(42240, 42560),
  151. "Cyrillic Extended-B": range(42560, 42656),
  152. "Bamum": range(42656, 42752),
  153. "Modifier Tone Letters": range(42752, 42784),
  154. "Latin Extended-D": range(42784, 43008),
  155. "Syloti Nagri": range(43008, 43056),
  156. "Common Indic Number Forms": range(43056, 43072),
  157. "Phags-pa": range(43072, 43136),
  158. "Saurashtra": range(43136, 43232),
  159. "Devanagari Extended": range(43232, 43264),
  160. "Kayah Li": range(43264, 43312),
  161. "Rejang": range(43312, 43360),
  162. "Hangul Jamo Extended-A": range(43360, 43392),
  163. "Javanese": range(43392, 43488),
  164. "Myanmar Extended-B": range(43488, 43520),
  165. "Cham": range(43520, 43616),
  166. "Myanmar Extended-A": range(43616, 43648),
  167. "Tai Viet": range(43648, 43744),
  168. "Meetei Mayek Extensions": range(43744, 43776),
  169. "Ethiopic Extended-A": range(43776, 43824),
  170. "Latin Extended-E": range(43824, 43888),
  171. "Cherokee Supplement": range(43888, 43968),
  172. "Meetei Mayek": range(43968, 44032),
  173. "Hangul Syllables": range(44032, 55216),
  174. "Hangul Jamo Extended-B": range(55216, 55296),
  175. "High Surrogates": range(55296, 56192),
  176. "High Private Use Surrogates": range(56192, 56320),
  177. "Low Surrogates": range(56320, 57344),
  178. "Private Use Area": range(57344, 63744),
  179. "CJK Compatibility Ideographs": range(63744, 64256),
  180. "Alphabetic Presentation Forms": range(64256, 64336),
  181. "Arabic Presentation Forms-A": range(64336, 65024),
  182. "Variation Selectors": range(65024, 65040),
  183. "Vertical Forms": range(65040, 65056),
  184. "Combining Half Marks": range(65056, 65072),
  185. "CJK Compatibility Forms": range(65072, 65104),
  186. "Small Form Variants": range(65104, 65136),
  187. "Arabic Presentation Forms-B": range(65136, 65280),
  188. "Halfwidth and Fullwidth Forms": range(65280, 65520),
  189. "Specials": range(65520, 65536),
  190. "Linear B Syllabary": range(65536, 65664),
  191. "Linear B Ideograms": range(65664, 65792),
  192. "Aegean Numbers": range(65792, 65856),
  193. "Ancient Greek Numbers": range(65856, 65936),
  194. "Ancient Symbols": range(65936, 66000),
  195. "Phaistos Disc": range(66000, 66048),
  196. "Lycian": range(66176, 66208),
  197. "Carian": range(66208, 66272),
  198. "Coptic Epact Numbers": range(66272, 66304),
  199. "Old Italic": range(66304, 66352),
  200. "Gothic": range(66352, 66384),
  201. "Old Permic": range(66384, 66432),
  202. "Ugaritic": range(66432, 66464),
  203. "Old Persian": range(66464, 66528),
  204. "Deseret": range(66560, 66640),
  205. "Shavian": range(66640, 66688),
  206. "Osmanya": range(66688, 66736),
  207. "Osage": range(66736, 66816),
  208. "Elbasan": range(66816, 66864),
  209. "Caucasian Albanian": range(66864, 66928),
  210. "Vithkuqi": range(66928, 67008),
  211. "Linear A": range(67072, 67456),
  212. "Latin Extended-F": range(67456, 67520),
  213. "Cypriot Syllabary": range(67584, 67648),
  214. "Imperial Aramaic": range(67648, 67680),
  215. "Palmyrene": range(67680, 67712),
  216. "Nabataean": range(67712, 67760),
  217. "Hatran": range(67808, 67840),
  218. "Phoenician": range(67840, 67872),
  219. "Lydian": range(67872, 67904),
  220. "Meroitic Hieroglyphs": range(67968, 68000),
  221. "Meroitic Cursive": range(68000, 68096),
  222. "Kharoshthi": range(68096, 68192),
  223. "Old South Arabian": range(68192, 68224),
  224. "Old North Arabian": range(68224, 68256),
  225. "Manichaean": range(68288, 68352),
  226. "Avestan": range(68352, 68416),
  227. "Inscriptional Parthian": range(68416, 68448),
  228. "Inscriptional Pahlavi": range(68448, 68480),
  229. "Psalter Pahlavi": range(68480, 68528),
  230. "Old Turkic": range(68608, 68688),
  231. "Old Hungarian": range(68736, 68864),
  232. "Hanifi Rohingya": range(68864, 68928),
  233. "Rumi Numeral Symbols": range(69216, 69248),
  234. "Yezidi": range(69248, 69312),
  235. "Arabic Extended-C": range(69312, 69376),
  236. "Old Sogdian": range(69376, 69424),
  237. "Sogdian": range(69424, 69488),
  238. "Old Uyghur": range(69488, 69552),
  239. "Chorasmian": range(69552, 69600),
  240. "Elymaic": range(69600, 69632),
  241. "Brahmi": range(69632, 69760),
  242. "Kaithi": range(69760, 69840),
  243. "Sora Sompeng": range(69840, 69888),
  244. "Chakma": range(69888, 69968),
  245. "Mahajani": range(69968, 70016),
  246. "Sharada": range(70016, 70112),
  247. "Sinhala Archaic Numbers": range(70112, 70144),
  248. "Khojki": range(70144, 70224),
  249. "Multani": range(70272, 70320),
  250. "Khudawadi": range(70320, 70400),
  251. "Grantha": range(70400, 70528),
  252. "Newa": range(70656, 70784),
  253. "Tirhuta": range(70784, 70880),
  254. "Siddham": range(71040, 71168),
  255. "Modi": range(71168, 71264),
  256. "Mongolian Supplement": range(71264, 71296),
  257. "Takri": range(71296, 71376),
  258. "Ahom": range(71424, 71504),
  259. "Dogra": range(71680, 71760),
  260. "Warang Citi": range(71840, 71936),
  261. "Dives Akuru": range(71936, 72032),
  262. "Nandinagari": range(72096, 72192),
  263. "Zanabazar Square": range(72192, 72272),
  264. "Soyombo": range(72272, 72368),
  265. "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
  266. "Pau Cin Hau": range(72384, 72448),
  267. "Devanagari Extended-A": range(72448, 72544),
  268. "Bhaiksuki": range(72704, 72816),
  269. "Marchen": range(72816, 72896),
  270. "Masaram Gondi": range(72960, 73056),
  271. "Gunjala Gondi": range(73056, 73136),
  272. "Makasar": range(73440, 73472),
  273. "Kawi": range(73472, 73568),
  274. "Lisu Supplement": range(73648, 73664),
  275. "Tamil Supplement": range(73664, 73728),
  276. "Cuneiform": range(73728, 74752),
  277. "Cuneiform Numbers and Punctuation": range(74752, 74880),
  278. "Early Dynastic Cuneiform": range(74880, 75088),
  279. "Cypro-Minoan": range(77712, 77824),
  280. "Egyptian Hieroglyphs": range(77824, 78896),
  281. "Egyptian Hieroglyph Format Controls": range(78896, 78944),
  282. "Anatolian Hieroglyphs": range(82944, 83584),
  283. "Bamum Supplement": range(92160, 92736),
  284. "Mro": range(92736, 92784),
  285. "Tangsa": range(92784, 92880),
  286. "Bassa Vah": range(92880, 92928),
  287. "Pahawh Hmong": range(92928, 93072),
  288. "Medefaidrin": range(93760, 93856),
  289. "Miao": range(93952, 94112),
  290. "Ideographic Symbols and Punctuation": range(94176, 94208),
  291. "Tangut": range(94208, 100352),
  292. "Tangut Components": range(100352, 101120),
  293. "Khitan Small Script": range(101120, 101632),
  294. "Tangut Supplement": range(101632, 101760),
  295. "Kana Extended-B": range(110576, 110592),
  296. "Kana Supplement": range(110592, 110848),
  297. "Kana Extended-A": range(110848, 110896),
  298. "Small Kana Extension": range(110896, 110960),
  299. "Nushu": range(110960, 111360),
  300. "Duployan": range(113664, 113824),
  301. "Shorthand Format Controls": range(113824, 113840),
  302. "Znamenny Musical Notation": range(118528, 118736),
  303. "Byzantine Musical Symbols": range(118784, 119040),
  304. "Musical Symbols": range(119040, 119296),
  305. "Ancient Greek Musical Notation": range(119296, 119376),
  306. "Kaktovik Numerals": range(119488, 119520),
  307. "Mayan Numerals": range(119520, 119552),
  308. "Tai Xuan Jing Symbols": range(119552, 119648),
  309. "Counting Rod Numerals": range(119648, 119680),
  310. "Mathematical Alphanumeric Symbols": range(119808, 120832),
  311. "Sutton SignWriting": range(120832, 121520),
  312. "Latin Extended-G": range(122624, 122880),
  313. "Glagolitic Supplement": range(122880, 122928),
  314. "Cyrillic Extended-D": range(122928, 123024),
  315. "Nyiakeng Puachue Hmong": range(123136, 123216),
  316. "Toto": range(123536, 123584),
  317. "Wancho": range(123584, 123648),
  318. "Nag Mundari": range(124112, 124160),
  319. "Ethiopic Extended-B": range(124896, 124928),
  320. "Mende Kikakui": range(124928, 125152),
  321. "Adlam": range(125184, 125280),
  322. "Indic Siyaq Numbers": range(126064, 126144),
  323. "Ottoman Siyaq Numbers": range(126208, 126288),
  324. "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
  325. "Mahjong Tiles": range(126976, 127024),
  326. "Domino Tiles": range(127024, 127136),
  327. "Playing Cards": range(127136, 127232),
  328. "Enclosed Alphanumeric Supplement": range(127232, 127488),
  329. "Enclosed Ideographic Supplement": range(127488, 127744),
  330. "Miscellaneous Symbols and Pictographs": range(127744, 128512),
  331. "Emoticons range(Emoji)": range(128512, 128592),
  332. "Ornamental Dingbats": range(128592, 128640),
  333. "Transport and Map Symbols": range(128640, 128768),
  334. "Alchemical Symbols": range(128768, 128896),
  335. "Geometric Shapes Extended": range(128896, 129024),
  336. "Supplemental Arrows-C": range(129024, 129280),
  337. "Supplemental Symbols and Pictographs": range(129280, 129536),
  338. "Chess Symbols": range(129536, 129648),
  339. "Symbols and Pictographs Extended-A": range(129648, 129792),
  340. "Symbols for Legacy Computing": range(129792, 130048),
  341. "CJK Unified Ideographs Extension B": range(131072, 173792),
  342. "CJK Unified Ideographs Extension C": range(173824, 177984),
  343. "CJK Unified Ideographs Extension D": range(177984, 178208),
  344. "CJK Unified Ideographs Extension E": range(178208, 183984),
  345. "CJK Unified Ideographs Extension F": range(183984, 191472),
  346. "CJK Compatibility Ideographs Supplement": range(194560, 195104),
  347. "CJK Unified Ideographs Extension G": range(196608, 201552),
  348. "CJK Unified Ideographs Extension H": range(201552, 205744),
  349. "Tags": range(917504, 917632),
  350. "Variation Selectors Supplement": range(917760, 918000),
  351. "Supplementary Private Use Area-A": range(983040, 1048576),
  352. "Supplementary Private Use Area-B": range(1048576, 1114112),
  353. }
  354. UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
  355. "Supplement",
  356. "Extended",
  357. "Extensions",
  358. "Modifier",
  359. "Marks",
  360. "Punctuation",
  361. "Symbols",
  362. "Forms",
  363. "Operators",
  364. "Miscellaneous",
  365. "Drawing",
  366. "Block",
  367. "Shapes",
  368. "Supplemental",
  369. "Tags",
  370. ]
  371. RE_POSSIBLE_ENCODING_INDICATION = re_compile(
  372. r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
  373. IGNORECASE,
  374. )
  375. IANA_NO_ALIASES = [
  376. "cp720",
  377. "cp737",
  378. "cp856",
  379. "cp874",
  380. "cp875",
  381. "cp1006",
  382. "koi8_r",
  383. "koi8_t",
  384. "koi8_u",
  385. ]
  386. IANA_SUPPORTED: List[str] = sorted(
  387. filter(
  388. lambda x: x.endswith("_codec") is False
  389. and x not in {"rot_13", "tactis", "mbcs"},
  390. list(set(aliases.values())) + IANA_NO_ALIASES,
  391. )
  392. )
  393. IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
  394. # pre-computed code page that are similar using the function cp_similarity.
  395. IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
  396. "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
  397. "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
  398. "cp1125": ["cp866"],
  399. "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
  400. "cp1250": ["iso8859_2"],
  401. "cp1251": ["kz1048", "ptcp154"],
  402. "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
  403. "cp1253": ["iso8859_7"],
  404. "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
  405. "cp1257": ["iso8859_13"],
  406. "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
  407. "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
  408. "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
  409. "cp850": ["cp437", "cp857", "cp858", "cp865"],
  410. "cp857": ["cp850", "cp858", "cp865"],
  411. "cp858": ["cp437", "cp850", "cp857", "cp865"],
  412. "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
  413. "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
  414. "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
  415. "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
  416. "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
  417. "cp866": ["cp1125"],
  418. "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
  419. "iso8859_11": ["tis_620"],
  420. "iso8859_13": ["cp1257"],
  421. "iso8859_14": [
  422. "iso8859_10",
  423. "iso8859_15",
  424. "iso8859_16",
  425. "iso8859_3",
  426. "iso8859_9",
  427. "latin_1",
  428. ],
  429. "iso8859_15": [
  430. "cp1252",
  431. "cp1254",
  432. "iso8859_10",
  433. "iso8859_14",
  434. "iso8859_16",
  435. "iso8859_3",
  436. "iso8859_9",
  437. "latin_1",
  438. ],
  439. "iso8859_16": [
  440. "iso8859_14",
  441. "iso8859_15",
  442. "iso8859_2",
  443. "iso8859_3",
  444. "iso8859_9",
  445. "latin_1",
  446. ],
  447. "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
  448. "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
  449. "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
  450. "iso8859_7": ["cp1253"],
  451. "iso8859_9": [
  452. "cp1252",
  453. "cp1254",
  454. "cp1258",
  455. "iso8859_10",
  456. "iso8859_14",
  457. "iso8859_15",
  458. "iso8859_16",
  459. "iso8859_3",
  460. "iso8859_4",
  461. "latin_1",
  462. ],
  463. "kz1048": ["cp1251", "ptcp154"],
  464. "latin_1": [
  465. "cp1252",
  466. "cp1254",
  467. "cp1258",
  468. "iso8859_10",
  469. "iso8859_14",
  470. "iso8859_15",
  471. "iso8859_16",
  472. "iso8859_3",
  473. "iso8859_4",
  474. "iso8859_9",
  475. ],
  476. "mac_iceland": ["mac_roman", "mac_turkish"],
  477. "mac_roman": ["mac_iceland", "mac_turkish"],
  478. "mac_turkish": ["mac_iceland", "mac_roman"],
  479. "ptcp154": ["cp1251", "kz1048"],
  480. "tis_620": ["iso8859_11"],
  481. }
  482. CHARDET_CORRESPONDENCE: Dict[str, str] = {
  483. "iso2022_kr": "ISO-2022-KR",
  484. "iso2022_jp": "ISO-2022-JP",
  485. "euc_kr": "EUC-KR",
  486. "tis_620": "TIS-620",
  487. "utf_32": "UTF-32",
  488. "euc_jp": "EUC-JP",
  489. "koi8_r": "KOI8-R",
  490. "iso8859_1": "ISO-8859-1",
  491. "iso8859_2": "ISO-8859-2",
  492. "iso8859_5": "ISO-8859-5",
  493. "iso8859_6": "ISO-8859-6",
  494. "iso8859_7": "ISO-8859-7",
  495. "iso8859_8": "ISO-8859-8",
  496. "utf_16": "UTF-16",
  497. "cp855": "IBM855",
  498. "mac_cyrillic": "MacCyrillic",
  499. "gb2312": "GB2312",
  500. "gb18030": "GB18030",
  501. "cp932": "CP932",
  502. "cp866": "IBM866",
  503. "utf_8": "utf-8",
  504. "utf_8_sig": "UTF-8-SIG",
  505. "shift_jis": "SHIFT_JIS",
  506. "big5": "Big5",
  507. "cp1250": "windows-1250",
  508. "cp1251": "windows-1251",
  509. "cp1252": "Windows-1252",
  510. "cp1253": "windows-1253",
  511. "cp1255": "windows-1255",
  512. "cp1256": "windows-1256",
  513. "cp1254": "Windows-1254",
  514. "cp949": "CP949",
  515. }
  516. COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
  517. "<",
  518. ">",
  519. "=",
  520. ":",
  521. "/",
  522. "&",
  523. ";",
  524. "{",
  525. "}",
  526. "[",
  527. "]",
  528. ",",
  529. "|",
  530. '"',
  531. "-",
  532. }
  533. KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
  534. ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
  535. # Logging LEVEL below DEBUG
  536. TRACE: int = 5
  537. # Language label that contain the em dash "—"
  538. # character are to be considered alternative seq to origin
  539. FREQUENCIES: Dict[str, List[str]] = {
  540. "English": [
  541. "e",
  542. "a",
  543. "t",
  544. "i",
  545. "o",
  546. "n",
  547. "s",
  548. "r",
  549. "h",
  550. "l",
  551. "d",
  552. "c",
  553. "u",
  554. "m",
  555. "f",
  556. "p",
  557. "g",
  558. "w",
  559. "y",
  560. "b",
  561. "v",
  562. "k",
  563. "x",
  564. "j",
  565. "z",
  566. "q",
  567. ],
  568. "English—": [
  569. "e",
  570. "a",
  571. "t",
  572. "i",
  573. "o",
  574. "n",
  575. "s",
  576. "r",
  577. "h",
  578. "l",
  579. "d",
  580. "c",
  581. "m",
  582. "u",
  583. "f",
  584. "p",
  585. "g",
  586. "w",
  587. "b",
  588. "y",
  589. "v",
  590. "k",
  591. "j",
  592. "x",
  593. "z",
  594. "q",
  595. ],
  596. "German": [
  597. "e",
  598. "n",
  599. "i",
  600. "r",
  601. "s",
  602. "t",
  603. "a",
  604. "d",
  605. "h",
  606. "u",
  607. "l",
  608. "g",
  609. "o",
  610. "c",
  611. "m",
  612. "b",
  613. "f",
  614. "k",
  615. "w",
  616. "z",
  617. "p",
  618. "v",
  619. "ü",
  620. "ä",
  621. "ö",
  622. "j",
  623. ],
  624. "French": [
  625. "e",
  626. "a",
  627. "s",
  628. "n",
  629. "i",
  630. "t",
  631. "r",
  632. "l",
  633. "u",
  634. "o",
  635. "d",
  636. "c",
  637. "p",
  638. "m",
  639. "é",
  640. "v",
  641. "g",
  642. "f",
  643. "b",
  644. "h",
  645. "q",
  646. "à",
  647. "x",
  648. "è",
  649. "y",
  650. "j",
  651. ],
  652. "Dutch": [
  653. "e",
  654. "n",
  655. "a",
  656. "i",
  657. "r",
  658. "t",
  659. "o",
  660. "d",
  661. "s",
  662. "l",
  663. "g",
  664. "h",
  665. "v",
  666. "m",
  667. "u",
  668. "k",
  669. "c",
  670. "p",
  671. "b",
  672. "w",
  673. "j",
  674. "z",
  675. "f",
  676. "y",
  677. "x",
  678. "ë",
  679. ],
  680. "Italian": [
  681. "e",
  682. "i",
  683. "a",
  684. "o",
  685. "n",
  686. "l",
  687. "t",
  688. "r",
  689. "s",
  690. "c",
  691. "d",
  692. "u",
  693. "p",
  694. "m",
  695. "g",
  696. "v",
  697. "f",
  698. "b",
  699. "z",
  700. "h",
  701. "q",
  702. "è",
  703. "à",
  704. "k",
  705. "y",
  706. "ò",
  707. ],
  708. "Polish": [
  709. "a",
  710. "i",
  711. "o",
  712. "e",
  713. "n",
  714. "r",
  715. "z",
  716. "w",
  717. "s",
  718. "c",
  719. "t",
  720. "k",
  721. "y",
  722. "d",
  723. "p",
  724. "m",
  725. "u",
  726. "l",
  727. "j",
  728. "ł",
  729. "g",
  730. "b",
  731. "h",
  732. "ą",
  733. "ę",
  734. "ó",
  735. ],
  736. "Spanish": [
  737. "e",
  738. "a",
  739. "o",
  740. "n",
  741. "s",
  742. "r",
  743. "i",
  744. "l",
  745. "d",
  746. "t",
  747. "c",
  748. "u",
  749. "m",
  750. "p",
  751. "b",
  752. "g",
  753. "v",
  754. "f",
  755. "y",
  756. "ó",
  757. "h",
  758. "q",
  759. "í",
  760. "j",
  761. "z",
  762. "á",
  763. ],
  764. "Russian": [
  765. "о",
  766. "а",
  767. "е",
  768. "и",
  769. "н",
  770. "с",
  771. "т",
  772. "р",
  773. "в",
  774. "л",
  775. "к",
  776. "м",
  777. "д",
  778. "п",
  779. "у",
  780. "г",
  781. "я",
  782. "ы",
  783. "з",
  784. "б",
  785. "й",
  786. "ь",
  787. "ч",
  788. "х",
  789. "ж",
  790. "ц",
  791. ],
  792. # Jap-Kanji
  793. "Japanese": [
  794. "人",
  795. "一",
  796. "大",
  797. "亅",
  798. "丁",
  799. "丨",
  800. "竹",
  801. "笑",
  802. "口",
  803. "日",
  804. "今",
  805. "二",
  806. "彳",
  807. "行",
  808. "十",
  809. "土",
  810. "丶",
  811. "寸",
  812. "寺",
  813. "時",
  814. "乙",
  815. "丿",
  816. "乂",
  817. "气",
  818. "気",
  819. "冂",
  820. "巾",
  821. "亠",
  822. "市",
  823. "目",
  824. "儿",
  825. "見",
  826. "八",
  827. "小",
  828. "凵",
  829. "県",
  830. "月",
  831. "彐",
  832. "門",
  833. "間",
  834. "木",
  835. "東",
  836. "山",
  837. "出",
  838. "本",
  839. "中",
  840. "刀",
  841. "分",
  842. "耳",
  843. "又",
  844. "取",
  845. "最",
  846. "言",
  847. "田",
  848. "心",
  849. "思",
  850. "刂",
  851. "前",
  852. "京",
  853. "尹",
  854. "事",
  855. "生",
  856. "厶",
  857. "云",
  858. "会",
  859. "未",
  860. "来",
  861. "白",
  862. "冫",
  863. "楽",
  864. "灬",
  865. "馬",
  866. "尸",
  867. "尺",
  868. "駅",
  869. "明",
  870. "耂",
  871. "者",
  872. "了",
  873. "阝",
  874. "都",
  875. "高",
  876. "卜",
  877. "占",
  878. "厂",
  879. "广",
  880. "店",
  881. "子",
  882. "申",
  883. "奄",
  884. "亻",
  885. "俺",
  886. "上",
  887. "方",
  888. "冖",
  889. "学",
  890. "衣",
  891. "艮",
  892. "食",
  893. "自",
  894. ],
  895. # Jap-Katakana
  896. "Japanese—": [
  897. "ー",
  898. "ン",
  899. "ス",
  900. "・",
  901. "ル",
  902. "ト",
  903. "リ",
  904. "イ",
  905. "ア",
  906. "ラ",
  907. "ッ",
  908. "ク",
  909. "ド",
  910. "シ",
  911. "レ",
  912. "ジ",
  913. "タ",
  914. "フ",
  915. "ロ",
  916. "カ",
  917. "テ",
  918. "マ",
  919. "ィ",
  920. "グ",
  921. "バ",
  922. "ム",
  923. "プ",
  924. "オ",
  925. "コ",
  926. "デ",
  927. "ニ",
  928. "ウ",
  929. "メ",
  930. "サ",
  931. "ビ",
  932. "ナ",
  933. "ブ",
  934. "ャ",
  935. "エ",
  936. "ュ",
  937. "チ",
  938. "キ",
  939. "ズ",
  940. "ダ",
  941. "パ",
  942. "ミ",
  943. "ェ",
  944. "ョ",
  945. "ハ",
  946. "セ",
  947. "ベ",
  948. "ガ",
  949. "モ",
  950. "ツ",
  951. "ネ",
  952. "ボ",
  953. "ソ",
  954. "ノ",
  955. "ァ",
  956. "ヴ",
  957. "ワ",
  958. "ポ",
  959. "ペ",
  960. "ピ",
  961. "ケ",
  962. "ゴ",
  963. "ギ",
  964. "ザ",
  965. "ホ",
  966. "ゲ",
  967. "ォ",
  968. "ヤ",
  969. "ヒ",
  970. "ユ",
  971. "ヨ",
  972. "ヘ",
  973. "ゼ",
  974. "ヌ",
  975. "ゥ",
  976. "ゾ",
  977. "ヶ",
  978. "ヂ",
  979. "ヲ",
  980. "ヅ",
  981. "ヵ",
  982. "ヱ",
  983. "ヰ",
  984. "ヮ",
  985. "ヽ",
  986. "゠",
  987. "ヾ",
  988. "ヷ",
  989. "ヿ",
  990. "ヸ",
  991. "ヹ",
  992. "ヺ",
  993. ],
  994. # Jap-Hiragana
  995. "Japanese——": [
  996. "の",
  997. "に",
  998. "る",
  999. "た",
  1000. "と",
  1001. "は",
  1002. "し",
  1003. "い",
  1004. "を",
  1005. "で",
  1006. "て",
  1007. "が",
  1008. "な",
  1009. "れ",
  1010. "か",
  1011. "ら",
  1012. "さ",
  1013. "っ",
  1014. "り",
  1015. "す",
  1016. "あ",
  1017. "も",
  1018. "こ",
  1019. "ま",
  1020. "う",
  1021. "く",
  1022. "よ",
  1023. "き",
  1024. "ん",
  1025. "め",
  1026. "お",
  1027. "け",
  1028. "そ",
  1029. "つ",
  1030. "だ",
  1031. "や",
  1032. "え",
  1033. "ど",
  1034. "わ",
  1035. "ち",
  1036. "み",
  1037. "せ",
  1038. "じ",
  1039. "ば",
  1040. "へ",
  1041. "び",
  1042. "ず",
  1043. "ろ",
  1044. "ほ",
  1045. "げ",
  1046. "む",
  1047. "べ",
  1048. "ひ",
  1049. "ょ",
  1050. "ゆ",
  1051. "ぶ",
  1052. "ご",
  1053. "ゃ",
  1054. "ね",
  1055. "ふ",
  1056. "ぐ",
  1057. "ぎ",
  1058. "ぼ",
  1059. "ゅ",
  1060. "づ",
  1061. "ざ",
  1062. "ぞ",
  1063. "ぬ",
  1064. "ぜ",
  1065. "ぱ",
  1066. "ぽ",
  1067. "ぷ",
  1068. "ぴ",
  1069. "ぃ",
  1070. "ぁ",
  1071. "ぇ",
  1072. "ぺ",
  1073. "ゞ",
  1074. "ぢ",
  1075. "ぉ",
  1076. "ぅ",
  1077. "ゐ",
  1078. "ゝ",
  1079. "ゑ",
  1080. "゛",
  1081. "゜",
  1082. "ゎ",
  1083. "ゔ",
  1084. "゚",
  1085. "ゟ",
  1086. "゙",
  1087. "ゕ",
  1088. "ゖ",
  1089. ],
  1090. "Portuguese": [
  1091. "a",
  1092. "e",
  1093. "o",
  1094. "s",
  1095. "i",
  1096. "r",
  1097. "d",
  1098. "n",
  1099. "t",
  1100. "m",
  1101. "u",
  1102. "c",
  1103. "l",
  1104. "p",
  1105. "g",
  1106. "v",
  1107. "b",
  1108. "f",
  1109. "h",
  1110. "ã",
  1111. "q",
  1112. "é",
  1113. "ç",
  1114. "á",
  1115. "z",
  1116. "í",
  1117. ],
  1118. "Swedish": [
  1119. "e",
  1120. "a",
  1121. "n",
  1122. "r",
  1123. "t",
  1124. "s",
  1125. "i",
  1126. "l",
  1127. "d",
  1128. "o",
  1129. "m",
  1130. "k",
  1131. "g",
  1132. "v",
  1133. "h",
  1134. "f",
  1135. "u",
  1136. "p",
  1137. "ä",
  1138. "c",
  1139. "b",
  1140. "ö",
  1141. "å",
  1142. "y",
  1143. "j",
  1144. "x",
  1145. ],
  1146. "Chinese": [
  1147. "的",
  1148. "一",
  1149. "是",
  1150. "不",
  1151. "了",
  1152. "在",
  1153. "人",
  1154. "有",
  1155. "我",
  1156. "他",
  1157. "这",
  1158. "个",
  1159. "们",
  1160. "中",
  1161. "来",
  1162. "上",
  1163. "大",
  1164. "为",
  1165. "和",
  1166. "国",
  1167. "地",
  1168. "到",
  1169. "以",
  1170. "说",
  1171. "时",
  1172. "要",
  1173. "就",
  1174. "出",
  1175. "会",
  1176. "可",
  1177. "也",
  1178. "你",
  1179. "对",
  1180. "生",
  1181. "能",
  1182. "而",
  1183. "子",
  1184. "那",
  1185. "得",
  1186. "于",
  1187. "着",
  1188. "下",
  1189. "自",
  1190. "之",
  1191. "年",
  1192. "过",
  1193. "发",
  1194. "后",
  1195. "作",
  1196. "里",
  1197. "用",
  1198. "道",
  1199. "行",
  1200. "所",
  1201. "然",
  1202. "家",
  1203. "种",
  1204. "事",
  1205. "成",
  1206. "方",
  1207. "多",
  1208. "经",
  1209. "么",
  1210. "去",
  1211. "法",
  1212. "学",
  1213. "如",
  1214. "都",
  1215. "同",
  1216. "现",
  1217. "当",
  1218. "没",
  1219. "动",
  1220. "面",
  1221. "起",
  1222. "看",
  1223. "定",
  1224. "天",
  1225. "分",
  1226. "还",
  1227. "进",
  1228. "好",
  1229. "小",
  1230. "部",
  1231. "其",
  1232. "些",
  1233. "主",
  1234. "样",
  1235. "理",
  1236. "心",
  1237. "她",
  1238. "本",
  1239. "前",
  1240. "开",
  1241. "但",
  1242. "因",
  1243. "只",
  1244. "从",
  1245. "想",
  1246. "实",
  1247. ],
  1248. "Ukrainian": [
  1249. "о",
  1250. "а",
  1251. "н",
  1252. "і",
  1253. "и",
  1254. "р",
  1255. "в",
  1256. "т",
  1257. "е",
  1258. "с",
  1259. "к",
  1260. "л",
  1261. "у",
  1262. "д",
  1263. "м",
  1264. "п",
  1265. "з",
  1266. "я",
  1267. "ь",
  1268. "б",
  1269. "г",
  1270. "й",
  1271. "ч",
  1272. "х",
  1273. "ц",
  1274. "ї",
  1275. ],
  1276. "Norwegian": [
  1277. "e",
  1278. "r",
  1279. "n",
  1280. "t",
  1281. "a",
  1282. "s",
  1283. "i",
  1284. "o",
  1285. "l",
  1286. "d",
  1287. "g",
  1288. "k",
  1289. "m",
  1290. "v",
  1291. "f",
  1292. "p",
  1293. "u",
  1294. "b",
  1295. "h",
  1296. "å",
  1297. "y",
  1298. "j",
  1299. "ø",
  1300. "c",
  1301. "æ",
  1302. "w",
  1303. ],
  1304. "Finnish": [
  1305. "a",
  1306. "i",
  1307. "n",
  1308. "t",
  1309. "e",
  1310. "s",
  1311. "l",
  1312. "o",
  1313. "u",
  1314. "k",
  1315. "ä",
  1316. "m",
  1317. "r",
  1318. "v",
  1319. "j",
  1320. "h",
  1321. "p",
  1322. "y",
  1323. "d",
  1324. "ö",
  1325. "g",
  1326. "c",
  1327. "b",
  1328. "f",
  1329. "w",
  1330. "z",
  1331. ],
  1332. "Vietnamese": [
  1333. "n",
  1334. "h",
  1335. "t",
  1336. "i",
  1337. "c",
  1338. "g",
  1339. "a",
  1340. "o",
  1341. "u",
  1342. "m",
  1343. "l",
  1344. "r",
  1345. "à",
  1346. "đ",
  1347. "s",
  1348. "e",
  1349. "v",
  1350. "p",
  1351. "b",
  1352. "y",
  1353. "ư",
  1354. "d",
  1355. "á",
  1356. "k",
  1357. "ộ",
  1358. "ế",
  1359. ],
  1360. "Czech": [
  1361. "o",
  1362. "e",
  1363. "a",
  1364. "n",
  1365. "t",
  1366. "s",
  1367. "i",
  1368. "l",
  1369. "v",
  1370. "r",
  1371. "k",
  1372. "d",
  1373. "u",
  1374. "m",
  1375. "p",
  1376. "í",
  1377. "c",
  1378. "h",
  1379. "z",
  1380. "á",
  1381. "y",
  1382. "j",
  1383. "b",
  1384. "ě",
  1385. "é",
  1386. "ř",
  1387. ],
  1388. "Hungarian": [
  1389. "e",
  1390. "a",
  1391. "t",
  1392. "l",
  1393. "s",
  1394. "n",
  1395. "k",
  1396. "r",
  1397. "i",
  1398. "o",
  1399. "z",
  1400. "á",
  1401. "é",
  1402. "g",
  1403. "m",
  1404. "b",
  1405. "y",
  1406. "v",
  1407. "d",
  1408. "h",
  1409. "u",
  1410. "p",
  1411. "j",
  1412. "ö",
  1413. "f",
  1414. "c",
  1415. ],
  1416. "Korean": [
  1417. "이",
  1418. "다",
  1419. "에",
  1420. "의",
  1421. "는",
  1422. "로",
  1423. "하",
  1424. "을",
  1425. "가",
  1426. "고",
  1427. "지",
  1428. "서",
  1429. "한",
  1430. "은",
  1431. "기",
  1432. "으",
  1433. "년",
  1434. "대",
  1435. "사",
  1436. "시",
  1437. "를",
  1438. "리",
  1439. "도",
  1440. "인",
  1441. "스",
  1442. "일",
  1443. ],
  1444. "Indonesian": [
  1445. "a",
  1446. "n",
  1447. "e",
  1448. "i",
  1449. "r",
  1450. "t",
  1451. "u",
  1452. "s",
  1453. "d",
  1454. "k",
  1455. "m",
  1456. "l",
  1457. "g",
  1458. "p",
  1459. "b",
  1460. "o",
  1461. "h",
  1462. "y",
  1463. "j",
  1464. "c",
  1465. "w",
  1466. "f",
  1467. "v",
  1468. "z",
  1469. "x",
  1470. "q",
  1471. ],
  1472. "Turkish": [
  1473. "a",
  1474. "e",
  1475. "i",
  1476. "n",
  1477. "r",
  1478. "l",
  1479. "ı",
  1480. "k",
  1481. "d",
  1482. "t",
  1483. "s",
  1484. "m",
  1485. "y",
  1486. "u",
  1487. "o",
  1488. "b",
  1489. "ü",
  1490. "ş",
  1491. "v",
  1492. "g",
  1493. "z",
  1494. "h",
  1495. "c",
  1496. "p",
  1497. "ç",
  1498. "ğ",
  1499. ],
  1500. "Romanian": [
  1501. "e",
  1502. "i",
  1503. "a",
  1504. "r",
  1505. "n",
  1506. "t",
  1507. "u",
  1508. "l",
  1509. "o",
  1510. "c",
  1511. "s",
  1512. "d",
  1513. "p",
  1514. "m",
  1515. "ă",
  1516. "f",
  1517. "v",
  1518. "î",
  1519. "g",
  1520. "b",
  1521. "ș",
  1522. "ț",
  1523. "z",
  1524. "h",
  1525. "â",
  1526. "j",
  1527. ],
  1528. "Farsi": [
  1529. "ا",
  1530. "ی",
  1531. "ر",
  1532. "د",
  1533. "ن",
  1534. "ه",
  1535. "و",
  1536. "م",
  1537. "ت",
  1538. "ب",
  1539. "س",
  1540. "ل",
  1541. "ک",
  1542. "ش",
  1543. "ز",
  1544. "ف",
  1545. "گ",
  1546. "ع",
  1547. "خ",
  1548. "ق",
  1549. "ج",
  1550. "آ",
  1551. "پ",
  1552. "ح",
  1553. "ط",
  1554. "ص",
  1555. ],
  1556. "Arabic": [
  1557. "ا",
  1558. "ل",
  1559. "ي",
  1560. "م",
  1561. "و",
  1562. "ن",
  1563. "ر",
  1564. "ت",
  1565. "ب",
  1566. "ة",
  1567. "ع",
  1568. "د",
  1569. "س",
  1570. "ف",
  1571. "ه",
  1572. "ك",
  1573. "ق",
  1574. "أ",
  1575. "ح",
  1576. "ج",
  1577. "ش",
  1578. "ط",
  1579. "ص",
  1580. "ى",
  1581. "خ",
  1582. "إ",
  1583. ],
  1584. "Danish": [
  1585. "e",
  1586. "r",
  1587. "n",
  1588. "t",
  1589. "a",
  1590. "i",
  1591. "s",
  1592. "d",
  1593. "l",
  1594. "o",
  1595. "g",
  1596. "m",
  1597. "k",
  1598. "f",
  1599. "v",
  1600. "u",
  1601. "b",
  1602. "h",
  1603. "p",
  1604. "å",
  1605. "y",
  1606. "ø",
  1607. "æ",
  1608. "c",
  1609. "j",
  1610. "w",
  1611. ],
  1612. "Serbian": [
  1613. "а",
  1614. "и",
  1615. "о",
  1616. "е",
  1617. "н",
  1618. "р",
  1619. "с",
  1620. "у",
  1621. "т",
  1622. "к",
  1623. "ј",
  1624. "в",
  1625. "д",
  1626. "м",
  1627. "п",
  1628. "л",
  1629. "г",
  1630. "з",
  1631. "б",
  1632. "a",
  1633. "i",
  1634. "e",
  1635. "o",
  1636. "n",
  1637. "ц",
  1638. "ш",
  1639. ],
  1640. "Lithuanian": [
  1641. "i",
  1642. "a",
  1643. "s",
  1644. "o",
  1645. "r",
  1646. "e",
  1647. "t",
  1648. "n",
  1649. "u",
  1650. "k",
  1651. "m",
  1652. "l",
  1653. "p",
  1654. "v",
  1655. "d",
  1656. "j",
  1657. "g",
  1658. "ė",
  1659. "b",
  1660. "y",
  1661. "ų",
  1662. "š",
  1663. "ž",
  1664. "c",
  1665. "ą",
  1666. "į",
  1667. ],
  1668. "Slovene": [
  1669. "e",
  1670. "a",
  1671. "i",
  1672. "o",
  1673. "n",
  1674. "r",
  1675. "s",
  1676. "l",
  1677. "t",
  1678. "j",
  1679. "v",
  1680. "k",
  1681. "d",
  1682. "p",
  1683. "m",
  1684. "u",
  1685. "z",
  1686. "b",
  1687. "g",
  1688. "h",
  1689. "č",
  1690. "c",
  1691. "š",
  1692. "ž",
  1693. "f",
  1694. "y",
  1695. ],
  1696. "Slovak": [
  1697. "o",
  1698. "a",
  1699. "e",
  1700. "n",
  1701. "i",
  1702. "r",
  1703. "v",
  1704. "t",
  1705. "s",
  1706. "l",
  1707. "k",
  1708. "d",
  1709. "m",
  1710. "p",
  1711. "u",
  1712. "c",
  1713. "h",
  1714. "j",
  1715. "b",
  1716. "z",
  1717. "á",
  1718. "y",
  1719. "ý",
  1720. "í",
  1721. "č",
  1722. "é",
  1723. ],
  1724. "Hebrew": [
  1725. "י",
  1726. "ו",
  1727. "ה",
  1728. "ל",
  1729. "ר",
  1730. "ב",
  1731. "ת",
  1732. "מ",
  1733. "א",
  1734. "ש",
  1735. "נ",
  1736. "ע",
  1737. "ם",
  1738. "ד",
  1739. "ק",
  1740. "ח",
  1741. "פ",
  1742. "ס",
  1743. "כ",
  1744. "ג",
  1745. "ט",
  1746. "צ",
  1747. "ן",
  1748. "ז",
  1749. "ך",
  1750. ],
  1751. "Bulgarian": [
  1752. "а",
  1753. "и",
  1754. "о",
  1755. "е",
  1756. "н",
  1757. "т",
  1758. "р",
  1759. "с",
  1760. "в",
  1761. "л",
  1762. "к",
  1763. "д",
  1764. "п",
  1765. "м",
  1766. "з",
  1767. "г",
  1768. "я",
  1769. "ъ",
  1770. "у",
  1771. "б",
  1772. "ч",
  1773. "ц",
  1774. "й",
  1775. "ж",
  1776. "щ",
  1777. "х",
  1778. ],
  1779. "Croatian": [
  1780. "a",
  1781. "i",
  1782. "o",
  1783. "e",
  1784. "n",
  1785. "r",
  1786. "j",
  1787. "s",
  1788. "t",
  1789. "u",
  1790. "k",
  1791. "l",
  1792. "v",
  1793. "d",
  1794. "m",
  1795. "p",
  1796. "g",
  1797. "z",
  1798. "b",
  1799. "c",
  1800. "č",
  1801. "h",
  1802. "š",
  1803. "ž",
  1804. "ć",
  1805. "f",
  1806. ],
  1807. "Hindi": [
  1808. "क",
  1809. "र",
  1810. "स",
  1811. "न",
  1812. "त",
  1813. "म",
  1814. "ह",
  1815. "प",
  1816. "य",
  1817. "ल",
  1818. "व",
  1819. "ज",
  1820. "द",
  1821. "ग",
  1822. "ब",
  1823. "श",
  1824. "ट",
  1825. "अ",
  1826. "ए",
  1827. "थ",
  1828. "भ",
  1829. "ड",
  1830. "च",
  1831. "ध",
  1832. "ष",
  1833. "इ",
  1834. ],
  1835. "Estonian": [
  1836. "a",
  1837. "i",
  1838. "e",
  1839. "s",
  1840. "t",
  1841. "l",
  1842. "u",
  1843. "n",
  1844. "o",
  1845. "k",
  1846. "r",
  1847. "d",
  1848. "m",
  1849. "v",
  1850. "g",
  1851. "p",
  1852. "j",
  1853. "h",
  1854. "ä",
  1855. "b",
  1856. "õ",
  1857. "ü",
  1858. "f",
  1859. "c",
  1860. "ö",
  1861. "y",
  1862. ],
  1863. "Thai": [
  1864. "า",
  1865. "น",
  1866. "ร",
  1867. "อ",
  1868. "ก",
  1869. "เ",
  1870. "ง",
  1871. "ม",
  1872. "ย",
  1873. "ล",
  1874. "ว",
  1875. "ด",
  1876. "ท",
  1877. "ส",
  1878. "ต",
  1879. "ะ",
  1880. "ป",
  1881. "บ",
  1882. "ค",
  1883. "ห",
  1884. "แ",
  1885. "จ",
  1886. "พ",
  1887. "ช",
  1888. "ข",
  1889. "ใ",
  1890. ],
  1891. "Greek": [
  1892. "α",
  1893. "τ",
  1894. "ο",
  1895. "ι",
  1896. "ε",
  1897. "ν",
  1898. "ρ",
  1899. "σ",
  1900. "κ",
  1901. "η",
  1902. "π",
  1903. "ς",
  1904. "υ",
  1905. "μ",
  1906. "λ",
  1907. "ί",
  1908. "ό",
  1909. "ά",
  1910. "γ",
  1911. "έ",
  1912. "δ",
  1913. "ή",
  1914. "ω",
  1915. "χ",
  1916. "θ",
  1917. "ύ",
  1918. ],
  1919. "Tamil": [
  1920. "க",
  1921. "த",
  1922. "ப",
  1923. "ட",
  1924. "ர",
  1925. "ம",
  1926. "ல",
  1927. "ன",
  1928. "வ",
  1929. "ற",
  1930. "ய",
  1931. "ள",
  1932. "ச",
  1933. "ந",
  1934. "இ",
  1935. "ண",
  1936. "அ",
  1937. "ஆ",
  1938. "ழ",
  1939. "ங",
  1940. "எ",
  1941. "உ",
  1942. "ஒ",
  1943. "ஸ",
  1944. ],
  1945. "Kazakh": [
  1946. "а",
  1947. "ы",
  1948. "е",
  1949. "н",
  1950. "т",
  1951. "р",
  1952. "л",
  1953. "і",
  1954. "д",
  1955. "с",
  1956. "м",
  1957. "қ",
  1958. "к",
  1959. "о",
  1960. "б",
  1961. "и",
  1962. "у",
  1963. "ғ",
  1964. "ж",
  1965. "ң",
  1966. "з",
  1967. "ш",
  1968. "й",
  1969. "п",
  1970. "г",
  1971. "ө",
  1972. ],
  1973. }
  1974. LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)