udhr.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. from gflanguages import languages_public_pb2
  2. import enum
  3. import re
  4. class Udhr:
  5. def __init__(
  6. self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
  7. ):
  8. self.key = key
  9. self.iso639_3 = iso639_3
  10. self.iso15924 = iso15924
  11. self.bcp47 = bcp47
  12. self.direction = direction
  13. self.ohchr = ohchr
  14. self.stage = stage
  15. self.loc = loc
  16. self.name = name
  17. self.title = None
  18. self.preamble = None
  19. self.articles = []
  20. def Parse(self, translation_data):
  21. if translation_data is None or self.stage < 2:
  22. return
  23. if translation_data.find("./{*}title") is not None:
  24. self.title = translation_data.find("./{*}title").text
  25. preamble_data = translation_data.find("./{*}preamble")
  26. if preamble_data is not None:
  27. if preamble_data.find("./{*}title") is not None:
  28. self.preamble = {
  29. "title": preamble_data.find("./{*}title").text,
  30. "content": [
  31. para.text for para in preamble_data.findall("./{*}para")
  32. ],
  33. }
  34. articles_data = translation_data.findall("./{*}article")
  35. for article_data in articles_data:
  36. title_data = article_data.find("./{*}title")
  37. article = {
  38. "id": int(article_data.get("number")),
  39. "title": None if title_data is None else title_data.text,
  40. "content": [para.text for para in article_data.findall("./{*}para")],
  41. }
  42. self.articles.append(article)
  43. def LoadArticleOne(self, article_one):
  44. self.articles.append({"id": 0, "title": None, "content": [article_one]})
  45. def GetSampleTexts(self):
  46. extractor = SampleTextExtractor(self)
  47. return extractor.GetSampleTexts()
  48. class SampleTextExtractor:
  49. class TextType(enum.Enum):
  50. GLYPHS = 1
  51. WORD = 2
  52. PHRASE = 3
  53. SENTENCE = 4
  54. PARAGRAPH = 5
  55. PASSAGE = 6
  56. def __init__(self, udhr):
  57. self._udhr = udhr
  58. self._glyphs = iter(self._GetGlyphs())
  59. self._words = iter(self._GetWords())
  60. self._paragraphs = iter(self._GetParagraphs())
  61. self._phrase_history = set()
  62. self._non_word_regex = re.compile(r"[^\w]+")
  63. self._space_regex = re.compile(r"\s+")
  64. self._non_space_regex = re.compile(r"[^\s]+")
  65. self._non_word_space_regex = re.compile(r"[^\w\s]+")
  66. self._any_regex = re.compile(r".")
  67. def _DisplayLength(self, s):
  68. """Returns length of given string. Omits combining characters.
  69. Some entire scripts will not be counted; in those cases, the raw length of
  70. the string is returned.
  71. """
  72. word_space_length = len(self._non_word_space_regex.sub("", s))
  73. space_length = len(self._non_space_regex.sub("", s))
  74. if word_space_length == space_length:
  75. return len(s)
  76. return word_space_length
  77. def _GetGlyphs(self):
  78. seen = set()
  79. for article in self._udhr.articles:
  80. for para in article["content"]:
  81. for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
  82. "", para
  83. ):
  84. ch = ch.lower()
  85. if ch not in seen:
  86. seen.add(ch)
  87. yield ch
  88. def _GetWords(self):
  89. if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
  90. splitter = self._space_regex
  91. else:
  92. splitter = self._non_word_regex
  93. seen = set()
  94. for article in self._udhr.articles:
  95. for para in article["content"]:
  96. for s in splitter.split(para):
  97. if s not in seen:
  98. seen.add(s)
  99. yield s
  100. def _GetParagraphs(self):
  101. if self._udhr.preamble is not None:
  102. for para in self._udhr.preamble["content"]:
  103. yield para
  104. for article in self._udhr.articles:
  105. for para in article["content"]:
  106. yield para
  107. def _ExtractGlyphs(self, min_chars, max_chars):
  108. s = ""
  109. for ch in self._glyphs:
  110. s += ch.upper()
  111. if len(s) >= min_chars:
  112. break
  113. if ch != ch.upper():
  114. s += ch
  115. if len(s) >= min_chars:
  116. break
  117. return s
  118. def _ExtractWord(self, min_chars, max_chars):
  119. for iterator in [self._words, self._GetWords()]:
  120. for w in iterator:
  121. if w is None:
  122. continue
  123. if min_chars <= self._DisplayLength(w) <= max_chars:
  124. return w
  125. # Fallback to using multiple words for languages with very small words
  126. return self._ExtractPhrase(min_chars, max_chars)
  127. def _ExtractPhrase(self, min_chars, max_chars):
  128. for iterator in [self._paragraphs, self._GetParagraphs()]:
  129. for para in iterator:
  130. if para is None:
  131. continue
  132. for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
  133. breaks = [-1]
  134. for match in regex.finditer(para, min_chars):
  135. breaks.append(match.start())
  136. phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
  137. p_size = self._DisplayLength(phrase)
  138. while p_size > max_chars and len(breaks) > 1:
  139. breaks.pop()
  140. phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
  141. p_size = self._DisplayLength(phrase)
  142. if min_chars <= p_size and phrase not in self._phrase_history:
  143. self._phrase_history.add(phrase)
  144. return phrase
  145. return self._ExtractParagraph(min_chars, max_chars)
  146. def _ExtractSentence(self, min_chars, max_chars):
  147. # Sentence delimination may differ between scripts, so tokenizing on spaces
  148. # would be unreliable. Prefer to use _ExtractPhrase.
  149. return self._ExtractPhrase(min_chars, max_chars)
  150. def _ExtractParagraph(self, min_chars, max_chars):
  151. for iterator in [self._paragraphs, self._GetParagraphs()]:
  152. for para in iterator:
  153. if para is None:
  154. continue
  155. if min_chars <= self._DisplayLength(para) <= max_chars:
  156. return para
  157. # Paragraphs likely insufficient length; try combining into passages
  158. return self._ExtractPassage(min_chars, max_chars)
  159. def _ExtractPassage(self, min_chars, max_chars):
  160. p = []
  161. p_size = 0
  162. while p_size < min_chars:
  163. for iterator in [self._paragraphs, self._GetParagraphs()]:
  164. for para in iterator:
  165. if para is None:
  166. continue
  167. p.append(para)
  168. p_size = self._DisplayLength(" ".join(p))
  169. if max_chars < p_size:
  170. p.pop()
  171. elif min_chars <= p_size:
  172. return "\n".join(p)
  173. assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
  174. if len(p) == 0:
  175. p.append([p for p in self._GetParagraphs()][0])
  176. return "\n".join(p)
  177. def _Get(self, text_type, **kwargs):
  178. if "char_count" in kwargs:
  179. min_chars = kwargs["char_count"]
  180. max_chars = kwargs["char_count"]
  181. else:
  182. min_chars = kwargs["min_chars"]
  183. max_chars = kwargs["max_chars"]
  184. if text_type == self.TextType.GLYPHS:
  185. return self._ExtractGlyphs(min_chars, max_chars)
  186. if text_type == self.TextType.WORD:
  187. return self._ExtractWord(min_chars, max_chars)
  188. if text_type == self.TextType.PHRASE:
  189. return self._ExtractPhrase(min_chars, max_chars)
  190. if text_type == self.TextType.SENTENCE:
  191. return self._ExtractSentence(min_chars, max_chars)
  192. if text_type == self.TextType.PARAGRAPH:
  193. return self._ExtractParagraph(min_chars, max_chars)
  194. if text_type == self.TextType.PASSAGE:
  195. return self._ExtractPassage(min_chars, max_chars)
  196. raise Exception("Unsupported text type: " + text_type)
  197. def GetSampleTexts(self):
  198. sample_text = languages_public_pb2.SampleTextProto()
  199. sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
  200. sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
  201. sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
  202. sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
  203. sample_text.poster_sm = self._Get(
  204. self.TextType.PHRASE, min_chars=10, max_chars=17
  205. )
  206. sample_text.poster_md = self._Get(
  207. self.TextType.PHRASE, min_chars=6, max_chars=12
  208. )
  209. sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
  210. sample_text.specimen_48 = self._Get(
  211. self.TextType.SENTENCE, min_chars=50, max_chars=80
  212. )
  213. sample_text.specimen_36 = self._Get(
  214. self.TextType.PARAGRAPH, min_chars=100, max_chars=120
  215. )
  216. sample_text.specimen_32 = self._Get(
  217. self.TextType.PARAGRAPH, min_chars=140, max_chars=180
  218. )
  219. sample_text.specimen_21 = self._Get(
  220. self.TextType.PASSAGE, min_chars=300, max_chars=500
  221. )
  222. sample_text.specimen_16 = self._Get(
  223. self.TextType.PASSAGE, min_chars=550, max_chars=750
  224. )
  225. return sample_text