from gflanguages import languages_public_pb2 import enum import re class Udhr: def __init__( self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name ): self.key = key self.iso639_3 = iso639_3 self.iso15924 = iso15924 self.bcp47 = bcp47 self.direction = direction self.ohchr = ohchr self.stage = stage self.loc = loc self.name = name self.title = None self.preamble = None self.articles = [] def Parse(self, translation_data): if translation_data is None or self.stage < 2: return if translation_data.find("./{*}title") is not None: self.title = translation_data.find("./{*}title").text preamble_data = translation_data.find("./{*}preamble") if preamble_data is not None: if preamble_data.find("./{*}title") is not None: self.preamble = { "title": preamble_data.find("./{*}title").text, "content": [ para.text for para in preamble_data.findall("./{*}para") ], } articles_data = translation_data.findall("./{*}article") for article_data in articles_data: title_data = article_data.find("./{*}title") article = { "id": int(article_data.get("number")), "title": None if title_data is None else title_data.text, "content": [para.text for para in article_data.findall("./{*}para")], } self.articles.append(article) def LoadArticleOne(self, article_one): self.articles.append({"id": 0, "title": None, "content": [article_one]}) def GetSampleTexts(self): extractor = SampleTextExtractor(self) return extractor.GetSampleTexts() class SampleTextExtractor: class TextType(enum.Enum): GLYPHS = 1 WORD = 2 PHRASE = 3 SENTENCE = 4 PARAGRAPH = 5 PASSAGE = 6 def __init__(self, udhr): self._udhr = udhr self._glyphs = iter(self._GetGlyphs()) self._words = iter(self._GetWords()) self._paragraphs = iter(self._GetParagraphs()) self._phrase_history = set() self._non_word_regex = re.compile(r"[^\w]+") self._space_regex = re.compile(r"\s+") self._non_space_regex = re.compile(r"[^\s]+") self._non_word_space_regex = re.compile(r"[^\w\s]+") self._any_regex = re.compile(r".") def _DisplayLength(self, s): """Returns length of given string. Omits combining characters. Some entire scripts will not be counted; in those cases, the raw length of the string is returned. """ word_space_length = len(self._non_word_space_regex.sub("", s)) space_length = len(self._non_space_regex.sub("", s)) if word_space_length == space_length: return len(s) return word_space_length def _GetGlyphs(self): seen = set() for article in self._udhr.articles: for para in article["content"]: for ch in self._non_word_regex.sub("", para) or self._space_regex.sub( "", para ): ch = ch.lower() if ch not in seen: seen.add(ch) yield ch def _GetWords(self): if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None: splitter = self._space_regex else: splitter = self._non_word_regex seen = set() for article in self._udhr.articles: for para in article["content"]: for s in splitter.split(para): if s not in seen: seen.add(s) yield s def _GetParagraphs(self): if self._udhr.preamble is not None: for para in self._udhr.preamble["content"]: yield para for article in self._udhr.articles: for para in article["content"]: yield para def _ExtractGlyphs(self, min_chars, max_chars): s = "" for ch in self._glyphs: s += ch.upper() if len(s) >= min_chars: break if ch != ch.upper(): s += ch if len(s) >= min_chars: break return s def _ExtractWord(self, min_chars, max_chars): for iterator in [self._words, self._GetWords()]: for w in iterator: if w is None: continue if min_chars <= self._DisplayLength(w) <= max_chars: return w # Fallback to using multiple words for languages with very small words return self._ExtractPhrase(min_chars, max_chars) def _ExtractPhrase(self, min_chars, max_chars): for iterator in [self._paragraphs, self._GetParagraphs()]: for para in iterator: if para is None: continue for regex in [self._any_regex, self._space_regex, self._non_word_regex]: breaks = [-1] for match in regex.finditer(para, min_chars): breaks.append(match.start()) phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] p_size = self._DisplayLength(phrase) while p_size > max_chars and len(breaks) > 1: breaks.pop() phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] p_size = self._DisplayLength(phrase) if min_chars <= p_size and phrase not in self._phrase_history: self._phrase_history.add(phrase) return phrase return self._ExtractParagraph(min_chars, max_chars) def _ExtractSentence(self, min_chars, max_chars): # Sentence delimination may differ between scripts, so tokenizing on spaces # would be unreliable. Prefer to use _ExtractPhrase. return self._ExtractPhrase(min_chars, max_chars) def _ExtractParagraph(self, min_chars, max_chars): for iterator in [self._paragraphs, self._GetParagraphs()]: for para in iterator: if para is None: continue if min_chars <= self._DisplayLength(para) <= max_chars: return para # Paragraphs likely insufficient length; try combining into passages return self._ExtractPassage(min_chars, max_chars) def _ExtractPassage(self, min_chars, max_chars): p = [] p_size = 0 while p_size < min_chars: for iterator in [self._paragraphs, self._GetParagraphs()]: for para in iterator: if para is None: continue p.append(para) p_size = self._DisplayLength(" ".join(p)) if max_chars < p_size: p.pop() elif min_chars <= p_size: return "\n".join(p) assert len(p) > 0, "Unable to extract passage: " + self._udhr.key if len(p) == 0: p.append([p for p in self._GetParagraphs()][0]) return "\n".join(p) def _Get(self, text_type, **kwargs): if "char_count" in kwargs: min_chars = kwargs["char_count"] max_chars = kwargs["char_count"] else: min_chars = kwargs["min_chars"] max_chars = kwargs["max_chars"] if text_type == self.TextType.GLYPHS: return self._ExtractGlyphs(min_chars, max_chars) if text_type == self.TextType.WORD: return self._ExtractWord(min_chars, max_chars) if text_type == self.TextType.PHRASE: return self._ExtractPhrase(min_chars, max_chars) if text_type == self.TextType.SENTENCE: return self._ExtractSentence(min_chars, max_chars) if text_type == self.TextType.PARAGRAPH: return self._ExtractParagraph(min_chars, max_chars) if text_type == self.TextType.PASSAGE: return self._ExtractPassage(min_chars, max_chars) raise Exception("Unsupported text type: " + text_type) def GetSampleTexts(self): sample_text = languages_public_pb2.SampleTextProto() sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4) sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2) sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60) sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90) sample_text.poster_sm = self._Get( self.TextType.PHRASE, min_chars=10, max_chars=17 ) sample_text.poster_md = self._Get( self.TextType.PHRASE, min_chars=6, max_chars=12 ) sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8) sample_text.specimen_48 = self._Get( self.TextType.SENTENCE, min_chars=50, max_chars=80 ) sample_text.specimen_36 = self._Get( self.TextType.PARAGRAPH, min_chars=100, max_chars=120 ) sample_text.specimen_32 = self._Get( self.TextType.PARAGRAPH, min_chars=140, max_chars=180 ) sample_text.specimen_21 = self._Get( self.TextType.PASSAGE, min_chars=300, max_chars=500 ) sample_text.specimen_16 = self._Get( self.TextType.PASSAGE, min_chars=550, max_chars=750 ) return sample_text