123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- from gflanguages import languages_public_pb2
- import enum
- import re
- class Udhr:
- def __init__(
- self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
- ):
- self.key = key
- self.iso639_3 = iso639_3
- self.iso15924 = iso15924
- self.bcp47 = bcp47
- self.direction = direction
- self.ohchr = ohchr
- self.stage = stage
- self.loc = loc
- self.name = name
- self.title = None
- self.preamble = None
- self.articles = []
- def Parse(self, translation_data):
- if translation_data is None or self.stage < 2:
- return
- if translation_data.find("./{*}title") is not None:
- self.title = translation_data.find("./{*}title").text
- preamble_data = translation_data.find("./{*}preamble")
- if preamble_data is not None:
- if preamble_data.find("./{*}title") is not None:
- self.preamble = {
- "title": preamble_data.find("./{*}title").text,
- "content": [
- para.text for para in preamble_data.findall("./{*}para")
- ],
- }
- articles_data = translation_data.findall("./{*}article")
- for article_data in articles_data:
- title_data = article_data.find("./{*}title")
- article = {
- "id": int(article_data.get("number")),
- "title": None if title_data is None else title_data.text,
- "content": [para.text for para in article_data.findall("./{*}para")],
- }
- self.articles.append(article)
- def LoadArticleOne(self, article_one):
- self.articles.append({"id": 0, "title": None, "content": [article_one]})
- def GetSampleTexts(self):
- extractor = SampleTextExtractor(self)
- return extractor.GetSampleTexts()
- class SampleTextExtractor:
- class TextType(enum.Enum):
- GLYPHS = 1
- WORD = 2
- PHRASE = 3
- SENTENCE = 4
- PARAGRAPH = 5
- PASSAGE = 6
- def __init__(self, udhr):
- self._udhr = udhr
- self._glyphs = iter(self._GetGlyphs())
- self._words = iter(self._GetWords())
- self._paragraphs = iter(self._GetParagraphs())
- self._phrase_history = set()
- self._non_word_regex = re.compile(r"[^\w]+")
- self._space_regex = re.compile(r"\s+")
- self._non_space_regex = re.compile(r"[^\s]+")
- self._non_word_space_regex = re.compile(r"[^\w\s]+")
- self._any_regex = re.compile(r".")
- def _DisplayLength(self, s):
- """Returns length of given string. Omits combining characters.
- Some entire scripts will not be counted; in those cases, the raw length of
- the string is returned.
- """
- word_space_length = len(self._non_word_space_regex.sub("", s))
- space_length = len(self._non_space_regex.sub("", s))
- if word_space_length == space_length:
- return len(s)
- return word_space_length
- def _GetGlyphs(self):
- seen = set()
- for article in self._udhr.articles:
- for para in article["content"]:
- for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
- "", para
- ):
- ch = ch.lower()
- if ch not in seen:
- seen.add(ch)
- yield ch
- def _GetWords(self):
- if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
- splitter = self._space_regex
- else:
- splitter = self._non_word_regex
- seen = set()
- for article in self._udhr.articles:
- for para in article["content"]:
- for s in splitter.split(para):
- if s not in seen:
- seen.add(s)
- yield s
- def _GetParagraphs(self):
- if self._udhr.preamble is not None:
- for para in self._udhr.preamble["content"]:
- yield para
- for article in self._udhr.articles:
- for para in article["content"]:
- yield para
- def _ExtractGlyphs(self, min_chars, max_chars):
- s = ""
- for ch in self._glyphs:
- s += ch.upper()
- if len(s) >= min_chars:
- break
- if ch != ch.upper():
- s += ch
- if len(s) >= min_chars:
- break
- return s
- def _ExtractWord(self, min_chars, max_chars):
- for iterator in [self._words, self._GetWords()]:
- for w in iterator:
- if w is None:
- continue
- if min_chars <= self._DisplayLength(w) <= max_chars:
- return w
- # Fallback to using multiple words for languages with very small words
- return self._ExtractPhrase(min_chars, max_chars)
- def _ExtractPhrase(self, min_chars, max_chars):
- for iterator in [self._paragraphs, self._GetParagraphs()]:
- for para in iterator:
- if para is None:
- continue
- for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
- breaks = [-1]
- for match in regex.finditer(para, min_chars):
- breaks.append(match.start())
- phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
- p_size = self._DisplayLength(phrase)
- while p_size > max_chars and len(breaks) > 1:
- breaks.pop()
- phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
- p_size = self._DisplayLength(phrase)
- if min_chars <= p_size and phrase not in self._phrase_history:
- self._phrase_history.add(phrase)
- return phrase
- return self._ExtractParagraph(min_chars, max_chars)
- def _ExtractSentence(self, min_chars, max_chars):
- # Sentence delimination may differ between scripts, so tokenizing on spaces
- # would be unreliable. Prefer to use _ExtractPhrase.
- return self._ExtractPhrase(min_chars, max_chars)
- def _ExtractParagraph(self, min_chars, max_chars):
- for iterator in [self._paragraphs, self._GetParagraphs()]:
- for para in iterator:
- if para is None:
- continue
- if min_chars <= self._DisplayLength(para) <= max_chars:
- return para
- # Paragraphs likely insufficient length; try combining into passages
- return self._ExtractPassage(min_chars, max_chars)
- def _ExtractPassage(self, min_chars, max_chars):
- p = []
- p_size = 0
- while p_size < min_chars:
- for iterator in [self._paragraphs, self._GetParagraphs()]:
- for para in iterator:
- if para is None:
- continue
- p.append(para)
- p_size = self._DisplayLength(" ".join(p))
- if max_chars < p_size:
- p.pop()
- elif min_chars <= p_size:
- return "\n".join(p)
- assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
- if len(p) == 0:
- p.append([p for p in self._GetParagraphs()][0])
- return "\n".join(p)
- def _Get(self, text_type, **kwargs):
- if "char_count" in kwargs:
- min_chars = kwargs["char_count"]
- max_chars = kwargs["char_count"]
- else:
- min_chars = kwargs["min_chars"]
- max_chars = kwargs["max_chars"]
- if text_type == self.TextType.GLYPHS:
- return self._ExtractGlyphs(min_chars, max_chars)
- if text_type == self.TextType.WORD:
- return self._ExtractWord(min_chars, max_chars)
- if text_type == self.TextType.PHRASE:
- return self._ExtractPhrase(min_chars, max_chars)
- if text_type == self.TextType.SENTENCE:
- return self._ExtractSentence(min_chars, max_chars)
- if text_type == self.TextType.PARAGRAPH:
- return self._ExtractParagraph(min_chars, max_chars)
- if text_type == self.TextType.PASSAGE:
- return self._ExtractPassage(min_chars, max_chars)
- raise Exception("Unsupported text type: " + text_type)
- def GetSampleTexts(self):
- sample_text = languages_public_pb2.SampleTextProto()
- sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
- sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
- sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
- sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
- sample_text.poster_sm = self._Get(
- self.TextType.PHRASE, min_chars=10, max_chars=17
- )
- sample_text.poster_md = self._Get(
- self.TextType.PHRASE, min_chars=6, max_chars=12
- )
- sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
- sample_text.specimen_48 = self._Get(
- self.TextType.SENTENCE, min_chars=50, max_chars=80
- )
- sample_text.specimen_36 = self._Get(
- self.TextType.PARAGRAPH, min_chars=100, max_chars=120
- )
- sample_text.specimen_32 = self._Get(
- self.TextType.PARAGRAPH, min_chars=140, max_chars=180
- )
- sample_text.specimen_21 = self._Get(
- self.TextType.PASSAGE, min_chars=300, max_chars=500
- )
- sample_text.specimen_16 = self._Get(
- self.TextType.PASSAGE, min_chars=550, max_chars=750
- )
- return sample_text
|