fix-exemplars-bases.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. from collections import Counter
  2. import unicodedata
  3. from google.protobuf import text_format
  4. from gflanguages import languages_public_pb2
  5. ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
  6. def main(args=None):
  7. for path in args:
  8. with open(path, encoding="utf-8") as fp:
  9. language = text_format.Parse(
  10. fp.read(), languages_public_pb2.LanguageProto()
  11. )
  12. changed = False
  13. exemplar_values = {}
  14. bases = language.exemplar_chars.base.split(" ")
  15. marks = language.exemplar_chars.marks.split(" ")
  16. if not len(bases) or bases == [""]:
  17. continue
  18. new_marks = []
  19. new_bases = []
  20. for chars in marks:
  21. if not chars:
  22. continue
  23. if chars[0] != "\u25CC":
  24. chars = "\u25CC" + chars
  25. if chars not in new_marks:
  26. new_marks.append(chars)
  27. for chars in bases:
  28. if not chars:
  29. continue
  30. if chars[0] == "\u25CC":
  31. chars = chars[1:]
  32. cat = unicodedata.category(chars[0])
  33. if cat in ["Mn", "Mc"]:
  34. if chars[0] != "\u25CC":
  35. chars = "\u25CC" + chars
  36. if chars not in new_marks:
  37. new_marks.append(chars)
  38. else:
  39. new_bases.append(chars)
  40. language.exemplar_chars.base = " ".join(new_bases)
  41. language.exemplar_chars.marks = " ".join(new_marks)
  42. with open(path, "w", encoding="utf-8") as fp:
  43. fp.write(text_format.MessageToString(language, as_utf8=True))
  44. fp.close()
  45. if __name__ == "__main__":
  46. import sys
  47. main(args=sys.argv[1:])