fix-exemplars-duplicates.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from collections import Counter
  2. from google.protobuf import text_format
  3. from gflanguages import languages_public_pb2
  4. ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
  5. def main(args=None):
  6. for path in args:
  7. with open(path, encoding="utf-8") as fp:
  8. language = text_format.Parse(
  9. fp.read(), languages_public_pb2.LanguageProto()
  10. )
  11. changed = False
  12. exemplar_values = {}
  13. if not hasattr(language, "exemplar_chars"):
  14. exit()
  15. for attr in ATTRIBUTES:
  16. if hasattr(language.exemplar_chars, attr):
  17. values = getattr(language.exemplar_chars, attr).split(" ")
  18. value_set = set()
  19. clean_values = []
  20. for value in values:
  21. if value in value_set:
  22. continue
  23. else:
  24. value_set.add(value)
  25. clean_values.append(value)
  26. if clean_values != values:
  27. if {len(set(values))} != {len(set(clean_values))}:
  28. print("before: "+ " ".join(values))
  29. print("after: "+ " ".join(clean_values))
  30. sys.exit("Failed fixing exemplar.")
  31. setattr(language.exemplar_chars, attr, " ".join(clean_values))
  32. changed = True
  33. exemplar_values[attr] = {
  34. "before": values,
  35. "after": clean_values
  36. }
  37. if changed:
  38. for exemplar, values in exemplar_values.items():
  39. before = values["before"]
  40. after = values["after"]
  41. counter = Counter(before)
  42. duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1]
  43. print(
  44. f"Changed {path} {exemplar} exemplar:\n"
  45. f"- from {len(before)} ({len(set(before))} as set) "
  46. f"to {len(after)} elements\n"
  47. f"- removing {len(before) - len(after)} duplicate(s):\n"
  48. f" {duplicates}\n"
  49. )
  50. with open(path, "w", encoding="utf-8") as fp:
  51. fp.write(text_format.MessageToString(language, as_utf8=True))
  52. fp.close()
  53. if __name__ == "__main__":
  54. import sys
  55. main(args=sys.argv[1:])