123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- from collections import Counter
- from google.protobuf import text_format
- from gflanguages import languages_public_pb2
- ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
- def main(args=None):
- for path in args:
- with open(path, encoding="utf-8") as fp:
- language = text_format.Parse(
- fp.read(), languages_public_pb2.LanguageProto()
- )
- changed = False
- exemplar_values = {}
- if not hasattr(language, "exemplar_chars"):
- exit()
- for attr in ATTRIBUTES:
- if hasattr(language.exemplar_chars, attr):
- values = getattr(language.exemplar_chars, attr).split(" ")
- value_set = set()
- clean_values = []
- for value in values:
- if value in value_set:
- continue
- else:
- value_set.add(value)
- clean_values.append(value)
- if clean_values != values:
- if {len(set(values))} != {len(set(clean_values))}:
- print("before: "+ " ".join(values))
- print("after: "+ " ".join(clean_values))
- sys.exit("Failed fixing exemplar.")
- setattr(language.exemplar_chars, attr, " ".join(clean_values))
- changed = True
- exemplar_values[attr] = {
- "before": values,
- "after": clean_values
- }
- if changed:
- for exemplar, values in exemplar_values.items():
- before = values["before"]
- after = values["after"]
- counter = Counter(before)
- duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1]
- print(
- f"Changed {path} {exemplar} exemplar:\n"
- f"- from {len(before)} ({len(set(before))} as set) "
- f"to {len(after)} elements\n"
- f"- removing {len(before) - len(after)} duplicate(s):\n"
- f" {duplicates}\n"
- )
- with open(path, "w", encoding="utf-8") as fp:
- fp.write(text_format.MessageToString(language, as_utf8=True))
- fp.close()
- if __name__ == "__main__":
- import sys
- main(args=sys.argv[1:])
|