from collections import Counter from google.protobuf import text_format from gflanguages import languages_public_pb2 ATTRIBUTES = "base auxiliary marks punctuation index".split(" ") def main(args=None): for path in args: with open(path, encoding="utf-8") as fp: language = text_format.Parse( fp.read(), languages_public_pb2.LanguageProto() ) changed = False exemplar_values = {} if not hasattr(language, "exemplar_chars"): exit() for attr in ATTRIBUTES: if hasattr(language.exemplar_chars, attr): values = getattr(language.exemplar_chars, attr).split(" ") value_set = set() clean_values = [] for value in values: if value in value_set: continue else: value_set.add(value) clean_values.append(value) if clean_values != values: if {len(set(values))} != {len(set(clean_values))}: print("before: "+ " ".join(values)) print("after: "+ " ".join(clean_values)) sys.exit("Failed fixing exemplar.") setattr(language.exemplar_chars, attr, " ".join(clean_values)) changed = True exemplar_values[attr] = { "before": values, "after": clean_values } if changed: for exemplar, values in exemplar_values.items(): before = values["before"] after = values["after"] counter = Counter(before) duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1] print( f"Changed {path} {exemplar} exemplar:\n" f"- from {len(before)} ({len(set(before))} as set) " f"to {len(after)} elements\n" f"- removing {len(before) - len(after)} duplicate(s):\n" f" {duplicates}\n" ) with open(path, "w", encoding="utf-8") as fp: fp.write(text_format.MessageToString(language, as_utf8=True)) fp.close() if __name__ == "__main__": import sys main(args=sys.argv[1:])