fix_translation_memory.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. ##% Script to fix a corrupted Translation memory from existing po files
  2. import os
  3. import re
  4. import argparse
  5. from pathlib import Path
  6. from fuzzywuzzy import fuzz
  7. from fuzzywuzzy import process
  8. import xml.etree.ElementTree as ET
  9. from xml.sax.saxutils import unescape
  10. def load_existing_xmtm(path: Path) -> ET.Element:
  11. """Load existing xmtm file and return the root element"""
  12. tree = ET.parse(path)
  13. return tree.getroot()
  14. def load_existing_po(path: Path) -> dict:
  15. """Load existing po file and return a dictionary of msgid and msgstr"""
  16. content = path.read_text(encoding="utf-8")
  17. content = "".join(content.splitlines()[16:])
  18. return dict(re.findall(r'[^#]msgid.?\"+\s?([\s|\S]+?)\"*?msgstr.?\"([\s|\S]+?)\"?#', content))
  19. def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path):
  20. po_content = {}
  21. for file in i18n_path.rglob("cura.po"):
  22. print(os.path.join(i18n_path, file))
  23. po_content[file.relative_to(i18n_path).parts[0].replace("_", "-")] = load_existing_po(Path(os.path.join(i18n_path, file)))
  24. root = load_existing_xmtm(tmx_source_path)
  25. root_old = ET.ElementTree(root)
  26. root_old.write("old.tmx", encoding="utf-8", xml_declaration=True)
  27. for tu in root.iter("tu"):
  28. if "cura.pot" not in [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"]:
  29. continue
  30. tuvs = tu.findall("tuv")
  31. key_source = tuvs[0].find("seg").text
  32. key_lang = tuvs[1].attrib["{http://www.w3.org/XML/1998/namespace}lang"]
  33. if key_lang in po_content and key_source in po_content[key_lang]:
  34. tuvs[1].find("seg").text = po_content[key_lang][key_source]
  35. else:
  36. fuzz_match_ratio = [fuzz.ratio(k, key_source) for k in po_content[key_lang].keys()]
  37. fuzz_max_ratio = max(fuzz_match_ratio)
  38. fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)]
  39. if fuzz_max_ratio > 90:
  40. fuzz_match_po_value = po_content[key_lang][fuzz_match_key]
  41. tuvs[0].find("seg").text = fuzz_match_key
  42. tuvs[1].find("seg").text = fuzz_match_po_value
  43. # print(f"[{key_lang}] Fuzz match: {key_source} -> {fuzz_match_key} -> {fuzz_match_po_value} with a ratio of {fuzz_max_ratio}")
  44. else:
  45. # print(f"[{key_lang}] No match for: {key_source} -> {tuvs[1].find('seg').text} -> highest ratio: {fuzz_max_ratio}")
  46. print(f"[{key_lang}] {key_source} == {fuzz_match_key} [{fuzz_max_ratio}]")
  47. fixed_root = ET.ElementTree(root)
  48. fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True)
  49. if __name__ == "__main__":
  50. parser = argparse.ArgumentParser(description="Fix a corrupted Translation memory from existing po files")
  51. parser.add_argument("tmx_source_path", type=Path, help="Path to the source TMX file")
  52. parser.add_argument("tmx_target_path", type=Path, help="Path to the target TMX file")
  53. parser.add_argument("i18n_path", type=Path, help="Path to the i18n folder")
  54. args = parser.parse_args()
  55. main(args.tmx_source_path, args.tmx_target_path, args.i18n_path)