fix_translation_memory.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. ##% Script to fix a corrupted Translation memory from existing po files
  2. import os
  3. import re
  4. import argparse
  5. from pathlib import Path
  6. from fuzzywuzzy import fuzz
  7. import xml.etree.ElementTree as ET
  8. from xml.sax.saxutils import unescape
  9. def load_existing_xmtm(path: Path) -> ET.Element:
  10. """Load existing xmtm file and return the root element"""
  11. tree = ET.parse(path)
  12. return tree.getroot()
  13. def load_existing_po(path: Path) -> dict:
  14. """Load existing po file and return a dictionary of msgid and msgstr"""
  15. content = path.read_text(encoding="utf-8")
  16. content = "".join(content.splitlines()[16:])
  17. # TODO: check languages with plural forms
  18. return dict(re.findall(r'[^#]msgid.?\"+\s?([\s|\S]+?)\"*?msgstr.?\"([\s|\S]+?)\"?#', content))
  19. def sanitize(text: str) -> str:
  20. """Sanitize the text"""
  21. # TODO: check if Digitial Factory Ultimaker etc handled correctly
  22. text = text.replace("\"\"", "").replace("\"#~", "")
  23. text = text.replace("Ultimaker", "UltiMaker")
  24. text = text.replace("UltiMaker Digital Library", "Ultimaker Digital Library")
  25. text = text.replace("UltiMaker Digital Factory", "Ultimaker Digital Factory")
  26. text = text.replace("UltiMaker Marketplace", "Ultimaker Marketplace")
  27. return unescape(text)
  28. def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path):
  29. po_content = {}
  30. for file in i18n_path.rglob("*.po"):
  31. print(os.path.join(i18n_path, file))
  32. po_content[file.relative_to(i18n_path).parts[0].replace("_", "-")] = load_existing_po(Path(os.path.join(i18n_path, file)))
  33. root = load_existing_xmtm(tmx_source_path)
  34. root_old = ET.ElementTree(root)
  35. # ET.indent(root_old, ' ')
  36. root_old.write("old.tmx", encoding="utf-8", xml_declaration=True)
  37. to_be_removed = []
  38. for tu in root.iter("tu"):
  39. # TODO: also add logic for other pot files
  40. if [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"][0] not in ("cura.pot", "fdmprinter.def.json.pot", "fdmextruder.def.json.pot", "uranium.pot"):
  41. continue
  42. tuvs = tu.findall("tuv")
  43. key_source = tuvs[0].find("seg").text
  44. key_lang = tuvs[1].attrib["{http://www.w3.org/XML/1998/namespace}lang"]
  45. if key_lang in po_content and key_source in po_content[key_lang]:
  46. replaced_translation = po_content[key_lang][key_source]
  47. else:
  48. fuzz_match_ratio = [fuzz.ratio(sanitize(k), key_source) for k in po_content[key_lang].keys()]
  49. fuzz_max_ratio = max(fuzz_match_ratio)
  50. fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)]
  51. if fuzz_max_ratio > 90:
  52. replaced_translation = po_content[key_lang][fuzz_match_key]
  53. tuvs[0].find("seg").text = sanitize(fuzz_match_key)
  54. else:
  55. print(f"[{key_lang}] {key_source} == {fuzz_match_key} [{fuzz_max_ratio}]")
  56. continue
  57. tuvs[1].find("seg").text = sanitize(replaced_translation)
  58. # if the tvus[1].find("seg").text is a single ", remove the tu element as whole (since this is an untranslated string)
  59. if tuvs[1].find("seg").text == "\"":
  60. to_be_removed.append(tu)
  61. print(f"Removed {len(to_be_removed)} elements")
  62. body = root.find("body")
  63. for tu in to_be_removed:
  64. body.remove(tu)
  65. fixed_root = ET.ElementTree(root)
  66. fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True)
  67. if __name__ == "__main__":
  68. parser = argparse.ArgumentParser(description="Fix a corrupted Translation memory from existing po files")
  69. parser.add_argument("tmx_source_path", type=Path, help="Path to the source TMX file")
  70. parser.add_argument("tmx_target_path", type=Path, help="Path to the target TMX file")
  71. parser.add_argument("i18n_path", type=Path, help="Path to the i18n folder")
  72. args = parser.parse_args()
  73. main(args.tmx_source_path, args.tmx_target_path, args.i18n_path)