knowledge_graph.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. from pprint import pprint
  2. from absl import app
  3. from absl import flags
  4. from gftools import knowledge_pb2
  5. from google.protobuf import text_format
  6. import itertools
  7. import mistune # markdown => ast
  8. from xml.dom import minidom
  9. from pathlib import Path
  10. import re
  11. import sys
  12. from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
  13. MAX_RASTER_IMAGE_SIZE_KB = 800
  14. MAX_VECTOR_IMAGE_SIZE_KB = 1750
  15. def _topic_target_to_path(_: Set[str], target: str) -> str:
  16. # TODO sanity check if this is the only valid update
  17. return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
  18. def _module_target_to_path(_: Set[str], target: str) -> str:
  19. return Path(target.replace("/module/", "modules/")) / "module.textproto"
  20. def _content_md(path: str) -> Path:
  21. return Path(path) / "content.md"
  22. def _glossary_target_to_path(_: Set[str], target: str) -> str:
  23. # TODO sanity check if this is the only valid update
  24. return _content_md(target.replace("/glossary/", "glossary/terms/"))
  25. def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
  26. # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
  27. parts = target[1:].split("/")
  28. assert parts[0] == "lesson"
  29. if len(parts) == 2:
  30. path = names.get(parts[1], "")
  31. if not path.startswith("modules/"):
  32. return _content_md(target)
  33. return _content_md(path)
  34. elif len(parts) == 3:
  35. return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
  36. else:
  37. return _content_md(target)
  38. def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
  39. return _content_md(names.get(target, target))
  40. _LINK_TO_PATH = [
  41. (re.compile("^/glossary/"), _glossary_target_to_path),
  42. (re.compile("^/topic/"), _topic_target_to_path),
  43. (re.compile("^/lesson/"), _lesson_target_to_path),
  44. (re.compile("^/module/"), _module_target_to_path),
  45. (re.compile("[^/]+"), _any_unique_name_to_path)
  46. ]
  47. FLAGS = flags.FLAGS
  48. flags.DEFINE_bool("print_valid", False, "Whether to print valid links")
  49. MdValue = Union[Mapping[str, "MdValue"]]
  50. class KnowledgeContent(NamedTuple):
  51. repo_root: Path
  52. knowledge_dir: Path
  53. md_files: Tuple[Path, ...]
  54. textproto_files: Tuple[Path, ...]
  55. unambiguous_names: Mapping[str, Path]
  56. def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
  57. return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
  58. def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  59. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
  60. def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  61. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
  62. def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  63. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
  64. def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  65. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
  66. @classmethod
  67. def load(cls, repo_root: Path) -> "KnowledgeContent":
  68. knowledge_dir = repo_root / "cc-by-sa" / "knowledge"
  69. assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"
  70. md_files = []
  71. textproto_files = []
  72. for file in knowledge_dir.rglob("*"):
  73. if file.suffix.lower() == ".md":
  74. md_files.append(file)
  75. elif file.suffix.lower() == ".textproto":
  76. textproto_files.append(file)
  77. else:
  78. pass
  79. unambiguous_names = {}
  80. for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
  81. entries = list(entries)
  82. if len(entries) != 1:
  83. print(name, "is ambiguous")
  84. continue
  85. unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
  86. return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
  87. def _markdown_ast(md_file: Path) -> List[MdValue]:
  88. return mistune.create_markdown(renderer='ast')(md_file.read_text())
  89. def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
  90. frontier = list(root)
  91. while frontier:
  92. current = frontier.pop(0)
  93. assert isinstance(current, dict), f"What is {current}"
  94. if filter_fn(current):
  95. yield current
  96. for entry in current.values():
  97. if isinstance(entry, list):
  98. frontier.extend(entry)
  99. def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
  100. for matcher, link_to_path_fn in _LINK_TO_PATH:
  101. if matcher.search(target):
  102. return link_to_path_fn(names, target)
  103. raise ValueError(f"Unrecognized target {target}")
  104. def _safe_relative_to(parent: Path, child: Path) -> Path:
  105. try:
  106. return child.relative_to(parent)
  107. except ValueError:
  108. return child
  109. def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
  110. if FLAGS.print_valid or not result:
  111. message = "valid "
  112. if not result:
  113. message = "INVALID "
  114. suffix = ""
  115. if target is not None:
  116. suffix = " => " + str(_safe_relative_to(repo_root, target))
  117. print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
  118. return result
  119. def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
  120. return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
  121. def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
  122. return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
  123. def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
  124. for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
  125. text = el.get("text", "")
  126. if re.search(' id="[^"]+"', text):
  127. print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
  128. return False
  129. f = open(md_file,"r")
  130. content = "".join(f.readlines())
  131. if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
  132. print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
  133. return False
  134. f.close()
  135. return True
  136. def _check_md_files(knowledge: KnowledgeContent) -> bool:
  137. result = True
  138. for md_file in knowledge.md_files:
  139. ast = _markdown_ast(md_file)
  140. result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
  141. for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
  142. target = link.get("link", "")
  143. if not target:
  144. continue # TODO: are empty links bad
  145. if re.search("^http(s)?://", target.lower()):
  146. continue # we aren't in the business of validating outbound links
  147. target_path = knowledge.link_target_to_path(target)
  148. result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
  149. return result
  150. def _check_proto_files(knowledge: KnowledgeContent) -> bool:
  151. # TODO support alt_ids, many Knowledge constructs have them
  152. # The set of valid contributors is useful in upcoming validations
  153. contributors_file = knowledge.knowledge_dir / "contributors.textproto"
  154. assert contributors_file.is_file(), contributors_file
  155. contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
  156. result = True
  157. for textproto_file in knowledge.textproto_files:
  158. expected_files = set()
  159. if textproto_file.stem == "contributors":
  160. pass # handled above
  161. elif textproto_file.stem == "knowledge":
  162. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
  163. expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
  164. elif textproto_file.stem == "term":
  165. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
  166. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
  167. elif textproto_file.stem == "lesson":
  168. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
  169. for author in set(proto.authors) | set(proto.reviewers):
  170. result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
  171. expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
  172. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
  173. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
  174. expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
  175. # thumbnail is mandatory
  176. expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
  177. elif textproto_file.stem == "module":
  178. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
  179. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
  180. elif textproto_file.stem == "topic":
  181. # The Topic parses. And that's enough.
  182. text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
  183. else:
  184. raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
  185. for ref, expected_file in expected_files:
  186. result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
  187. return result
  188. def _is_svg(image_file: Path) -> bool:
  189. return image_file.suffix == ".svg"
  190. def _is_svg(image_file: Path) -> bool:
  191. return image_file.suffix == ".svg"
  192. def _check_image_files(knowledge: KnowledgeContent) -> bool:
  193. result = True
  194. image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
  195. for image_file in image_files:
  196. st_size = image_file.stat().st_size
  197. if _is_svg(image_file):
  198. if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
  199. print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
  200. result = False
  201. root = minidom.parseString(image_file.read_text()).documentElement
  202. if root.tagName != "svg":
  203. print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
  204. result = False
  205. has_view_box = "viewBox" in root.attributes
  206. has_width_and_height = "width" in root.attributes and "height" in root.attributes
  207. if not has_view_box and not has_width_and_height:
  208. print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
  209. result = False
  210. for stopEl in root.getElementsByTagName("stop"):
  211. if "offset" not in stopEl.attributes:
  212. print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
  213. result = False
  214. else:
  215. if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
  216. print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
  217. result = False
  218. return result
  219. def main(_):
  220. knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
  221. return_code = 1
  222. if (_check_md_files(knowledge)
  223. and _check_proto_files(knowledge)
  224. and _check_image_files(knowledge)):
  225. return_code = 0
  226. sys.exit(return_code)
  227. if __name__ == "__main__":
  228. app.run(main)