knowledge_graph.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. from pprint import pprint
  2. from absl import app
  3. from absl import flags
  4. from gftools import knowledge_pb2
  5. from google.protobuf import text_format
  6. import itertools
  7. import mistune # markdown => ast
  8. from xml.dom import minidom
  9. from pathlib import Path
  10. import re
  11. import sys
  12. from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
  13. import requests
  14. from functools import lru_cache
  15. from urllib.parse import urlparse
  16. MAX_RASTER_IMAGE_SIZE_KB = 800
  17. MAX_VECTOR_IMAGE_SIZE_KB = 1750
  18. def _topic_target_to_path(_: Set[str], target: str) -> str:
  19. # TODO sanity check if this is the only valid update
  20. return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
  21. def _module_target_to_path(_: Set[str], target: str) -> str:
  22. return Path(target.replace("/module/", "modules/")) / "module.textproto"
  23. def _content_md(path: str) -> Path:
  24. return Path(path) / "content.md"
  25. def _glossary_target_to_path(_: Set[str], target: str) -> str:
  26. # TODO sanity check if this is the only valid update
  27. return _content_md(target.replace("/glossary/", "glossary/terms/"))
  28. def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
  29. # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
  30. parts = target[1:].split("/")
  31. assert parts[0] == "lesson"
  32. if len(parts) == 2:
  33. path = names.get(parts[1], "")
  34. if not path.startswith("modules/"):
  35. return _content_md(target)
  36. return _content_md(path)
  37. elif len(parts) == 3:
  38. return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
  39. else:
  40. return _content_md(target)
  41. def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
  42. return _content_md(names.get(target, target))
  43. _LINK_TO_PATH = [
  44. (re.compile("^/glossary/"), _glossary_target_to_path),
  45. (re.compile("^/topic/"), _topic_target_to_path),
  46. (re.compile("^/lesson/"), _lesson_target_to_path),
  47. (re.compile("^/module/"), _module_target_to_path),
  48. (re.compile("[^/]+"), _any_unique_name_to_path)
  49. ]
  50. FLAGS = flags.FLAGS
  51. flags.DEFINE_bool("print_valid", False, "Whether to print valid links")
  52. flags.DEFINE_bool("check_outbound_links", False, "Check outbound urls")
  53. MdValue = Union[Mapping[str, "MdValue"]]
  54. class KnowledgeContent(NamedTuple):
  55. repo_root: Path
  56. knowledge_dir: Path
  57. md_files: Tuple[Path, ...]
  58. textproto_files: Tuple[Path, ...]
  59. unambiguous_names: Mapping[str, Path]
  60. def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
  61. return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
  62. def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  63. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
  64. def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  65. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
  66. def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  67. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
  68. def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
  69. return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
  70. @classmethod
  71. def load(cls, repo_root: Path) -> "KnowledgeContent":
  72. knowledge_dir = repo_root / "cc-by-sa" / "knowledge"
  73. assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"
  74. md_files = []
  75. textproto_files = []
  76. for file in knowledge_dir.rglob("*"):
  77. if file.suffix.lower() == ".md":
  78. md_files.append(file)
  79. elif file.suffix.lower() == ".textproto":
  80. textproto_files.append(file)
  81. else:
  82. pass
  83. unambiguous_names = {}
  84. for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
  85. entries = list(entries)
  86. if len(entries) != 1:
  87. print(name, "is ambiguous")
  88. continue
  89. unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
  90. return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
  91. def _markdown_ast(md_file: Path) -> List[MdValue]:
  92. return mistune.create_markdown(renderer='ast')(md_file.read_text())
  93. def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
  94. frontier = list(root)
  95. while frontier:
  96. current = frontier.pop(0)
  97. assert isinstance(current, dict), f"What is {current}"
  98. if filter_fn(current):
  99. yield current
  100. for entry in current.values():
  101. if isinstance(entry, list):
  102. frontier.extend(entry)
  103. def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
  104. for matcher, link_to_path_fn in _LINK_TO_PATH:
  105. if matcher.search(target):
  106. return link_to_path_fn(names, target)
  107. raise ValueError(f"Unrecognized target {target}")
  108. def _safe_relative_to(parent: Path, child: Path) -> Path:
  109. try:
  110. return child.relative_to(parent)
  111. except ValueError:
  112. return child
  113. def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
  114. if FLAGS.print_valid or not result:
  115. message = "valid "
  116. if not result:
  117. message = "INVALID "
  118. suffix = ""
  119. if target is not None:
  120. suffix = " => " + str(_safe_relative_to(repo_root, target))
  121. print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
  122. return result
  123. def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
  124. return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
  125. def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
  126. return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
  127. def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
  128. for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
  129. text = el.get("text", "")
  130. if re.search(' id="[^"]+"', text):
  131. print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
  132. return False
  133. f = open(md_file,"r")
  134. content = "".join(f.readlines())
  135. if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
  136. print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
  137. return False
  138. f.close()
  139. return True
  140. @lru_cache()
  141. def _check_outbound_link(url: str):
  142. # Following urls work correctly on a web browser but raise a 400 code when using python requests
  143. whitelist = frozenset([
  144. 'circuitousroot.com',
  145. 'codepen.io',
  146. 'colourblindawareness.org',
  147. 'cortezlawfirmpllc.com',
  148. 'doi.org',
  149. 'figma.com',
  150. 'freepik.com',
  151. 'gigapress.net',
  152. 'help.figma.com',
  153. 'kupferschrift.de',
  154. 'languagegeek.com',
  155. 'layoutgridcalculator.com',
  156. 'medium.com',
  157. 'medium.engineering',
  158. 'nedwin.medium.com',
  159. 'nytimes.com',
  160. 'paulshawletterdesign.com',
  161. 'psycnet.apa.org',
  162. 'researchgate.net',
  163. 'sciencedirect.com',
  164. 'support.google.com',
  165. 'twitter.com',
  166. 'typetura.com',
  167. 'webmd.com',
  168. "jessicahische.is",
  169. "type.method.ac",
  170. ])
  171. # Following urls will be fixed at a later date. If the CI is failing and a suitable
  172. # replacement url cannot be found, please add them to this set.
  173. to_fix = frozenset([
  174. # bad SSL cert
  175. "clagnut.com",
  176. "xinreality.com"
  177. ])
  178. if urlparse(url).netloc.replace("www.", "") in whitelist | to_fix:
  179. return True
  180. response = requests.head(url, allow_redirects=True, timeout=30)
  181. if not response.ok:
  182. print(f"INVALID url {url}' returned response status code '{response.status_code}'")
  183. return response.ok
  184. def _check_md_files(knowledge: KnowledgeContent) -> bool:
  185. result = True
  186. for md_file in knowledge.md_files:
  187. ast = _markdown_ast(md_file)
  188. result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
  189. for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
  190. target = link["attrs"]["url"]
  191. # mistune cannot parse urls that end with a closing parenthesis,
  192. # https://github.com/lepture/mistune/issues/355
  193. # A possible fix is to do some regex acrobatics in:
  194. # https://github.com/lepture/mistune/blob/master/src/mistune/helpers.py#L12-L18,
  195. if "(" in target:
  196. target += ")"
  197. if not target:
  198. continue # TODO: are empty links bad
  199. if re.search("^http(s)?://", target.lower()):
  200. if FLAGS.check_outbound_links:
  201. result = _check_outbound_link(target) and result
  202. else:
  203. target_path = knowledge.link_target_to_path(target)
  204. result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
  205. return result
  206. def _check_proto_files(knowledge: KnowledgeContent) -> bool:
  207. # TODO support alt_ids, many Knowledge constructs have them
  208. # The set of valid contributors is useful in upcoming validations
  209. contributors_file = knowledge.knowledge_dir / "contributors.textproto"
  210. assert contributors_file.is_file(), contributors_file
  211. contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
  212. result = True
  213. for textproto_file in knowledge.textproto_files:
  214. expected_files = set()
  215. if textproto_file.stem == "contributors":
  216. pass # handled above
  217. elif textproto_file.stem == "knowledge":
  218. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
  219. expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
  220. elif textproto_file.stem == "term":
  221. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
  222. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
  223. elif textproto_file.stem == "lesson":
  224. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
  225. for author in set(proto.authors) | set(proto.reviewers):
  226. result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
  227. expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
  228. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
  229. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
  230. expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
  231. # thumbnail is mandatory
  232. expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
  233. elif textproto_file.stem == "module":
  234. proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
  235. expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
  236. elif textproto_file.stem == "topic":
  237. # The Topic parses. And that's enough.
  238. text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
  239. else:
  240. raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
  241. for ref, expected_file in expected_files:
  242. result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
  243. return result
  244. def _is_svg(image_file: Path) -> bool:
  245. return image_file.suffix == ".svg"
  246. def _is_svg(image_file: Path) -> bool:
  247. return image_file.suffix == ".svg"
  248. def _check_image_files(knowledge: KnowledgeContent) -> bool:
  249. result = True
  250. image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
  251. for image_file in image_files:
  252. st_size = image_file.stat().st_size
  253. if _is_svg(image_file):
  254. if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
  255. print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
  256. result = False
  257. root = minidom.parseString(image_file.read_text()).documentElement
  258. if root.tagName != "svg":
  259. print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
  260. result = False
  261. has_view_box = "viewBox" in root.attributes
  262. has_width_and_height = "width" in root.attributes and "height" in root.attributes
  263. if not has_view_box and not has_width_and_height:
  264. print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
  265. result = False
  266. for stopEl in root.getElementsByTagName("stop"):
  267. if "offset" not in stopEl.attributes:
  268. print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
  269. result = False
  270. else:
  271. if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
  272. print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
  273. result = False
  274. return result
  275. def main(_):
  276. knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
  277. return_code = 1
  278. if (_check_md_files(knowledge)
  279. and _check_proto_files(knowledge)
  280. and _check_image_files(knowledge)):
  281. return_code = 0
  282. sys.exit(return_code)
  283. if __name__ == "__main__":
  284. app.run(main)