from pprint import pprint from absl import app from absl import flags from gftools import knowledge_pb2 from google.protobuf import text_format import itertools import mistune # markdown => ast from xml.dom import minidom from pathlib import Path import re import sys from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union import requests from functools import lru_cache from urllib.parse import urlparse MAX_RASTER_IMAGE_SIZE_KB = 800 MAX_VECTOR_IMAGE_SIZE_KB = 1750 def _topic_target_to_path(_: Set[str], target: str) -> str: # TODO sanity check if this is the only valid update return Path(target.replace("/topic/", "topics/")) / "topic.textproto" def _module_target_to_path(_: Set[str], target: str) -> str: return Path(target.replace("/module/", "modules/")) / "module.textproto" def _content_md(path: str) -> Path: return Path(path) / "content.md" def _glossary_target_to_path(_: Set[str], target: str) -> str: # TODO sanity check if this is the only valid update return _content_md(target.replace("/glossary/", "glossary/terms/")) def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str: # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/ parts = target[1:].split("/") assert parts[0] == "lesson" if len(parts) == 2: path = names.get(parts[1], "") if not path.startswith("modules/"): return _content_md(target) return _content_md(path) elif len(parts) == 3: return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}") else: return _content_md(target) def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str: return _content_md(names.get(target, target)) _LINK_TO_PATH = [ (re.compile("^/glossary/"), _glossary_target_to_path), (re.compile("^/topic/"), _topic_target_to_path), (re.compile("^/lesson/"), _lesson_target_to_path), (re.compile("^/module/"), _module_target_to_path), (re.compile("[^/]+"), _any_unique_name_to_path) ] FLAGS = flags.FLAGS flags.DEFINE_bool("print_valid", False, "Whether to print valid links") flags.DEFINE_bool("check_outbound_links", False, "Check outbound urls") MdValue = Union[Mapping[str, "MdValue"]] class KnowledgeContent(NamedTuple): repo_root: Path knowledge_dir: Path md_files: Tuple[Path, ...] textproto_files: Tuple[Path, ...] unambiguous_names: Mapping[str, Path] def module_name_to_path(self: "KnowledgeContent", name: str) -> Path: return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto" def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target) def term_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target) def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target) def link_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target) @classmethod def load(cls, repo_root: Path) -> "KnowledgeContent": knowledge_dir = repo_root / "cc-by-sa" / "knowledge" assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}" md_files = [] textproto_files = [] for file in knowledge_dir.rglob("*"): if file.suffix.lower() == ".md": md_files.append(file) elif file.suffix.lower() == ".textproto": textproto_files.append(file) else: pass unambiguous_names = {} for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name): entries = list(entries) if len(entries) != 1: print(name, "is ambiguous") continue unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent) return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names) def _markdown_ast(md_file: Path) -> List[MdValue]: return mistune.create_markdown(renderer='ast')(md_file.read_text()) def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]: frontier = list(root) while frontier: current = frontier.pop(0) assert isinstance(current, dict), f"What is {current}" if filter_fn(current): yield current for entry in current.values(): if isinstance(entry, list): frontier.extend(entry) def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path: for matcher, link_to_path_fn in _LINK_TO_PATH: if matcher.search(target): return link_to_path_fn(names, target) raise ValueError(f"Unrecognized target {target}") def _safe_relative_to(parent: Path, child: Path) -> Path: try: return child.relative_to(parent) except ValueError: return child def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool: if FLAGS.print_valid or not result: message = "valid " if not result: message = "INVALID " suffix = "" if target is not None: suffix = " => " + str(_safe_relative_to(repo_root, target)) print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}") return result def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool: return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target) def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool: return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None) def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool: for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"): text = el.get("text", "") if re.search(' id="[^"]+"', text): print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text) return False f = open(md_file,"r") content = "".join(f.readlines()) if re.search('(?!.*)', content, re.MULTILINE | re.DOTALL): print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a
outside of a
") return False f.close() return True @lru_cache() def _check_outbound_link(url: str): # Following urls work correctly on a web browser but raise a 400 code when using python requests whitelist = frozenset([ 'circuitousroot.com', 'codepen.io', 'colourblindawareness.org', 'cortezlawfirmpllc.com', 'doi.org', 'figma.com', 'freepik.com', 'gigapress.net', 'help.figma.com', 'kupferschrift.de', 'languagegeek.com', 'layoutgridcalculator.com', 'medium.com', 'medium.engineering', 'nedwin.medium.com', 'nytimes.com', 'paulshawletterdesign.com', 'psycnet.apa.org', 'researchgate.net', 'sciencedirect.com', 'support.google.com', 'twitter.com', 'typetura.com', 'webmd.com', "jessicahische.is", "type.method.ac", ]) # Following urls will be fixed at a later date. If the CI is failing and a suitable # replacement url cannot be found, please add them to this set. to_fix = frozenset([ # bad SSL cert "clagnut.com", "xinreality.com" ]) if urlparse(url).netloc.replace("www.", "") in whitelist | to_fix: return True response = requests.head(url, allow_redirects=True, timeout=30) if not response.ok: print(f"INVALID url {url}' returned response status code '{response.status_code}'") return response.ok def _check_md_files(knowledge: KnowledgeContent) -> bool: result = True for md_file in knowledge.md_files: ast = _markdown_ast(md_file) result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"): target = link["attrs"]["url"] # mistune cannot parse urls that end with a closing parenthesis, # https://github.com/lepture/mistune/issues/355 # A possible fix is to do some regex acrobatics in: # https://github.com/lepture/mistune/blob/master/src/mistune/helpers.py#L12-L18, if "(" in target: target += ")" if not target: continue # TODO: are empty links bad if re.search("^http(s)?://", target.lower()): if FLAGS.check_outbound_links: result = _check_outbound_link(target) and result else: target_path = knowledge.link_target_to_path(target) result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result return result def _check_proto_files(knowledge: KnowledgeContent) -> bool: # TODO support alt_ids, many Knowledge constructs have them # The set of valid contributors is useful in upcoming validations contributors_file = knowledge.knowledge_dir / "contributors.textproto" assert contributors_file.is_file(), contributors_file contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors} result = True for textproto_file in knowledge.textproto_files: expected_files = set() if textproto_file.stem == "contributors": pass # handled above elif textproto_file.stem == "knowledge": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto()) expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules} elif textproto_file.stem == "term": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto()) expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons} elif textproto_file.stem == "lesson": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto()) for author in set(proto.authors) | set(proto.reviewers): result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics} expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons} expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons} expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms} # thumbnail is mandatory expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg")) elif textproto_file.stem == "module": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto()) expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons} elif textproto_file.stem == "topic": # The Topic parses. And that's enough. text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto()) else: raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root)) for ref, expected_file in expected_files: result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result return result def _is_svg(image_file: Path) -> bool: return image_file.suffix == ".svg" def _is_svg(image_file: Path) -> bool: return image_file.suffix == ".svg" def _check_image_files(knowledge: KnowledgeContent) -> bool: result = True image_files = list(knowledge.knowledge_dir.glob("**/images/*")) for image_file in image_files: st_size = image_file.stat().st_size if _is_svg(image_file): if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024: print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir)) result = False root = minidom.parseString(image_file.read_text()).documentElement if root.tagName != "svg": print("Root element must be :", image_file.relative_to(knowledge.repo_root)) result = False has_view_box = "viewBox" in root.attributes has_width_and_height = "width" in root.attributes and "height" in root.attributes if not has_view_box and not has_width_and_height: print("Must specify viewBox and/or width+height on :", image_file.relative_to(knowledge.knowledge_dir)) result = False for stopEl in root.getElementsByTagName("stop"): if "offset" not in stopEl.attributes: print("Must specify offset on :", image_file.relative_to(knowledge.knowledge_dir)) result = False else: if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024: print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir)) result = False return result def main(_): knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent) return_code = 1 if (_check_md_files(knowledge) and _check_proto_files(knowledge) and _check_image_files(knowledge)): return_code = 0 sys.exit(return_code) if __name__ == "__main__": app.run(main)