from pprint import pprint from absl import app from absl import flags from gftools import knowledge_pb2 from google.protobuf import text_format import itertools import mistune # markdown => ast from xml.dom import minidom from pathlib import Path import re import sys from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union MAX_RASTER_IMAGE_SIZE_KB = 800 MAX_VECTOR_IMAGE_SIZE_KB = 1750 def _topic_target_to_path(_: Set[str], target: str) -> str: # TODO sanity check if this is the only valid update return Path(target.replace("/topic/", "topics/")) / "topic.textproto" def _module_target_to_path(_: Set[str], target: str) -> str: return Path(target.replace("/module/", "modules/")) / "module.textproto" def _content_md(path: str) -> Path: return Path(path) / "content.md" def _glossary_target_to_path(_: Set[str], target: str) -> str: # TODO sanity check if this is the only valid update return _content_md(target.replace("/glossary/", "glossary/terms/")) def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str: # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/ parts = target[1:].split("/") assert parts[0] == "lesson" if len(parts) == 2: path = names.get(parts[1], "") if not path.startswith("modules/"): return _content_md(target) return _content_md(path) elif len(parts) == 3: return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}") else: return _content_md(target) def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str: return _content_md(names.get(target, target)) _LINK_TO_PATH = [ (re.compile("^/glossary/"), _glossary_target_to_path), (re.compile("^/topic/"), _topic_target_to_path), (re.compile("^/lesson/"), _lesson_target_to_path), (re.compile("^/module/"), _module_target_to_path), (re.compile("[^/]+"), _any_unique_name_to_path) ] FLAGS = flags.FLAGS flags.DEFINE_bool("print_valid", False, "Whether to print valid links") MdValue = Union[Mapping[str, "MdValue"]] class KnowledgeContent(NamedTuple): repo_root: Path knowledge_dir: Path md_files: Tuple[Path, ...] textproto_files: Tuple[Path, ...] unambiguous_names: Mapping[str, Path] def module_name_to_path(self: "KnowledgeContent", name: str) -> Path: return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto" def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target) def term_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target) def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target) def link_target_to_path(self: "KnowledgeContent", target: str) -> Path: return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target) @classmethod def load(cls, repo_root: Path) -> "KnowledgeContent": knowledge_dir = repo_root / "cc-by-sa" / "knowledge" assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}" md_files = [] textproto_files = [] for file in knowledge_dir.rglob("*"): if file.suffix.lower() == ".md": md_files.append(file) elif file.suffix.lower() == ".textproto": textproto_files.append(file) else: pass unambiguous_names = {} for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name): entries = list(entries) if len(entries) != 1: print(name, "is ambiguous") continue unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent) return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names) def _markdown_ast(md_file: Path) -> List[MdValue]: return mistune.create_markdown(renderer='ast')(md_file.read_text()) def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]: frontier = list(root) while frontier: current = frontier.pop(0) assert isinstance(current, dict), f"What is {current}" if filter_fn(current): yield current for entry in current.values(): if isinstance(entry, list): frontier.extend(entry) def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path: for matcher, link_to_path_fn in _LINK_TO_PATH: if matcher.search(target): return link_to_path_fn(names, target) raise ValueError(f"Unrecognized target {target}") def _safe_relative_to(parent: Path, child: Path) -> Path: try: return child.relative_to(parent) except ValueError: return child def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool: if FLAGS.print_valid or not result: message = "valid " if not result: message = "INVALID " suffix = "" if target is not None: suffix = " => " + str(_safe_relative_to(repo_root, target)) print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}") return result def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool: return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target) def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool: return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None) def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool: for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"): text = el.get("text", "") if re.search(' id="[^"]+"', text): print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text) return False f = open(md_file,"r") content = "".join(f.readlines()) if re.search('(?!.*)', content, re.MULTILINE | re.DOTALL): print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a
outside of a
") return False f.close() return True def _check_md_files(knowledge: KnowledgeContent) -> bool: result = True for md_file in knowledge.md_files: ast = _markdown_ast(md_file) result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"): target = link.get("link", "") if not target: continue # TODO: are empty links bad if re.search("^http(s)?://", target.lower()): continue # we aren't in the business of validating outbound links target_path = knowledge.link_target_to_path(target) result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result return result def _check_proto_files(knowledge: KnowledgeContent) -> bool: # TODO support alt_ids, many Knowledge constructs have them # The set of valid contributors is useful in upcoming validations contributors_file = knowledge.knowledge_dir / "contributors.textproto" assert contributors_file.is_file(), contributors_file contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors} result = True for textproto_file in knowledge.textproto_files: expected_files = set() if textproto_file.stem == "contributors": pass # handled above elif textproto_file.stem == "knowledge": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto()) expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules} elif textproto_file.stem == "term": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto()) expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons} elif textproto_file.stem == "lesson": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto()) for author in set(proto.authors) | set(proto.reviewers): result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics} expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons} expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons} expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms} # thumbnail is mandatory expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg")) elif textproto_file.stem == "module": proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto()) expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons} elif textproto_file.stem == "topic": # The Topic parses. And that's enough. text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto()) else: raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root)) for ref, expected_file in expected_files: result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result return result def _is_svg(image_file: Path) -> bool: return image_file.suffix == ".svg" def _is_svg(image_file: Path) -> bool: return image_file.suffix == ".svg" def _check_image_files(knowledge: KnowledgeContent) -> bool: result = True image_files = list(knowledge.knowledge_dir.glob("**/images/*")) for image_file in image_files: st_size = image_file.stat().st_size if _is_svg(image_file): if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024: print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir)) result = False root = minidom.parseString(image_file.read_text()).documentElement if root.tagName != "svg": print("Root element must be :", image_file.relative_to(knowledge.repo_root)) result = False has_view_box = "viewBox" in root.attributes has_width_and_height = "width" in root.attributes and "height" in root.attributes if not has_view_box and not has_width_and_height: print("Must specify viewBox and/or width+height on :", image_file.relative_to(knowledge.knowledge_dir)) result = False for stopEl in root.getElementsByTagName("stop"): if "offset" not in stopEl.attributes: print("Must specify offset on :", image_file.relative_to(knowledge.knowledge_dir)) result = False else: if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024: print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir)) result = False return result def main(_): knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent) return_code = 1 if (_check_md_files(knowledge) and _check_proto_files(knowledge) and _check_image_files(knowledge)): return_code = 0 sys.exit(return_code) if __name__ == "__main__": app.run(main)