123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367 |
- from pprint import pprint
- from absl import app
- from absl import flags
- from gftools import knowledge_pb2
- from google.protobuf import text_format
- import itertools
- import mistune # markdown => ast
- from xml.dom import minidom
- from pathlib import Path
- import re
- import sys
- from typing import Callable, Iterable, List, Mapping, NamedTuple, Optional, Tuple, Set, Union
- import requests
- from functools import lru_cache
- from urllib.parse import urlparse
- MAX_RASTER_IMAGE_SIZE_KB = 800
- MAX_VECTOR_IMAGE_SIZE_KB = 1750
- def _topic_target_to_path(_: Set[str], target: str) -> str:
- # TODO sanity check if this is the only valid update
- return Path(target.replace("/topic/", "topics/")) / "topic.textproto"
- def _module_target_to_path(_: Set[str], target: str) -> str:
- return Path(target.replace("/module/", "modules/")) / "module.textproto"
- def _content_md(path: str) -> Path:
- return Path(path) / "content.md"
- def _glossary_target_to_path(_: Set[str], target: str) -> str:
- # TODO sanity check if this is the only valid update
- return _content_md(target.replace("/glossary/", "glossary/terms/"))
- def _lesson_target_to_path(names: Mapping[str, str], target: str) -> str:
- # /lesson/choosing_type/choosing_reliable_typefaces => modules/choosing_type/lessons/choosing_reliable_typefaces/
- parts = target[1:].split("/")
- assert parts[0] == "lesson"
- if len(parts) == 2:
- path = names.get(parts[1], "")
- if not path.startswith("modules/"):
- return _content_md(target)
- return _content_md(path)
- elif len(parts) == 3:
- return _content_md(f"modules/{parts[1]}/lessons/{parts[2]}")
- else:
- return _content_md(target)
- def _any_unique_name_to_path(names: Mapping[str, str], target: str) -> str:
- return _content_md(names.get(target, target))
- _LINK_TO_PATH = [
- (re.compile("^/glossary/"), _glossary_target_to_path),
- (re.compile("^/topic/"), _topic_target_to_path),
- (re.compile("^/lesson/"), _lesson_target_to_path),
- (re.compile("^/module/"), _module_target_to_path),
- (re.compile("[^/]+"), _any_unique_name_to_path)
- ]
- FLAGS = flags.FLAGS
- flags.DEFINE_bool("print_valid", False, "Whether to print valid links")
- flags.DEFINE_bool("check_outbound_links", False, "Check outbound urls")
- MdValue = Union[Mapping[str, "MdValue"]]
- class KnowledgeContent(NamedTuple):
- repo_root: Path
- knowledge_dir: Path
- md_files: Tuple[Path, ...]
- textproto_files: Tuple[Path, ...]
- unambiguous_names: Mapping[str, Path]
- def module_name_to_path(self: "KnowledgeContent", name: str) -> Path:
- return self.knowledge_dir / "modules" / name.lower().replace(" ", "_") / "module.textproto"
- def lesson_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/lesson/" + target)
- def term_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/glossary/" + target)
- def topic_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, "/topic/" + target)
- def link_target_to_path(self: "KnowledgeContent", target: str) -> Path:
- return self.knowledge_dir / _link_target_to_path(self.unambiguous_names, target)
- @classmethod
- def load(cls, repo_root: Path) -> "KnowledgeContent":
- knowledge_dir = repo_root / "cc-by-sa" / "knowledge"
- assert knowledge_dir.is_dir(), f"No dir {knowledge_dir}"
- md_files = []
- textproto_files = []
- for file in knowledge_dir.rglob("*"):
- if file.suffix.lower() == ".md":
- md_files.append(file)
- elif file.suffix.lower() == ".textproto":
- textproto_files.append(file)
- else:
- pass
- unambiguous_names = {}
- for name, entries in itertools.groupby(sorted(md_files, key=lambda p: p.parent.name), key=lambda p: p.parent.name):
- entries = list(entries)
- if len(entries) != 1:
- print(name, "is ambiguous")
- continue
- unambiguous_names[name] = str(entries[0].relative_to(knowledge_dir).parent)
- return cls(repo_root, knowledge_dir, tuple(md_files), tuple(textproto_files), unambiguous_names)
- def _markdown_ast(md_file: Path) -> List[MdValue]:
- return mistune.create_markdown(renderer='ast')(md_file.read_text())
- def _ast_iter(root: List[MdValue], filter_fn: Callable[[MdValue], bool]) -> Iterable[MdValue]:
- frontier = list(root)
- while frontier:
- current = frontier.pop(0)
- assert isinstance(current, dict), f"What is {current}"
- if filter_fn(current):
- yield current
- for entry in current.values():
- if isinstance(entry, list):
- frontier.extend(entry)
- def _link_target_to_path(names: Mapping[str, Path], target: str) -> Path:
- for matcher, link_to_path_fn in _LINK_TO_PATH:
- if matcher.search(target):
- return link_to_path_fn(names, target)
- raise ValueError(f"Unrecognized target {target}")
- def _safe_relative_to(parent: Path, child: Path) -> Path:
- try:
- return child.relative_to(parent)
- except ValueError:
- return child
- def _maybe_print_check(result: bool, repo_root: Path, referrer: Path, ref: str, target: Optional[Path]) -> bool:
- if FLAGS.print_valid or not result:
- message = "valid "
- if not result:
- message = "INVALID "
- suffix = ""
- if target is not None:
- suffix = " => " + str(_safe_relative_to(repo_root, target))
- print(message, _safe_relative_to(repo_root, referrer), f"\"{ref}\"{suffix}")
- return result
- def _check_file_present(repo_root: Path, referrer: Path, ref: str, target: Path) -> bool:
- return _maybe_print_check(target.is_file(), repo_root, referrer, ref, target)
- def _check_contributor(repo_root: Path, referrer: Path, ref: str, contributors: Set[str]) -> bool:
- return _maybe_print_check(ref in contributors, repo_root, referrer, ref, None)
- def _check_md_file_contents(repo_root: Path, md_file: Path, ast: List[MdValue]) -> bool:
- for el in _ast_iter(ast, lambda v: v.get("type", None) == "inline_html"):
- text = el.get("text", "")
- if re.search(' id="[^"]+"', text):
- print("INVALID ", _safe_relative_to(repo_root, md_file), "attr.id not allowed:", text)
- return False
- f = open(md_file,"r")
- content = "".join(f.readlines())
- if re.search('</figcaption>(?!.*</figure>)', content, re.MULTILINE | re.DOTALL):
- print("INVALID ", _safe_relative_to(repo_root, md_file), "Cannot have a <figcaption> outside of a <figure>")
- return False
- f.close()
- return True
- @lru_cache()
- def _check_outbound_link(url: str):
- # Following urls work correctly on a web browser but raise a 400 code when using python requests
- whitelist = frozenset([
- 'circuitousroot.com',
- 'codepen.io',
- 'colourblindawareness.org',
- 'cortezlawfirmpllc.com',
- 'doi.org',
- 'figma.com',
- 'freepik.com',
- 'gigapress.net',
- 'help.figma.com',
- 'kupferschrift.de',
- 'languagegeek.com',
- 'layoutgridcalculator.com',
- 'medium.com',
- 'medium.engineering',
- 'nedwin.medium.com',
- 'nytimes.com',
- 'paulshawletterdesign.com',
- 'psycnet.apa.org',
- 'researchgate.net',
- 'sciencedirect.com',
- 'support.google.com',
- 'twitter.com',
- 'typetura.com',
- 'webmd.com',
- "jessicahische.is",
- "type.method.ac",
- ])
- # Following urls will be fixed at a later date. If the CI is failing and a suitable
- # replacement url cannot be found, please add them to this set.
- to_fix = frozenset([
- # bad SSL cert
- "clagnut.com",
- "xinreality.com"
- ])
- if urlparse(url).netloc.replace("www.", "") in whitelist | to_fix:
- return True
- response = requests.head(url, allow_redirects=True, timeout=30)
- if not response.ok:
- print(f"INVALID url {url}' returned response status code '{response.status_code}'")
- return response.ok
- def _check_md_files(knowledge: KnowledgeContent) -> bool:
- result = True
- for md_file in knowledge.md_files:
- ast = _markdown_ast(md_file)
- result = _check_md_file_contents(knowledge.repo_root, md_file, ast) and result
- for link in _ast_iter(ast, lambda v: v.get("type", None) == "link"):
- target = link["attrs"]["url"]
- # mistune cannot parse urls that end with a closing parenthesis,
- # https://github.com/lepture/mistune/issues/355
- # A possible fix is to do some regex acrobatics in:
- # https://github.com/lepture/mistune/blob/master/src/mistune/helpers.py#L12-L18,
- if "(" in target:
- target += ")"
- if not target:
- continue # TODO: are empty links bad
- if re.search("^http(s)?://", target.lower()):
- if FLAGS.check_outbound_links:
- result = _check_outbound_link(target) and result
- else:
- target_path = knowledge.link_target_to_path(target)
- result = _check_file_present(knowledge.repo_root, md_file, target, target_path) and result
- return result
- def _check_proto_files(knowledge: KnowledgeContent) -> bool:
- # TODO support alt_ids, many Knowledge constructs have them
- # The set of valid contributors is useful in upcoming validations
- contributors_file = knowledge.knowledge_dir / "contributors.textproto"
- assert contributors_file.is_file(), contributors_file
- contributors = {c.name for c in text_format.Parse(contributors_file.read_text(), knowledge_pb2.ContributorsProto()).contributors}
- result = True
- for textproto_file in knowledge.textproto_files:
- expected_files = set()
- if textproto_file.stem == "contributors":
- pass # handled above
- elif textproto_file.stem == "knowledge":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.KnowledgeProto())
- expected_files |= {(m, knowledge.module_name_to_path(m)) for m in proto.modules}
- elif textproto_file.stem == "term":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.TermProto())
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.related_lessons}
- elif textproto_file.stem == "lesson":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.LessonProto())
- for author in set(proto.authors) | set(proto.reviewers):
- result = _check_contributor(knowledge.repo_root, textproto_file, author, contributors) and result
- expected_files |= {(n, knowledge.topic_target_to_path(n)) for n in proto.topics}
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.prev_lessons}
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.next_lessons}
- expected_files |= {(n, knowledge.term_target_to_path(n)) for n in proto.related_terms}
- # thumbnail is mandatory
- expected_files.add(("thumbnail", textproto_file.parent / "images" / "thumbnail.svg"))
- elif textproto_file.stem == "module":
- proto = text_format.Parse(textproto_file.read_text(), knowledge_pb2.ModuleProto())
- expected_files |= {(n, knowledge.lesson_target_to_path(n)) for n in proto.lessons}
- elif textproto_file.stem == "topic":
- # The Topic parses. And that's enough.
- text_format.Parse(textproto_file.read_text(), knowledge_pb2.TopicProto())
- else:
- raise ValueError("No handler for " + textproto_file.relative_to(knowledge.repo_root))
- for ref, expected_file in expected_files:
- result = _check_file_present(knowledge.repo_root, textproto_file, ref, expected_file) and result
- return result
- def _is_svg(image_file: Path) -> bool:
- return image_file.suffix == ".svg"
- def _is_svg(image_file: Path) -> bool:
- return image_file.suffix == ".svg"
- def _check_image_files(knowledge: KnowledgeContent) -> bool:
- result = True
- image_files = list(knowledge.knowledge_dir.glob("**/images/*"))
- for image_file in image_files:
- st_size = image_file.stat().st_size
- if _is_svg(image_file):
- if st_size > MAX_VECTOR_IMAGE_SIZE_KB * 1024:
- print("File exceeds max size of %s KB (%s KB):" % (MAX_VECTOR_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
- result = False
- root = minidom.parseString(image_file.read_text()).documentElement
- if root.tagName != "svg":
- print("Root element must be <svg>:", image_file.relative_to(knowledge.repo_root))
- result = False
- has_view_box = "viewBox" in root.attributes
- has_width_and_height = "width" in root.attributes and "height" in root.attributes
- if not has_view_box and not has_width_and_height:
- print("Must specify viewBox and/or width+height on <svg>:", image_file.relative_to(knowledge.knowledge_dir))
- result = False
- for stopEl in root.getElementsByTagName("stop"):
- if "offset" not in stopEl.attributes:
- print("Must specify offset on <stop>:", image_file.relative_to(knowledge.knowledge_dir))
- result = False
- else:
- if st_size > MAX_RASTER_IMAGE_SIZE_KB * 1024:
- print("File exceeds max size of %s KB (%s KB):" % (MAX_RASTER_IMAGE_SIZE_KB, st_size // 1024), image_file.relative_to(knowledge.knowledge_dir))
- result = False
- return result
- def main(_):
- knowledge = KnowledgeContent.load(Path(__file__).parent.parent.parent)
- return_code = 1
- if (_check_md_files(knowledge)
- and _check_proto_files(knowledge)
- and _check_image_files(knowledge)):
- return_code = 0
- sys.exit(return_code)
- if __name__ == "__main__":
- app.run(main)
|