fuzz_diff_parser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. """
  2. A script to find bugs in the diff parser.
  3. This script is extremely useful if changes are made to the diff parser. By
  4. running a few thousand iterations, we can assure that the diff parser is in
  5. good shape.
  6. Usage:
  7. fuzz_diff_parser.py [--pdb|--ipdb] [-l] [-n=<nr>] [-x=<nr>] random [<path>]
  8. fuzz_diff_parser.py [--pdb|--ipdb] [-l] redo [-o=<nr>] [-p]
  9. fuzz_diff_parser.py -h | --help
  10. Options:
  11. -h --help Show this screen
  12. -n, --maxtries=<nr> Maximum of random tries [default: 1000]
  13. -x, --changes=<nr> Amount of changes to be done to a file per try [default: 5]
  14. -l, --logging Prints all the logs
  15. -o, --only-last=<nr> Only runs the last n iterations; Defaults to running all
  16. -p, --print-code Print all test diffs
  17. --pdb Launch pdb when error is raised
  18. --ipdb Launch ipdb when error is raised
  19. """
  20. from __future__ import print_function
  21. import logging
  22. import sys
  23. import os
  24. import random
  25. import pickle
  26. import parso
  27. from parso.utils import split_lines
  28. from test.test_diff_parser import _check_error_leaves_nodes
  29. _latest_grammar = parso.load_grammar(version='3.8')
  30. _python_reserved_strings = tuple(
  31. # Keywords are ususally only interesting in combination with spaces after
  32. # them. We don't put a space before keywords, to avoid indentation errors.
  33. s + (' ' if s.isalpha() else '')
  34. for s in _latest_grammar._pgen_grammar.reserved_syntax_strings.keys()
  35. )
  36. _random_python_fragments = _python_reserved_strings + (
  37. ' ', '\t', '\n', '\r', '\f', 'f"', 'F"""', "fr'", "RF'''", '"', '"""', "'",
  38. "'''", ';', ' some_random_word ', '\\', '#',
  39. )
  40. def find_python_files_in_tree(file_path):
  41. if not os.path.isdir(file_path):
  42. yield file_path
  43. return
  44. for root, dirnames, filenames in os.walk(file_path):
  45. if 'chardet' in root:
  46. # Stuff like chardet/langcyrillicmodel.py is just very slow to
  47. # parse and machine generated, so ignore those.
  48. continue
  49. for name in filenames:
  50. if name.endswith('.py'):
  51. yield os.path.join(root, name)
  52. def _print_copyable_lines(lines):
  53. for line in lines:
  54. line = repr(line)[1:-1]
  55. if line.endswith(r'\n'):
  56. line = line[:-2] + '\n'
  57. print(line, end='')
  58. def _get_first_error_start_pos_or_none(module):
  59. error_leaf = _check_error_leaves_nodes(module)
  60. return None if error_leaf is None else error_leaf.start_pos
  61. class LineReplacement:
  62. def __init__(self, line_nr, new_line):
  63. self._line_nr = line_nr
  64. self._new_line = new_line
  65. def apply(self, code_lines):
  66. # print(repr(self._new_line))
  67. code_lines[self._line_nr] = self._new_line
  68. class LineDeletion:
  69. def __init__(self, line_nr):
  70. self.line_nr = line_nr
  71. def apply(self, code_lines):
  72. del code_lines[self.line_nr]
  73. class LineCopy:
  74. def __init__(self, copy_line, insertion_line):
  75. self._copy_line = copy_line
  76. self._insertion_line = insertion_line
  77. def apply(self, code_lines):
  78. code_lines.insert(
  79. self._insertion_line,
  80. # Use some line from the file. This doesn't feel totally
  81. # random, but for the diff parser it will feel like it.
  82. code_lines[self._copy_line]
  83. )
  84. class FileModification:
  85. @classmethod
  86. def generate(cls, code_lines, change_count, previous_file_modification=None):
  87. if previous_file_modification is not None and random.random() > 0.5:
  88. # We want to keep the previous modifications in some cases to make
  89. # more complex parser issues visible.
  90. code_lines = previous_file_modification.apply(code_lines)
  91. added_modifications = previous_file_modification.modification_list
  92. else:
  93. added_modifications = []
  94. return cls(
  95. added_modifications
  96. + list(cls._generate_line_modifications(code_lines, change_count)),
  97. # work with changed trees more than with normal ones.
  98. check_original=random.random() > 0.8,
  99. )
  100. @staticmethod
  101. def _generate_line_modifications(lines, change_count):
  102. def random_line(include_end=False):
  103. return random.randint(0, len(lines) - (not include_end))
  104. lines = list(lines)
  105. for _ in range(change_count):
  106. rand = random.randint(1, 4)
  107. if rand == 1:
  108. if len(lines) == 1:
  109. # We cannot delete every line, that doesn't make sense to
  110. # fuzz and it would be annoying to rewrite everything here.
  111. continue
  112. l = LineDeletion(random_line())
  113. elif rand == 2:
  114. # Copy / Insertion
  115. # Make it possible to insert into the first and the last line
  116. l = LineCopy(random_line(), random_line(include_end=True))
  117. elif rand in (3, 4):
  118. # Modify a line in some weird random ways.
  119. line_nr = random_line()
  120. line = lines[line_nr]
  121. column = random.randint(0, len(line))
  122. random_string = ''
  123. for _ in range(random.randint(1, 3)):
  124. if random.random() > 0.8:
  125. # The lower characters cause way more issues.
  126. unicode_range = 0x1f if random.randint(0, 1) else 0x3000
  127. random_string += chr(random.randint(0, unicode_range))
  128. else:
  129. # These insertions let us understand how random
  130. # keyword/operator insertions work. Theoretically this
  131. # could also be done with unicode insertions, but the
  132. # fuzzer is just way more effective here.
  133. random_string += random.choice(_random_python_fragments)
  134. if random.random() > 0.5:
  135. # In this case we insert at a very random place that
  136. # probably breaks syntax.
  137. line = line[:column] + random_string + line[column:]
  138. else:
  139. # Here we have better chances to not break syntax, because
  140. # we really replace the line with something that has
  141. # indentation.
  142. line = ' ' * random.randint(0, 12) + random_string + '\n'
  143. l = LineReplacement(line_nr, line)
  144. l.apply(lines)
  145. yield l
  146. def __init__(self, modification_list, check_original):
  147. self.modification_list = modification_list
  148. self._check_original = check_original
  149. def apply(self, code_lines):
  150. changed_lines = list(code_lines)
  151. for modification in self.modification_list:
  152. modification.apply(changed_lines)
  153. return changed_lines
  154. def run(self, grammar, code_lines, print_code):
  155. code = ''.join(code_lines)
  156. modified_lines = self.apply(code_lines)
  157. modified_code = ''.join(modified_lines)
  158. if print_code:
  159. if self._check_original:
  160. print('Original:')
  161. _print_copyable_lines(code_lines)
  162. print('\nModified:')
  163. _print_copyable_lines(modified_lines)
  164. print()
  165. if self._check_original:
  166. m = grammar.parse(code, diff_cache=True)
  167. start1 = _get_first_error_start_pos_or_none(m)
  168. grammar.parse(modified_code, diff_cache=True)
  169. if self._check_original:
  170. # Also check if it's possible to "revert" the changes.
  171. m = grammar.parse(code, diff_cache=True)
  172. start2 = _get_first_error_start_pos_or_none(m)
  173. assert start1 == start2, (start1, start2)
  174. class FileTests:
  175. def __init__(self, file_path, test_count, change_count):
  176. self._path = file_path
  177. with open(file_path, errors='replace') as f:
  178. code = f.read()
  179. self._code_lines = split_lines(code, keepends=True)
  180. self._test_count = test_count
  181. self._code_lines = self._code_lines
  182. self._change_count = change_count
  183. self._file_modifications = []
  184. def _run(self, grammar, file_modifications, debugger, print_code=False):
  185. try:
  186. for i, fm in enumerate(file_modifications, 1):
  187. fm.run(grammar, self._code_lines, print_code=print_code)
  188. print('.', end='')
  189. sys.stdout.flush()
  190. print()
  191. except Exception:
  192. print("Issue in file: %s" % self._path)
  193. if debugger:
  194. einfo = sys.exc_info()
  195. pdb = __import__(debugger)
  196. pdb.post_mortem(einfo[2])
  197. raise
  198. def redo(self, grammar, debugger, only_last, print_code):
  199. mods = self._file_modifications
  200. if only_last is not None:
  201. mods = mods[-only_last:]
  202. self._run(grammar, mods, debugger, print_code=print_code)
  203. def run(self, grammar, debugger):
  204. def iterate():
  205. fm = None
  206. for _ in range(self._test_count):
  207. fm = FileModification.generate(
  208. self._code_lines, self._change_count,
  209. previous_file_modification=fm
  210. )
  211. self._file_modifications.append(fm)
  212. yield fm
  213. self._run(grammar, iterate(), debugger)
  214. def main(arguments):
  215. debugger = 'pdb' if arguments['--pdb'] else \
  216. 'ipdb' if arguments['--ipdb'] else None
  217. redo_file = os.path.join(os.path.dirname(__file__), 'fuzz-redo.pickle')
  218. if arguments['--logging']:
  219. root = logging.getLogger()
  220. root.setLevel(logging.DEBUG)
  221. ch = logging.StreamHandler(sys.stdout)
  222. ch.setLevel(logging.DEBUG)
  223. root.addHandler(ch)
  224. grammar = parso.load_grammar()
  225. parso.python.diff.DEBUG_DIFF_PARSER = True
  226. if arguments['redo']:
  227. with open(redo_file, 'rb') as f:
  228. file_tests_obj = pickle.load(f)
  229. only_last = arguments['--only-last'] and int(arguments['--only-last'])
  230. file_tests_obj.redo(
  231. grammar,
  232. debugger,
  233. only_last=only_last,
  234. print_code=arguments['--print-code']
  235. )
  236. elif arguments['random']:
  237. # A random file is used to do diff parser checks if no file is given.
  238. # This helps us to find errors in a lot of different files.
  239. file_paths = list(find_python_files_in_tree(arguments['<path>'] or '.'))
  240. max_tries = int(arguments['--maxtries'])
  241. tries = 0
  242. try:
  243. while tries < max_tries:
  244. path = random.choice(file_paths)
  245. print("Checking %s: %s tries" % (path, tries))
  246. now_tries = min(1000, max_tries - tries)
  247. file_tests_obj = FileTests(path, now_tries, int(arguments['--changes']))
  248. file_tests_obj.run(grammar, debugger)
  249. tries += now_tries
  250. except Exception:
  251. with open(redo_file, 'wb') as f:
  252. pickle.dump(file_tests_obj, f)
  253. raise
  254. else:
  255. raise NotImplementedError('Command is not implemented')
  256. if __name__ == '__main__':
  257. from docopt import docopt
  258. arguments = docopt(__doc__)
  259. main(arguments)