regexopt.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. """
  2. pygments.regexopt
  3. ~~~~~~~~~~~~~~~~~
  4. An algorithm that generates optimized regexes for matching long lists of
  5. literal strings.
  6. :copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from re import escape
  11. from os.path import commonprefix
  12. from itertools import groupby
  13. from operator import itemgetter
  14. CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
  15. FIRST_ELEMENT = itemgetter(0)
  16. def make_charset(letters):
  17. return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
  18. def regex_opt_inner(strings, open_paren):
  19. """Return a regex that matches any string in the sorted list of strings."""
  20. close_paren = open_paren and ')' or ''
  21. # print strings, repr(open_paren)
  22. if not strings:
  23. # print '-> nothing left'
  24. return ''
  25. first = strings[0]
  26. if len(strings) == 1:
  27. # print '-> only 1 string'
  28. return open_paren + escape(first) + close_paren
  29. if not first:
  30. # print '-> first string empty'
  31. return open_paren + regex_opt_inner(strings[1:], '(?:') \
  32. + '?' + close_paren
  33. if len(first) == 1:
  34. # multiple one-char strings? make a charset
  35. oneletter = []
  36. rest = []
  37. for s in strings:
  38. if len(s) == 1:
  39. oneletter.append(s)
  40. else:
  41. rest.append(s)
  42. if len(oneletter) > 1: # do we have more than one oneletter string?
  43. if rest:
  44. # print '-> 1-character + rest'
  45. return open_paren + regex_opt_inner(rest, '') + '|' \
  46. + make_charset(oneletter) + close_paren
  47. # print '-> only 1-character'
  48. return open_paren + make_charset(oneletter) + close_paren
  49. prefix = commonprefix(strings)
  50. if prefix:
  51. plen = len(prefix)
  52. # we have a prefix for all strings
  53. # print '-> prefix:', prefix
  54. return open_paren + escape(prefix) \
  55. + regex_opt_inner([s[plen:] for s in strings], '(?:') \
  56. + close_paren
  57. # is there a suffix?
  58. strings_rev = [s[::-1] for s in strings]
  59. suffix = commonprefix(strings_rev)
  60. if suffix:
  61. slen = len(suffix)
  62. # print '-> suffix:', suffix[::-1]
  63. return open_paren \
  64. + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
  65. + escape(suffix[::-1]) + close_paren
  66. # recurse on common 1-string prefixes
  67. # print '-> last resort'
  68. return open_paren + \
  69. '|'.join(regex_opt_inner(list(group[1]), '')
  70. for group in groupby(strings, lambda s: s[0] == first[0])) \
  71. + close_paren
  72. def regex_opt(strings, prefix='', suffix=''):
  73. """Return a compiled regex that matches any string in the given list.
  74. The strings to match must be literal strings, not regexes. They will be
  75. regex-escaped.
  76. *prefix* and *suffix* are pre- and appended to the final regex.
  77. """
  78. strings = sorted(strings)
  79. return prefix + regex_opt_inner(strings, '(') + suffix