regexopt.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.regexopt
  4. ~~~~~~~~~~~~~~~~~
  5. An algorithm that generates optimized regexes for matching long lists of
  6. literal strings.
  7. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  8. :license: BSD, see LICENSE for details.
  9. """
  10. import re
  11. from re import escape
  12. from os.path import commonprefix
  13. from itertools import groupby
  14. from operator import itemgetter
  15. CS_ESCAPE = re.compile(r'[\^\\\-\]]')
  16. FIRST_ELEMENT = itemgetter(0)
  17. def make_charset(letters):
  18. return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
  19. def regex_opt_inner(strings, open_paren):
  20. """Return a regex that matches any string in the sorted list of strings."""
  21. close_paren = open_paren and ')' or ''
  22. # print strings, repr(open_paren)
  23. if not strings:
  24. # print '-> nothing left'
  25. return ''
  26. first = strings[0]
  27. if len(strings) == 1:
  28. # print '-> only 1 string'
  29. return open_paren + escape(first) + close_paren
  30. if not first:
  31. # print '-> first string empty'
  32. return open_paren + regex_opt_inner(strings[1:], '(?:') \
  33. + '?' + close_paren
  34. if len(first) == 1:
  35. # multiple one-char strings? make a charset
  36. oneletter = []
  37. rest = []
  38. for s in strings:
  39. if len(s) == 1:
  40. oneletter.append(s)
  41. else:
  42. rest.append(s)
  43. if len(oneletter) > 1: # do we have more than one oneletter string?
  44. if rest:
  45. # print '-> 1-character + rest'
  46. return open_paren + regex_opt_inner(rest, '') + '|' \
  47. + make_charset(oneletter) + close_paren
  48. # print '-> only 1-character'
  49. return open_paren + make_charset(oneletter) + close_paren
  50. prefix = commonprefix(strings)
  51. if prefix:
  52. plen = len(prefix)
  53. # we have a prefix for all strings
  54. # print '-> prefix:', prefix
  55. return open_paren + escape(prefix) \
  56. + regex_opt_inner([s[plen:] for s in strings], '(?:') \
  57. + close_paren
  58. # is there a suffix?
  59. strings_rev = [s[::-1] for s in strings]
  60. suffix = commonprefix(strings_rev)
  61. if suffix:
  62. slen = len(suffix)
  63. # print '-> suffix:', suffix[::-1]
  64. return open_paren \
  65. + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
  66. + escape(suffix[::-1]) + close_paren
  67. # recurse on common 1-string prefixes
  68. # print '-> last resort'
  69. return open_paren + \
  70. '|'.join(regex_opt_inner(list(group[1]), '')
  71. for group in groupby(strings, lambda s: s[0] == first[0])) \
  72. + close_paren
  73. def regex_opt(strings, prefix='', suffix=''):
  74. """Return a compiled regex that matches any string in the given list.
  75. The strings to match must be literal strings, not regexes. They will be
  76. regex-escaped.
  77. *prefix* and *suffix* are pre- and appended to the final regex.
  78. """
  79. strings = sorted(strings)
  80. return prefix + regex_opt_inner(strings, '(') + suffix