_parser.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the __init__.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. from ._constants import *
  13. SPECIAL_CHARS = ".\\[{()*+?^$|"
  14. REPEAT_CHARS = "*+?{"
  15. DIGITS = frozenset("0123456789")
  16. OCTDIGITS = frozenset("01234567")
  17. HEXDIGITS = frozenset("0123456789abcdefABCDEF")
  18. ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  19. WHITESPACE = frozenset(" \t\n\r\v\f")
  20. _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
  21. _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  22. ESCAPES = {
  23. r"\a": (LITERAL, ord("\a")),
  24. r"\b": (LITERAL, ord("\b")),
  25. r"\f": (LITERAL, ord("\f")),
  26. r"\n": (LITERAL, ord("\n")),
  27. r"\r": (LITERAL, ord("\r")),
  28. r"\t": (LITERAL, ord("\t")),
  29. r"\v": (LITERAL, ord("\v")),
  30. r"\\": (LITERAL, ord("\\"))
  31. }
  32. CATEGORIES = {
  33. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  34. r"\b": (AT, AT_BOUNDARY),
  35. r"\B": (AT, AT_NON_BOUNDARY),
  36. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  37. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  38. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  39. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  40. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  41. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  42. r"\Z": (AT, AT_END_STRING), # end of string
  43. }
  44. FLAGS = {
  45. # standard flags
  46. "i": SRE_FLAG_IGNORECASE,
  47. "L": SRE_FLAG_LOCALE,
  48. "m": SRE_FLAG_MULTILINE,
  49. "s": SRE_FLAG_DOTALL,
  50. "x": SRE_FLAG_VERBOSE,
  51. # extensions
  52. "a": SRE_FLAG_ASCII,
  53. "t": SRE_FLAG_TEMPLATE,
  54. "u": SRE_FLAG_UNICODE,
  55. }
  56. TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
  57. GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
  58. # Maximal value returned by SubPattern.getwidth().
  59. # Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
  60. MAXWIDTH = 1 << 64
  61. class State:
  62. # keeps track of state for parsing
  63. def __init__(self):
  64. self.flags = 0
  65. self.groupdict = {}
  66. self.groupwidths = [None] # group 0
  67. self.lookbehindgroups = None
  68. self.grouprefpos = {}
  69. @property
  70. def groups(self):
  71. return len(self.groupwidths)
  72. def opengroup(self, name=None):
  73. gid = self.groups
  74. self.groupwidths.append(None)
  75. if self.groups > MAXGROUPS:
  76. raise error("too many groups")
  77. if name is not None:
  78. ogid = self.groupdict.get(name, None)
  79. if ogid is not None:
  80. raise error("redefinition of group name %r as group %d; "
  81. "was group %d" % (name, gid, ogid))
  82. self.groupdict[name] = gid
  83. return gid
  84. def closegroup(self, gid, p):
  85. self.groupwidths[gid] = p.getwidth()
  86. def checkgroup(self, gid):
  87. return gid < self.groups and self.groupwidths[gid] is not None
  88. def checklookbehindgroup(self, gid, source):
  89. if self.lookbehindgroups is not None:
  90. if not self.checkgroup(gid):
  91. raise source.error('cannot refer to an open group')
  92. if gid >= self.lookbehindgroups:
  93. raise source.error('cannot refer to group defined in the same '
  94. 'lookbehind subpattern')
  95. class SubPattern:
  96. # a subpattern, in intermediate form
  97. def __init__(self, state, data=None):
  98. self.state = state
  99. if data is None:
  100. data = []
  101. self.data = data
  102. self.width = None
  103. def dump(self, level=0):
  104. seqtypes = (tuple, list)
  105. for op, av in self.data:
  106. print(level*" " + str(op), end='')
  107. if op is IN:
  108. # member sublanguage
  109. print()
  110. for op, a in av:
  111. print((level+1)*" " + str(op), a)
  112. elif op is BRANCH:
  113. print()
  114. for i, a in enumerate(av[1]):
  115. if i:
  116. print(level*" " + "OR")
  117. a.dump(level+1)
  118. elif op is GROUPREF_EXISTS:
  119. condgroup, item_yes, item_no = av
  120. print('', condgroup)
  121. item_yes.dump(level+1)
  122. if item_no:
  123. print(level*" " + "ELSE")
  124. item_no.dump(level+1)
  125. elif isinstance(av, SubPattern):
  126. print()
  127. av.dump(level+1)
  128. elif isinstance(av, seqtypes):
  129. nl = False
  130. for a in av:
  131. if isinstance(a, SubPattern):
  132. if not nl:
  133. print()
  134. a.dump(level+1)
  135. nl = True
  136. else:
  137. if not nl:
  138. print(' ', end='')
  139. print(a, end='')
  140. nl = False
  141. if not nl:
  142. print()
  143. else:
  144. print('', av)
  145. def __repr__(self):
  146. return repr(self.data)
  147. def __len__(self):
  148. return len(self.data)
  149. def __delitem__(self, index):
  150. del self.data[index]
  151. def __getitem__(self, index):
  152. if isinstance(index, slice):
  153. return SubPattern(self.state, self.data[index])
  154. return self.data[index]
  155. def __setitem__(self, index, code):
  156. self.data[index] = code
  157. def insert(self, index, code):
  158. self.data.insert(index, code)
  159. def append(self, code):
  160. self.data.append(code)
  161. def getwidth(self):
  162. # determine the width (min, max) for this subpattern
  163. if self.width is not None:
  164. return self.width
  165. lo = hi = 0
  166. for op, av in self.data:
  167. if op is BRANCH:
  168. i = MAXWIDTH
  169. j = 0
  170. for av in av[1]:
  171. l, h = av.getwidth()
  172. i = min(i, l)
  173. j = max(j, h)
  174. lo = lo + i
  175. hi = hi + j
  176. elif op is ATOMIC_GROUP:
  177. i, j = av.getwidth()
  178. lo = lo + i
  179. hi = hi + j
  180. elif op is SUBPATTERN:
  181. i, j = av[-1].getwidth()
  182. lo = lo + i
  183. hi = hi + j
  184. elif op in _REPEATCODES:
  185. i, j = av[2].getwidth()
  186. lo = lo + i * av[0]
  187. if av[1] == MAXREPEAT and j:
  188. hi = MAXWIDTH
  189. else:
  190. hi = hi + j * av[1]
  191. elif op in _UNITCODES:
  192. lo = lo + 1
  193. hi = hi + 1
  194. elif op is GROUPREF:
  195. i, j = self.state.groupwidths[av]
  196. lo = lo + i
  197. hi = hi + j
  198. elif op is GROUPREF_EXISTS:
  199. i, j = av[1].getwidth()
  200. if av[2] is not None:
  201. l, h = av[2].getwidth()
  202. i = min(i, l)
  203. j = max(j, h)
  204. else:
  205. i = 0
  206. lo = lo + i
  207. hi = hi + j
  208. elif op is SUCCESS:
  209. break
  210. self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
  211. return self.width
  212. class Tokenizer:
  213. def __init__(self, string):
  214. self.istext = isinstance(string, str)
  215. self.string = string
  216. if not self.istext:
  217. string = str(string, 'latin1')
  218. self.decoded_string = string
  219. self.index = 0
  220. self.next = None
  221. self.__next()
  222. def __next(self):
  223. index = self.index
  224. try:
  225. char = self.decoded_string[index]
  226. except IndexError:
  227. self.next = None
  228. return
  229. if char == "\\":
  230. index += 1
  231. try:
  232. char += self.decoded_string[index]
  233. except IndexError:
  234. raise error("bad escape (end of pattern)",
  235. self.string, len(self.string) - 1) from None
  236. self.index = index + 1
  237. self.next = char
  238. def match(self, char):
  239. if char == self.next:
  240. self.__next()
  241. return True
  242. return False
  243. def get(self):
  244. this = self.next
  245. self.__next()
  246. return this
  247. def getwhile(self, n, charset):
  248. result = ''
  249. for _ in range(n):
  250. c = self.next
  251. if c not in charset:
  252. break
  253. result += c
  254. self.__next()
  255. return result
  256. def getuntil(self, terminator, name):
  257. result = ''
  258. while True:
  259. c = self.next
  260. self.__next()
  261. if c is None:
  262. if not result:
  263. raise self.error("missing " + name)
  264. raise self.error("missing %s, unterminated name" % terminator,
  265. len(result))
  266. if c == terminator:
  267. if not result:
  268. raise self.error("missing " + name, 1)
  269. break
  270. result += c
  271. return result
  272. @property
  273. def pos(self):
  274. return self.index - len(self.next or '')
  275. def tell(self):
  276. return self.index - len(self.next or '')
  277. def seek(self, index):
  278. self.index = index
  279. self.__next()
  280. def error(self, msg, offset=0):
  281. if not self.istext:
  282. msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
  283. return error(msg, self.string, self.tell() - offset)
  284. def checkgroupname(self, name, offset):
  285. if not (self.istext or name.isascii()):
  286. msg = "bad character in group name %a" % name
  287. raise self.error(msg, len(name) + offset)
  288. if not name.isidentifier():
  289. msg = "bad character in group name %r" % name
  290. raise self.error(msg, len(name) + offset)
  291. def _class_escape(source, escape):
  292. # handle escape code inside character class
  293. code = ESCAPES.get(escape)
  294. if code:
  295. return code
  296. code = CATEGORIES.get(escape)
  297. if code and code[0] is IN:
  298. return code
  299. try:
  300. c = escape[1:2]
  301. if c == "x":
  302. # hexadecimal escape (exactly two digits)
  303. escape += source.getwhile(2, HEXDIGITS)
  304. if len(escape) != 4:
  305. raise source.error("incomplete escape %s" % escape, len(escape))
  306. return LITERAL, int(escape[2:], 16)
  307. elif c == "u" and source.istext:
  308. # unicode escape (exactly four digits)
  309. escape += source.getwhile(4, HEXDIGITS)
  310. if len(escape) != 6:
  311. raise source.error("incomplete escape %s" % escape, len(escape))
  312. return LITERAL, int(escape[2:], 16)
  313. elif c == "U" and source.istext:
  314. # unicode escape (exactly eight digits)
  315. escape += source.getwhile(8, HEXDIGITS)
  316. if len(escape) != 10:
  317. raise source.error("incomplete escape %s" % escape, len(escape))
  318. c = int(escape[2:], 16)
  319. chr(c) # raise ValueError for invalid code
  320. return LITERAL, c
  321. elif c == "N" and source.istext:
  322. import unicodedata
  323. # named unicode escape e.g. \N{EM DASH}
  324. if not source.match('{'):
  325. raise source.error("missing {")
  326. charname = source.getuntil('}', 'character name')
  327. try:
  328. c = ord(unicodedata.lookup(charname))
  329. except (KeyError, TypeError):
  330. raise source.error("undefined character name %r" % charname,
  331. len(charname) + len(r'\N{}')) from None
  332. return LITERAL, c
  333. elif c in OCTDIGITS:
  334. # octal escape (up to three digits)
  335. escape += source.getwhile(2, OCTDIGITS)
  336. c = int(escape[1:], 8)
  337. if c > 0o377:
  338. raise source.error('octal escape value %s outside of '
  339. 'range 0-0o377' % escape, len(escape))
  340. return LITERAL, c
  341. elif c in DIGITS:
  342. raise ValueError
  343. if len(escape) == 2:
  344. if c in ASCIILETTERS:
  345. raise source.error('bad escape %s' % escape, len(escape))
  346. return LITERAL, ord(escape[1])
  347. except ValueError:
  348. pass
  349. raise source.error("bad escape %s" % escape, len(escape))
  350. def _escape(source, escape, state):
  351. # handle escape code in expression
  352. code = CATEGORIES.get(escape)
  353. if code:
  354. return code
  355. code = ESCAPES.get(escape)
  356. if code:
  357. return code
  358. try:
  359. c = escape[1:2]
  360. if c == "x":
  361. # hexadecimal escape
  362. escape += source.getwhile(2, HEXDIGITS)
  363. if len(escape) != 4:
  364. raise source.error("incomplete escape %s" % escape, len(escape))
  365. return LITERAL, int(escape[2:], 16)
  366. elif c == "u" and source.istext:
  367. # unicode escape (exactly four digits)
  368. escape += source.getwhile(4, HEXDIGITS)
  369. if len(escape) != 6:
  370. raise source.error("incomplete escape %s" % escape, len(escape))
  371. return LITERAL, int(escape[2:], 16)
  372. elif c == "U" and source.istext:
  373. # unicode escape (exactly eight digits)
  374. escape += source.getwhile(8, HEXDIGITS)
  375. if len(escape) != 10:
  376. raise source.error("incomplete escape %s" % escape, len(escape))
  377. c = int(escape[2:], 16)
  378. chr(c) # raise ValueError for invalid code
  379. return LITERAL, c
  380. elif c == "N" and source.istext:
  381. import unicodedata
  382. # named unicode escape e.g. \N{EM DASH}
  383. if not source.match('{'):
  384. raise source.error("missing {")
  385. charname = source.getuntil('}', 'character name')
  386. try:
  387. c = ord(unicodedata.lookup(charname))
  388. except (KeyError, TypeError):
  389. raise source.error("undefined character name %r" % charname,
  390. len(charname) + len(r'\N{}')) from None
  391. return LITERAL, c
  392. elif c == "0":
  393. # octal escape
  394. escape += source.getwhile(2, OCTDIGITS)
  395. return LITERAL, int(escape[1:], 8)
  396. elif c in DIGITS:
  397. # octal escape *or* decimal group reference (sigh)
  398. if source.next in DIGITS:
  399. escape += source.get()
  400. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  401. source.next in OCTDIGITS):
  402. # got three octal digits; this is an octal escape
  403. escape += source.get()
  404. c = int(escape[1:], 8)
  405. if c > 0o377:
  406. raise source.error('octal escape value %s outside of '
  407. 'range 0-0o377' % escape,
  408. len(escape))
  409. return LITERAL, c
  410. # not an octal escape, so this is a group reference
  411. group = int(escape[1:])
  412. if group < state.groups:
  413. if not state.checkgroup(group):
  414. raise source.error("cannot refer to an open group",
  415. len(escape))
  416. state.checklookbehindgroup(group, source)
  417. return GROUPREF, group
  418. raise source.error("invalid group reference %d" % group, len(escape) - 1)
  419. if len(escape) == 2:
  420. if c in ASCIILETTERS:
  421. raise source.error("bad escape %s" % escape, len(escape))
  422. return LITERAL, ord(escape[1])
  423. except ValueError:
  424. pass
  425. raise source.error("bad escape %s" % escape, len(escape))
  426. def _uniq(items):
  427. return list(dict.fromkeys(items))
  428. def _parse_sub(source, state, verbose, nested):
  429. # parse an alternation: a|b|c
  430. items = []
  431. itemsappend = items.append
  432. sourcematch = source.match
  433. start = source.tell()
  434. while True:
  435. itemsappend(_parse(source, state, verbose, nested + 1,
  436. not nested and not items))
  437. if not sourcematch("|"):
  438. break
  439. if not nested:
  440. verbose = state.flags & SRE_FLAG_VERBOSE
  441. if len(items) == 1:
  442. return items[0]
  443. subpattern = SubPattern(state)
  444. # check if all items share a common prefix
  445. while True:
  446. prefix = None
  447. for item in items:
  448. if not item:
  449. break
  450. if prefix is None:
  451. prefix = item[0]
  452. elif item[0] != prefix:
  453. break
  454. else:
  455. # all subitems start with a common "prefix".
  456. # move it out of the branch
  457. for item in items:
  458. del item[0]
  459. subpattern.append(prefix)
  460. continue # check next one
  461. break
  462. # check if the branch can be replaced by a character set
  463. set = []
  464. for item in items:
  465. if len(item) != 1:
  466. break
  467. op, av = item[0]
  468. if op is LITERAL:
  469. set.append((op, av))
  470. elif op is IN and av[0][0] is not NEGATE:
  471. set.extend(av)
  472. else:
  473. break
  474. else:
  475. # we can store this as a character set instead of a
  476. # branch (the compiler may optimize this even more)
  477. subpattern.append((IN, _uniq(set)))
  478. return subpattern
  479. subpattern.append((BRANCH, (None, items)))
  480. return subpattern
  481. def _parse(source, state, verbose, nested, first=False):
  482. # parse a simple pattern
  483. subpattern = SubPattern(state)
  484. # precompute constants into local variables
  485. subpatternappend = subpattern.append
  486. sourceget = source.get
  487. sourcematch = source.match
  488. _len = len
  489. _ord = ord
  490. while True:
  491. this = source.next
  492. if this is None:
  493. break # end of pattern
  494. if this in "|)":
  495. break # end of subpattern
  496. sourceget()
  497. if verbose:
  498. # skip whitespace and comments
  499. if this in WHITESPACE:
  500. continue
  501. if this == "#":
  502. while True:
  503. this = sourceget()
  504. if this is None or this == "\n":
  505. break
  506. continue
  507. if this[0] == "\\":
  508. code = _escape(source, this, state)
  509. subpatternappend(code)
  510. elif this not in SPECIAL_CHARS:
  511. subpatternappend((LITERAL, _ord(this)))
  512. elif this == "[":
  513. here = source.tell() - 1
  514. # character set
  515. set = []
  516. setappend = set.append
  517. ## if sourcematch(":"):
  518. ## pass # handle character classes
  519. if source.next == '[':
  520. import warnings
  521. warnings.warn(
  522. 'Possible nested set at position %d' % source.tell(),
  523. FutureWarning, stacklevel=nested + 6
  524. )
  525. negate = sourcematch("^")
  526. # check remaining characters
  527. while True:
  528. this = sourceget()
  529. if this is None:
  530. raise source.error("unterminated character set",
  531. source.tell() - here)
  532. if this == "]" and set:
  533. break
  534. elif this[0] == "\\":
  535. code1 = _class_escape(source, this)
  536. else:
  537. if set and this in '-&~|' and source.next == this:
  538. import warnings
  539. warnings.warn(
  540. 'Possible set %s at position %d' % (
  541. 'difference' if this == '-' else
  542. 'intersection' if this == '&' else
  543. 'symmetric difference' if this == '~' else
  544. 'union',
  545. source.tell() - 1),
  546. FutureWarning, stacklevel=nested + 6
  547. )
  548. code1 = LITERAL, _ord(this)
  549. if sourcematch("-"):
  550. # potential range
  551. that = sourceget()
  552. if that is None:
  553. raise source.error("unterminated character set",
  554. source.tell() - here)
  555. if that == "]":
  556. if code1[0] is IN:
  557. code1 = code1[1][0]
  558. setappend(code1)
  559. setappend((LITERAL, _ord("-")))
  560. break
  561. if that[0] == "\\":
  562. code2 = _class_escape(source, that)
  563. else:
  564. if that == '-':
  565. import warnings
  566. warnings.warn(
  567. 'Possible set difference at position %d' % (
  568. source.tell() - 2),
  569. FutureWarning, stacklevel=nested + 6
  570. )
  571. code2 = LITERAL, _ord(that)
  572. if code1[0] != LITERAL or code2[0] != LITERAL:
  573. msg = "bad character range %s-%s" % (this, that)
  574. raise source.error(msg, len(this) + 1 + len(that))
  575. lo = code1[1]
  576. hi = code2[1]
  577. if hi < lo:
  578. msg = "bad character range %s-%s" % (this, that)
  579. raise source.error(msg, len(this) + 1 + len(that))
  580. setappend((RANGE, (lo, hi)))
  581. else:
  582. if code1[0] is IN:
  583. code1 = code1[1][0]
  584. setappend(code1)
  585. set = _uniq(set)
  586. # XXX: <fl> should move set optimization to compiler!
  587. if _len(set) == 1 and set[0][0] is LITERAL:
  588. # optimization
  589. if negate:
  590. subpatternappend((NOT_LITERAL, set[0][1]))
  591. else:
  592. subpatternappend(set[0])
  593. else:
  594. if negate:
  595. set.insert(0, (NEGATE, None))
  596. # charmap optimization can't be added here because
  597. # global flags still are not known
  598. subpatternappend((IN, set))
  599. elif this in REPEAT_CHARS:
  600. # repeat previous item
  601. here = source.tell()
  602. if this == "?":
  603. min, max = 0, 1
  604. elif this == "*":
  605. min, max = 0, MAXREPEAT
  606. elif this == "+":
  607. min, max = 1, MAXREPEAT
  608. elif this == "{":
  609. if source.next == "}":
  610. subpatternappend((LITERAL, _ord(this)))
  611. continue
  612. min, max = 0, MAXREPEAT
  613. lo = hi = ""
  614. while source.next in DIGITS:
  615. lo += sourceget()
  616. if sourcematch(","):
  617. while source.next in DIGITS:
  618. hi += sourceget()
  619. else:
  620. hi = lo
  621. if not sourcematch("}"):
  622. subpatternappend((LITERAL, _ord(this)))
  623. source.seek(here)
  624. continue
  625. if lo:
  626. min = int(lo)
  627. if min >= MAXREPEAT:
  628. raise OverflowError("the repetition number is too large")
  629. if hi:
  630. max = int(hi)
  631. if max >= MAXREPEAT:
  632. raise OverflowError("the repetition number is too large")
  633. if max < min:
  634. raise source.error("min repeat greater than max repeat",
  635. source.tell() - here)
  636. else:
  637. raise AssertionError("unsupported quantifier %r" % (char,))
  638. # figure out which item to repeat
  639. if subpattern:
  640. item = subpattern[-1:]
  641. else:
  642. item = None
  643. if not item or item[0][0] is AT:
  644. raise source.error("nothing to repeat",
  645. source.tell() - here + len(this))
  646. if item[0][0] in _REPEATCODES:
  647. raise source.error("multiple repeat",
  648. source.tell() - here + len(this))
  649. if item[0][0] is SUBPATTERN:
  650. group, add_flags, del_flags, p = item[0][1]
  651. if group is None and not add_flags and not del_flags:
  652. item = p
  653. if sourcematch("?"):
  654. # Non-Greedy Match
  655. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  656. elif sourcematch("+"):
  657. # Possessive Match (Always Greedy)
  658. subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
  659. else:
  660. # Greedy Match
  661. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  662. elif this == ".":
  663. subpatternappend((ANY, None))
  664. elif this == "(":
  665. start = source.tell() - 1
  666. capture = True
  667. atomic = False
  668. name = None
  669. add_flags = 0
  670. del_flags = 0
  671. if sourcematch("?"):
  672. # options
  673. char = sourceget()
  674. if char is None:
  675. raise source.error("unexpected end of pattern")
  676. if char == "P":
  677. # python extensions
  678. if sourcematch("<"):
  679. # named group: skip forward to end of name
  680. name = source.getuntil(">", "group name")
  681. source.checkgroupname(name, 1)
  682. elif sourcematch("="):
  683. # named backreference
  684. name = source.getuntil(")", "group name")
  685. source.checkgroupname(name, 1)
  686. gid = state.groupdict.get(name)
  687. if gid is None:
  688. msg = "unknown group name %r" % name
  689. raise source.error(msg, len(name) + 1)
  690. if not state.checkgroup(gid):
  691. raise source.error("cannot refer to an open group",
  692. len(name) + 1)
  693. state.checklookbehindgroup(gid, source)
  694. subpatternappend((GROUPREF, gid))
  695. continue
  696. else:
  697. char = sourceget()
  698. if char is None:
  699. raise source.error("unexpected end of pattern")
  700. raise source.error("unknown extension ?P" + char,
  701. len(char) + 2)
  702. elif char == ":":
  703. # non-capturing group
  704. capture = False
  705. elif char == "#":
  706. # comment
  707. while True:
  708. if source.next is None:
  709. raise source.error("missing ), unterminated comment",
  710. source.tell() - start)
  711. if sourceget() == ")":
  712. break
  713. continue
  714. elif char in "=!<":
  715. # lookahead assertions
  716. dir = 1
  717. if char == "<":
  718. char = sourceget()
  719. if char is None:
  720. raise source.error("unexpected end of pattern")
  721. if char not in "=!":
  722. raise source.error("unknown extension ?<" + char,
  723. len(char) + 2)
  724. dir = -1 # lookbehind
  725. lookbehindgroups = state.lookbehindgroups
  726. if lookbehindgroups is None:
  727. state.lookbehindgroups = state.groups
  728. p = _parse_sub(source, state, verbose, nested + 1)
  729. if dir < 0:
  730. if lookbehindgroups is None:
  731. state.lookbehindgroups = None
  732. if not sourcematch(")"):
  733. raise source.error("missing ), unterminated subpattern",
  734. source.tell() - start)
  735. if char == "=":
  736. subpatternappend((ASSERT, (dir, p)))
  737. else:
  738. subpatternappend((ASSERT_NOT, (dir, p)))
  739. continue
  740. elif char == "(":
  741. # conditional backreference group
  742. condname = source.getuntil(")", "group name")
  743. if not (condname.isdecimal() and condname.isascii()):
  744. source.checkgroupname(condname, 1)
  745. condgroup = state.groupdict.get(condname)
  746. if condgroup is None:
  747. msg = "unknown group name %r" % condname
  748. raise source.error(msg, len(condname) + 1)
  749. else:
  750. condgroup = int(condname)
  751. if not condgroup:
  752. raise source.error("bad group number",
  753. len(condname) + 1)
  754. if condgroup >= MAXGROUPS:
  755. msg = "invalid group reference %d" % condgroup
  756. raise source.error(msg, len(condname) + 1)
  757. if condgroup not in state.grouprefpos:
  758. state.grouprefpos[condgroup] = (
  759. source.tell() - len(condname) - 1
  760. )
  761. if not (condname.isdecimal() and condname.isascii()):
  762. import warnings
  763. warnings.warn(
  764. "bad character in group name %s at position %d" %
  765. (repr(condname) if source.istext else ascii(condname),
  766. source.tell() - len(condname) - 1),
  767. DeprecationWarning, stacklevel=nested + 6
  768. )
  769. state.checklookbehindgroup(condgroup, source)
  770. item_yes = _parse(source, state, verbose, nested + 1)
  771. if source.match("|"):
  772. item_no = _parse(source, state, verbose, nested + 1)
  773. if source.next == "|":
  774. raise source.error("conditional backref with more than two branches")
  775. else:
  776. item_no = None
  777. if not source.match(")"):
  778. raise source.error("missing ), unterminated subpattern",
  779. source.tell() - start)
  780. subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  781. continue
  782. elif char == ">":
  783. # non-capturing, atomic group
  784. capture = False
  785. atomic = True
  786. elif char in FLAGS or char == "-":
  787. # flags
  788. flags = _parse_flags(source, state, char)
  789. if flags is None: # global flags
  790. if not first or subpattern:
  791. raise source.error('global flags not at the start '
  792. 'of the expression',
  793. source.tell() - start)
  794. verbose = state.flags & SRE_FLAG_VERBOSE
  795. continue
  796. add_flags, del_flags = flags
  797. capture = False
  798. else:
  799. raise source.error("unknown extension ?" + char,
  800. len(char) + 1)
  801. # parse group contents
  802. if capture:
  803. try:
  804. group = state.opengroup(name)
  805. except error as err:
  806. raise source.error(err.msg, len(name) + 1) from None
  807. else:
  808. group = None
  809. sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
  810. not (del_flags & SRE_FLAG_VERBOSE))
  811. p = _parse_sub(source, state, sub_verbose, nested + 1)
  812. if not source.match(")"):
  813. raise source.error("missing ), unterminated subpattern",
  814. source.tell() - start)
  815. if group is not None:
  816. state.closegroup(group, p)
  817. if atomic:
  818. assert group is None
  819. subpatternappend((ATOMIC_GROUP, p))
  820. else:
  821. subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
  822. elif this == "^":
  823. subpatternappend((AT, AT_BEGINNING))
  824. elif this == "$":
  825. subpatternappend((AT, AT_END))
  826. else:
  827. raise AssertionError("unsupported special character %r" % (char,))
  828. # unpack non-capturing groups
  829. for i in range(len(subpattern))[::-1]:
  830. op, av = subpattern[i]
  831. if op is SUBPATTERN:
  832. group, add_flags, del_flags, p = av
  833. if group is None and not add_flags and not del_flags:
  834. subpattern[i: i+1] = p
  835. return subpattern
  836. def _parse_flags(source, state, char):
  837. sourceget = source.get
  838. add_flags = 0
  839. del_flags = 0
  840. if char != "-":
  841. while True:
  842. flag = FLAGS[char]
  843. if source.istext:
  844. if char == 'L':
  845. msg = "bad inline flags: cannot use 'L' flag with a str pattern"
  846. raise source.error(msg)
  847. else:
  848. if char == 'u':
  849. msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
  850. raise source.error(msg)
  851. add_flags |= flag
  852. if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
  853. msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
  854. raise source.error(msg)
  855. char = sourceget()
  856. if char is None:
  857. raise source.error("missing -, : or )")
  858. if char in ")-:":
  859. break
  860. if char not in FLAGS:
  861. msg = "unknown flag" if char.isalpha() else "missing -, : or )"
  862. raise source.error(msg, len(char))
  863. if char == ")":
  864. state.flags |= add_flags
  865. return None
  866. if add_flags & GLOBAL_FLAGS:
  867. raise source.error("bad inline flags: cannot turn on global flag", 1)
  868. if char == "-":
  869. char = sourceget()
  870. if char is None:
  871. raise source.error("missing flag")
  872. if char not in FLAGS:
  873. msg = "unknown flag" if char.isalpha() else "missing flag"
  874. raise source.error(msg, len(char))
  875. while True:
  876. flag = FLAGS[char]
  877. if flag & TYPE_FLAGS:
  878. msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
  879. raise source.error(msg)
  880. del_flags |= flag
  881. char = sourceget()
  882. if char is None:
  883. raise source.error("missing :")
  884. if char == ":":
  885. break
  886. if char not in FLAGS:
  887. msg = "unknown flag" if char.isalpha() else "missing :"
  888. raise source.error(msg, len(char))
  889. assert char == ":"
  890. if del_flags & GLOBAL_FLAGS:
  891. raise source.error("bad inline flags: cannot turn off global flag", 1)
  892. if add_flags & del_flags:
  893. raise source.error("bad inline flags: flag turned on and off", 1)
  894. return add_flags, del_flags
  895. def fix_flags(src, flags):
  896. # Check and fix flags according to the type of pattern (str or bytes)
  897. if isinstance(src, str):
  898. if flags & SRE_FLAG_LOCALE:
  899. raise ValueError("cannot use LOCALE flag with a str pattern")
  900. if not flags & SRE_FLAG_ASCII:
  901. flags |= SRE_FLAG_UNICODE
  902. elif flags & SRE_FLAG_UNICODE:
  903. raise ValueError("ASCII and UNICODE flags are incompatible")
  904. else:
  905. if flags & SRE_FLAG_UNICODE:
  906. raise ValueError("cannot use UNICODE flag with a bytes pattern")
  907. if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
  908. raise ValueError("ASCII and LOCALE flags are incompatible")
  909. return flags
  910. def parse(str, flags=0, state=None):
  911. # parse 're' pattern into list of (opcode, argument) tuples
  912. source = Tokenizer(str)
  913. if state is None:
  914. state = State()
  915. state.flags = flags
  916. state.str = str
  917. p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  918. p.state.flags = fix_flags(str, p.state.flags)
  919. if source.next is not None:
  920. assert source.next == ")"
  921. raise source.error("unbalanced parenthesis")
  922. for g in p.state.grouprefpos:
  923. if g >= p.state.groups:
  924. msg = "invalid group reference %d" % g
  925. raise error(msg, str, p.state.grouprefpos[g])
  926. if flags & SRE_FLAG_DEBUG:
  927. p.dump()
  928. return p
  929. def parse_template(source, pattern):
  930. # parse 're' replacement string into list of literals and
  931. # group references
  932. s = Tokenizer(source)
  933. sget = s.get
  934. result = []
  935. literal = []
  936. lappend = literal.append
  937. def addliteral():
  938. if s.istext:
  939. result.append(''.join(literal))
  940. else:
  941. # The tokenizer implicitly decodes bytes objects as latin-1, we must
  942. # therefore re-encode the final representation.
  943. result.append(''.join(literal).encode('latin-1'))
  944. del literal[:]
  945. def addgroup(index, pos):
  946. if index > pattern.groups:
  947. raise s.error("invalid group reference %d" % index, pos)
  948. addliteral()
  949. result.append(index)
  950. groupindex = pattern.groupindex
  951. while True:
  952. this = sget()
  953. if this is None:
  954. break # end of replacement string
  955. if this[0] == "\\":
  956. # group
  957. c = this[1]
  958. if c == "g":
  959. if not s.match("<"):
  960. raise s.error("missing <")
  961. name = s.getuntil(">", "group name")
  962. if not (name.isdecimal() and name.isascii()):
  963. s.checkgroupname(name, 1)
  964. try:
  965. index = groupindex[name]
  966. except KeyError:
  967. raise IndexError("unknown group name %r" % name) from None
  968. else:
  969. index = int(name)
  970. if index >= MAXGROUPS:
  971. raise s.error("invalid group reference %d" % index,
  972. len(name) + 1)
  973. if not (name.isdecimal() and name.isascii()):
  974. import warnings
  975. warnings.warn(
  976. "bad character in group name %s at position %d" %
  977. (repr(name) if s.istext else ascii(name),
  978. s.tell() - len(name) - 1),
  979. DeprecationWarning, stacklevel=5
  980. )
  981. addgroup(index, len(name) + 1)
  982. elif c == "0":
  983. if s.next in OCTDIGITS:
  984. this += sget()
  985. if s.next in OCTDIGITS:
  986. this += sget()
  987. lappend(chr(int(this[1:], 8) & 0xff))
  988. elif c in DIGITS:
  989. isoctal = False
  990. if s.next in DIGITS:
  991. this += sget()
  992. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  993. s.next in OCTDIGITS):
  994. this += sget()
  995. isoctal = True
  996. c = int(this[1:], 8)
  997. if c > 0o377:
  998. raise s.error('octal escape value %s outside of '
  999. 'range 0-0o377' % this, len(this))
  1000. lappend(chr(c))
  1001. if not isoctal:
  1002. addgroup(int(this[1:]), len(this) - 1)
  1003. else:
  1004. try:
  1005. this = chr(ESCAPES[this][1])
  1006. except KeyError:
  1007. if c in ASCIILETTERS:
  1008. raise s.error('bad escape %s' % this, len(this)) from None
  1009. lappend(this)
  1010. else:
  1011. lappend(this)
  1012. addliteral()
  1013. return result