regengine.inc 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034
  1. /*-
  2. * This code is derived from OpenBSD's libc/regex, original license follows:
  3. *
  4. * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  5. * Copyright (c) 1992, 1993, 1994
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Henry Spencer.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)engine.c 8.5 (Berkeley) 3/20/94
  36. */
  37. /*
  38. * The matching engine and friends. This file is #included by regexec.c
  39. * after suitable #defines of a variety of macros used herein, so that
  40. * different state representations can be used without duplicating masses
  41. * of code.
  42. */
  43. #ifdef SNAMES
  44. #define matcher smatcher
  45. #define fast sfast
  46. #define slow sslow
  47. #define dissect sdissect
  48. #define backref sbackref
  49. #define step sstep
  50. #define print sprint
  51. #define at sat
  52. #define match smat
  53. #define nope snope
  54. #endif
  55. #ifdef LNAMES
  56. #define matcher lmatcher
  57. #define fast lfast
  58. #define slow lslow
  59. #define dissect ldissect
  60. #define backref lbackref
  61. #define step lstep
  62. #define print lprint
  63. #define at lat
  64. #define match lmat
  65. #define nope lnope
  66. #endif
  67. /* another structure passed up and down to avoid zillions of parameters */
  68. struct match {
  69. struct re_guts *g;
  70. int eflags;
  71. llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
  72. const char *offp; /* offsets work from here */
  73. const char *beginp; /* start of string -- virtual NUL precedes */
  74. const char *endp; /* end of string -- virtual NUL here */
  75. const char *coldp; /* can be no match starting before here */
  76. const char **lastpos; /* [nplus+1] */
  77. STATEVARS;
  78. states st; /* current states */
  79. states fresh; /* states for a fresh start */
  80. states tmp; /* temporary */
  81. states empty; /* empty set of states */
  82. };
  83. static int matcher(struct re_guts *, const char *, size_t,
  84. llvm_regmatch_t[], int);
  85. static const char *dissect(struct match *, const char *, const char *, sopno,
  86. sopno);
  87. static const char *backref(struct match *, const char *, const char *, sopno,
  88. sopno, sopno, int);
  89. static const char *fast(struct match *, const char *, const char *, sopno, sopno);
  90. static const char *slow(struct match *, const char *, const char *, sopno, sopno);
  91. static states step(struct re_guts *, sopno, sopno, states, int, states);
  92. #define MAX_RECURSION 100
  93. #define BOL (OUT+1)
  94. #define EOL (BOL+1)
  95. #define BOLEOL (BOL+2)
  96. #define NOTHING (BOL+3)
  97. #define BOW (BOL+4)
  98. #define EOW (BOL+5)
  99. #define CODEMAX (BOL+5) /* highest code used */
  100. #define NONCHAR(c) ((c) > CHAR_MAX)
  101. #define NNONCHAR (CODEMAX-CHAR_MAX)
  102. #ifdef REDEBUG
  103. static void print(struct match *, char *, states, int, FILE *);
  104. #endif
  105. #ifdef REDEBUG
  106. static void at(struct match *, char *, char *, char *, sopno, sopno);
  107. #endif
  108. #ifdef REDEBUG
  109. static char *pchar(int);
  110. #endif
  111. #ifdef REDEBUG
  112. #define SP(t, s, c) print(m, t, s, c, stdout)
  113. #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
  114. #define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
  115. static int nope = 0;
  116. #else
  117. #define SP(t, s, c) /* nothing */
  118. #define AT(t, p1, p2, s1, s2) /* nothing */
  119. #define NOTE(s) /* nothing */
  120. #endif
  121. /*
  122. - matcher - the actual matching engine
  123. */
  124. static int /* 0 success, REG_NOMATCH failure */
  125. matcher(struct re_guts *g, const char *string, size_t nmatch,
  126. llvm_regmatch_t pmatch[],
  127. int eflags)
  128. {
  129. const char *endp;
  130. size_t i;
  131. struct match mv;
  132. struct match *m = &mv;
  133. const char *dp;
  134. const sopno gf = g->firststate+1; /* +1 for OEND */
  135. const sopno gl = g->laststate;
  136. const char *start;
  137. const char *stop;
  138. /* simplify the situation where possible */
  139. if (g->cflags&REG_NOSUB)
  140. nmatch = 0;
  141. if (eflags&REG_STARTEND) {
  142. start = string + pmatch[0].rm_so;
  143. stop = string + pmatch[0].rm_eo;
  144. } else {
  145. start = string;
  146. stop = start + strlen(start);
  147. }
  148. if (stop < start)
  149. return(REG_INVARG);
  150. /* prescreening; this does wonders for this rather slow code */
  151. if (g->must != NULL) {
  152. for (dp = start; dp < stop; dp++)
  153. if (*dp == g->must[0] && stop - dp >= g->mlen &&
  154. memcmp(dp, g->must, (size_t)g->mlen) == 0)
  155. break;
  156. if (dp == stop) /* we didn't find g->must */
  157. return(REG_NOMATCH);
  158. }
  159. /* match struct setup */
  160. m->g = g;
  161. m->eflags = eflags;
  162. m->pmatch = NULL;
  163. m->lastpos = NULL;
  164. m->offp = string;
  165. m->beginp = start;
  166. m->endp = stop;
  167. STATESETUP(m, 4);
  168. SETUP(m->st);
  169. SETUP(m->fresh);
  170. SETUP(m->tmp);
  171. SETUP(m->empty);
  172. CLEAR(m->empty);
  173. /* this loop does only one repetition except for backrefs */
  174. for (;;) {
  175. endp = fast(m, start, stop, gf, gl);
  176. if (endp == NULL) { /* a miss */
  177. free(m->pmatch);
  178. free((void*)m->lastpos);
  179. STATETEARDOWN(m);
  180. return(REG_NOMATCH);
  181. }
  182. if (nmatch == 0 && !g->backrefs)
  183. break; /* no further info needed */
  184. /* where? */
  185. assert(m->coldp != NULL);
  186. for (;;) {
  187. NOTE("finding start");
  188. endp = slow(m, m->coldp, stop, gf, gl);
  189. if (endp != NULL)
  190. break;
  191. assert(m->coldp < m->endp);
  192. m->coldp++;
  193. }
  194. if (nmatch == 1 && !g->backrefs)
  195. break; /* no further info needed */
  196. /* oh my, they want the subexpressions... */
  197. if (m->pmatch == NULL)
  198. m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
  199. sizeof(llvm_regmatch_t));
  200. if (m->pmatch == NULL) {
  201. STATETEARDOWN(m);
  202. return(REG_ESPACE);
  203. }
  204. for (i = 1; i <= m->g->nsub; i++)
  205. m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
  206. if (!g->backrefs && !(m->eflags&REG_BACKR)) {
  207. NOTE("dissecting");
  208. dp = dissect(m, m->coldp, endp, gf, gl);
  209. } else {
  210. if (g->nplus > 0 && m->lastpos == NULL)
  211. m->lastpos = (const char **)malloc((g->nplus+1) *
  212. sizeof(char *));
  213. if (g->nplus > 0 && m->lastpos == NULL) {
  214. free(m->pmatch);
  215. STATETEARDOWN(m);
  216. return(REG_ESPACE);
  217. }
  218. NOTE("backref dissect");
  219. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
  220. }
  221. if (dp != NULL)
  222. break;
  223. /* uh-oh... we couldn't find a subexpression-level match */
  224. assert(g->backrefs); /* must be back references doing it */
  225. assert(g->nplus == 0 || m->lastpos != NULL);
  226. for (;;) {
  227. if (dp != NULL || endp <= m->coldp)
  228. break; /* defeat */
  229. NOTE("backoff");
  230. endp = slow(m, m->coldp, endp-1, gf, gl);
  231. if (endp == NULL)
  232. break; /* defeat */
  233. /* try it on a shorter possibility */
  234. #ifndef NDEBUG
  235. for (i = 1; i <= m->g->nsub; i++) {
  236. assert(m->pmatch[i].rm_so == -1);
  237. assert(m->pmatch[i].rm_eo == -1);
  238. }
  239. #endif
  240. NOTE("backoff dissect");
  241. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
  242. }
  243. assert(dp == NULL || dp == endp);
  244. if (dp != NULL) /* found a shorter one */
  245. break;
  246. /* despite initial appearances, there is no match here */
  247. NOTE("false alarm");
  248. if (m->coldp == stop)
  249. break;
  250. start = m->coldp + 1; /* recycle starting later */
  251. }
  252. /* fill in the details if requested */
  253. if (nmatch > 0) {
  254. pmatch[0].rm_so = m->coldp - m->offp;
  255. pmatch[0].rm_eo = endp - m->offp;
  256. }
  257. if (nmatch > 1) {
  258. assert(m->pmatch != NULL);
  259. for (i = 1; i < nmatch; i++)
  260. if (i <= m->g->nsub)
  261. pmatch[i] = m->pmatch[i];
  262. else {
  263. pmatch[i].rm_so = -1;
  264. pmatch[i].rm_eo = -1;
  265. }
  266. }
  267. if (m->pmatch != NULL)
  268. free((char *)m->pmatch);
  269. if (m->lastpos != NULL)
  270. free((char *)m->lastpos);
  271. STATETEARDOWN(m);
  272. return(0);
  273. }
  274. /*
  275. - dissect - figure out what matched what, no back references
  276. */
  277. static const char * /* == stop (success) always */
  278. dissect(struct match *m, const char *start, const char *stop, sopno startst,
  279. sopno stopst)
  280. {
  281. int i;
  282. sopno ss; /* start sop of current subRE */
  283. sopno es; /* end sop of current subRE */
  284. const char *sp; /* start of string matched by it */
  285. const char *stp; /* string matched by it cannot pass here */
  286. const char *rest; /* start of rest of string */
  287. const char *tail; /* string unmatched by rest of RE */
  288. sopno ssub; /* start sop of subsubRE */
  289. sopno esub; /* end sop of subsubRE */
  290. const char *ssp; /* start of string matched by subsubRE */
  291. const char *sep; /* end of string matched by subsubRE */
  292. const char *oldssp; /* previous ssp */
  293. AT("diss", start, stop, startst, stopst);
  294. sp = start;
  295. for (ss = startst; ss < stopst; ss = es) {
  296. /* identify end of subRE */
  297. es = ss;
  298. switch (OP(m->g->strip[es])) {
  299. case OPLUS_:
  300. case OQUEST_:
  301. es += OPND(m->g->strip[es]);
  302. break;
  303. case OCH_:
  304. while (OP(m->g->strip[es]) != O_CH)
  305. es += OPND(m->g->strip[es]);
  306. break;
  307. }
  308. es++;
  309. /* figure out what it matched */
  310. switch (OP(m->g->strip[ss])) {
  311. case OEND:
  312. assert(nope);
  313. break;
  314. case OCHAR:
  315. sp++;
  316. break;
  317. case OBOL:
  318. case OEOL:
  319. case OBOW:
  320. case OEOW:
  321. break;
  322. case OANY:
  323. case OANYOF:
  324. sp++;
  325. break;
  326. case OBACK_:
  327. case O_BACK:
  328. assert(nope);
  329. break;
  330. /* cases where length of match is hard to find */
  331. case OQUEST_:
  332. stp = stop;
  333. for (;;) {
  334. /* how long could this one be? */
  335. rest = slow(m, sp, stp, ss, es);
  336. assert(rest != NULL); /* it did match */
  337. /* could the rest match the rest? */
  338. tail = slow(m, rest, stop, es, stopst);
  339. if (tail == stop)
  340. break; /* yes! */
  341. /* no -- try a shorter match for this one */
  342. stp = rest - 1;
  343. assert(stp >= sp); /* it did work */
  344. }
  345. ssub = ss + 1;
  346. esub = es - 1;
  347. /* did innards match? */
  348. if (slow(m, sp, rest, ssub, esub) != NULL) {
  349. const char *dp = dissect(m, sp, rest, ssub, esub);
  350. (void)dp; /* avoid warning if assertions off */
  351. assert(dp == rest);
  352. } else /* no */
  353. assert(sp == rest);
  354. sp = rest;
  355. break;
  356. case OPLUS_:
  357. stp = stop;
  358. for (;;) {
  359. /* how long could this one be? */
  360. rest = slow(m, sp, stp, ss, es);
  361. assert(rest != NULL); /* it did match */
  362. /* could the rest match the rest? */
  363. tail = slow(m, rest, stop, es, stopst);
  364. if (tail == stop)
  365. break; /* yes! */
  366. /* no -- try a shorter match for this one */
  367. stp = rest - 1;
  368. assert(stp >= sp); /* it did work */
  369. }
  370. ssub = ss + 1;
  371. esub = es - 1;
  372. ssp = sp;
  373. oldssp = ssp;
  374. for (;;) { /* find last match of innards */
  375. sep = slow(m, ssp, rest, ssub, esub);
  376. if (sep == NULL || sep == ssp)
  377. break; /* failed or matched null */
  378. oldssp = ssp; /* on to next try */
  379. ssp = sep;
  380. }
  381. if (sep == NULL) {
  382. /* last successful match */
  383. sep = ssp;
  384. ssp = oldssp;
  385. }
  386. assert(sep == rest); /* must exhaust substring */
  387. assert(slow(m, ssp, sep, ssub, esub) == rest);
  388. {
  389. const char *dp = dissect(m, ssp, sep, ssub, esub);
  390. (void)dp; /* avoid warning if assertions off */
  391. assert(dp == sep);
  392. }
  393. sp = rest;
  394. break;
  395. case OCH_:
  396. stp = stop;
  397. for (;;) {
  398. /* how long could this one be? */
  399. rest = slow(m, sp, stp, ss, es);
  400. assert(rest != NULL); /* it did match */
  401. /* could the rest match the rest? */
  402. tail = slow(m, rest, stop, es, stopst);
  403. if (tail == stop)
  404. break; /* yes! */
  405. /* no -- try a shorter match for this one */
  406. stp = rest - 1;
  407. assert(stp >= sp); /* it did work */
  408. }
  409. ssub = ss + 1;
  410. esub = ss + OPND(m->g->strip[ss]) - 1;
  411. assert(OP(m->g->strip[esub]) == OOR1);
  412. for (;;) { /* find first matching branch */
  413. if (slow(m, sp, rest, ssub, esub) == rest)
  414. break; /* it matched all of it */
  415. /* that one missed, try next one */
  416. assert(OP(m->g->strip[esub]) == OOR1);
  417. esub++;
  418. assert(OP(m->g->strip[esub]) == OOR2);
  419. ssub = esub + 1;
  420. esub += OPND(m->g->strip[esub]);
  421. if (OP(m->g->strip[esub]) == OOR2)
  422. esub--;
  423. else
  424. assert(OP(m->g->strip[esub]) == O_CH);
  425. }
  426. {
  427. const char *dp = dissect(m, sp, rest, ssub, esub);
  428. (void)dp; /* avoid warning if assertions off */
  429. assert(dp == rest);
  430. }
  431. sp = rest;
  432. break;
  433. case O_PLUS:
  434. case O_QUEST:
  435. case OOR1:
  436. case OOR2:
  437. case O_CH:
  438. assert(nope);
  439. break;
  440. case OLPAREN:
  441. i = OPND(m->g->strip[ss]);
  442. assert(0 < i && i <= m->g->nsub);
  443. m->pmatch[i].rm_so = sp - m->offp;
  444. break;
  445. case ORPAREN:
  446. i = OPND(m->g->strip[ss]);
  447. assert(0 < i && i <= m->g->nsub);
  448. m->pmatch[i].rm_eo = sp - m->offp;
  449. break;
  450. default: /* uh oh */
  451. assert(nope);
  452. break;
  453. }
  454. }
  455. assert(sp == stop);
  456. return(sp);
  457. }
  458. /*
  459. - backref - figure out what matched what, figuring in back references
  460. */
  461. static const char * /* == stop (success) or NULL (failure) */
  462. backref(struct match *m, const char *start, const char *stop, sopno startst,
  463. sopno stopst, sopno lev, int rec) /* PLUS nesting level */
  464. {
  465. int i;
  466. sopno ss; /* start sop of current subRE */
  467. const char *sp; /* start of string matched by it */
  468. sopno ssub; /* start sop of subsubRE */
  469. sopno esub; /* end sop of subsubRE */
  470. const char *ssp; /* start of string matched by subsubRE */
  471. const char *dp;
  472. size_t len;
  473. int hard;
  474. sop s;
  475. llvm_regoff_t offsave;
  476. cset *cs;
  477. AT("back", start, stop, startst, stopst);
  478. sp = start;
  479. /* get as far as we can with easy stuff */
  480. hard = 0;
  481. for (ss = startst; !hard && ss < stopst; ss++)
  482. switch (OP(s = m->g->strip[ss])) {
  483. case OCHAR:
  484. if (sp == stop || *sp++ != (char)OPND(s))
  485. return(NULL);
  486. break;
  487. case OANY:
  488. if (sp == stop)
  489. return(NULL);
  490. sp++;
  491. break;
  492. case OANYOF:
  493. cs = &m->g->sets[OPND(s)];
  494. if (sp == stop || !CHIN(cs, *sp++))
  495. return(NULL);
  496. break;
  497. case OBOL:
  498. if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  499. (sp < m->endp && *(sp-1) == '\n' &&
  500. (m->g->cflags&REG_NEWLINE)) )
  501. { /* yes */ }
  502. else
  503. return(NULL);
  504. break;
  505. case OEOL:
  506. if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  507. (sp < m->endp && *sp == '\n' &&
  508. (m->g->cflags&REG_NEWLINE)) )
  509. { /* yes */ }
  510. else
  511. return(NULL);
  512. break;
  513. case OBOW:
  514. if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  515. (sp < m->endp && *(sp-1) == '\n' &&
  516. (m->g->cflags&REG_NEWLINE)) ||
  517. (sp > m->beginp &&
  518. !ISWORD(*(sp-1))) ) &&
  519. (sp < m->endp && ISWORD(*sp)) )
  520. { /* yes */ }
  521. else
  522. return(NULL);
  523. break;
  524. case OEOW:
  525. if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  526. (sp < m->endp && *sp == '\n' &&
  527. (m->g->cflags&REG_NEWLINE)) ||
  528. (sp < m->endp && !ISWORD(*sp)) ) &&
  529. (sp > m->beginp && ISWORD(*(sp-1))) )
  530. { /* yes */ }
  531. else
  532. return(NULL);
  533. break;
  534. case O_QUEST:
  535. break;
  536. case OOR1: /* matches null but needs to skip */
  537. ss++;
  538. s = m->g->strip[ss];
  539. do {
  540. assert(OP(s) == OOR2);
  541. ss += OPND(s);
  542. } while (OP(s = m->g->strip[ss]) != O_CH);
  543. /* note that the ss++ gets us past the O_CH */
  544. break;
  545. default: /* have to make a choice */
  546. hard = 1;
  547. break;
  548. }
  549. if (!hard) { /* that was it! */
  550. if (sp != stop)
  551. return(NULL);
  552. return(sp);
  553. }
  554. ss--; /* adjust for the for's final increment */
  555. /* the hard stuff */
  556. AT("hard", sp, stop, ss, stopst);
  557. s = m->g->strip[ss];
  558. switch (OP(s)) {
  559. case OBACK_: /* the vilest depths */
  560. i = OPND(s);
  561. assert(0 < i && i <= m->g->nsub);
  562. if (m->pmatch[i].rm_eo == -1)
  563. return(NULL);
  564. assert(m->pmatch[i].rm_so != -1);
  565. len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
  566. if (len == 0 && rec++ > MAX_RECURSION)
  567. return(NULL);
  568. assert(stop - m->beginp >= len);
  569. if (sp > stop - len)
  570. return(NULL); /* not enough left to match */
  571. ssp = m->offp + m->pmatch[i].rm_so;
  572. if (memcmp(sp, ssp, len) != 0)
  573. return(NULL);
  574. while (m->g->strip[ss] != SOP(O_BACK, i))
  575. ss++;
  576. return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
  577. break;
  578. case OQUEST_: /* to null or not */
  579. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  580. if (dp != NULL)
  581. return(dp); /* not */
  582. return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
  583. break;
  584. case OPLUS_:
  585. assert(m->lastpos != NULL);
  586. assert(lev+1 <= m->g->nplus);
  587. m->lastpos[lev+1] = sp;
  588. return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
  589. break;
  590. case O_PLUS:
  591. if (sp == m->lastpos[lev]) /* last pass matched null */
  592. return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
  593. /* try another pass */
  594. m->lastpos[lev] = sp;
  595. dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
  596. if (dp == NULL)
  597. return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
  598. else
  599. return(dp);
  600. break;
  601. case OCH_: /* find the right one, if any */
  602. ssub = ss + 1;
  603. esub = ss + OPND(s) - 1;
  604. assert(OP(m->g->strip[esub]) == OOR1);
  605. for (;;) { /* find first matching branch */
  606. dp = backref(m, sp, stop, ssub, esub, lev, rec);
  607. if (dp != NULL)
  608. return(dp);
  609. /* that one missed, try next one */
  610. if (OP(m->g->strip[esub]) == O_CH)
  611. return(NULL); /* there is none */
  612. esub++;
  613. assert(OP(m->g->strip[esub]) == OOR2);
  614. ssub = esub + 1;
  615. esub += OPND(m->g->strip[esub]);
  616. if (OP(m->g->strip[esub]) == OOR2)
  617. esub--;
  618. else
  619. assert(OP(m->g->strip[esub]) == O_CH);
  620. }
  621. break;
  622. case OLPAREN: /* must undo assignment if rest fails */
  623. i = OPND(s);
  624. assert(0 < i && i <= m->g->nsub);
  625. offsave = m->pmatch[i].rm_so;
  626. m->pmatch[i].rm_so = sp - m->offp;
  627. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  628. if (dp != NULL)
  629. return(dp);
  630. m->pmatch[i].rm_so = offsave;
  631. return(NULL);
  632. break;
  633. case ORPAREN: /* must undo assignment if rest fails */
  634. i = OPND(s);
  635. assert(0 < i && i <= m->g->nsub);
  636. offsave = m->pmatch[i].rm_eo;
  637. m->pmatch[i].rm_eo = sp - m->offp;
  638. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  639. if (dp != NULL)
  640. return(dp);
  641. m->pmatch[i].rm_eo = offsave;
  642. return(NULL);
  643. break;
  644. default: /* uh oh */
  645. assert(nope);
  646. break;
  647. }
  648. /* "can't happen" */
  649. assert(nope);
  650. /* NOTREACHED */
  651. return NULL;
  652. }
  653. /*
  654. - fast - step through the string at top speed
  655. */
  656. static const char * /* where tentative match ended, or NULL */
  657. fast(struct match *m, const char *start, const char *stop, sopno startst,
  658. sopno stopst)
  659. {
  660. states st = m->st;
  661. states fresh = m->fresh;
  662. states tmp = m->tmp;
  663. const char *p = start;
  664. int c = (start == m->beginp) ? OUT : *(start-1);
  665. int lastc; /* previous c */
  666. int flagch;
  667. int i;
  668. const char *coldp; /* last p after which no match was underway */
  669. CLEAR(st);
  670. SET1(st, startst);
  671. st = step(m->g, startst, stopst, st, NOTHING, st);
  672. ASSIGN(fresh, st);
  673. SP("start", st, *p);
  674. coldp = NULL;
  675. for (;;) {
  676. /* next character */
  677. lastc = c;
  678. c = (p == m->endp) ? OUT : *p;
  679. if (EQ(st, fresh))
  680. coldp = p;
  681. /* is there an EOL and/or BOL between lastc and c? */
  682. flagch = '\0';
  683. i = 0;
  684. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  685. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  686. flagch = BOL;
  687. i = m->g->nbol;
  688. }
  689. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  690. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  691. flagch = (flagch == BOL) ? BOLEOL : EOL;
  692. i += m->g->neol;
  693. }
  694. if (i != 0) {
  695. for (; i > 0; i--)
  696. st = step(m->g, startst, stopst, st, flagch, st);
  697. SP("boleol", st, c);
  698. }
  699. /* how about a word boundary? */
  700. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  701. (c != OUT && ISWORD(c)) ) {
  702. flagch = BOW;
  703. }
  704. if ( (lastc != OUT && ISWORD(lastc)) &&
  705. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  706. flagch = EOW;
  707. }
  708. if (flagch == BOW || flagch == EOW) {
  709. st = step(m->g, startst, stopst, st, flagch, st);
  710. SP("boweow", st, c);
  711. }
  712. /* are we done? */
  713. if (ISSET(st, stopst) || p == stop)
  714. break; /* NOTE BREAK OUT */
  715. /* no, we must deal with this character */
  716. ASSIGN(tmp, st);
  717. ASSIGN(st, fresh);
  718. assert(c != OUT);
  719. st = step(m->g, startst, stopst, tmp, c, st);
  720. SP("aft", st, c);
  721. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  722. p++;
  723. }
  724. assert(coldp != NULL);
  725. m->coldp = coldp;
  726. if (ISSET(st, stopst))
  727. return(p+1);
  728. else
  729. return(NULL);
  730. }
  731. /*
  732. - slow - step through the string more deliberately
  733. */
  734. static const char * /* where it ended */
  735. slow(struct match *m, const char *start, const char *stop, sopno startst,
  736. sopno stopst)
  737. {
  738. states st = m->st;
  739. states empty = m->empty;
  740. states tmp = m->tmp;
  741. const char *p = start;
  742. int c = (start == m->beginp) ? OUT : *(start-1);
  743. int lastc; /* previous c */
  744. int flagch;
  745. int i;
  746. const char *matchp; /* last p at which a match ended */
  747. AT("slow", start, stop, startst, stopst);
  748. CLEAR(st);
  749. SET1(st, startst);
  750. SP("sstart", st, *p);
  751. st = step(m->g, startst, stopst, st, NOTHING, st);
  752. matchp = NULL;
  753. for (;;) {
  754. /* next character */
  755. lastc = c;
  756. c = (p == m->endp) ? OUT : *p;
  757. /* is there an EOL and/or BOL between lastc and c? */
  758. flagch = '\0';
  759. i = 0;
  760. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  761. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  762. flagch = BOL;
  763. i = m->g->nbol;
  764. }
  765. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  766. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  767. flagch = (flagch == BOL) ? BOLEOL : EOL;
  768. i += m->g->neol;
  769. }
  770. if (i != 0) {
  771. for (; i > 0; i--)
  772. st = step(m->g, startst, stopst, st, flagch, st);
  773. SP("sboleol", st, c);
  774. }
  775. /* how about a word boundary? */
  776. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  777. (c != OUT && ISWORD(c)) ) {
  778. flagch = BOW;
  779. }
  780. if ( (lastc != OUT && ISWORD(lastc)) &&
  781. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  782. flagch = EOW;
  783. }
  784. if (flagch == BOW || flagch == EOW) {
  785. st = step(m->g, startst, stopst, st, flagch, st);
  786. SP("sboweow", st, c);
  787. }
  788. /* are we done? */
  789. if (ISSET(st, stopst))
  790. matchp = p;
  791. if (EQ(st, empty) || p == stop)
  792. break; /* NOTE BREAK OUT */
  793. /* no, we must deal with this character */
  794. ASSIGN(tmp, st);
  795. ASSIGN(st, empty);
  796. assert(c != OUT);
  797. st = step(m->g, startst, stopst, tmp, c, st);
  798. SP("saft", st, c);
  799. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  800. p++;
  801. }
  802. return(matchp);
  803. }
  804. /*
  805. - step - map set of states reachable before char to set reachable after
  806. */
  807. static states
  808. step(struct re_guts *g,
  809. sopno start, /* start state within strip */
  810. sopno stop, /* state after stop state within strip */
  811. states bef, /* states reachable before */
  812. int ch, /* character or NONCHAR code */
  813. states aft) /* states already known reachable after */
  814. {
  815. cset *cs;
  816. sop s;
  817. sopno pc;
  818. onestate here; /* note, macros know this name */
  819. sopno look;
  820. int i;
  821. for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
  822. s = g->strip[pc];
  823. switch (OP(s)) {
  824. case OEND:
  825. assert(pc == stop-1);
  826. break;
  827. case OCHAR:
  828. /* only characters can match */
  829. assert(!NONCHAR(ch) || ch != (char)OPND(s));
  830. if (ch == (char)OPND(s))
  831. FWD(aft, bef, 1);
  832. break;
  833. case OBOL:
  834. if (ch == BOL || ch == BOLEOL)
  835. FWD(aft, bef, 1);
  836. break;
  837. case OEOL:
  838. if (ch == EOL || ch == BOLEOL)
  839. FWD(aft, bef, 1);
  840. break;
  841. case OBOW:
  842. if (ch == BOW)
  843. FWD(aft, bef, 1);
  844. break;
  845. case OEOW:
  846. if (ch == EOW)
  847. FWD(aft, bef, 1);
  848. break;
  849. case OANY:
  850. if (!NONCHAR(ch))
  851. FWD(aft, bef, 1);
  852. break;
  853. case OANYOF:
  854. cs = &g->sets[OPND(s)];
  855. if (!NONCHAR(ch) && CHIN(cs, ch))
  856. FWD(aft, bef, 1);
  857. break;
  858. case OBACK_: /* ignored here */
  859. case O_BACK:
  860. FWD(aft, aft, 1);
  861. break;
  862. case OPLUS_: /* forward, this is just an empty */
  863. FWD(aft, aft, 1);
  864. break;
  865. case O_PLUS: /* both forward and back */
  866. FWD(aft, aft, 1);
  867. i = ISSETBACK(aft, OPND(s));
  868. BACK(aft, aft, OPND(s));
  869. if (!i && ISSETBACK(aft, OPND(s))) {
  870. /* oho, must reconsider loop body */
  871. pc -= OPND(s) + 1;
  872. INIT(here, pc);
  873. }
  874. break;
  875. case OQUEST_: /* two branches, both forward */
  876. FWD(aft, aft, 1);
  877. FWD(aft, aft, OPND(s));
  878. break;
  879. case O_QUEST: /* just an empty */
  880. FWD(aft, aft, 1);
  881. break;
  882. case OLPAREN: /* not significant here */
  883. case ORPAREN:
  884. FWD(aft, aft, 1);
  885. break;
  886. case OCH_: /* mark the first two branches */
  887. FWD(aft, aft, 1);
  888. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  889. FWD(aft, aft, OPND(s));
  890. break;
  891. case OOR1: /* done a branch, find the O_CH */
  892. if (ISSTATEIN(aft, here)) {
  893. for (look = 1;
  894. OP(s = g->strip[pc+look]) != O_CH;
  895. look += OPND(s))
  896. assert(OP(s) == OOR2);
  897. FWD(aft, aft, look);
  898. }
  899. break;
  900. case OOR2: /* propagate OCH_'s marking */
  901. FWD(aft, aft, 1);
  902. if (OP(g->strip[pc+OPND(s)]) != O_CH) {
  903. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  904. FWD(aft, aft, OPND(s));
  905. }
  906. break;
  907. case O_CH: /* just empty */
  908. FWD(aft, aft, 1);
  909. break;
  910. default: /* ooooops... */
  911. assert(nope);
  912. break;
  913. }
  914. }
  915. return(aft);
  916. }
  917. #ifdef REDEBUG
  918. /*
  919. - print - print a set of states
  920. */
  921. static void
  922. print(struct match *m, char *caption, states st, int ch, FILE *d)
  923. {
  924. struct re_guts *g = m->g;
  925. int i;
  926. int first = 1;
  927. if (!(m->eflags&REG_TRACE))
  928. return;
  929. (void)fprintf(d, "%s", caption);
  930. if (ch != '\0')
  931. (void)fprintf(d, " %s", pchar(ch));
  932. for (i = 0; i < g->nstates; i++)
  933. if (ISSET(st, i)) {
  934. (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
  935. first = 0;
  936. }
  937. (void)fprintf(d, "\n");
  938. }
  939. /*
  940. - at - print current situation
  941. */
  942. static void
  943. at(struct match *m, char *title, char *start, char *stop, sopno startst,
  944. sopno stopst)
  945. {
  946. if (!(m->eflags&REG_TRACE))
  947. return;
  948. (void)printf("%s %s-", title, pchar(*start));
  949. (void)printf("%s ", pchar(*stop));
  950. (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
  951. }
  952. #ifndef PCHARDONE
  953. #define PCHARDONE /* never again */
  954. /*
  955. - pchar - make a character printable
  956. *
  957. * Is this identical to regchar() over in debug.c? Well, yes. But a
  958. * duplicate here avoids having a debugging-capable regexec.o tied to
  959. * a matching debug.o, and this is convenient. It all disappears in
  960. * the non-debug compilation anyway, so it doesn't matter much.
  961. */
  962. static char * /* -> representation */
  963. pchar(int ch)
  964. {
  965. static char pbuf[10];
  966. if (isPrint(ch) || ch == ' ')
  967. (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
  968. else
  969. (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
  970. return(pbuf);
  971. }
  972. #endif
  973. #endif
  974. #undef matcher
  975. #undef fast
  976. #undef slow
  977. #undef dissect
  978. #undef backref
  979. #undef step
  980. #undef print
  981. #undef at
  982. #undef match
  983. #undef nope