regengine.inc 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099
  1. /*-
  2. * This code is derived from OpenBSD's libc/regex, original license follows:
  3. *
  4. * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  5. * Copyright (c) 1992, 1993, 1994
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Henry Spencer.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)engine.c 8.5 (Berkeley) 3/20/94
  36. */
  37. /*
  38. * The matching engine and friends. This file is #included by regexec.c
  39. * after suitable #defines of a variety of macros used herein, so that
  40. * different state representations can be used without duplicating masses
  41. * of code.
  42. */
  43. #ifdef SNAMES
  44. #define matcher smatcher
  45. #define fast sfast
  46. #define slow sslow
  47. #define dissect sdissect
  48. #define backref sbackref
  49. #define step sstep
  50. #define print sprint
  51. #define at sat
  52. #define match smat
  53. #define nope snope
  54. #define step_back sstep_back
  55. #endif
  56. #ifdef LNAMES
  57. #define matcher lmatcher
  58. #define fast lfast
  59. #define slow lslow
  60. #define dissect ldissect
  61. #define backref lbackref
  62. #define step lstep
  63. #define print lprint
  64. #define at lat
  65. #define match lmat
  66. #define nope lnope
  67. #define step_back lstep_back
  68. #endif
  69. /* another structure passed up and down to avoid zillions of parameters */
  70. struct match {
  71. struct re_guts *g;
  72. int eflags;
  73. llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
  74. const char *offp; /* offsets work from here */
  75. const char *beginp; /* start of string -- virtual NUL precedes */
  76. const char *endp; /* end of string -- virtual NUL here */
  77. const char *coldp; /* can be no match starting before here */
  78. const char **lastpos; /* [nplus+1] */
  79. STATEVARS;
  80. states st; /* current states */
  81. states fresh; /* states for a fresh start */
  82. states tmp; /* temporary */
  83. states empty; /* empty set of states */
  84. };
  85. static int matcher(struct re_guts *, const char *, size_t,
  86. llvm_regmatch_t[], int);
  87. static const char *dissect(struct match *, const char *, const char *, sopno,
  88. sopno);
  89. static const char *backref(struct match *, const char *, const char *, sopno,
  90. sopno, sopno, int);
  91. static const char *fast(struct match *, const char *, const char *, sopno, sopno);
  92. static const char *slow(struct match *, const char *, const char *, sopno, sopno);
  93. static states step(struct re_guts *, sopno, sopno, states, int, states);
  94. #define MAX_RECURSION 100
  95. #define BOL (OUT+1)
  96. #define EOL (BOL+1)
  97. #define BOLEOL (BOL+2)
  98. #define NOTHING (BOL+3)
  99. #define BOW (BOL+4)
  100. #define EOW (BOL+5)
  101. #define CODEMAX (BOL+5) /* highest code used */
  102. #define NONCHAR(c) ((c) > CHAR_MAX)
  103. #define NNONCHAR (CODEMAX-CHAR_MAX)
  104. #ifdef REDEBUG
  105. static void print(struct match *, const char *, states, int, FILE *);
  106. #endif
  107. #ifdef REDEBUG
  108. static void at(
  109. struct match *, const char *, const char *, const char *, sopno, sopno);
  110. #endif
  111. #ifdef REDEBUG
  112. static char *pchar(int);
  113. #endif
  114. #ifdef REDEBUG
  115. #define SP(t, s, c) print(m, t, s, c, stdout)
  116. #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
  117. #define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
  118. static int nope = 0;
  119. #else
  120. #define SP(t, s, c) /* nothing */
  121. #define AT(t, p1, p2, s1, s2) /* nothing */
  122. #define NOTE(s) /* nothing */
  123. #endif
  124. /*
  125. - matcher - the actual matching engine
  126. */
  127. static int /* 0 success, REG_NOMATCH failure */
  128. matcher(struct re_guts *g, const char *string, size_t nmatch,
  129. llvm_regmatch_t pmatch[],
  130. int eflags)
  131. {
  132. const char *endp;
  133. size_t i;
  134. struct match mv;
  135. struct match *m = &mv;
  136. const char *dp;
  137. const sopno gf = g->firststate+1; /* +1 for OEND */
  138. const sopno gl = g->laststate;
  139. const char *start;
  140. const char *stop;
  141. /* simplify the situation where possible */
  142. if (g->cflags&REG_NOSUB)
  143. nmatch = 0;
  144. if (eflags&REG_STARTEND) {
  145. start = string + pmatch[0].rm_so;
  146. stop = string + pmatch[0].rm_eo;
  147. } else {
  148. start = string;
  149. stop = start + strlen(start);
  150. }
  151. if (stop < start)
  152. return(REG_INVARG);
  153. /* prescreening; this does wonders for this rather slow code */
  154. if (g->must != NULL) {
  155. for (dp = start; dp < stop; dp++)
  156. if (*dp == g->must[0] && stop - dp >= g->mlen &&
  157. memcmp(dp, g->must, (size_t)g->mlen) == 0)
  158. break;
  159. if (dp == stop) /* we didn't find g->must */
  160. return(REG_NOMATCH);
  161. }
  162. /* match struct setup */
  163. m->g = g;
  164. m->eflags = eflags;
  165. m->pmatch = NULL;
  166. m->lastpos = NULL;
  167. m->offp = string;
  168. m->beginp = start;
  169. m->endp = stop;
  170. STATESETUP(m, 4);
  171. SETUP(m->st);
  172. SETUP(m->fresh);
  173. SETUP(m->tmp);
  174. SETUP(m->empty);
  175. CLEAR(m->empty);
  176. /* this loop does only one repetition except for backrefs */
  177. for (;;) {
  178. endp = fast(m, start, stop, gf, gl);
  179. if (endp == NULL) { /* a miss */
  180. free(m->pmatch);
  181. free((void*)m->lastpos);
  182. STATETEARDOWN(m);
  183. return(REG_NOMATCH);
  184. }
  185. if (nmatch == 0 && !g->backrefs)
  186. break; /* no further info needed */
  187. /* where? */
  188. assert(m->coldp != NULL);
  189. for (;;) {
  190. NOTE("finding start");
  191. endp = slow(m, m->coldp, stop, gf, gl);
  192. if (endp != NULL)
  193. break;
  194. assert(m->coldp < m->endp);
  195. m->coldp++;
  196. }
  197. if (nmatch == 1 && !g->backrefs)
  198. break; /* no further info needed */
  199. /* oh my, they want the subexpressions... */
  200. if (m->pmatch == NULL)
  201. m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
  202. sizeof(llvm_regmatch_t));
  203. if (m->pmatch == NULL) {
  204. STATETEARDOWN(m);
  205. return(REG_ESPACE);
  206. }
  207. for (i = 1; i <= m->g->nsub; i++)
  208. m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
  209. if (!g->backrefs && !(m->eflags&REG_BACKR)) {
  210. NOTE("dissecting");
  211. dp = dissect(m, m->coldp, endp, gf, gl);
  212. } else {
  213. if (g->nplus > 0 && m->lastpos == NULL)
  214. m->lastpos = (const char **)malloc((g->nplus+1) *
  215. sizeof(char *));
  216. if (g->nplus > 0 && m->lastpos == NULL) {
  217. free(m->pmatch);
  218. STATETEARDOWN(m);
  219. return(REG_ESPACE);
  220. }
  221. NOTE("backref dissect");
  222. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
  223. }
  224. if (dp != NULL)
  225. break;
  226. /* uh-oh... we couldn't find a subexpression-level match */
  227. assert(g->backrefs); /* must be back references doing it */
  228. assert(g->nplus == 0 || m->lastpos != NULL);
  229. for (;;) {
  230. if (dp != NULL || endp <= m->coldp)
  231. break; /* defeat */
  232. NOTE("backoff");
  233. endp = slow(m, m->coldp, endp-1, gf, gl);
  234. if (endp == NULL)
  235. break; /* defeat */
  236. /* try it on a shorter possibility */
  237. #ifndef NDEBUG
  238. for (i = 1; i <= m->g->nsub; i++) {
  239. assert(m->pmatch[i].rm_so == -1);
  240. assert(m->pmatch[i].rm_eo == -1);
  241. }
  242. #endif
  243. NOTE("backoff dissect");
  244. dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
  245. }
  246. assert(dp == NULL || dp == endp);
  247. if (dp != NULL) /* found a shorter one */
  248. break;
  249. /* despite initial appearances, there is no match here */
  250. NOTE("false alarm");
  251. if (m->coldp == stop)
  252. break;
  253. start = m->coldp + 1; /* recycle starting later */
  254. }
  255. /* fill in the details if requested */
  256. if (nmatch > 0) {
  257. pmatch[0].rm_so = m->coldp - m->offp;
  258. pmatch[0].rm_eo = endp - m->offp;
  259. }
  260. if (nmatch > 1) {
  261. assert(m->pmatch != NULL);
  262. for (i = 1; i < nmatch; i++)
  263. if (i <= m->g->nsub)
  264. pmatch[i] = m->pmatch[i];
  265. else {
  266. pmatch[i].rm_so = -1;
  267. pmatch[i].rm_eo = -1;
  268. }
  269. }
  270. if (m->pmatch != NULL)
  271. free((char *)m->pmatch);
  272. if (m->lastpos != NULL)
  273. free((char *)m->lastpos);
  274. STATETEARDOWN(m);
  275. return(0);
  276. }
  277. /* Step back from "stop" to a position where the strip startst..stopst might
  278. * match. This can always conservatively return "stop - 1", but may return an
  279. * earlier position if matches at later positions are impossible. */
  280. static const char *
  281. step_back(struct re_guts *g, const char *start, const char *stop, sopno startst,
  282. sopno stopst)
  283. {
  284. /* Always step back at least one character. */
  285. assert(stop > start);
  286. const char *res = stop - 1;
  287. /* Check whether the strip startst..stropst starts with a fixed character,
  288. * ignoring any closing parentheses. If not, return a conservative result. */
  289. for (;;) {
  290. if (startst >= stopst)
  291. return res;
  292. if (OP(g->strip[startst]) != ORPAREN)
  293. break;
  294. startst++;
  295. }
  296. if (OP(g->strip[startst]) != OCHAR)
  297. return res;
  298. /* Find the character that starts the following match. */
  299. char ch = OPND(g->strip[startst]);
  300. for (; res != start; --res) {
  301. if (*res == ch) {
  302. /* Try to check the next fixed character as well. */
  303. sopno nextst = startst + 1;
  304. const char *next = res + 1;
  305. if (nextst >= stopst || OP(g->strip[nextst]) != OCHAR || next >= stop ||
  306. *next == (char)OPND(g->strip[nextst]))
  307. break;
  308. }
  309. }
  310. return res;
  311. }
  312. /*
  313. - dissect - figure out what matched what, no back references
  314. */
  315. static const char * /* == stop (success) always */
  316. dissect(struct match *m, const char *start, const char *stop, sopno startst,
  317. sopno stopst)
  318. {
  319. int i;
  320. sopno ss; /* start sop of current subRE */
  321. sopno es; /* end sop of current subRE */
  322. const char *sp; /* start of string matched by it */
  323. const char *stp; /* string matched by it cannot pass here */
  324. const char *rest; /* start of rest of string */
  325. const char *tail; /* string unmatched by rest of RE */
  326. sopno ssub; /* start sop of subsubRE */
  327. sopno esub; /* end sop of subsubRE */
  328. const char *ssp; /* start of string matched by subsubRE */
  329. const char *sep; /* end of string matched by subsubRE */
  330. const char *oldssp; /* previous ssp */
  331. AT("diss", start, stop, startst, stopst);
  332. sp = start;
  333. for (ss = startst; ss < stopst; ss = es) {
  334. /* identify end of subRE */
  335. es = ss;
  336. switch (OP(m->g->strip[es])) {
  337. case OPLUS_:
  338. case OQUEST_:
  339. es += OPND(m->g->strip[es]);
  340. break;
  341. case OCH_:
  342. while (OP(m->g->strip[es]) != O_CH)
  343. es += OPND(m->g->strip[es]);
  344. break;
  345. }
  346. es++;
  347. /* figure out what it matched */
  348. switch (OP(m->g->strip[ss])) {
  349. case OEND:
  350. assert(nope);
  351. break;
  352. case OCHAR:
  353. sp++;
  354. break;
  355. case OBOL:
  356. case OEOL:
  357. case OBOW:
  358. case OEOW:
  359. break;
  360. case OANY:
  361. case OANYOF:
  362. sp++;
  363. break;
  364. case OBACK_:
  365. case O_BACK:
  366. assert(nope);
  367. break;
  368. /* cases where length of match is hard to find */
  369. case OQUEST_:
  370. stp = stop;
  371. for (;;) {
  372. /* how long could this one be? */
  373. rest = slow(m, sp, stp, ss, es);
  374. assert(rest != NULL); /* it did match */
  375. /* could the rest match the rest? */
  376. tail = slow(m, rest, stop, es, stopst);
  377. if (tail == stop)
  378. break; /* yes! */
  379. /* no -- try a shorter match for this one */
  380. stp = step_back(m->g, sp, rest, es, stopst);
  381. assert(stp >= sp); /* it did work */
  382. }
  383. ssub = ss + 1;
  384. esub = es - 1;
  385. /* did innards match? */
  386. if (slow(m, sp, rest, ssub, esub) != NULL) {
  387. const char *dp = dissect(m, sp, rest, ssub, esub);
  388. (void)dp; /* avoid warning if assertions off */
  389. assert(dp == rest);
  390. } else /* no */
  391. assert(sp == rest);
  392. sp = rest;
  393. break;
  394. case OPLUS_:
  395. stp = stop;
  396. for (;;) {
  397. /* how long could this one be? */
  398. rest = slow(m, sp, stp, ss, es);
  399. assert(rest != NULL); /* it did match */
  400. /* could the rest match the rest? */
  401. tail = slow(m, rest, stop, es, stopst);
  402. if (tail == stop)
  403. break; /* yes! */
  404. /* no -- try a shorter match for this one */
  405. stp = step_back(m->g, sp, rest, es, stopst);
  406. assert(stp >= sp); /* it did work */
  407. }
  408. ssub = ss + 1;
  409. esub = es - 1;
  410. ssp = sp;
  411. oldssp = ssp;
  412. for (;;) { /* find last match of innards */
  413. sep = slow(m, ssp, rest, ssub, esub);
  414. if (sep == NULL || sep == ssp)
  415. break; /* failed or matched null */
  416. oldssp = ssp; /* on to next try */
  417. ssp = sep;
  418. }
  419. if (sep == NULL) {
  420. /* last successful match */
  421. sep = ssp;
  422. ssp = oldssp;
  423. }
  424. assert(sep == rest); /* must exhaust substring */
  425. assert(slow(m, ssp, sep, ssub, esub) == rest);
  426. {
  427. const char *dp = dissect(m, ssp, sep, ssub, esub);
  428. (void)dp; /* avoid warning if assertions off */
  429. assert(dp == sep);
  430. }
  431. sp = rest;
  432. break;
  433. case OCH_:
  434. stp = stop;
  435. for (;;) {
  436. /* how long could this one be? */
  437. rest = slow(m, sp, stp, ss, es);
  438. assert(rest != NULL); /* it did match */
  439. /* could the rest match the rest? */
  440. tail = slow(m, rest, stop, es, stopst);
  441. if (tail == stop)
  442. break; /* yes! */
  443. /* no -- try a shorter match for this one */
  444. stp = rest - 1;
  445. assert(stp >= sp); /* it did work */
  446. }
  447. ssub = ss + 1;
  448. esub = ss + OPND(m->g->strip[ss]) - 1;
  449. assert(OP(m->g->strip[esub]) == OOR1);
  450. for (;;) { /* find first matching branch */
  451. if (slow(m, sp, rest, ssub, esub) == rest)
  452. break; /* it matched all of it */
  453. /* that one missed, try next one */
  454. assert(OP(m->g->strip[esub]) == OOR1);
  455. esub++;
  456. assert(OP(m->g->strip[esub]) == OOR2);
  457. ssub = esub + 1;
  458. esub += OPND(m->g->strip[esub]);
  459. if (OP(m->g->strip[esub]) == OOR2)
  460. esub--;
  461. else
  462. assert(OP(m->g->strip[esub]) == O_CH);
  463. }
  464. {
  465. const char *dp = dissect(m, sp, rest, ssub, esub);
  466. (void)dp; /* avoid warning if assertions off */
  467. assert(dp == rest);
  468. }
  469. sp = rest;
  470. break;
  471. case O_PLUS:
  472. case O_QUEST:
  473. case OOR1:
  474. case OOR2:
  475. case O_CH:
  476. assert(nope);
  477. break;
  478. case OLPAREN:
  479. i = OPND(m->g->strip[ss]);
  480. assert(0 < i && i <= m->g->nsub);
  481. m->pmatch[i].rm_so = sp - m->offp;
  482. break;
  483. case ORPAREN:
  484. i = OPND(m->g->strip[ss]);
  485. assert(0 < i && i <= m->g->nsub);
  486. m->pmatch[i].rm_eo = sp - m->offp;
  487. break;
  488. default: /* uh oh */
  489. assert(nope);
  490. break;
  491. }
  492. }
  493. assert(sp == stop);
  494. return(sp);
  495. }
  496. /*
  497. - backref - figure out what matched what, figuring in back references
  498. */
  499. static const char * /* == stop (success) or NULL (failure) */
  500. backref(struct match *m, const char *start, const char *stop, sopno startst,
  501. sopno stopst, sopno lev, int rec) /* PLUS nesting level */
  502. {
  503. int i;
  504. sopno ss; /* start sop of current subRE */
  505. const char *sp; /* start of string matched by it */
  506. sopno ssub; /* start sop of subsubRE */
  507. sopno esub; /* end sop of subsubRE */
  508. const char *ssp; /* start of string matched by subsubRE */
  509. const char *dp;
  510. size_t len;
  511. int hard;
  512. sop s;
  513. llvm_regoff_t offsave;
  514. cset *cs;
  515. AT("back", start, stop, startst, stopst);
  516. sp = start;
  517. /* get as far as we can with easy stuff */
  518. hard = 0;
  519. for (ss = startst; !hard && ss < stopst; ss++)
  520. switch (OP(s = m->g->strip[ss])) {
  521. case OCHAR:
  522. if (sp == stop || *sp++ != (char)OPND(s))
  523. return(NULL);
  524. break;
  525. case OANY:
  526. if (sp == stop)
  527. return(NULL);
  528. sp++;
  529. break;
  530. case OANYOF:
  531. cs = &m->g->sets[OPND(s)];
  532. if (sp == stop || !CHIN(cs, *sp++))
  533. return(NULL);
  534. break;
  535. case OBOL:
  536. if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  537. (sp < m->endp && *(sp-1) == '\n' &&
  538. (m->g->cflags&REG_NEWLINE)) )
  539. { /* yes */ }
  540. else
  541. return(NULL);
  542. break;
  543. case OEOL:
  544. if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  545. (sp < m->endp && *sp == '\n' &&
  546. (m->g->cflags&REG_NEWLINE)) )
  547. { /* yes */ }
  548. else
  549. return(NULL);
  550. break;
  551. case OBOW:
  552. if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
  553. (sp < m->endp && *(sp-1) == '\n' &&
  554. (m->g->cflags&REG_NEWLINE)) ||
  555. (sp > m->beginp &&
  556. !ISWORD(*(sp-1))) ) &&
  557. (sp < m->endp && ISWORD(*sp)) )
  558. { /* yes */ }
  559. else
  560. return(NULL);
  561. break;
  562. case OEOW:
  563. if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
  564. (sp < m->endp && *sp == '\n' &&
  565. (m->g->cflags&REG_NEWLINE)) ||
  566. (sp < m->endp && !ISWORD(*sp)) ) &&
  567. (sp > m->beginp && ISWORD(*(sp-1))) )
  568. { /* yes */ }
  569. else
  570. return(NULL);
  571. break;
  572. case O_QUEST:
  573. case O_CH:
  574. break;
  575. case OOR1: /* matches null but needs to skip */
  576. ss++;
  577. s = m->g->strip[ss];
  578. do {
  579. assert(OP(s) == OOR2);
  580. ss += OPND(s);
  581. } while (OP(s = m->g->strip[ss]) != O_CH);
  582. /* note that the ss++ gets us past the O_CH */
  583. break;
  584. default: /* have to make a choice */
  585. hard = 1;
  586. break;
  587. }
  588. if (!hard) { /* that was it! */
  589. if (sp != stop)
  590. return(NULL);
  591. return(sp);
  592. }
  593. ss--; /* adjust for the for's final increment */
  594. /* the hard stuff */
  595. AT("hard", sp, stop, ss, stopst);
  596. s = m->g->strip[ss];
  597. switch (OP(s)) {
  598. case OBACK_: /* the vilest depths */
  599. i = OPND(s);
  600. assert(0 < i && i <= m->g->nsub);
  601. if (m->pmatch[i].rm_eo == -1)
  602. return(NULL);
  603. assert(m->pmatch[i].rm_so != -1);
  604. len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
  605. if (len == 0 && rec++ > MAX_RECURSION)
  606. return(NULL);
  607. assert(stop - m->beginp >= len);
  608. if (sp > stop - len)
  609. return(NULL); /* not enough left to match */
  610. ssp = m->offp + m->pmatch[i].rm_so;
  611. if (memcmp(sp, ssp, len) != 0)
  612. return(NULL);
  613. while (m->g->strip[ss] != SOP(O_BACK, i))
  614. ss++;
  615. return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
  616. break;
  617. case OQUEST_: /* to null or not */
  618. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  619. if (dp != NULL)
  620. return(dp); /* not */
  621. return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
  622. break;
  623. case OPLUS_:
  624. assert(m->lastpos != NULL);
  625. assert(lev+1 <= m->g->nplus);
  626. m->lastpos[lev+1] = sp;
  627. return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
  628. break;
  629. case O_PLUS:
  630. if (sp == m->lastpos[lev]) /* last pass matched null */
  631. return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
  632. /* try another pass */
  633. m->lastpos[lev] = sp;
  634. dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
  635. if (dp == NULL)
  636. return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
  637. else
  638. return(dp);
  639. break;
  640. case OCH_: /* find the right one, if any */
  641. ssub = ss + 1;
  642. esub = ss + OPND(s) - 1;
  643. assert(OP(m->g->strip[esub]) == OOR1);
  644. for (;;) { /* find first matching branch */
  645. dp = backref(m, sp, stop, ssub, stopst, lev, rec);
  646. if (dp != NULL)
  647. return(dp);
  648. /* that one missed, try next one */
  649. if (OP(m->g->strip[esub]) == O_CH)
  650. return(NULL); /* there is none */
  651. esub++;
  652. assert(OP(m->g->strip[esub]) == OOR2);
  653. ssub = esub + 1;
  654. esub += OPND(m->g->strip[esub]);
  655. if (OP(m->g->strip[esub]) == OOR2)
  656. esub--;
  657. else
  658. assert(OP(m->g->strip[esub]) == O_CH);
  659. }
  660. break;
  661. case OLPAREN: /* must undo assignment if rest fails */
  662. i = OPND(s);
  663. assert(0 < i && i <= m->g->nsub);
  664. offsave = m->pmatch[i].rm_so;
  665. m->pmatch[i].rm_so = sp - m->offp;
  666. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  667. if (dp != NULL)
  668. return(dp);
  669. m->pmatch[i].rm_so = offsave;
  670. return(NULL);
  671. break;
  672. case ORPAREN: /* must undo assignment if rest fails */
  673. i = OPND(s);
  674. assert(0 < i && i <= m->g->nsub);
  675. offsave = m->pmatch[i].rm_eo;
  676. m->pmatch[i].rm_eo = sp - m->offp;
  677. dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
  678. if (dp != NULL)
  679. return(dp);
  680. m->pmatch[i].rm_eo = offsave;
  681. return(NULL);
  682. break;
  683. default: /* uh oh */
  684. assert(nope);
  685. break;
  686. }
  687. /* "can't happen" */
  688. assert(nope);
  689. /* NOTREACHED */
  690. return NULL;
  691. }
  692. /*
  693. - fast - step through the string at top speed
  694. */
  695. static const char * /* where tentative match ended, or NULL */
  696. fast(struct match *m, const char *start, const char *stop, sopno startst,
  697. sopno stopst)
  698. {
  699. states st = m->st;
  700. states fresh = m->fresh;
  701. states tmp = m->tmp;
  702. const char *p = start;
  703. int c = (start == m->beginp) ? OUT : *(start-1);
  704. int lastc; /* previous c */
  705. int flagch;
  706. int i;
  707. const char *coldp; /* last p after which no match was underway */
  708. CLEAR(st);
  709. SET1(st, startst);
  710. st = step(m->g, startst, stopst, st, NOTHING, st);
  711. ASSIGN(fresh, st);
  712. SP("start", st, *p);
  713. coldp = NULL;
  714. for (;;) {
  715. /* next character */
  716. lastc = c;
  717. c = (p == m->endp) ? OUT : *p;
  718. if (EQ(st, fresh))
  719. coldp = p;
  720. /* is there an EOL and/or BOL between lastc and c? */
  721. flagch = '\0';
  722. i = 0;
  723. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  724. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  725. flagch = BOL;
  726. i = m->g->nbol;
  727. }
  728. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  729. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  730. flagch = (flagch == BOL) ? BOLEOL : EOL;
  731. i += m->g->neol;
  732. }
  733. if (i != 0) {
  734. for (; i > 0; i--)
  735. st = step(m->g, startst, stopst, st, flagch, st);
  736. SP("boleol", st, c);
  737. }
  738. /* how about a word boundary? */
  739. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  740. (c != OUT && ISWORD(c)) ) {
  741. flagch = BOW;
  742. }
  743. if ( (lastc != OUT && ISWORD(lastc)) &&
  744. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  745. flagch = EOW;
  746. }
  747. if (flagch == BOW || flagch == EOW) {
  748. st = step(m->g, startst, stopst, st, flagch, st);
  749. SP("boweow", st, c);
  750. }
  751. /* are we done? */
  752. if (ISSET(st, stopst) || p == stop)
  753. break; /* NOTE BREAK OUT */
  754. /* no, we must deal with this character */
  755. ASSIGN(tmp, st);
  756. ASSIGN(st, fresh);
  757. assert(c != OUT);
  758. st = step(m->g, startst, stopst, tmp, c, st);
  759. SP("aft", st, c);
  760. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  761. p++;
  762. }
  763. assert(coldp != NULL);
  764. m->coldp = coldp;
  765. if (ISSET(st, stopst))
  766. return(p+1);
  767. else
  768. return(NULL);
  769. }
  770. /*
  771. - slow - step through the string more deliberately
  772. */
  773. static const char * /* where it ended */
  774. slow(struct match *m, const char *start, const char *stop, sopno startst,
  775. sopno stopst)
  776. {
  777. /* Quickly skip over fixed character matches at the start. */
  778. const char *p = start;
  779. for (; startst < stopst; ++startst) {
  780. int hard = 0;
  781. sop s = m->g->strip[startst];
  782. switch (OP(s)) {
  783. case OLPAREN:
  784. case ORPAREN:
  785. /* Not relevant here. */
  786. break;
  787. case OCHAR:
  788. if (p == stop || *p != (char)OPND(s))
  789. return NULL;
  790. ++p;
  791. break;
  792. default:
  793. hard = 1;
  794. break;
  795. }
  796. if (hard)
  797. break;
  798. }
  799. states st = m->st;
  800. states empty = m->empty;
  801. states tmp = m->tmp;
  802. int c = (p == m->beginp) ? OUT : *(p-1);
  803. int lastc; /* previous c */
  804. int flagch;
  805. int i;
  806. const char *matchp; /* last p at which a match ended */
  807. AT("slow", start, stop, startst, stopst);
  808. CLEAR(st);
  809. SET1(st, startst);
  810. SP("sstart", st, *p);
  811. st = step(m->g, startst, stopst, st, NOTHING, st);
  812. matchp = NULL;
  813. for (;;) {
  814. /* next character */
  815. lastc = c;
  816. c = (p == m->endp) ? OUT : *p;
  817. /* is there an EOL and/or BOL between lastc and c? */
  818. flagch = '\0';
  819. i = 0;
  820. if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
  821. (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
  822. flagch = BOL;
  823. i = m->g->nbol;
  824. }
  825. if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
  826. (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
  827. flagch = (flagch == BOL) ? BOLEOL : EOL;
  828. i += m->g->neol;
  829. }
  830. if (i != 0) {
  831. for (; i > 0; i--)
  832. st = step(m->g, startst, stopst, st, flagch, st);
  833. SP("sboleol", st, c);
  834. }
  835. /* how about a word boundary? */
  836. if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
  837. (c != OUT && ISWORD(c)) ) {
  838. flagch = BOW;
  839. }
  840. if ( (lastc != OUT && ISWORD(lastc)) &&
  841. (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
  842. flagch = EOW;
  843. }
  844. if (flagch == BOW || flagch == EOW) {
  845. st = step(m->g, startst, stopst, st, flagch, st);
  846. SP("sboweow", st, c);
  847. }
  848. /* are we done? */
  849. if (ISSET(st, stopst))
  850. matchp = p;
  851. if (EQ(st, empty) || p == stop)
  852. break; /* NOTE BREAK OUT */
  853. /* no, we must deal with this character */
  854. ASSIGN(tmp, st);
  855. ASSIGN(st, empty);
  856. assert(c != OUT);
  857. st = step(m->g, startst, stopst, tmp, c, st);
  858. SP("saft", st, c);
  859. assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
  860. p++;
  861. }
  862. return(matchp);
  863. }
  864. /*
  865. - step - map set of states reachable before char to set reachable after
  866. */
  867. static states
  868. step(struct re_guts *g,
  869. sopno start, /* start state within strip */
  870. sopno stop, /* state after stop state within strip */
  871. states bef, /* states reachable before */
  872. int ch, /* character or NONCHAR code */
  873. states aft) /* states already known reachable after */
  874. {
  875. cset *cs;
  876. sop s;
  877. sopno pc;
  878. onestate here; /* note, macros know this name */
  879. sopno look;
  880. int i;
  881. for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
  882. s = g->strip[pc];
  883. switch (OP(s)) {
  884. case OEND:
  885. assert(pc == stop-1);
  886. break;
  887. case OCHAR:
  888. /* only characters can match */
  889. assert(!NONCHAR(ch) || ch != (char)OPND(s));
  890. if (ch == (char)OPND(s))
  891. FWD(aft, bef, 1);
  892. break;
  893. case OBOL:
  894. if (ch == BOL || ch == BOLEOL)
  895. FWD(aft, bef, 1);
  896. break;
  897. case OEOL:
  898. if (ch == EOL || ch == BOLEOL)
  899. FWD(aft, bef, 1);
  900. break;
  901. case OBOW:
  902. if (ch == BOW)
  903. FWD(aft, bef, 1);
  904. break;
  905. case OEOW:
  906. if (ch == EOW)
  907. FWD(aft, bef, 1);
  908. break;
  909. case OANY:
  910. if (!NONCHAR(ch))
  911. FWD(aft, bef, 1);
  912. break;
  913. case OANYOF:
  914. cs = &g->sets[OPND(s)];
  915. if (!NONCHAR(ch) && CHIN(cs, ch))
  916. FWD(aft, bef, 1);
  917. break;
  918. case OBACK_: /* ignored here */
  919. case O_BACK:
  920. FWD(aft, aft, 1);
  921. break;
  922. case OPLUS_: /* forward, this is just an empty */
  923. FWD(aft, aft, 1);
  924. break;
  925. case O_PLUS: /* both forward and back */
  926. FWD(aft, aft, 1);
  927. i = ISSETBACK(aft, OPND(s));
  928. BACK(aft, aft, OPND(s));
  929. if (!i && ISSETBACK(aft, OPND(s))) {
  930. /* oho, must reconsider loop body */
  931. pc -= OPND(s) + 1;
  932. INIT(here, pc);
  933. }
  934. break;
  935. case OQUEST_: /* two branches, both forward */
  936. FWD(aft, aft, 1);
  937. FWD(aft, aft, OPND(s));
  938. break;
  939. case O_QUEST: /* just an empty */
  940. FWD(aft, aft, 1);
  941. break;
  942. case OLPAREN: /* not significant here */
  943. case ORPAREN:
  944. FWD(aft, aft, 1);
  945. break;
  946. case OCH_: /* mark the first two branches */
  947. FWD(aft, aft, 1);
  948. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  949. FWD(aft, aft, OPND(s));
  950. break;
  951. case OOR1: /* done a branch, find the O_CH */
  952. if (ISSTATEIN(aft, here)) {
  953. for (look = 1;
  954. OP(s = g->strip[pc+look]) != O_CH;
  955. look += OPND(s))
  956. assert(OP(s) == OOR2);
  957. FWD(aft, aft, look);
  958. }
  959. break;
  960. case OOR2: /* propagate OCH_'s marking */
  961. FWD(aft, aft, 1);
  962. if (OP(g->strip[pc+OPND(s)]) != O_CH) {
  963. assert(OP(g->strip[pc+OPND(s)]) == OOR2);
  964. FWD(aft, aft, OPND(s));
  965. }
  966. break;
  967. case O_CH: /* just empty */
  968. FWD(aft, aft, 1);
  969. break;
  970. default: /* ooooops... */
  971. assert(nope);
  972. break;
  973. }
  974. }
  975. return(aft);
  976. }
  977. #ifdef REDEBUG
  978. /*
  979. - print - print a set of states
  980. */
  981. static void
  982. print(struct match *m, const char *caption, states st, int ch, FILE *d)
  983. {
  984. struct re_guts *g = m->g;
  985. int i;
  986. int first = 1;
  987. if (!(m->eflags&REG_TRACE))
  988. return;
  989. (void)fprintf(d, "%s", caption);
  990. if (ch != '\0')
  991. (void)fprintf(d, " %s", pchar(ch));
  992. for (i = 0; i < g->nstates; i++)
  993. if (ISSET(st, i)) {
  994. (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
  995. first = 0;
  996. }
  997. (void)fprintf(d, "\n");
  998. }
  999. /*
  1000. - at - print current situation
  1001. */
  1002. static void
  1003. at(struct match *m, const char *title, const char *start, const char *stop,
  1004. sopno startst, sopno stopst)
  1005. {
  1006. if (!(m->eflags&REG_TRACE))
  1007. return;
  1008. (void)printf("%s %s-", title, pchar(*start));
  1009. (void)printf("%s ", pchar(*stop));
  1010. (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
  1011. }
  1012. #ifndef PCHARDONE
  1013. #define PCHARDONE /* never again */
  1014. /*
  1015. - pchar - make a character printable
  1016. *
  1017. * Is this identical to regchar() over in debug.c? Well, yes. But a
  1018. * duplicate here avoids having a debugging-capable regexec.o tied to
  1019. * a matching debug.o, and this is convenient. It all disappears in
  1020. * the non-debug compilation anyway, so it doesn't matter much.
  1021. */
  1022. static char * /* -> representation */
  1023. pchar(int ch)
  1024. {
  1025. static char pbuf[10];
  1026. if (isprint(ch) || ch == ' ')
  1027. (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
  1028. else
  1029. (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
  1030. return(pbuf);
  1031. }
  1032. #endif
  1033. #endif
  1034. #undef matcher
  1035. #undef fast
  1036. #undef slow
  1037. #undef dissect
  1038. #undef backref
  1039. #undef step
  1040. #undef print
  1041. #undef at
  1042. #undef match
  1043. #undef nope
  1044. #undef step_back