regcomp.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702
  1. /*-
  2. * This code is derived from OpenBSD's libc/regex, original license follows:
  3. *
  4. * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  5. * Copyright (c) 1992, 1993, 1994
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Henry Spencer.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
  36. */
  37. #include <sys/types.h>
  38. #include <stdint.h>
  39. #include <stdio.h>
  40. #include <string.h>
  41. #include <ctype.h>
  42. #include <limits.h>
  43. #include <stdlib.h>
  44. #include "regex_impl.h"
  45. #include "regutils.h"
  46. #include "regex2.h"
  47. #include "llvm/Config/config.h"
  48. #include "llvm/Support/Compiler.h"
  49. /* character-class table */
  50. static struct cclass {
  51. const char *name;
  52. const char *chars;
  53. const char *multis;
  54. } cclasses[] = {
  55. { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  56. 0123456789", ""} ,
  57. { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
  58. ""} ,
  59. { "blank", " \t", ""} ,
  60. { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
  61. \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
  62. { "digit", "0123456789", ""} ,
  63. { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  64. 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  65. ""} ,
  66. { "lower", "abcdefghijklmnopqrstuvwxyz",
  67. ""} ,
  68. { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  69. 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
  70. ""} ,
  71. { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  72. ""} ,
  73. { "space", "\t\n\v\f\r ", ""} ,
  74. { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  75. ""} ,
  76. { "xdigit", "0123456789ABCDEFabcdef",
  77. ""} ,
  78. { NULL, 0, "" }
  79. };
  80. /* character-name table */
  81. static struct cname {
  82. const char *name;
  83. char code;
  84. } cnames[] = {
  85. { "NUL", '\0' },
  86. { "SOH", '\001' },
  87. { "STX", '\002' },
  88. { "ETX", '\003' },
  89. { "EOT", '\004' },
  90. { "ENQ", '\005' },
  91. { "ACK", '\006' },
  92. { "BEL", '\007' },
  93. { "alert", '\007' },
  94. { "BS", '\010' },
  95. { "backspace", '\b' },
  96. { "HT", '\011' },
  97. { "tab", '\t' },
  98. { "LF", '\012' },
  99. { "newline", '\n' },
  100. { "VT", '\013' },
  101. { "vertical-tab", '\v' },
  102. { "FF", '\014' },
  103. { "form-feed", '\f' },
  104. { "CR", '\015' },
  105. { "carriage-return", '\r' },
  106. { "SO", '\016' },
  107. { "SI", '\017' },
  108. { "DLE", '\020' },
  109. { "DC1", '\021' },
  110. { "DC2", '\022' },
  111. { "DC3", '\023' },
  112. { "DC4", '\024' },
  113. { "NAK", '\025' },
  114. { "SYN", '\026' },
  115. { "ETB", '\027' },
  116. { "CAN", '\030' },
  117. { "EM", '\031' },
  118. { "SUB", '\032' },
  119. { "ESC", '\033' },
  120. { "IS4", '\034' },
  121. { "FS", '\034' },
  122. { "IS3", '\035' },
  123. { "GS", '\035' },
  124. { "IS2", '\036' },
  125. { "RS", '\036' },
  126. { "IS1", '\037' },
  127. { "US", '\037' },
  128. { "space", ' ' },
  129. { "exclamation-mark", '!' },
  130. { "quotation-mark", '"' },
  131. { "number-sign", '#' },
  132. { "dollar-sign", '$' },
  133. { "percent-sign", '%' },
  134. { "ampersand", '&' },
  135. { "apostrophe", '\'' },
  136. { "left-parenthesis", '(' },
  137. { "right-parenthesis", ')' },
  138. { "asterisk", '*' },
  139. { "plus-sign", '+' },
  140. { "comma", ',' },
  141. { "hyphen", '-' },
  142. { "hyphen-minus", '-' },
  143. { "period", '.' },
  144. { "full-stop", '.' },
  145. { "slash", '/' },
  146. { "solidus", '/' },
  147. { "zero", '0' },
  148. { "one", '1' },
  149. { "two", '2' },
  150. { "three", '3' },
  151. { "four", '4' },
  152. { "five", '5' },
  153. { "six", '6' },
  154. { "seven", '7' },
  155. { "eight", '8' },
  156. { "nine", '9' },
  157. { "colon", ':' },
  158. { "semicolon", ';' },
  159. { "less-than-sign", '<' },
  160. { "equals-sign", '=' },
  161. { "greater-than-sign", '>' },
  162. { "question-mark", '?' },
  163. { "commercial-at", '@' },
  164. { "left-square-bracket", '[' },
  165. { "backslash", '\\' },
  166. { "reverse-solidus", '\\' },
  167. { "right-square-bracket", ']' },
  168. { "circumflex", '^' },
  169. { "circumflex-accent", '^' },
  170. { "underscore", '_' },
  171. { "low-line", '_' },
  172. { "grave-accent", '`' },
  173. { "left-brace", '{' },
  174. { "left-curly-bracket", '{' },
  175. { "vertical-line", '|' },
  176. { "right-brace", '}' },
  177. { "right-curly-bracket", '}' },
  178. { "tilde", '~' },
  179. { "DEL", '\177' },
  180. { NULL, 0 }
  181. };
  182. /*
  183. * parse structure, passed up and down to avoid global variables and
  184. * other clumsinesses
  185. */
  186. struct parse {
  187. char *next; /* next character in RE */
  188. char *end; /* end of string (-> NUL normally) */
  189. int error; /* has an error been seen? */
  190. sop *strip; /* malloced strip */
  191. sopno ssize; /* malloced strip size (allocated) */
  192. sopno slen; /* malloced strip length (used) */
  193. int ncsalloc; /* number of csets allocated */
  194. struct re_guts *g;
  195. # define NPAREN 10 /* we need to remember () 1-9 for back refs */
  196. sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
  197. sopno pend[NPAREN]; /* -> ) ([0] unused) */
  198. };
  199. static void p_ere(struct parse *, int);
  200. static void p_ere_exp(struct parse *);
  201. static void p_str(struct parse *);
  202. static void p_bre(struct parse *, int, int);
  203. static int p_simp_re(struct parse *, int);
  204. static int p_count(struct parse *);
  205. static void p_bracket(struct parse *);
  206. static void p_b_term(struct parse *, cset *);
  207. static void p_b_cclass(struct parse *, cset *);
  208. static void p_b_eclass(struct parse *, cset *);
  209. static char p_b_symbol(struct parse *);
  210. static char p_b_coll_elem(struct parse *, int);
  211. static char othercase(int);
  212. static void bothcases(struct parse *, int);
  213. static void ordinary(struct parse *, int);
  214. static void nonnewline(struct parse *);
  215. static void repeat(struct parse *, sopno, int, int);
  216. static int seterr(struct parse *, int);
  217. static cset *allocset(struct parse *);
  218. static void freeset(struct parse *, cset *);
  219. static int freezeset(struct parse *, cset *);
  220. static int firstch(struct parse *, cset *);
  221. static int nch(struct parse *, cset *);
  222. static void mcadd(struct parse *, cset *, const char *);
  223. static void mcinvert(struct parse *, cset *);
  224. static void mccase(struct parse *, cset *);
  225. static int isinsets(struct re_guts *, int);
  226. static int samesets(struct re_guts *, int, int);
  227. static void categorize(struct parse *, struct re_guts *);
  228. static sopno dupl(struct parse *, sopno, sopno);
  229. static void doemit(struct parse *, sop, size_t);
  230. static void doinsert(struct parse *, sop, size_t, sopno);
  231. static void dofwd(struct parse *, sopno, sop);
  232. static void enlarge(struct parse *, sopno);
  233. static void stripsnug(struct parse *, struct re_guts *);
  234. static void findmust(struct parse *, struct re_guts *);
  235. static sopno pluscount(struct parse *, struct re_guts *);
  236. static char nuls[10]; /* place to point scanner in event of error */
  237. /*
  238. * macros for use with parse structure
  239. * BEWARE: these know that the parse structure is named `p' !!!
  240. */
  241. #define PEEK() (*p->next)
  242. #define PEEK2() (*(p->next+1))
  243. #define MORE() (p->next < p->end)
  244. #define MORE2() (p->next+1 < p->end)
  245. #define SEE(c) (MORE() && PEEK() == (c))
  246. #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
  247. #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
  248. #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
  249. #define NEXT() (p->next++)
  250. #define NEXT2() (p->next += 2)
  251. #define NEXTn(n) (p->next += (n))
  252. #define GETNEXT() (*p->next++)
  253. #define SETERROR(e) seterr(p, (e))
  254. #define REQUIRE(co, e) (void)((co) || SETERROR(e))
  255. #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
  256. #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
  257. #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
  258. #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
  259. #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
  260. #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
  261. #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
  262. #define HERE() (p->slen)
  263. #define THERE() (p->slen - 1)
  264. #define THERETHERE() (p->slen - 2)
  265. #define DROP(n) (p->slen -= (n))
  266. #ifdef _POSIX2_RE_DUP_MAX
  267. #define DUPMAX _POSIX2_RE_DUP_MAX
  268. #else
  269. #define DUPMAX 255
  270. #endif
  271. #define INFINITY (DUPMAX + 1)
  272. #ifndef NDEBUG
  273. static int never = 0; /* for use in asserts; shuts lint up */
  274. #else
  275. #define never 0 /* some <assert.h>s have bugs too */
  276. #endif
  277. /*
  278. - llvm_regcomp - interface for parser and compilation
  279. */
  280. int /* 0 success, otherwise REG_something */
  281. llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
  282. {
  283. struct parse pa;
  284. struct re_guts *g;
  285. struct parse *p = &pa;
  286. int i;
  287. size_t len;
  288. #ifdef REDEBUG
  289. # define GOODFLAGS(f) (f)
  290. #else
  291. # define GOODFLAGS(f) ((f)&~REG_DUMP)
  292. #endif
  293. cflags = GOODFLAGS(cflags);
  294. if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
  295. return(REG_INVARG);
  296. if (cflags&REG_PEND) {
  297. if (preg->re_endp < pattern)
  298. return(REG_INVARG);
  299. len = preg->re_endp - pattern;
  300. } else
  301. len = strlen((const char *)pattern);
  302. /* do the mallocs early so failure handling is easy */
  303. g = (struct re_guts *)malloc(sizeof(struct re_guts) +
  304. (NC-1)*sizeof(cat_t));
  305. if (g == NULL)
  306. return(REG_ESPACE);
  307. p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
  308. p->strip = (sop *)calloc(p->ssize, sizeof(sop));
  309. p->slen = 0;
  310. if (p->strip == NULL) {
  311. free((char *)g);
  312. return(REG_ESPACE);
  313. }
  314. /* set things up */
  315. p->g = g;
  316. p->next = (char *)pattern; /* convenience; we do not modify it */
  317. p->end = p->next + len;
  318. p->error = 0;
  319. p->ncsalloc = 0;
  320. for (i = 0; i < NPAREN; i++) {
  321. p->pbegin[i] = 0;
  322. p->pend[i] = 0;
  323. }
  324. g->csetsize = NC;
  325. g->sets = NULL;
  326. g->setbits = NULL;
  327. g->ncsets = 0;
  328. g->cflags = cflags;
  329. g->iflags = 0;
  330. g->nbol = 0;
  331. g->neol = 0;
  332. g->must = NULL;
  333. g->mlen = 0;
  334. g->nsub = 0;
  335. g->ncategories = 1; /* category 0 is "everything else" */
  336. g->categories = &g->catspace[-(CHAR_MIN)];
  337. (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
  338. g->backrefs = 0;
  339. /* do it */
  340. EMIT(OEND, 0);
  341. g->firststate = THERE();
  342. if (cflags&REG_EXTENDED)
  343. p_ere(p, OUT);
  344. else if (cflags&REG_NOSPEC)
  345. p_str(p);
  346. else
  347. p_bre(p, OUT, OUT);
  348. EMIT(OEND, 0);
  349. g->laststate = THERE();
  350. /* tidy up loose ends and fill things in */
  351. categorize(p, g);
  352. stripsnug(p, g);
  353. findmust(p, g);
  354. g->nplus = pluscount(p, g);
  355. g->magic = MAGIC2;
  356. preg->re_nsub = g->nsub;
  357. preg->re_g = g;
  358. preg->re_magic = MAGIC1;
  359. #ifndef REDEBUG
  360. /* not debugging, so can't rely on the assert() in llvm_regexec() */
  361. if (g->iflags&REGEX_BAD)
  362. SETERROR(REG_ASSERT);
  363. #endif
  364. /* win or lose, we're done */
  365. if (p->error != 0) /* lose */
  366. llvm_regfree(preg);
  367. return(p->error);
  368. }
  369. /*
  370. - p_ere - ERE parser top level, concatenation and alternation
  371. */
  372. static void
  373. p_ere(struct parse *p, int stop) /* character this ERE should end at */
  374. {
  375. char c;
  376. sopno prevback = 0;
  377. sopno prevfwd = 0;
  378. sopno conc;
  379. int first = 1; /* is this the first alternative? */
  380. for (;;) {
  381. /* do a bunch of concatenated expressions */
  382. conc = HERE();
  383. while (MORE() && (c = PEEK()) != '|' && c != stop)
  384. p_ere_exp(p);
  385. REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
  386. if (!EAT('|'))
  387. break; /* NOTE BREAK OUT */
  388. if (first) {
  389. INSERT(OCH_, conc); /* offset is wrong */
  390. prevfwd = conc;
  391. prevback = conc;
  392. first = 0;
  393. }
  394. ASTERN(OOR1, prevback);
  395. prevback = THERE();
  396. AHEAD(prevfwd); /* fix previous offset */
  397. prevfwd = HERE();
  398. EMIT(OOR2, 0); /* offset is very wrong */
  399. }
  400. if (!first) { /* tail-end fixups */
  401. AHEAD(prevfwd);
  402. ASTERN(O_CH, prevback);
  403. }
  404. assert(!MORE() || SEE(stop));
  405. }
  406. /*
  407. - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
  408. */
  409. static void
  410. p_ere_exp(struct parse *p)
  411. {
  412. char c;
  413. sopno pos;
  414. int count;
  415. int count2;
  416. int backrefnum;
  417. sopno subno;
  418. int wascaret = 0;
  419. assert(MORE()); /* caller should have ensured this */
  420. c = GETNEXT();
  421. pos = HERE();
  422. switch (c) {
  423. case '(':
  424. REQUIRE(MORE(), REG_EPAREN);
  425. p->g->nsub++;
  426. subno = p->g->nsub;
  427. if (subno < NPAREN)
  428. p->pbegin[subno] = HERE();
  429. EMIT(OLPAREN, subno);
  430. if (!SEE(')'))
  431. p_ere(p, ')');
  432. if (subno < NPAREN) {
  433. p->pend[subno] = HERE();
  434. assert(p->pend[subno] != 0);
  435. }
  436. EMIT(ORPAREN, subno);
  437. MUSTEAT(')', REG_EPAREN);
  438. break;
  439. #ifndef POSIX_MISTAKE
  440. case ')': /* happens only if no current unmatched ( */
  441. /*
  442. * You may ask, why the ifndef? Because I didn't notice
  443. * this until slightly too late for 1003.2, and none of the
  444. * other 1003.2 regular-expression reviewers noticed it at
  445. * all. So an unmatched ) is legal POSIX, at least until
  446. * we can get it fixed.
  447. */
  448. SETERROR(REG_EPAREN);
  449. break;
  450. #endif
  451. case '^':
  452. EMIT(OBOL, 0);
  453. p->g->iflags |= USEBOL;
  454. p->g->nbol++;
  455. wascaret = 1;
  456. break;
  457. case '$':
  458. EMIT(OEOL, 0);
  459. p->g->iflags |= USEEOL;
  460. p->g->neol++;
  461. break;
  462. case '|':
  463. SETERROR(REG_EMPTY);
  464. break;
  465. case '*':
  466. case '+':
  467. case '?':
  468. SETERROR(REG_BADRPT);
  469. break;
  470. case '.':
  471. if (p->g->cflags&REG_NEWLINE)
  472. nonnewline(p);
  473. else
  474. EMIT(OANY, 0);
  475. break;
  476. case '[':
  477. p_bracket(p);
  478. break;
  479. case '\\':
  480. REQUIRE(MORE(), REG_EESCAPE);
  481. c = GETNEXT();
  482. if (c >= '1' && c <= '9') {
  483. /* \[0-9] is taken to be a back-reference to a previously specified
  484. * matching group. backrefnum will hold the number. The matching
  485. * group must exist (i.e. if \4 is found there must have been at
  486. * least 4 matching groups specified in the pattern previously).
  487. */
  488. backrefnum = c - '0';
  489. if (p->pend[backrefnum] == 0) {
  490. SETERROR(REG_ESUBREG);
  491. break;
  492. }
  493. /* Make sure everything checks out and emit the sequence
  494. * that marks a back-reference to the parse structure.
  495. */
  496. assert(backrefnum <= p->g->nsub);
  497. EMIT(OBACK_, backrefnum);
  498. assert(p->pbegin[backrefnum] != 0);
  499. assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
  500. assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
  501. (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
  502. EMIT(O_BACK, backrefnum);
  503. p->g->backrefs = 1;
  504. } else {
  505. /* Other chars are simply themselves when escaped with a backslash.
  506. */
  507. ordinary(p, c);
  508. }
  509. break;
  510. case '{': /* okay as ordinary except if digit follows */
  511. REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
  512. LLVM_FALLTHROUGH;
  513. default:
  514. ordinary(p, c);
  515. break;
  516. }
  517. if (!MORE())
  518. return;
  519. c = PEEK();
  520. /* we call { a repetition if followed by a digit */
  521. if (!( c == '*' || c == '+' || c == '?' ||
  522. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
  523. return; /* no repetition, we're done */
  524. NEXT();
  525. REQUIRE(!wascaret, REG_BADRPT);
  526. switch (c) {
  527. case '*': /* implemented as +? */
  528. /* this case does not require the (y|) trick, noKLUDGE */
  529. INSERT(OPLUS_, pos);
  530. ASTERN(O_PLUS, pos);
  531. INSERT(OQUEST_, pos);
  532. ASTERN(O_QUEST, pos);
  533. break;
  534. case '+':
  535. INSERT(OPLUS_, pos);
  536. ASTERN(O_PLUS, pos);
  537. break;
  538. case '?':
  539. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  540. INSERT(OCH_, pos); /* offset slightly wrong */
  541. ASTERN(OOR1, pos); /* this one's right */
  542. AHEAD(pos); /* fix the OCH_ */
  543. EMIT(OOR2, 0); /* offset very wrong... */
  544. AHEAD(THERE()); /* ...so fix it */
  545. ASTERN(O_CH, THERETHERE());
  546. break;
  547. case '{':
  548. count = p_count(p);
  549. if (EAT(',')) {
  550. if (isdigit((uch)PEEK())) {
  551. count2 = p_count(p);
  552. REQUIRE(count <= count2, REG_BADBR);
  553. } else /* single number with comma */
  554. count2 = INFINITY;
  555. } else /* just a single number */
  556. count2 = count;
  557. repeat(p, pos, count, count2);
  558. if (!EAT('}')) { /* error heuristics */
  559. while (MORE() && PEEK() != '}')
  560. NEXT();
  561. REQUIRE(MORE(), REG_EBRACE);
  562. SETERROR(REG_BADBR);
  563. }
  564. break;
  565. }
  566. if (!MORE())
  567. return;
  568. c = PEEK();
  569. if (!( c == '*' || c == '+' || c == '?' ||
  570. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
  571. return;
  572. SETERROR(REG_BADRPT);
  573. }
  574. /*
  575. - p_str - string (no metacharacters) "parser"
  576. */
  577. static void
  578. p_str(struct parse *p)
  579. {
  580. REQUIRE(MORE(), REG_EMPTY);
  581. while (MORE())
  582. ordinary(p, GETNEXT());
  583. }
  584. /*
  585. - p_bre - BRE parser top level, anchoring and concatenation
  586. * Giving end1 as OUT essentially eliminates the end1/end2 check.
  587. *
  588. * This implementation is a bit of a kludge, in that a trailing $ is first
  589. * taken as an ordinary character and then revised to be an anchor. The
  590. * only undesirable side effect is that '$' gets included as a character
  591. * category in such cases. This is fairly harmless; not worth fixing.
  592. * The amount of lookahead needed to avoid this kludge is excessive.
  593. */
  594. static void
  595. p_bre(struct parse *p,
  596. int end1, /* first terminating character */
  597. int end2) /* second terminating character */
  598. {
  599. sopno start = HERE();
  600. int first = 1; /* first subexpression? */
  601. int wasdollar = 0;
  602. if (EAT('^')) {
  603. EMIT(OBOL, 0);
  604. p->g->iflags |= USEBOL;
  605. p->g->nbol++;
  606. }
  607. while (MORE() && !SEETWO(end1, end2)) {
  608. wasdollar = p_simp_re(p, first);
  609. first = 0;
  610. }
  611. if (wasdollar) { /* oops, that was a trailing anchor */
  612. DROP(1);
  613. EMIT(OEOL, 0);
  614. p->g->iflags |= USEEOL;
  615. p->g->neol++;
  616. }
  617. REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
  618. }
  619. /*
  620. - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
  621. */
  622. static int /* was the simple RE an unbackslashed $? */
  623. p_simp_re(struct parse *p,
  624. int starordinary) /* is a leading * an ordinary character? */
  625. {
  626. int c;
  627. int count;
  628. int count2;
  629. sopno pos;
  630. int i;
  631. sopno subno;
  632. # define BACKSL (1<<CHAR_BIT)
  633. pos = HERE(); /* repetition op, if any, covers from here */
  634. assert(MORE()); /* caller should have ensured this */
  635. c = GETNEXT();
  636. if (c == '\\') {
  637. REQUIRE(MORE(), REG_EESCAPE);
  638. c = BACKSL | GETNEXT();
  639. }
  640. switch (c) {
  641. case '.':
  642. if (p->g->cflags&REG_NEWLINE)
  643. nonnewline(p);
  644. else
  645. EMIT(OANY, 0);
  646. break;
  647. case '[':
  648. p_bracket(p);
  649. break;
  650. case BACKSL|'{':
  651. SETERROR(REG_BADRPT);
  652. break;
  653. case BACKSL|'(':
  654. p->g->nsub++;
  655. subno = p->g->nsub;
  656. if (subno < NPAREN)
  657. p->pbegin[subno] = HERE();
  658. EMIT(OLPAREN, subno);
  659. /* the MORE here is an error heuristic */
  660. if (MORE() && !SEETWO('\\', ')'))
  661. p_bre(p, '\\', ')');
  662. if (subno < NPAREN) {
  663. p->pend[subno] = HERE();
  664. assert(p->pend[subno] != 0);
  665. }
  666. EMIT(ORPAREN, subno);
  667. REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
  668. break;
  669. case BACKSL|')': /* should not get here -- must be user */
  670. case BACKSL|'}':
  671. SETERROR(REG_EPAREN);
  672. break;
  673. case BACKSL|'1':
  674. case BACKSL|'2':
  675. case BACKSL|'3':
  676. case BACKSL|'4':
  677. case BACKSL|'5':
  678. case BACKSL|'6':
  679. case BACKSL|'7':
  680. case BACKSL|'8':
  681. case BACKSL|'9':
  682. i = (c&~BACKSL) - '0';
  683. assert(i < NPAREN);
  684. if (p->pend[i] != 0) {
  685. assert(i <= p->g->nsub);
  686. EMIT(OBACK_, i);
  687. assert(p->pbegin[i] != 0);
  688. assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
  689. assert(OP(p->strip[p->pend[i]]) == ORPAREN);
  690. (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
  691. EMIT(O_BACK, i);
  692. } else
  693. SETERROR(REG_ESUBREG);
  694. p->g->backrefs = 1;
  695. break;
  696. case '*':
  697. REQUIRE(starordinary, REG_BADRPT);
  698. LLVM_FALLTHROUGH;
  699. default:
  700. ordinary(p, (char)c);
  701. break;
  702. }
  703. if (EAT('*')) { /* implemented as +? */
  704. /* this case does not require the (y|) trick, noKLUDGE */
  705. INSERT(OPLUS_, pos);
  706. ASTERN(O_PLUS, pos);
  707. INSERT(OQUEST_, pos);
  708. ASTERN(O_QUEST, pos);
  709. } else if (EATTWO('\\', '{')) {
  710. count = p_count(p);
  711. if (EAT(',')) {
  712. if (MORE() && isdigit((uch)PEEK())) {
  713. count2 = p_count(p);
  714. REQUIRE(count <= count2, REG_BADBR);
  715. } else /* single number with comma */
  716. count2 = INFINITY;
  717. } else /* just a single number */
  718. count2 = count;
  719. repeat(p, pos, count, count2);
  720. if (!EATTWO('\\', '}')) { /* error heuristics */
  721. while (MORE() && !SEETWO('\\', '}'))
  722. NEXT();
  723. REQUIRE(MORE(), REG_EBRACE);
  724. SETERROR(REG_BADBR);
  725. }
  726. } else if (c == '$') /* $ (but not \$) ends it */
  727. return(1);
  728. return(0);
  729. }
  730. /*
  731. - p_count - parse a repetition count
  732. */
  733. static int /* the value */
  734. p_count(struct parse *p)
  735. {
  736. int count = 0;
  737. int ndigits = 0;
  738. while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
  739. count = count*10 + (GETNEXT() - '0');
  740. ndigits++;
  741. }
  742. REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
  743. return(count);
  744. }
  745. /*
  746. - p_bracket - parse a bracketed character list
  747. *
  748. * Note a significant property of this code: if the allocset() did SETERROR,
  749. * no set operations are done.
  750. */
  751. static void
  752. p_bracket(struct parse *p)
  753. {
  754. cset *cs;
  755. int invert = 0;
  756. /* Dept of Truly Sickening Special-Case Kludges */
  757. if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
  758. EMIT(OBOW, 0);
  759. NEXTn(6);
  760. return;
  761. }
  762. if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
  763. EMIT(OEOW, 0);
  764. NEXTn(6);
  765. return;
  766. }
  767. if ((cs = allocset(p)) == NULL) {
  768. /* allocset did set error status in p */
  769. return;
  770. }
  771. if (EAT('^'))
  772. invert++; /* make note to invert set at end */
  773. if (EAT(']'))
  774. CHadd(cs, ']');
  775. else if (EAT('-'))
  776. CHadd(cs, '-');
  777. while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
  778. p_b_term(p, cs);
  779. if (EAT('-'))
  780. CHadd(cs, '-');
  781. MUSTEAT(']', REG_EBRACK);
  782. if (p->error != 0) { /* don't mess things up further */
  783. freeset(p, cs);
  784. return;
  785. }
  786. if (p->g->cflags&REG_ICASE) {
  787. int i;
  788. int ci;
  789. for (i = p->g->csetsize - 1; i >= 0; i--)
  790. if (CHIN(cs, i) && isalpha(i)) {
  791. ci = othercase(i);
  792. if (ci != i)
  793. CHadd(cs, ci);
  794. }
  795. if (cs->multis != NULL)
  796. mccase(p, cs);
  797. }
  798. if (invert) {
  799. int i;
  800. for (i = p->g->csetsize - 1; i >= 0; i--)
  801. if (CHIN(cs, i))
  802. CHsub(cs, i);
  803. else
  804. CHadd(cs, i);
  805. if (p->g->cflags&REG_NEWLINE)
  806. CHsub(cs, '\n');
  807. if (cs->multis != NULL)
  808. mcinvert(p, cs);
  809. }
  810. assert(cs->multis == NULL); /* xxx */
  811. if (nch(p, cs) == 1) { /* optimize singleton sets */
  812. ordinary(p, firstch(p, cs));
  813. freeset(p, cs);
  814. } else
  815. EMIT(OANYOF, freezeset(p, cs));
  816. }
  817. /*
  818. - p_b_term - parse one term of a bracketed character list
  819. */
  820. static void
  821. p_b_term(struct parse *p, cset *cs)
  822. {
  823. char c;
  824. char start, finish;
  825. int i;
  826. /* classify what we've got */
  827. switch ((MORE()) ? PEEK() : '\0') {
  828. case '[':
  829. c = (MORE2()) ? PEEK2() : '\0';
  830. break;
  831. case '-':
  832. SETERROR(REG_ERANGE);
  833. return; /* NOTE RETURN */
  834. break;
  835. default:
  836. c = '\0';
  837. break;
  838. }
  839. switch (c) {
  840. case ':': /* character class */
  841. NEXT2();
  842. REQUIRE(MORE(), REG_EBRACK);
  843. c = PEEK();
  844. REQUIRE(c != '-' && c != ']', REG_ECTYPE);
  845. p_b_cclass(p, cs);
  846. REQUIRE(MORE(), REG_EBRACK);
  847. REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
  848. break;
  849. case '=': /* equivalence class */
  850. NEXT2();
  851. REQUIRE(MORE(), REG_EBRACK);
  852. c = PEEK();
  853. REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
  854. p_b_eclass(p, cs);
  855. REQUIRE(MORE(), REG_EBRACK);
  856. REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
  857. break;
  858. default: /* symbol, ordinary character, or range */
  859. /* xxx revision needed for multichar stuff */
  860. start = p_b_symbol(p);
  861. if (SEE('-') && MORE2() && PEEK2() != ']') {
  862. /* range */
  863. NEXT();
  864. if (EAT('-'))
  865. finish = '-';
  866. else
  867. finish = p_b_symbol(p);
  868. } else
  869. finish = start;
  870. /* xxx what about signed chars here... */
  871. REQUIRE(start <= finish, REG_ERANGE);
  872. for (i = start; i <= finish; i++)
  873. CHadd(cs, i);
  874. break;
  875. }
  876. }
  877. /*
  878. - p_b_cclass - parse a character-class name and deal with it
  879. */
  880. static void
  881. p_b_cclass(struct parse *p, cset *cs)
  882. {
  883. char *sp = p->next;
  884. struct cclass *cp;
  885. size_t len;
  886. const char *u;
  887. char c;
  888. while (MORE() && isalpha((uch)PEEK()))
  889. NEXT();
  890. len = p->next - sp;
  891. for (cp = cclasses; cp->name != NULL; cp++)
  892. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  893. break;
  894. if (cp->name == NULL) {
  895. /* oops, didn't find it */
  896. SETERROR(REG_ECTYPE);
  897. return;
  898. }
  899. u = cp->chars;
  900. while ((c = *u++) != '\0')
  901. CHadd(cs, c);
  902. for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
  903. MCadd(p, cs, u);
  904. }
  905. /*
  906. - p_b_eclass - parse an equivalence-class name and deal with it
  907. *
  908. * This implementation is incomplete. xxx
  909. */
  910. static void
  911. p_b_eclass(struct parse *p, cset *cs)
  912. {
  913. char c;
  914. c = p_b_coll_elem(p, '=');
  915. CHadd(cs, c);
  916. }
  917. /*
  918. - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
  919. */
  920. static char /* value of symbol */
  921. p_b_symbol(struct parse *p)
  922. {
  923. char value;
  924. REQUIRE(MORE(), REG_EBRACK);
  925. if (!EATTWO('[', '.'))
  926. return(GETNEXT());
  927. /* collating symbol */
  928. value = p_b_coll_elem(p, '.');
  929. REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
  930. return(value);
  931. }
  932. /*
  933. - p_b_coll_elem - parse a collating-element name and look it up
  934. */
  935. static char /* value of collating element */
  936. p_b_coll_elem(struct parse *p,
  937. int endc) /* name ended by endc,']' */
  938. {
  939. char *sp = p->next;
  940. struct cname *cp;
  941. size_t len;
  942. while (MORE() && !SEETWO(endc, ']'))
  943. NEXT();
  944. if (!MORE()) {
  945. SETERROR(REG_EBRACK);
  946. return(0);
  947. }
  948. len = p->next - sp;
  949. for (cp = cnames; cp->name != NULL; cp++)
  950. if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
  951. return(cp->code); /* known name */
  952. if (len == 1)
  953. return(*sp); /* single character */
  954. SETERROR(REG_ECOLLATE); /* neither */
  955. return(0);
  956. }
  957. /*
  958. - othercase - return the case counterpart of an alphabetic
  959. */
  960. static char /* if no counterpart, return ch */
  961. othercase(int ch)
  962. {
  963. ch = (uch)ch;
  964. assert(isalpha(ch));
  965. if (isupper(ch))
  966. return ((uch)tolower(ch));
  967. else if (islower(ch))
  968. return ((uch)toupper(ch));
  969. else /* peculiar, but could happen */
  970. return(ch);
  971. }
  972. /*
  973. - bothcases - emit a dualcase version of a two-case character
  974. *
  975. * Boy, is this implementation ever a kludge...
  976. */
  977. static void
  978. bothcases(struct parse *p, int ch)
  979. {
  980. char *oldnext = p->next;
  981. char *oldend = p->end;
  982. char bracket[3];
  983. ch = (uch)ch;
  984. assert(othercase(ch) != ch); /* p_bracket() would recurse */
  985. p->next = bracket;
  986. p->end = bracket+2;
  987. bracket[0] = ch;
  988. bracket[1] = ']';
  989. bracket[2] = '\0';
  990. p_bracket(p);
  991. assert(p->next == bracket+2);
  992. p->next = oldnext;
  993. p->end = oldend;
  994. }
  995. /*
  996. - ordinary - emit an ordinary character
  997. */
  998. static void
  999. ordinary(struct parse *p, int ch)
  1000. {
  1001. cat_t *cap = p->g->categories;
  1002. if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
  1003. bothcases(p, ch);
  1004. else {
  1005. EMIT(OCHAR, (uch)ch);
  1006. if (cap[ch] == 0)
  1007. cap[ch] = p->g->ncategories++;
  1008. }
  1009. }
  1010. /*
  1011. - nonnewline - emit REG_NEWLINE version of OANY
  1012. *
  1013. * Boy, is this implementation ever a kludge...
  1014. */
  1015. static void
  1016. nonnewline(struct parse *p)
  1017. {
  1018. char *oldnext = p->next;
  1019. char *oldend = p->end;
  1020. char bracket[4];
  1021. p->next = bracket;
  1022. p->end = bracket+3;
  1023. bracket[0] = '^';
  1024. bracket[1] = '\n';
  1025. bracket[2] = ']';
  1026. bracket[3] = '\0';
  1027. p_bracket(p);
  1028. assert(p->next == bracket+3);
  1029. p->next = oldnext;
  1030. p->end = oldend;
  1031. }
  1032. /*
  1033. - repeat - generate code for a bounded repetition, recursively if needed
  1034. */
  1035. static void
  1036. repeat(struct parse *p,
  1037. sopno start, /* operand from here to end of strip */
  1038. int from, /* repeated from this number */
  1039. int to) /* to this number of times (maybe INFINITY) */
  1040. {
  1041. sopno finish = HERE();
  1042. # define N 2
  1043. # define INF 3
  1044. # define REP(f, t) ((f)*8 + (t))
  1045. # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
  1046. sopno copy;
  1047. if (p->error != 0) /* head off possible runaway recursion */
  1048. return;
  1049. assert(from <= to);
  1050. switch (REP(MAP(from), MAP(to))) {
  1051. case REP(0, 0): /* must be user doing this */
  1052. DROP(finish-start); /* drop the operand */
  1053. break;
  1054. case REP(0, 1): /* as x{1,1}? */
  1055. case REP(0, N): /* as x{1,n}? */
  1056. case REP(0, INF): /* as x{1,}? */
  1057. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  1058. INSERT(OCH_, start); /* offset is wrong... */
  1059. repeat(p, start+1, 1, to);
  1060. ASTERN(OOR1, start);
  1061. AHEAD(start); /* ... fix it */
  1062. EMIT(OOR2, 0);
  1063. AHEAD(THERE());
  1064. ASTERN(O_CH, THERETHERE());
  1065. break;
  1066. case REP(1, 1): /* trivial case */
  1067. /* done */
  1068. break;
  1069. case REP(1, N): /* as x?x{1,n-1} */
  1070. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  1071. INSERT(OCH_, start);
  1072. ASTERN(OOR1, start);
  1073. AHEAD(start);
  1074. EMIT(OOR2, 0); /* offset very wrong... */
  1075. AHEAD(THERE()); /* ...so fix it */
  1076. ASTERN(O_CH, THERETHERE());
  1077. copy = dupl(p, start+1, finish+1);
  1078. assert(copy == finish+4);
  1079. repeat(p, copy, 1, to-1);
  1080. break;
  1081. case REP(1, INF): /* as x+ */
  1082. INSERT(OPLUS_, start);
  1083. ASTERN(O_PLUS, start);
  1084. break;
  1085. case REP(N, N): /* as xx{m-1,n-1} */
  1086. copy = dupl(p, start, finish);
  1087. repeat(p, copy, from-1, to-1);
  1088. break;
  1089. case REP(N, INF): /* as xx{n-1,INF} */
  1090. copy = dupl(p, start, finish);
  1091. repeat(p, copy, from-1, to);
  1092. break;
  1093. default: /* "can't happen" */
  1094. SETERROR(REG_ASSERT); /* just in case */
  1095. break;
  1096. }
  1097. }
  1098. /*
  1099. - seterr - set an error condition
  1100. */
  1101. static int /* useless but makes type checking happy */
  1102. seterr(struct parse *p, int e)
  1103. {
  1104. if (p->error == 0) /* keep earliest error condition */
  1105. p->error = e;
  1106. p->next = nuls; /* try to bring things to a halt */
  1107. p->end = nuls;
  1108. return(0); /* make the return value well-defined */
  1109. }
  1110. /*
  1111. - allocset - allocate a set of characters for []
  1112. */
  1113. static cset *
  1114. allocset(struct parse *p)
  1115. {
  1116. int no = p->g->ncsets++;
  1117. size_t nc;
  1118. size_t nbytes;
  1119. cset *cs;
  1120. size_t css = (size_t)p->g->csetsize;
  1121. int i;
  1122. if (no >= p->ncsalloc) { /* need another column of space */
  1123. void *ptr;
  1124. p->ncsalloc += CHAR_BIT;
  1125. nc = p->ncsalloc;
  1126. if (nc > SIZE_MAX / sizeof(cset))
  1127. goto nomem;
  1128. assert(nc % CHAR_BIT == 0);
  1129. nbytes = nc / CHAR_BIT * css;
  1130. ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
  1131. if (ptr == NULL)
  1132. goto nomem;
  1133. p->g->sets = ptr;
  1134. ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
  1135. if (ptr == NULL)
  1136. goto nomem;
  1137. p->g->setbits = ptr;
  1138. for (i = 0; i < no; i++)
  1139. p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
  1140. (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
  1141. }
  1142. /* XXX should not happen */
  1143. if (p->g->sets == NULL || p->g->setbits == NULL)
  1144. goto nomem;
  1145. cs = &p->g->sets[no];
  1146. cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
  1147. cs->mask = 1 << ((no) % CHAR_BIT);
  1148. cs->hash = 0;
  1149. cs->smultis = 0;
  1150. cs->multis = NULL;
  1151. return(cs);
  1152. nomem:
  1153. free(p->g->sets);
  1154. p->g->sets = NULL;
  1155. free(p->g->setbits);
  1156. p->g->setbits = NULL;
  1157. SETERROR(REG_ESPACE);
  1158. /* caller's responsibility not to do set ops */
  1159. return(NULL);
  1160. }
  1161. /*
  1162. - freeset - free a now-unused set
  1163. */
  1164. static void
  1165. freeset(struct parse *p, cset *cs)
  1166. {
  1167. size_t i;
  1168. cset *top = &p->g->sets[p->g->ncsets];
  1169. size_t css = (size_t)p->g->csetsize;
  1170. for (i = 0; i < css; i++)
  1171. CHsub(cs, i);
  1172. if (cs == top-1) /* recover only the easy case */
  1173. p->g->ncsets--;
  1174. }
  1175. /*
  1176. - freezeset - final processing on a set of characters
  1177. *
  1178. * The main task here is merging identical sets. This is usually a waste
  1179. * of time (although the hash code minimizes the overhead), but can win
  1180. * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
  1181. * is done using addition rather than xor -- all ASCII [aA] sets xor to
  1182. * the same value!
  1183. */
  1184. static int /* set number */
  1185. freezeset(struct parse *p, cset *cs)
  1186. {
  1187. uch h = cs->hash;
  1188. size_t i;
  1189. cset *top = &p->g->sets[p->g->ncsets];
  1190. cset *cs2;
  1191. size_t css = (size_t)p->g->csetsize;
  1192. /* look for an earlier one which is the same */
  1193. for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
  1194. if (cs2->hash == h && cs2 != cs) {
  1195. /* maybe */
  1196. for (i = 0; i < css; i++)
  1197. if (!!CHIN(cs2, i) != !!CHIN(cs, i))
  1198. break; /* no */
  1199. if (i == css)
  1200. break; /* yes */
  1201. }
  1202. if (cs2 < top) { /* found one */
  1203. freeset(p, cs);
  1204. cs = cs2;
  1205. }
  1206. return((int)(cs - p->g->sets));
  1207. }
  1208. /*
  1209. - firstch - return first character in a set (which must have at least one)
  1210. */
  1211. static int /* character; there is no "none" value */
  1212. firstch(struct parse *p, cset *cs)
  1213. {
  1214. size_t i;
  1215. size_t css = (size_t)p->g->csetsize;
  1216. for (i = 0; i < css; i++)
  1217. if (CHIN(cs, i))
  1218. return((char)i);
  1219. assert(never);
  1220. return(0); /* arbitrary */
  1221. }
  1222. /*
  1223. - nch - number of characters in a set
  1224. */
  1225. static int
  1226. nch(struct parse *p, cset *cs)
  1227. {
  1228. size_t i;
  1229. size_t css = (size_t)p->g->csetsize;
  1230. int n = 0;
  1231. for (i = 0; i < css; i++)
  1232. if (CHIN(cs, i))
  1233. n++;
  1234. return(n);
  1235. }
  1236. /*
  1237. - mcadd - add a collating element to a cset
  1238. */
  1239. static void
  1240. mcadd( struct parse *p, cset *cs, const char *cp)
  1241. {
  1242. size_t oldend = cs->smultis;
  1243. void *np;
  1244. cs->smultis += strlen(cp) + 1;
  1245. np = realloc(cs->multis, cs->smultis);
  1246. if (np == NULL) {
  1247. if (cs->multis)
  1248. free(cs->multis);
  1249. cs->multis = NULL;
  1250. SETERROR(REG_ESPACE);
  1251. return;
  1252. }
  1253. cs->multis = np;
  1254. llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
  1255. }
  1256. /*
  1257. - mcinvert - invert the list of collating elements in a cset
  1258. *
  1259. * This would have to know the set of possibilities. Implementation
  1260. * is deferred.
  1261. */
  1262. /* ARGSUSED */
  1263. static void
  1264. mcinvert(struct parse *p, cset *cs)
  1265. {
  1266. assert(cs->multis == NULL); /* xxx */
  1267. }
  1268. /*
  1269. - mccase - add case counterparts of the list of collating elements in a cset
  1270. *
  1271. * This would have to know the set of possibilities. Implementation
  1272. * is deferred.
  1273. */
  1274. /* ARGSUSED */
  1275. static void
  1276. mccase(struct parse *p, cset *cs)
  1277. {
  1278. assert(cs->multis == NULL); /* xxx */
  1279. }
  1280. /*
  1281. - isinsets - is this character in any sets?
  1282. */
  1283. static int /* predicate */
  1284. isinsets(struct re_guts *g, int c)
  1285. {
  1286. uch *col;
  1287. int i;
  1288. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1289. unsigned uc = (uch)c;
  1290. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1291. if (col[uc] != 0)
  1292. return(1);
  1293. return(0);
  1294. }
  1295. /*
  1296. - samesets - are these two characters in exactly the same sets?
  1297. */
  1298. static int /* predicate */
  1299. samesets(struct re_guts *g, int c1, int c2)
  1300. {
  1301. uch *col;
  1302. int i;
  1303. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1304. unsigned uc1 = (uch)c1;
  1305. unsigned uc2 = (uch)c2;
  1306. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1307. if (col[uc1] != col[uc2])
  1308. return(0);
  1309. return(1);
  1310. }
  1311. /*
  1312. - categorize - sort out character categories
  1313. */
  1314. static void
  1315. categorize(struct parse *p, struct re_guts *g)
  1316. {
  1317. cat_t *cats = g->categories;
  1318. int c;
  1319. int c2;
  1320. cat_t cat;
  1321. /* avoid making error situations worse */
  1322. if (p->error != 0)
  1323. return;
  1324. for (c = CHAR_MIN; c <= CHAR_MAX; c++)
  1325. if (cats[c] == 0 && isinsets(g, c)) {
  1326. cat = g->ncategories++;
  1327. cats[c] = cat;
  1328. for (c2 = c+1; c2 <= CHAR_MAX; c2++)
  1329. if (cats[c2] == 0 && samesets(g, c, c2))
  1330. cats[c2] = cat;
  1331. }
  1332. }
  1333. /*
  1334. - dupl - emit a duplicate of a bunch of sops
  1335. */
  1336. static sopno /* start of duplicate */
  1337. dupl(struct parse *p,
  1338. sopno start, /* from here */
  1339. sopno finish) /* to this less one */
  1340. {
  1341. sopno ret = HERE();
  1342. sopno len = finish - start;
  1343. assert(finish >= start);
  1344. if (len == 0)
  1345. return(ret);
  1346. enlarge(p, p->ssize + len); /* this many unexpected additions */
  1347. assert(p->ssize >= p->slen + len);
  1348. (void) memmove((char *)(p->strip + p->slen),
  1349. (char *)(p->strip + start), (size_t)len*sizeof(sop));
  1350. p->slen += len;
  1351. return(ret);
  1352. }
  1353. /*
  1354. - doemit - emit a strip operator
  1355. *
  1356. * It might seem better to implement this as a macro with a function as
  1357. * hard-case backup, but it's just too big and messy unless there are
  1358. * some changes to the data structures. Maybe later.
  1359. */
  1360. static void
  1361. doemit(struct parse *p, sop op, size_t opnd)
  1362. {
  1363. /* avoid making error situations worse */
  1364. if (p->error != 0)
  1365. return;
  1366. /* deal with oversize operands ("can't happen", more or less) */
  1367. assert(opnd < 1<<OPSHIFT);
  1368. /* deal with undersized strip */
  1369. if (p->slen >= p->ssize)
  1370. enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
  1371. assert(p->slen < p->ssize);
  1372. /* finally, it's all reduced to the easy case */
  1373. p->strip[p->slen++] = SOP(op, opnd);
  1374. }
  1375. /*
  1376. - doinsert - insert a sop into the strip
  1377. */
  1378. static void
  1379. doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
  1380. {
  1381. sopno sn;
  1382. sop s;
  1383. int i;
  1384. /* avoid making error situations worse */
  1385. if (p->error != 0)
  1386. return;
  1387. sn = HERE();
  1388. EMIT(op, opnd); /* do checks, ensure space */
  1389. assert(HERE() == sn+1);
  1390. s = p->strip[sn];
  1391. /* adjust paren pointers */
  1392. assert(pos > 0);
  1393. for (i = 1; i < NPAREN; i++) {
  1394. if (p->pbegin[i] >= pos) {
  1395. p->pbegin[i]++;
  1396. }
  1397. if (p->pend[i] >= pos) {
  1398. p->pend[i]++;
  1399. }
  1400. }
  1401. memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
  1402. (HERE()-pos-1)*sizeof(sop));
  1403. p->strip[pos] = s;
  1404. }
  1405. /*
  1406. - dofwd - complete a forward reference
  1407. */
  1408. static void
  1409. dofwd(struct parse *p, sopno pos, sop value)
  1410. {
  1411. /* avoid making error situations worse */
  1412. if (p->error != 0)
  1413. return;
  1414. assert(value < 1<<OPSHIFT);
  1415. p->strip[pos] = OP(p->strip[pos]) | value;
  1416. }
  1417. /*
  1418. - enlarge - enlarge the strip
  1419. */
  1420. static void
  1421. enlarge(struct parse *p, sopno size)
  1422. {
  1423. sop *sp;
  1424. if (p->ssize >= size)
  1425. return;
  1426. if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
  1427. SETERROR(REG_ESPACE);
  1428. return;
  1429. }
  1430. sp = (sop *)realloc(p->strip, size*sizeof(sop));
  1431. if (sp == NULL) {
  1432. SETERROR(REG_ESPACE);
  1433. return;
  1434. }
  1435. p->strip = sp;
  1436. p->ssize = size;
  1437. }
  1438. /*
  1439. - stripsnug - compact the strip
  1440. */
  1441. static void
  1442. stripsnug(struct parse *p, struct re_guts *g)
  1443. {
  1444. g->nstates = p->slen;
  1445. if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
  1446. g->strip = p->strip;
  1447. SETERROR(REG_ESPACE);
  1448. return;
  1449. }
  1450. g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
  1451. if (g->strip == NULL) {
  1452. SETERROR(REG_ESPACE);
  1453. g->strip = p->strip;
  1454. }
  1455. }
  1456. /*
  1457. - findmust - fill in must and mlen with longest mandatory literal string
  1458. *
  1459. * This algorithm could do fancy things like analyzing the operands of |
  1460. * for common subsequences. Someday. This code is simple and finds most
  1461. * of the interesting cases.
  1462. *
  1463. * Note that must and mlen got initialized during setup.
  1464. */
  1465. static void
  1466. findmust(struct parse *p, struct re_guts *g)
  1467. {
  1468. sop *scan;
  1469. sop *start = 0; /* start initialized in the default case, after that */
  1470. sop *newstart = 0; /* newstart was initialized in the OCHAR case */
  1471. sopno newlen;
  1472. sop s;
  1473. char *cp;
  1474. sopno i;
  1475. /* avoid making error situations worse */
  1476. if (p->error != 0)
  1477. return;
  1478. /* find the longest OCHAR sequence in strip */
  1479. newlen = 0;
  1480. scan = g->strip + 1;
  1481. do {
  1482. s = *scan++;
  1483. switch (OP(s)) {
  1484. case OCHAR: /* sequence member */
  1485. if (newlen == 0) /* new sequence */
  1486. newstart = scan - 1;
  1487. newlen++;
  1488. break;
  1489. case OPLUS_: /* things that don't break one */
  1490. case OLPAREN:
  1491. case ORPAREN:
  1492. break;
  1493. case OQUEST_: /* things that must be skipped */
  1494. case OCH_:
  1495. scan--;
  1496. do {
  1497. scan += OPND(s);
  1498. s = *scan;
  1499. /* assert() interferes w debug printouts */
  1500. if (OP(s) != O_QUEST && OP(s) != O_CH &&
  1501. OP(s) != OOR2) {
  1502. g->iflags |= REGEX_BAD;
  1503. return;
  1504. }
  1505. } while (OP(s) != O_QUEST && OP(s) != O_CH);
  1506. LLVM_FALLTHROUGH;
  1507. default: /* things that break a sequence */
  1508. if (newlen > g->mlen) { /* ends one */
  1509. start = newstart;
  1510. g->mlen = newlen;
  1511. }
  1512. newlen = 0;
  1513. break;
  1514. }
  1515. } while (OP(s) != OEND);
  1516. if (g->mlen == 0) /* there isn't one */
  1517. return;
  1518. /* turn it into a character string */
  1519. g->must = malloc((size_t)g->mlen + 1);
  1520. if (g->must == NULL) { /* argh; just forget it */
  1521. g->mlen = 0;
  1522. return;
  1523. }
  1524. cp = g->must;
  1525. scan = start;
  1526. for (i = g->mlen; i > 0; i--) {
  1527. while (OP(s = *scan++) != OCHAR)
  1528. continue;
  1529. assert(cp < g->must + g->mlen);
  1530. *cp++ = (char)OPND(s);
  1531. }
  1532. assert(cp == g->must + g->mlen);
  1533. *cp++ = '\0'; /* just on general principles */
  1534. }
  1535. /*
  1536. - pluscount - count + nesting
  1537. */
  1538. static sopno /* nesting depth */
  1539. pluscount(struct parse *p, struct re_guts *g)
  1540. {
  1541. sop *scan;
  1542. sop s;
  1543. sopno plusnest = 0;
  1544. sopno maxnest = 0;
  1545. if (p->error != 0)
  1546. return(0); /* there may not be an OEND */
  1547. scan = g->strip + 1;
  1548. do {
  1549. s = *scan++;
  1550. switch (OP(s)) {
  1551. case OPLUS_:
  1552. plusnest++;
  1553. break;
  1554. case O_PLUS:
  1555. if (plusnest > maxnest)
  1556. maxnest = plusnest;
  1557. plusnest--;
  1558. break;
  1559. }
  1560. } while (OP(s) != OEND);
  1561. if (plusnest != 0)
  1562. g->iflags |= REGEX_BAD;
  1563. return(maxnest);
  1564. }