regcomp.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704
  1. /*-
  2. * This code is derived from OpenBSD's libc/regex, original license follows:
  3. *
  4. * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  5. * Copyright (c) 1992, 1993, 1994
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Henry Spencer.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
  36. */
  37. #include <sys/types.h>
  38. #include <stdint.h>
  39. #include <stdio.h>
  40. #include <string.h>
  41. #include <ctype.h>
  42. #include <limits.h>
  43. #include <stdlib.h>
  44. #include "regex_impl.h"
  45. #include "regutils.h"
  46. #include "regex2.h"
  47. #include "llvm/Config/config.h"
  48. #include "llvm/Support/Compiler.h"
  49. /* character-class table */
  50. static struct cclass {
  51. const char *name;
  52. const char *chars;
  53. const char *multis;
  54. } cclasses[] = {
  55. { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  56. 0123456789", ""} ,
  57. { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
  58. ""} ,
  59. { "blank", " \t", ""} ,
  60. { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
  61. \25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
  62. { "digit", "0123456789", ""} ,
  63. { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  64. 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  65. ""} ,
  66. { "lower", "abcdefghijklmnopqrstuvwxyz",
  67. ""} ,
  68. { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
  69. 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
  70. ""} ,
  71. { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
  72. ""} ,
  73. { "space", "\t\n\v\f\r ", ""} ,
  74. { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
  75. ""} ,
  76. { "xdigit", "0123456789ABCDEFabcdef",
  77. ""} ,
  78. { NULL, 0, "" }
  79. };
  80. /* character-name table */
  81. static struct cname {
  82. const char *name;
  83. char code;
  84. } cnames[] = {
  85. { "NUL", '\0' },
  86. { "SOH", '\001' },
  87. { "STX", '\002' },
  88. { "ETX", '\003' },
  89. { "EOT", '\004' },
  90. { "ENQ", '\005' },
  91. { "ACK", '\006' },
  92. { "BEL", '\007' },
  93. { "alert", '\007' },
  94. { "BS", '\010' },
  95. { "backspace", '\b' },
  96. { "HT", '\011' },
  97. { "tab", '\t' },
  98. { "LF", '\012' },
  99. { "newline", '\n' },
  100. { "VT", '\013' },
  101. { "vertical-tab", '\v' },
  102. { "FF", '\014' },
  103. { "form-feed", '\f' },
  104. { "CR", '\015' },
  105. { "carriage-return", '\r' },
  106. { "SO", '\016' },
  107. { "SI", '\017' },
  108. { "DLE", '\020' },
  109. { "DC1", '\021' },
  110. { "DC2", '\022' },
  111. { "DC3", '\023' },
  112. { "DC4", '\024' },
  113. { "NAK", '\025' },
  114. { "SYN", '\026' },
  115. { "ETB", '\027' },
  116. { "CAN", '\030' },
  117. { "EM", '\031' },
  118. { "SUB", '\032' },
  119. { "ESC", '\033' },
  120. { "IS4", '\034' },
  121. { "FS", '\034' },
  122. { "IS3", '\035' },
  123. { "GS", '\035' },
  124. { "IS2", '\036' },
  125. { "RS", '\036' },
  126. { "IS1", '\037' },
  127. { "US", '\037' },
  128. { "space", ' ' },
  129. { "exclamation-mark", '!' },
  130. { "quotation-mark", '"' },
  131. { "number-sign", '#' },
  132. { "dollar-sign", '$' },
  133. { "percent-sign", '%' },
  134. { "ampersand", '&' },
  135. { "apostrophe", '\'' },
  136. { "left-parenthesis", '(' },
  137. { "right-parenthesis", ')' },
  138. { "asterisk", '*' },
  139. { "plus-sign", '+' },
  140. { "comma", ',' },
  141. { "hyphen", '-' },
  142. { "hyphen-minus", '-' },
  143. { "period", '.' },
  144. { "full-stop", '.' },
  145. { "slash", '/' },
  146. { "solidus", '/' },
  147. { "zero", '0' },
  148. { "one", '1' },
  149. { "two", '2' },
  150. { "three", '3' },
  151. { "four", '4' },
  152. { "five", '5' },
  153. { "six", '6' },
  154. { "seven", '7' },
  155. { "eight", '8' },
  156. { "nine", '9' },
  157. { "colon", ':' },
  158. { "semicolon", ';' },
  159. { "less-than-sign", '<' },
  160. { "equals-sign", '=' },
  161. { "greater-than-sign", '>' },
  162. { "question-mark", '?' },
  163. { "commercial-at", '@' },
  164. { "left-square-bracket", '[' },
  165. { "backslash", '\\' },
  166. { "reverse-solidus", '\\' },
  167. { "right-square-bracket", ']' },
  168. { "circumflex", '^' },
  169. { "circumflex-accent", '^' },
  170. { "underscore", '_' },
  171. { "low-line", '_' },
  172. { "grave-accent", '`' },
  173. { "left-brace", '{' },
  174. { "left-curly-bracket", '{' },
  175. { "vertical-line", '|' },
  176. { "right-brace", '}' },
  177. { "right-curly-bracket", '}' },
  178. { "tilde", '~' },
  179. { "DEL", '\177' },
  180. { NULL, 0 }
  181. };
  182. /*
  183. * parse structure, passed up and down to avoid global variables and
  184. * other clumsinesses
  185. */
  186. struct parse {
  187. char *next; /* next character in RE */
  188. char *end; /* end of string (-> NUL normally) */
  189. int error; /* has an error been seen? */
  190. sop *strip; /* malloced strip */
  191. sopno ssize; /* malloced strip size (allocated) */
  192. sopno slen; /* malloced strip length (used) */
  193. int ncsalloc; /* number of csets allocated */
  194. struct re_guts *g;
  195. # define NPAREN 10 /* we need to remember () 1-9 for back refs */
  196. sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
  197. sopno pend[NPAREN]; /* -> ) ([0] unused) */
  198. };
  199. static void p_ere(struct parse *, int);
  200. static void p_ere_exp(struct parse *);
  201. static void p_str(struct parse *);
  202. static void p_bre(struct parse *, int, int);
  203. static int p_simp_re(struct parse *, int);
  204. static int p_count(struct parse *);
  205. static void p_bracket(struct parse *);
  206. static void p_b_term(struct parse *, cset *);
  207. static void p_b_cclass(struct parse *, cset *);
  208. static void p_b_eclass(struct parse *, cset *);
  209. static char p_b_symbol(struct parse *);
  210. static char p_b_coll_elem(struct parse *, int);
  211. static char othercase(int);
  212. static void bothcases(struct parse *, int);
  213. static void ordinary(struct parse *, int);
  214. static void nonnewline(struct parse *);
  215. static void repeat(struct parse *, sopno, int, int);
  216. static int seterr(struct parse *, int);
  217. static cset *allocset(struct parse *);
  218. static void freeset(struct parse *, cset *);
  219. static int freezeset(struct parse *, cset *);
  220. static int firstch(struct parse *, cset *);
  221. static int nch(struct parse *, cset *);
  222. static void mcadd(struct parse *, cset *, const char *);
  223. static void mcinvert(struct parse *, cset *);
  224. static void mccase(struct parse *, cset *);
  225. static int isinsets(struct re_guts *, int);
  226. static int samesets(struct re_guts *, int, int);
  227. static void categorize(struct parse *, struct re_guts *);
  228. static sopno dupl(struct parse *, sopno, sopno);
  229. static void doemit(struct parse *, sop, size_t);
  230. static void doinsert(struct parse *, sop, size_t, sopno);
  231. static void dofwd(struct parse *, sopno, sop);
  232. static void enlarge(struct parse *, sopno);
  233. static void stripsnug(struct parse *, struct re_guts *);
  234. static void findmust(struct parse *, struct re_guts *);
  235. static sopno pluscount(struct parse *, struct re_guts *);
  236. static char nuls[10]; /* place to point scanner in event of error */
  237. /*
  238. * macros for use with parse structure
  239. * BEWARE: these know that the parse structure is named `p' !!!
  240. */
  241. #define PEEK() (*p->next)
  242. #define PEEK2() (*(p->next+1))
  243. #define MORE() (p->end - p->next > 0)
  244. #define MORE2() (p->end - p->next > 1)
  245. #define SEE(c) (MORE() && PEEK() == (c))
  246. #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b))
  247. #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
  248. #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
  249. #define NEXT() (p->next++)
  250. #define NEXT2() (p->next += 2)
  251. #define NEXTn(n) (p->next += (n))
  252. #define GETNEXT() (*p->next++)
  253. #define SETERROR(e) seterr(p, (e))
  254. #define REQUIRE(co, e) (void)((co) || SETERROR(e))
  255. #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
  256. #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
  257. #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
  258. #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
  259. #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
  260. #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
  261. #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
  262. #define HERE() (p->slen)
  263. #define THERE() (p->slen - 1)
  264. #define THERETHERE() (p->slen - 2)
  265. #define DROP(n) (p->slen -= (n))
  266. #ifdef _POSIX2_RE_DUP_MAX
  267. #define DUPMAX _POSIX2_RE_DUP_MAX
  268. #else
  269. #define DUPMAX 255
  270. #endif
  271. #define INFINITY (DUPMAX + 1)
  272. #ifndef NDEBUG
  273. static int never = 0; /* for use in asserts; shuts lint up */
  274. #else
  275. #define never 0 /* some <assert.h>s have bugs too */
  276. #endif
  277. /*
  278. - llvm_regcomp - interface for parser and compilation
  279. */
  280. int /* 0 success, otherwise REG_something */
  281. llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
  282. {
  283. struct parse pa;
  284. struct re_guts *g;
  285. struct parse *p = &pa;
  286. int i;
  287. size_t len;
  288. #ifdef REDEBUG
  289. # define GOODFLAGS(f) (f)
  290. #else
  291. # define GOODFLAGS(f) ((f)&~REG_DUMP)
  292. #endif
  293. cflags = GOODFLAGS(cflags);
  294. if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
  295. return(REG_INVARG);
  296. if (cflags&REG_PEND) {
  297. if (preg->re_endp < pattern)
  298. return(REG_INVARG);
  299. len = preg->re_endp - pattern;
  300. } else
  301. len = strlen((const char *)pattern);
  302. /* do the mallocs early so failure handling is easy */
  303. g = (struct re_guts *)malloc(sizeof(struct re_guts) +
  304. (NC-1)*sizeof(cat_t));
  305. if (g == NULL)
  306. return(REG_ESPACE);
  307. p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
  308. p->strip = (sop *)calloc(p->ssize, sizeof(sop));
  309. p->slen = 0;
  310. if (p->strip == NULL) {
  311. free((char *)g);
  312. return(REG_ESPACE);
  313. }
  314. /* set things up */
  315. p->g = g;
  316. p->next = (char *)pattern; /* convenience; we do not modify it */
  317. p->end = p->next + len;
  318. p->error = 0;
  319. p->ncsalloc = 0;
  320. for (i = 0; i < NPAREN; i++) {
  321. p->pbegin[i] = 0;
  322. p->pend[i] = 0;
  323. }
  324. g->csetsize = NC;
  325. g->sets = NULL;
  326. g->setbits = NULL;
  327. g->ncsets = 0;
  328. g->cflags = cflags;
  329. g->iflags = 0;
  330. g->nbol = 0;
  331. g->neol = 0;
  332. g->must = NULL;
  333. g->mlen = 0;
  334. g->nsub = 0;
  335. g->ncategories = 1; /* category 0 is "everything else" */
  336. g->categories = &g->catspace[-(CHAR_MIN)];
  337. (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
  338. g->backrefs = 0;
  339. /* do it */
  340. EMIT(OEND, 0);
  341. g->firststate = THERE();
  342. if (cflags&REG_EXTENDED)
  343. p_ere(p, OUT);
  344. else if (cflags&REG_NOSPEC)
  345. p_str(p);
  346. else
  347. p_bre(p, OUT, OUT);
  348. EMIT(OEND, 0);
  349. g->laststate = THERE();
  350. /* tidy up loose ends and fill things in */
  351. categorize(p, g);
  352. stripsnug(p, g);
  353. findmust(p, g);
  354. g->nplus = pluscount(p, g);
  355. g->magic = MAGIC2;
  356. preg->re_nsub = g->nsub;
  357. preg->re_g = g;
  358. preg->re_magic = MAGIC1;
  359. #ifndef REDEBUG
  360. /* not debugging, so can't rely on the assert() in llvm_regexec() */
  361. if (g->iflags&REGEX_BAD)
  362. SETERROR(REG_ASSERT);
  363. #endif
  364. /* win or lose, we're done */
  365. if (p->error != 0) /* lose */
  366. llvm_regfree(preg);
  367. return(p->error);
  368. }
  369. /*
  370. - p_ere - ERE parser top level, concatenation and alternation
  371. */
  372. static void
  373. p_ere(struct parse *p, int stop) /* character this ERE should end at */
  374. {
  375. char c;
  376. sopno prevback = 0;
  377. sopno prevfwd = 0;
  378. sopno conc;
  379. int first = 1; /* is this the first alternative? */
  380. for (;;) {
  381. /* do a bunch of concatenated expressions */
  382. conc = HERE();
  383. while (MORE() && (c = PEEK()) != '|' && c != stop)
  384. p_ere_exp(p);
  385. REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
  386. if (!EAT('|'))
  387. break; /* NOTE BREAK OUT */
  388. if (first) {
  389. INSERT(OCH_, conc); /* offset is wrong */
  390. prevfwd = conc;
  391. prevback = conc;
  392. first = 0;
  393. }
  394. ASTERN(OOR1, prevback);
  395. prevback = THERE();
  396. AHEAD(prevfwd); /* fix previous offset */
  397. prevfwd = HERE();
  398. EMIT(OOR2, 0); /* offset is very wrong */
  399. }
  400. if (!first) { /* tail-end fixups */
  401. AHEAD(prevfwd);
  402. ASTERN(O_CH, prevback);
  403. }
  404. assert(!MORE() || SEE(stop));
  405. }
  406. /*
  407. - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
  408. */
  409. static void
  410. p_ere_exp(struct parse *p)
  411. {
  412. char c;
  413. sopno pos;
  414. int count;
  415. int count2;
  416. int backrefnum;
  417. sopno subno;
  418. int wascaret = 0;
  419. assert(MORE()); /* caller should have ensured this */
  420. c = GETNEXT();
  421. pos = HERE();
  422. switch (c) {
  423. case '(':
  424. REQUIRE(MORE(), REG_EPAREN);
  425. p->g->nsub++;
  426. subno = p->g->nsub;
  427. if (subno < NPAREN)
  428. p->pbegin[subno] = HERE();
  429. EMIT(OLPAREN, subno);
  430. if (!SEE(')'))
  431. p_ere(p, ')');
  432. if (subno < NPAREN) {
  433. p->pend[subno] = HERE();
  434. assert(p->pend[subno] != 0);
  435. }
  436. EMIT(ORPAREN, subno);
  437. MUSTEAT(')', REG_EPAREN);
  438. break;
  439. #ifndef POSIX_MISTAKE
  440. case ')': /* happens only if no current unmatched ( */
  441. /*
  442. * You may ask, why the ifndef? Because I didn't notice
  443. * this until slightly too late for 1003.2, and none of the
  444. * other 1003.2 regular-expression reviewers noticed it at
  445. * all. So an unmatched ) is legal POSIX, at least until
  446. * we can get it fixed.
  447. */
  448. SETERROR(REG_EPAREN);
  449. break;
  450. #endif
  451. case '^':
  452. EMIT(OBOL, 0);
  453. p->g->iflags |= USEBOL;
  454. p->g->nbol++;
  455. wascaret = 1;
  456. break;
  457. case '$':
  458. EMIT(OEOL, 0);
  459. p->g->iflags |= USEEOL;
  460. p->g->neol++;
  461. break;
  462. case '|':
  463. SETERROR(REG_EMPTY);
  464. break;
  465. case '*':
  466. case '+':
  467. case '?':
  468. SETERROR(REG_BADRPT);
  469. break;
  470. case '.':
  471. if (p->g->cflags&REG_NEWLINE)
  472. nonnewline(p);
  473. else
  474. EMIT(OANY, 0);
  475. break;
  476. case '[':
  477. p_bracket(p);
  478. break;
  479. case '\\':
  480. REQUIRE(MORE(), REG_EESCAPE);
  481. c = GETNEXT();
  482. if (c >= '1' && c <= '9') {
  483. /* \[0-9] is taken to be a back-reference to a previously specified
  484. * matching group. backrefnum will hold the number. The matching
  485. * group must exist (i.e. if \4 is found there must have been at
  486. * least 4 matching groups specified in the pattern previously).
  487. */
  488. backrefnum = c - '0';
  489. if (p->pend[backrefnum] == 0) {
  490. SETERROR(REG_ESUBREG);
  491. break;
  492. }
  493. /* Make sure everything checks out and emit the sequence
  494. * that marks a back-reference to the parse structure.
  495. */
  496. assert(backrefnum <= p->g->nsub);
  497. EMIT(OBACK_, backrefnum);
  498. assert(p->pbegin[backrefnum] != 0);
  499. assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
  500. assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
  501. (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
  502. EMIT(O_BACK, backrefnum);
  503. p->g->backrefs = 1;
  504. } else {
  505. /* Other chars are simply themselves when escaped with a backslash.
  506. */
  507. ordinary(p, c);
  508. }
  509. break;
  510. case '{': /* okay as ordinary except if digit follows */
  511. REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
  512. LLVM_FALLTHROUGH;
  513. default:
  514. ordinary(p, c);
  515. break;
  516. }
  517. if (!MORE())
  518. return;
  519. c = PEEK();
  520. /* we call { a repetition if followed by a digit */
  521. if (!( c == '*' || c == '+' || c == '?' ||
  522. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
  523. return; /* no repetition, we're done */
  524. NEXT();
  525. REQUIRE(!wascaret, REG_BADRPT);
  526. switch (c) {
  527. case '*': /* implemented as +? */
  528. /* this case does not require the (y|) trick, noKLUDGE */
  529. INSERT(OPLUS_, pos);
  530. ASTERN(O_PLUS, pos);
  531. INSERT(OQUEST_, pos);
  532. ASTERN(O_QUEST, pos);
  533. break;
  534. case '+':
  535. INSERT(OPLUS_, pos);
  536. ASTERN(O_PLUS, pos);
  537. break;
  538. case '?':
  539. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  540. INSERT(OCH_, pos); /* offset slightly wrong */
  541. ASTERN(OOR1, pos); /* this one's right */
  542. AHEAD(pos); /* fix the OCH_ */
  543. EMIT(OOR2, 0); /* offset very wrong... */
  544. AHEAD(THERE()); /* ...so fix it */
  545. ASTERN(O_CH, THERETHERE());
  546. break;
  547. case '{':
  548. count = p_count(p);
  549. if (EAT(',')) {
  550. if (isdigit((uch)PEEK())) {
  551. count2 = p_count(p);
  552. REQUIRE(count <= count2, REG_BADBR);
  553. } else /* single number with comma */
  554. count2 = INFINITY;
  555. } else /* just a single number */
  556. count2 = count;
  557. repeat(p, pos, count, count2);
  558. if (!EAT('}')) { /* error heuristics */
  559. while (MORE() && PEEK() != '}')
  560. NEXT();
  561. REQUIRE(MORE(), REG_EBRACE);
  562. SETERROR(REG_BADBR);
  563. }
  564. break;
  565. }
  566. if (!MORE())
  567. return;
  568. c = PEEK();
  569. if (!( c == '*' || c == '+' || c == '?' ||
  570. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
  571. return;
  572. SETERROR(REG_BADRPT);
  573. }
  574. /*
  575. - p_str - string (no metacharacters) "parser"
  576. */
  577. static void
  578. p_str(struct parse *p)
  579. {
  580. REQUIRE(MORE(), REG_EMPTY);
  581. while (MORE())
  582. ordinary(p, GETNEXT());
  583. }
  584. /*
  585. - p_bre - BRE parser top level, anchoring and concatenation
  586. * Giving end1 as OUT essentially eliminates the end1/end2 check.
  587. *
  588. * This implementation is a bit of a kludge, in that a trailing $ is first
  589. * taken as an ordinary character and then revised to be an anchor. The
  590. * only undesirable side effect is that '$' gets included as a character
  591. * category in such cases. This is fairly harmless; not worth fixing.
  592. * The amount of lookahead needed to avoid this kludge is excessive.
  593. */
  594. static void
  595. p_bre(struct parse *p,
  596. int end1, /* first terminating character */
  597. int end2) /* second terminating character */
  598. {
  599. sopno start = HERE();
  600. int first = 1; /* first subexpression? */
  601. int wasdollar = 0;
  602. if (EAT('^')) {
  603. EMIT(OBOL, 0);
  604. p->g->iflags |= USEBOL;
  605. p->g->nbol++;
  606. }
  607. while (MORE() && !SEETWO(end1, end2)) {
  608. wasdollar = p_simp_re(p, first);
  609. first = 0;
  610. }
  611. if (wasdollar) { /* oops, that was a trailing anchor */
  612. DROP(1);
  613. EMIT(OEOL, 0);
  614. p->g->iflags |= USEEOL;
  615. p->g->neol++;
  616. }
  617. REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
  618. }
  619. /*
  620. - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
  621. */
  622. static int /* was the simple RE an unbackslashed $? */
  623. p_simp_re(struct parse *p,
  624. int starordinary) /* is a leading * an ordinary character? */
  625. {
  626. int c;
  627. int count;
  628. int count2;
  629. sopno pos;
  630. int i;
  631. sopno subno;
  632. # define BACKSL (1<<CHAR_BIT)
  633. pos = HERE(); /* repetition op, if any, covers from here */
  634. assert(MORE()); /* caller should have ensured this */
  635. c = GETNEXT();
  636. if (c == '\\') {
  637. REQUIRE(MORE(), REG_EESCAPE);
  638. c = BACKSL | GETNEXT();
  639. }
  640. switch (c) {
  641. case '.':
  642. if (p->g->cflags&REG_NEWLINE)
  643. nonnewline(p);
  644. else
  645. EMIT(OANY, 0);
  646. break;
  647. case '[':
  648. p_bracket(p);
  649. break;
  650. case BACKSL|'{':
  651. SETERROR(REG_BADRPT);
  652. break;
  653. case BACKSL|'(':
  654. p->g->nsub++;
  655. subno = p->g->nsub;
  656. if (subno < NPAREN)
  657. p->pbegin[subno] = HERE();
  658. EMIT(OLPAREN, subno);
  659. /* the MORE here is an error heuristic */
  660. if (MORE() && !SEETWO('\\', ')'))
  661. p_bre(p, '\\', ')');
  662. if (subno < NPAREN) {
  663. p->pend[subno] = HERE();
  664. assert(p->pend[subno] != 0);
  665. }
  666. EMIT(ORPAREN, subno);
  667. REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
  668. break;
  669. case BACKSL|')': /* should not get here -- must be user */
  670. case BACKSL|'}':
  671. SETERROR(REG_EPAREN);
  672. break;
  673. case BACKSL|'1':
  674. case BACKSL|'2':
  675. case BACKSL|'3':
  676. case BACKSL|'4':
  677. case BACKSL|'5':
  678. case BACKSL|'6':
  679. case BACKSL|'7':
  680. case BACKSL|'8':
  681. case BACKSL|'9':
  682. i = (c&~BACKSL) - '0';
  683. assert(i < NPAREN);
  684. if (p->pend[i] != 0) {
  685. assert(i <= p->g->nsub);
  686. EMIT(OBACK_, i);
  687. assert(p->pbegin[i] != 0);
  688. assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
  689. assert(OP(p->strip[p->pend[i]]) == ORPAREN);
  690. (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
  691. EMIT(O_BACK, i);
  692. } else
  693. SETERROR(REG_ESUBREG);
  694. p->g->backrefs = 1;
  695. break;
  696. case '*':
  697. REQUIRE(starordinary, REG_BADRPT);
  698. LLVM_FALLTHROUGH;
  699. default:
  700. ordinary(p, (char)c);
  701. break;
  702. }
  703. if (EAT('*')) { /* implemented as +? */
  704. /* this case does not require the (y|) trick, noKLUDGE */
  705. INSERT(OPLUS_, pos);
  706. ASTERN(O_PLUS, pos);
  707. INSERT(OQUEST_, pos);
  708. ASTERN(O_QUEST, pos);
  709. } else if (EATTWO('\\', '{')) {
  710. count = p_count(p);
  711. if (EAT(',')) {
  712. if (MORE() && isdigit((uch)PEEK())) {
  713. count2 = p_count(p);
  714. REQUIRE(count <= count2, REG_BADBR);
  715. } else /* single number with comma */
  716. count2 = INFINITY;
  717. } else /* just a single number */
  718. count2 = count;
  719. repeat(p, pos, count, count2);
  720. if (!EATTWO('\\', '}')) { /* error heuristics */
  721. while (MORE() && !SEETWO('\\', '}'))
  722. NEXT();
  723. REQUIRE(MORE(), REG_EBRACE);
  724. SETERROR(REG_BADBR);
  725. }
  726. } else if (c == '$') /* $ (but not \$) ends it */
  727. return(1);
  728. return(0);
  729. }
  730. /*
  731. - p_count - parse a repetition count
  732. */
  733. static int /* the value */
  734. p_count(struct parse *p)
  735. {
  736. int count = 0;
  737. int ndigits = 0;
  738. while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
  739. count = count*10 + (GETNEXT() - '0');
  740. ndigits++;
  741. }
  742. REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
  743. return(count);
  744. }
  745. /*
  746. - p_bracket - parse a bracketed character list
  747. *
  748. * Note a significant property of this code: if the allocset() did SETERROR,
  749. * no set operations are done.
  750. */
  751. static void
  752. p_bracket(struct parse *p)
  753. {
  754. cset *cs;
  755. int invert = 0;
  756. /* Dept of Truly Sickening Special-Case Kludges */
  757. if (p->end - p->next > 5) {
  758. if (strncmp(p->next, "[:<:]]", 6) == 0) {
  759. EMIT(OBOW, 0);
  760. NEXTn(6);
  761. return;
  762. }
  763. if (strncmp(p->next, "[:>:]]", 6) == 0) {
  764. EMIT(OEOW, 0);
  765. NEXTn(6);
  766. return;
  767. }
  768. }
  769. if ((cs = allocset(p)) == NULL) {
  770. /* allocset did set error status in p */
  771. return;
  772. }
  773. if (EAT('^'))
  774. invert++; /* make note to invert set at end */
  775. if (EAT(']'))
  776. CHadd(cs, ']');
  777. else if (EAT('-'))
  778. CHadd(cs, '-');
  779. while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
  780. p_b_term(p, cs);
  781. if (EAT('-'))
  782. CHadd(cs, '-');
  783. MUSTEAT(']', REG_EBRACK);
  784. if (p->error != 0) { /* don't mess things up further */
  785. freeset(p, cs);
  786. return;
  787. }
  788. if (p->g->cflags&REG_ICASE) {
  789. int i;
  790. int ci;
  791. for (i = p->g->csetsize - 1; i >= 0; i--)
  792. if (CHIN(cs, i) && isalpha(i)) {
  793. ci = othercase(i);
  794. if (ci != i)
  795. CHadd(cs, ci);
  796. }
  797. if (cs->multis != NULL)
  798. mccase(p, cs);
  799. }
  800. if (invert) {
  801. int i;
  802. for (i = p->g->csetsize - 1; i >= 0; i--)
  803. if (CHIN(cs, i))
  804. CHsub(cs, i);
  805. else
  806. CHadd(cs, i);
  807. if (p->g->cflags&REG_NEWLINE)
  808. CHsub(cs, '\n');
  809. if (cs->multis != NULL)
  810. mcinvert(p, cs);
  811. }
  812. assert(cs->multis == NULL); /* xxx */
  813. if (nch(p, cs) == 1) { /* optimize singleton sets */
  814. ordinary(p, firstch(p, cs));
  815. freeset(p, cs);
  816. } else
  817. EMIT(OANYOF, freezeset(p, cs));
  818. }
  819. /*
  820. - p_b_term - parse one term of a bracketed character list
  821. */
  822. static void
  823. p_b_term(struct parse *p, cset *cs)
  824. {
  825. char c;
  826. char start, finish;
  827. int i;
  828. /* classify what we've got */
  829. switch ((MORE()) ? PEEK() : '\0') {
  830. case '[':
  831. c = (MORE2()) ? PEEK2() : '\0';
  832. break;
  833. case '-':
  834. SETERROR(REG_ERANGE);
  835. return; /* NOTE RETURN */
  836. break;
  837. default:
  838. c = '\0';
  839. break;
  840. }
  841. switch (c) {
  842. case ':': /* character class */
  843. NEXT2();
  844. REQUIRE(MORE(), REG_EBRACK);
  845. c = PEEK();
  846. REQUIRE(c != '-' && c != ']', REG_ECTYPE);
  847. p_b_cclass(p, cs);
  848. REQUIRE(MORE(), REG_EBRACK);
  849. REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
  850. break;
  851. case '=': /* equivalence class */
  852. NEXT2();
  853. REQUIRE(MORE(), REG_EBRACK);
  854. c = PEEK();
  855. REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
  856. p_b_eclass(p, cs);
  857. REQUIRE(MORE(), REG_EBRACK);
  858. REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
  859. break;
  860. default: /* symbol, ordinary character, or range */
  861. /* xxx revision needed for multichar stuff */
  862. start = p_b_symbol(p);
  863. if (SEE('-') && MORE2() && PEEK2() != ']') {
  864. /* range */
  865. NEXT();
  866. if (EAT('-'))
  867. finish = '-';
  868. else
  869. finish = p_b_symbol(p);
  870. } else
  871. finish = start;
  872. /* xxx what about signed chars here... */
  873. REQUIRE(start <= finish, REG_ERANGE);
  874. for (i = start; i <= finish; i++)
  875. CHadd(cs, i);
  876. break;
  877. }
  878. }
  879. /*
  880. - p_b_cclass - parse a character-class name and deal with it
  881. */
  882. static void
  883. p_b_cclass(struct parse *p, cset *cs)
  884. {
  885. char *sp = p->next;
  886. struct cclass *cp;
  887. size_t len;
  888. const char *u;
  889. char c;
  890. while (MORE() && isalpha((uch)PEEK()))
  891. NEXT();
  892. len = p->next - sp;
  893. for (cp = cclasses; cp->name != NULL; cp++)
  894. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  895. break;
  896. if (cp->name == NULL) {
  897. /* oops, didn't find it */
  898. SETERROR(REG_ECTYPE);
  899. return;
  900. }
  901. u = cp->chars;
  902. while ((c = *u++) != '\0')
  903. CHadd(cs, c);
  904. for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
  905. MCadd(p, cs, u);
  906. }
  907. /*
  908. - p_b_eclass - parse an equivalence-class name and deal with it
  909. *
  910. * This implementation is incomplete. xxx
  911. */
  912. static void
  913. p_b_eclass(struct parse *p, cset *cs)
  914. {
  915. char c;
  916. c = p_b_coll_elem(p, '=');
  917. CHadd(cs, c);
  918. }
  919. /*
  920. - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
  921. */
  922. static char /* value of symbol */
  923. p_b_symbol(struct parse *p)
  924. {
  925. char value;
  926. REQUIRE(MORE(), REG_EBRACK);
  927. if (!EATTWO('[', '.'))
  928. return(GETNEXT());
  929. /* collating symbol */
  930. value = p_b_coll_elem(p, '.');
  931. REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
  932. return(value);
  933. }
  934. /*
  935. - p_b_coll_elem - parse a collating-element name and look it up
  936. */
  937. static char /* value of collating element */
  938. p_b_coll_elem(struct parse *p,
  939. int endc) /* name ended by endc,']' */
  940. {
  941. char *sp = p->next;
  942. struct cname *cp;
  943. size_t len;
  944. while (MORE() && !SEETWO(endc, ']'))
  945. NEXT();
  946. if (!MORE()) {
  947. SETERROR(REG_EBRACK);
  948. return(0);
  949. }
  950. len = p->next - sp;
  951. for (cp = cnames; cp->name != NULL; cp++)
  952. if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
  953. return(cp->code); /* known name */
  954. if (len == 1)
  955. return(*sp); /* single character */
  956. SETERROR(REG_ECOLLATE); /* neither */
  957. return(0);
  958. }
  959. /*
  960. - othercase - return the case counterpart of an alphabetic
  961. */
  962. static char /* if no counterpart, return ch */
  963. othercase(int ch)
  964. {
  965. ch = (uch)ch;
  966. assert(isalpha(ch));
  967. if (isupper(ch))
  968. return ((uch)tolower(ch));
  969. else if (islower(ch))
  970. return ((uch)toupper(ch));
  971. else /* peculiar, but could happen */
  972. return(ch);
  973. }
  974. /*
  975. - bothcases - emit a dualcase version of a two-case character
  976. *
  977. * Boy, is this implementation ever a kludge...
  978. */
  979. static void
  980. bothcases(struct parse *p, int ch)
  981. {
  982. char *oldnext = p->next;
  983. char *oldend = p->end;
  984. char bracket[3];
  985. ch = (uch)ch;
  986. assert(othercase(ch) != ch); /* p_bracket() would recurse */
  987. p->next = bracket;
  988. p->end = bracket+2;
  989. bracket[0] = ch;
  990. bracket[1] = ']';
  991. bracket[2] = '\0';
  992. p_bracket(p);
  993. assert(p->next == bracket+2);
  994. p->next = oldnext;
  995. p->end = oldend;
  996. }
  997. /*
  998. - ordinary - emit an ordinary character
  999. */
  1000. static void
  1001. ordinary(struct parse *p, int ch)
  1002. {
  1003. cat_t *cap = p->g->categories;
  1004. if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
  1005. bothcases(p, ch);
  1006. else {
  1007. EMIT(OCHAR, (uch)ch);
  1008. if (cap[ch] == 0)
  1009. cap[ch] = p->g->ncategories++;
  1010. }
  1011. }
  1012. /*
  1013. - nonnewline - emit REG_NEWLINE version of OANY
  1014. *
  1015. * Boy, is this implementation ever a kludge...
  1016. */
  1017. static void
  1018. nonnewline(struct parse *p)
  1019. {
  1020. char *oldnext = p->next;
  1021. char *oldend = p->end;
  1022. char bracket[4];
  1023. p->next = bracket;
  1024. p->end = bracket+3;
  1025. bracket[0] = '^';
  1026. bracket[1] = '\n';
  1027. bracket[2] = ']';
  1028. bracket[3] = '\0';
  1029. p_bracket(p);
  1030. assert(p->next == bracket+3);
  1031. p->next = oldnext;
  1032. p->end = oldend;
  1033. }
  1034. /*
  1035. - repeat - generate code for a bounded repetition, recursively if needed
  1036. */
  1037. static void
  1038. repeat(struct parse *p,
  1039. sopno start, /* operand from here to end of strip */
  1040. int from, /* repeated from this number */
  1041. int to) /* to this number of times (maybe INFINITY) */
  1042. {
  1043. sopno finish = HERE();
  1044. # define N 2
  1045. # define INF 3
  1046. # define REP(f, t) ((f)*8 + (t))
  1047. # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
  1048. sopno copy;
  1049. if (p->error != 0) /* head off possible runaway recursion */
  1050. return;
  1051. assert(from <= to);
  1052. switch (REP(MAP(from), MAP(to))) {
  1053. case REP(0, 0): /* must be user doing this */
  1054. DROP(finish-start); /* drop the operand */
  1055. break;
  1056. case REP(0, 1): /* as x{1,1}? */
  1057. case REP(0, N): /* as x{1,n}? */
  1058. case REP(0, INF): /* as x{1,}? */
  1059. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  1060. INSERT(OCH_, start); /* offset is wrong... */
  1061. repeat(p, start+1, 1, to);
  1062. ASTERN(OOR1, start);
  1063. AHEAD(start); /* ... fix it */
  1064. EMIT(OOR2, 0);
  1065. AHEAD(THERE());
  1066. ASTERN(O_CH, THERETHERE());
  1067. break;
  1068. case REP(1, 1): /* trivial case */
  1069. /* done */
  1070. break;
  1071. case REP(1, N): /* as x?x{1,n-1} */
  1072. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  1073. INSERT(OCH_, start);
  1074. ASTERN(OOR1, start);
  1075. AHEAD(start);
  1076. EMIT(OOR2, 0); /* offset very wrong... */
  1077. AHEAD(THERE()); /* ...so fix it */
  1078. ASTERN(O_CH, THERETHERE());
  1079. copy = dupl(p, start+1, finish+1);
  1080. assert(copy == finish+4);
  1081. repeat(p, copy, 1, to-1);
  1082. break;
  1083. case REP(1, INF): /* as x+ */
  1084. INSERT(OPLUS_, start);
  1085. ASTERN(O_PLUS, start);
  1086. break;
  1087. case REP(N, N): /* as xx{m-1,n-1} */
  1088. copy = dupl(p, start, finish);
  1089. repeat(p, copy, from-1, to-1);
  1090. break;
  1091. case REP(N, INF): /* as xx{n-1,INF} */
  1092. copy = dupl(p, start, finish);
  1093. repeat(p, copy, from-1, to);
  1094. break;
  1095. default: /* "can't happen" */
  1096. SETERROR(REG_ASSERT); /* just in case */
  1097. break;
  1098. }
  1099. }
  1100. /*
  1101. - seterr - set an error condition
  1102. */
  1103. static int /* useless but makes type checking happy */
  1104. seterr(struct parse *p, int e)
  1105. {
  1106. if (p->error == 0) /* keep earliest error condition */
  1107. p->error = e;
  1108. p->next = nuls; /* try to bring things to a halt */
  1109. p->end = nuls;
  1110. return(0); /* make the return value well-defined */
  1111. }
  1112. /*
  1113. - allocset - allocate a set of characters for []
  1114. */
  1115. static cset *
  1116. allocset(struct parse *p)
  1117. {
  1118. int no = p->g->ncsets++;
  1119. size_t nc;
  1120. size_t nbytes;
  1121. cset *cs;
  1122. size_t css = (size_t)p->g->csetsize;
  1123. int i;
  1124. if (no >= p->ncsalloc) { /* need another column of space */
  1125. void *ptr;
  1126. p->ncsalloc += CHAR_BIT;
  1127. nc = p->ncsalloc;
  1128. if (nc > SIZE_MAX / sizeof(cset))
  1129. goto nomem;
  1130. assert(nc % CHAR_BIT == 0);
  1131. nbytes = nc / CHAR_BIT * css;
  1132. ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
  1133. if (ptr == NULL)
  1134. goto nomem;
  1135. p->g->sets = ptr;
  1136. ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
  1137. if (ptr == NULL)
  1138. goto nomem;
  1139. p->g->setbits = ptr;
  1140. for (i = 0; i < no; i++)
  1141. p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
  1142. (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
  1143. }
  1144. /* XXX should not happen */
  1145. if (p->g->sets == NULL || p->g->setbits == NULL)
  1146. goto nomem;
  1147. cs = &p->g->sets[no];
  1148. cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
  1149. cs->mask = 1 << ((no) % CHAR_BIT);
  1150. cs->hash = 0;
  1151. cs->smultis = 0;
  1152. cs->multis = NULL;
  1153. return(cs);
  1154. nomem:
  1155. free(p->g->sets);
  1156. p->g->sets = NULL;
  1157. free(p->g->setbits);
  1158. p->g->setbits = NULL;
  1159. SETERROR(REG_ESPACE);
  1160. /* caller's responsibility not to do set ops */
  1161. return(NULL);
  1162. }
  1163. /*
  1164. - freeset - free a now-unused set
  1165. */
  1166. static void
  1167. freeset(struct parse *p, cset *cs)
  1168. {
  1169. size_t i;
  1170. cset *top = &p->g->sets[p->g->ncsets];
  1171. size_t css = (size_t)p->g->csetsize;
  1172. for (i = 0; i < css; i++)
  1173. CHsub(cs, i);
  1174. if (cs == top-1) /* recover only the easy case */
  1175. p->g->ncsets--;
  1176. }
  1177. /*
  1178. - freezeset - final processing on a set of characters
  1179. *
  1180. * The main task here is merging identical sets. This is usually a waste
  1181. * of time (although the hash code minimizes the overhead), but can win
  1182. * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
  1183. * is done using addition rather than xor -- all ASCII [aA] sets xor to
  1184. * the same value!
  1185. */
  1186. static int /* set number */
  1187. freezeset(struct parse *p, cset *cs)
  1188. {
  1189. uch h = cs->hash;
  1190. size_t i;
  1191. cset *top = &p->g->sets[p->g->ncsets];
  1192. cset *cs2;
  1193. size_t css = (size_t)p->g->csetsize;
  1194. /* look for an earlier one which is the same */
  1195. for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
  1196. if (cs2->hash == h && cs2 != cs) {
  1197. /* maybe */
  1198. for (i = 0; i < css; i++)
  1199. if (!!CHIN(cs2, i) != !!CHIN(cs, i))
  1200. break; /* no */
  1201. if (i == css)
  1202. break; /* yes */
  1203. }
  1204. if (cs2 < top) { /* found one */
  1205. freeset(p, cs);
  1206. cs = cs2;
  1207. }
  1208. return((int)(cs - p->g->sets));
  1209. }
  1210. /*
  1211. - firstch - return first character in a set (which must have at least one)
  1212. */
  1213. static int /* character; there is no "none" value */
  1214. firstch(struct parse *p, cset *cs)
  1215. {
  1216. size_t i;
  1217. size_t css = (size_t)p->g->csetsize;
  1218. for (i = 0; i < css; i++)
  1219. if (CHIN(cs, i))
  1220. return((char)i);
  1221. assert(never);
  1222. return(0); /* arbitrary */
  1223. }
  1224. /*
  1225. - nch - number of characters in a set
  1226. */
  1227. static int
  1228. nch(struct parse *p, cset *cs)
  1229. {
  1230. size_t i;
  1231. size_t css = (size_t)p->g->csetsize;
  1232. int n = 0;
  1233. for (i = 0; i < css; i++)
  1234. if (CHIN(cs, i))
  1235. n++;
  1236. return(n);
  1237. }
  1238. /*
  1239. - mcadd - add a collating element to a cset
  1240. */
  1241. static void
  1242. mcadd( struct parse *p, cset *cs, const char *cp)
  1243. {
  1244. size_t oldend = cs->smultis;
  1245. void *np;
  1246. cs->smultis += strlen(cp) + 1;
  1247. np = realloc(cs->multis, cs->smultis);
  1248. if (np == NULL) {
  1249. if (cs->multis)
  1250. free(cs->multis);
  1251. cs->multis = NULL;
  1252. SETERROR(REG_ESPACE);
  1253. return;
  1254. }
  1255. cs->multis = np;
  1256. llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
  1257. }
  1258. /*
  1259. - mcinvert - invert the list of collating elements in a cset
  1260. *
  1261. * This would have to know the set of possibilities. Implementation
  1262. * is deferred.
  1263. */
  1264. /* ARGSUSED */
  1265. static void
  1266. mcinvert(struct parse *p, cset *cs)
  1267. {
  1268. assert(cs->multis == NULL); /* xxx */
  1269. }
  1270. /*
  1271. - mccase - add case counterparts of the list of collating elements in a cset
  1272. *
  1273. * This would have to know the set of possibilities. Implementation
  1274. * is deferred.
  1275. */
  1276. /* ARGSUSED */
  1277. static void
  1278. mccase(struct parse *p, cset *cs)
  1279. {
  1280. assert(cs->multis == NULL); /* xxx */
  1281. }
  1282. /*
  1283. - isinsets - is this character in any sets?
  1284. */
  1285. static int /* predicate */
  1286. isinsets(struct re_guts *g, int c)
  1287. {
  1288. uch *col;
  1289. int i;
  1290. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1291. unsigned uc = (uch)c;
  1292. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1293. if (col[uc] != 0)
  1294. return(1);
  1295. return(0);
  1296. }
  1297. /*
  1298. - samesets - are these two characters in exactly the same sets?
  1299. */
  1300. static int /* predicate */
  1301. samesets(struct re_guts *g, int c1, int c2)
  1302. {
  1303. uch *col;
  1304. int i;
  1305. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1306. unsigned uc1 = (uch)c1;
  1307. unsigned uc2 = (uch)c2;
  1308. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1309. if (col[uc1] != col[uc2])
  1310. return(0);
  1311. return(1);
  1312. }
  1313. /*
  1314. - categorize - sort out character categories
  1315. */
  1316. static void
  1317. categorize(struct parse *p, struct re_guts *g)
  1318. {
  1319. cat_t *cats = g->categories;
  1320. int c;
  1321. int c2;
  1322. cat_t cat;
  1323. /* avoid making error situations worse */
  1324. if (p->error != 0)
  1325. return;
  1326. for (c = CHAR_MIN; c <= CHAR_MAX; c++)
  1327. if (cats[c] == 0 && isinsets(g, c)) {
  1328. cat = g->ncategories++;
  1329. cats[c] = cat;
  1330. for (c2 = c+1; c2 <= CHAR_MAX; c2++)
  1331. if (cats[c2] == 0 && samesets(g, c, c2))
  1332. cats[c2] = cat;
  1333. }
  1334. }
  1335. /*
  1336. - dupl - emit a duplicate of a bunch of sops
  1337. */
  1338. static sopno /* start of duplicate */
  1339. dupl(struct parse *p,
  1340. sopno start, /* from here */
  1341. sopno finish) /* to this less one */
  1342. {
  1343. sopno ret = HERE();
  1344. sopno len = finish - start;
  1345. assert(finish >= start);
  1346. if (len == 0)
  1347. return(ret);
  1348. enlarge(p, p->ssize + len); /* this many unexpected additions */
  1349. assert(p->ssize >= p->slen + len);
  1350. (void) memmove((char *)(p->strip + p->slen),
  1351. (char *)(p->strip + start), (size_t)len*sizeof(sop));
  1352. p->slen += len;
  1353. return(ret);
  1354. }
  1355. /*
  1356. - doemit - emit a strip operator
  1357. *
  1358. * It might seem better to implement this as a macro with a function as
  1359. * hard-case backup, but it's just too big and messy unless there are
  1360. * some changes to the data structures. Maybe later.
  1361. */
  1362. static void
  1363. doemit(struct parse *p, sop op, size_t opnd)
  1364. {
  1365. /* avoid making error situations worse */
  1366. if (p->error != 0)
  1367. return;
  1368. /* deal with oversize operands ("can't happen", more or less) */
  1369. assert(opnd < 1<<OPSHIFT);
  1370. /* deal with undersized strip */
  1371. if (p->slen >= p->ssize)
  1372. enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
  1373. assert(p->slen < p->ssize);
  1374. /* finally, it's all reduced to the easy case */
  1375. p->strip[p->slen++] = SOP(op, opnd);
  1376. }
  1377. /*
  1378. - doinsert - insert a sop into the strip
  1379. */
  1380. static void
  1381. doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
  1382. {
  1383. sopno sn;
  1384. sop s;
  1385. int i;
  1386. /* avoid making error situations worse */
  1387. if (p->error != 0)
  1388. return;
  1389. sn = HERE();
  1390. EMIT(op, opnd); /* do checks, ensure space */
  1391. assert(HERE() == sn+1);
  1392. s = p->strip[sn];
  1393. /* adjust paren pointers */
  1394. assert(pos > 0);
  1395. for (i = 1; i < NPAREN; i++) {
  1396. if (p->pbegin[i] >= pos) {
  1397. p->pbegin[i]++;
  1398. }
  1399. if (p->pend[i] >= pos) {
  1400. p->pend[i]++;
  1401. }
  1402. }
  1403. memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
  1404. (HERE()-pos-1)*sizeof(sop));
  1405. p->strip[pos] = s;
  1406. }
  1407. /*
  1408. - dofwd - complete a forward reference
  1409. */
  1410. static void
  1411. dofwd(struct parse *p, sopno pos, sop value)
  1412. {
  1413. /* avoid making error situations worse */
  1414. if (p->error != 0)
  1415. return;
  1416. assert(value < 1<<OPSHIFT);
  1417. p->strip[pos] = OP(p->strip[pos]) | value;
  1418. }
  1419. /*
  1420. - enlarge - enlarge the strip
  1421. */
  1422. static void
  1423. enlarge(struct parse *p, sopno size)
  1424. {
  1425. sop *sp;
  1426. if (p->ssize >= size)
  1427. return;
  1428. if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
  1429. SETERROR(REG_ESPACE);
  1430. return;
  1431. }
  1432. sp = (sop *)realloc(p->strip, size*sizeof(sop));
  1433. if (sp == NULL) {
  1434. SETERROR(REG_ESPACE);
  1435. return;
  1436. }
  1437. p->strip = sp;
  1438. p->ssize = size;
  1439. }
  1440. /*
  1441. - stripsnug - compact the strip
  1442. */
  1443. static void
  1444. stripsnug(struct parse *p, struct re_guts *g)
  1445. {
  1446. g->nstates = p->slen;
  1447. if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
  1448. g->strip = p->strip;
  1449. SETERROR(REG_ESPACE);
  1450. return;
  1451. }
  1452. g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
  1453. if (g->strip == NULL) {
  1454. SETERROR(REG_ESPACE);
  1455. g->strip = p->strip;
  1456. }
  1457. }
  1458. /*
  1459. - findmust - fill in must and mlen with longest mandatory literal string
  1460. *
  1461. * This algorithm could do fancy things like analyzing the operands of |
  1462. * for common subsequences. Someday. This code is simple and finds most
  1463. * of the interesting cases.
  1464. *
  1465. * Note that must and mlen got initialized during setup.
  1466. */
  1467. static void
  1468. findmust(struct parse *p, struct re_guts *g)
  1469. {
  1470. sop *scan;
  1471. sop *start = 0; /* start initialized in the default case, after that */
  1472. sop *newstart = 0; /* newstart was initialized in the OCHAR case */
  1473. sopno newlen;
  1474. sop s;
  1475. char *cp;
  1476. sopno i;
  1477. /* avoid making error situations worse */
  1478. if (p->error != 0)
  1479. return;
  1480. /* find the longest OCHAR sequence in strip */
  1481. newlen = 0;
  1482. scan = g->strip + 1;
  1483. do {
  1484. s = *scan++;
  1485. switch (OP(s)) {
  1486. case OCHAR: /* sequence member */
  1487. if (newlen == 0) /* new sequence */
  1488. newstart = scan - 1;
  1489. newlen++;
  1490. break;
  1491. case OPLUS_: /* things that don't break one */
  1492. case OLPAREN:
  1493. case ORPAREN:
  1494. break;
  1495. case OQUEST_: /* things that must be skipped */
  1496. case OCH_:
  1497. scan--;
  1498. do {
  1499. scan += OPND(s);
  1500. s = *scan;
  1501. /* assert() interferes w debug printouts */
  1502. if (OP(s) != O_QUEST && OP(s) != O_CH &&
  1503. OP(s) != OOR2) {
  1504. g->iflags |= REGEX_BAD;
  1505. return;
  1506. }
  1507. } while (OP(s) != O_QUEST && OP(s) != O_CH);
  1508. LLVM_FALLTHROUGH;
  1509. default: /* things that break a sequence */
  1510. if (newlen > g->mlen) { /* ends one */
  1511. start = newstart;
  1512. g->mlen = newlen;
  1513. }
  1514. newlen = 0;
  1515. break;
  1516. }
  1517. } while (OP(s) != OEND);
  1518. if (g->mlen == 0) /* there isn't one */
  1519. return;
  1520. /* turn it into a character string */
  1521. g->must = malloc((size_t)g->mlen + 1);
  1522. if (g->must == NULL) { /* argh; just forget it */
  1523. g->mlen = 0;
  1524. return;
  1525. }
  1526. cp = g->must;
  1527. scan = start;
  1528. for (i = g->mlen; i > 0; i--) {
  1529. while (OP(s = *scan++) != OCHAR)
  1530. continue;
  1531. assert(cp < g->must + g->mlen);
  1532. *cp++ = (char)OPND(s);
  1533. }
  1534. assert(cp == g->must + g->mlen);
  1535. *cp++ = '\0'; /* just on general principles */
  1536. }
  1537. /*
  1538. - pluscount - count + nesting
  1539. */
  1540. static sopno /* nesting depth */
  1541. pluscount(struct parse *p, struct re_guts *g)
  1542. {
  1543. sop *scan;
  1544. sop s;
  1545. sopno plusnest = 0;
  1546. sopno maxnest = 0;
  1547. if (p->error != 0)
  1548. return(0); /* there may not be an OEND */
  1549. scan = g->strip + 1;
  1550. do {
  1551. s = *scan++;
  1552. switch (OP(s)) {
  1553. case OPLUS_:
  1554. plusnest++;
  1555. break;
  1556. case O_PLUS:
  1557. if (plusnest > maxnest)
  1558. maxnest = plusnest;
  1559. plusnest--;
  1560. break;
  1561. }
  1562. } while (OP(s) != OEND);
  1563. if (plusnest != 0)
  1564. g->iflags |= REGEX_BAD;
  1565. return(maxnest);
  1566. }