pcre.cc 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957
  1. // Copyright 2003-2009 Google Inc. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
  5. // The main changes are the addition of the HitLimit method and
  6. // compilation as PCRE in namespace re2.
  7. #include <assert.h>
  8. #include <ctype.h>
  9. #include <errno.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #include <limits>
  13. #include <string>
  14. #include <utility>
  15. #include "absl/flags/flag.h"
  16. #include "absl/log/absl_check.h"
  17. #include "absl/log/absl_log.h"
  18. #include "absl/strings/str_format.h"
  19. #include "util/pcre.h"
  20. // Silence warnings about the wacky formatting in the operator() functions.
  21. #if defined(__GNUC__)
  22. #pragma GCC diagnostic ignored "-Wmisleading-indentation"
  23. #endif
  24. #define PCREPORT(level) ABSL_LOG(level)
  25. // Default PCRE limits.
  26. // Defaults chosen to allow a plausible amount of CPU and
  27. // not exceed main thread stacks. Note that other threads
  28. // often have smaller stacks, and therefore tightening
  29. // regexp_stack_limit may frequently be necessary.
  30. ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
  31. "default PCRE stack limit (bytes)");
  32. ABSL_FLAG(int, regexp_match_limit, 1000000,
  33. "default PCRE match limit (function calls)");
  34. #ifndef USEPCRE
  35. // Fake just enough of the PCRE API to allow this file to build. :)
  36. struct pcre_extra {
  37. int flags;
  38. int match_limit;
  39. int match_limit_recursion;
  40. };
  41. #define PCRE_EXTRA_MATCH_LIMIT 0
  42. #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
  43. #define PCRE_ANCHORED 0
  44. #define PCRE_NOTEMPTY 0
  45. #define PCRE_ERROR_NOMATCH 1
  46. #define PCRE_ERROR_MATCHLIMIT 2
  47. #define PCRE_ERROR_RECURSIONLIMIT 3
  48. #define PCRE_INFO_CAPTURECOUNT 0
  49. void pcre_free(void*) {
  50. }
  51. pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
  52. return NULL;
  53. }
  54. int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
  55. return 0;
  56. }
  57. int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
  58. return 0;
  59. }
  60. #endif
  61. namespace re2 {
  62. // Maximum number of args we can set
  63. static const int kMaxArgs = 16;
  64. static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
  65. // Approximate size of a recursive invocation of PCRE's
  66. // internal "match()" frame. This varies depending on the
  67. // compiler and architecture, of course, so the constant is
  68. // just a conservative estimate. To find the exact number,
  69. // run regexp_unittest with --regexp_stack_limit=0 under
  70. // a debugger and look at the frames when it crashes.
  71. // The exact frame size was 656 in production on 2008/02/03.
  72. static const int kPCREFrameSize = 700;
  73. // Special name for missing C++ arguments.
  74. PCRE::Arg PCRE::no_more_args((void*)NULL);
  75. const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
  76. const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
  77. const PCRE::ConsumeFunctor PCRE::Consume = { };
  78. const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
  79. // If a regular expression has no error, its error_ field points here
  80. static const std::string empty_string;
  81. void PCRE::Init(const char* pattern, Option options, int match_limit,
  82. int stack_limit, bool report_errors) {
  83. pattern_ = pattern;
  84. options_ = options;
  85. match_limit_ = match_limit;
  86. stack_limit_ = stack_limit;
  87. hit_limit_ = false;
  88. error_ = &empty_string;
  89. report_errors_ = report_errors;
  90. re_full_ = NULL;
  91. re_partial_ = NULL;
  92. if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
  93. error_ = new std::string("illegal regexp option");
  94. PCREPORT(ERROR)
  95. << "Error compiling '" << pattern << "': illegal regexp option";
  96. } else {
  97. re_partial_ = Compile(UNANCHORED);
  98. if (re_partial_ != NULL) {
  99. re_full_ = Compile(ANCHOR_BOTH);
  100. }
  101. }
  102. }
  103. PCRE::PCRE(const char* pattern) {
  104. Init(pattern, None, 0, 0, true);
  105. }
  106. PCRE::PCRE(const char* pattern, Option option) {
  107. Init(pattern, option, 0, 0, true);
  108. }
  109. PCRE::PCRE(const std::string& pattern) {
  110. Init(pattern.c_str(), None, 0, 0, true);
  111. }
  112. PCRE::PCRE(const std::string& pattern, Option option) {
  113. Init(pattern.c_str(), option, 0, 0, true);
  114. }
  115. PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
  116. Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
  117. re_option.stack_limit(), re_option.report_errors());
  118. }
  119. PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
  120. Init(pattern, re_option.option(), re_option.match_limit(),
  121. re_option.stack_limit(), re_option.report_errors());
  122. }
  123. PCRE::~PCRE() {
  124. if (re_full_ != NULL) pcre_free(re_full_);
  125. if (re_partial_ != NULL) pcre_free(re_partial_);
  126. if (error_ != &empty_string) delete error_;
  127. }
  128. pcre* PCRE::Compile(Anchor anchor) {
  129. // Special treatment for anchoring. This is needed because at
  130. // runtime pcre only provides an option for anchoring at the
  131. // beginning of a string.
  132. //
  133. // There are three types of anchoring we want:
  134. // UNANCHORED Compile the original pattern, and use
  135. // a pcre unanchored match.
  136. // ANCHOR_START Compile the original pattern, and use
  137. // a pcre anchored match.
  138. // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
  139. // and use a pcre anchored match.
  140. const char* error = "";
  141. int eoffset;
  142. pcre* re;
  143. if (anchor != ANCHOR_BOTH) {
  144. re = pcre_compile(pattern_.c_str(),
  145. (options_ & EnabledCompileOptions),
  146. &error, &eoffset, NULL);
  147. } else {
  148. // Tack a '\z' at the end of PCRE. Parenthesize it first so that
  149. // the '\z' applies to all top-level alternatives in the regexp.
  150. std::string wrapped = "(?:"; // A non-counting grouping operator
  151. wrapped += pattern_;
  152. wrapped += ")\\z";
  153. re = pcre_compile(wrapped.c_str(),
  154. (options_ & EnabledCompileOptions),
  155. &error, &eoffset, NULL);
  156. }
  157. if (re == NULL) {
  158. if (error_ == &empty_string) error_ = new std::string(error);
  159. PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
  160. }
  161. return re;
  162. }
  163. /***** Convenience interfaces *****/
  164. bool PCRE::FullMatchFunctor::operator()(
  165. absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
  166. const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
  167. const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
  168. const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
  169. const Arg* args[kMaxArgs];
  170. int n = 0;
  171. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  172. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  173. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  174. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  175. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  176. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  177. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  178. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  179. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  180. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  181. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  182. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  183. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  184. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  185. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  186. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  187. done:
  188. size_t consumed;
  189. int vec[kVecSize] = {};
  190. return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  191. }
  192. bool PCRE::PartialMatchFunctor::operator()(
  193. absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
  194. const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
  195. const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
  196. const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
  197. const Arg* args[kMaxArgs];
  198. int n = 0;
  199. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  200. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  201. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  202. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  203. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  204. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  205. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  206. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  207. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  208. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  209. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  210. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  211. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  212. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  213. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  214. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  215. done:
  216. size_t consumed;
  217. int vec[kVecSize] = {};
  218. return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  219. }
  220. bool PCRE::ConsumeFunctor::operator()(
  221. absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
  222. const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
  223. const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
  224. const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
  225. const Arg* args[kMaxArgs];
  226. int n = 0;
  227. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  228. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  229. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  230. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  231. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  232. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  233. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  234. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  235. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  236. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  237. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  238. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  239. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  240. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  241. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  242. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  243. done:
  244. size_t consumed;
  245. int vec[kVecSize] = {};
  246. if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
  247. args, n, vec, kVecSize)) {
  248. input->remove_prefix(consumed);
  249. return true;
  250. } else {
  251. return false;
  252. }
  253. }
  254. bool PCRE::FindAndConsumeFunctor::operator()(
  255. absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
  256. const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
  257. const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
  258. const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
  259. const Arg* args[kMaxArgs];
  260. int n = 0;
  261. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  262. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  263. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  264. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  265. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  266. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  267. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  268. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  269. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  270. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  271. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  272. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  273. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  274. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  275. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  276. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  277. done:
  278. size_t consumed;
  279. int vec[kVecSize] = {};
  280. if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
  281. args, n, vec, kVecSize)) {
  282. input->remove_prefix(consumed);
  283. return true;
  284. } else {
  285. return false;
  286. }
  287. }
  288. bool PCRE::Replace(std::string* str, const PCRE& pattern,
  289. absl::string_view rewrite) {
  290. int vec[kVecSize] = {};
  291. int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
  292. if (matches == 0)
  293. return false;
  294. std::string s;
  295. if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
  296. return false;
  297. assert(vec[0] >= 0);
  298. assert(vec[1] >= 0);
  299. str->replace(vec[0], vec[1] - vec[0], s);
  300. return true;
  301. }
  302. int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
  303. absl::string_view rewrite) {
  304. int count = 0;
  305. int vec[kVecSize] = {};
  306. std::string out;
  307. size_t start = 0;
  308. bool last_match_was_empty_string = false;
  309. while (start <= str->size()) {
  310. // If the previous match was for the empty string, we shouldn't
  311. // just match again: we'll match in the same way and get an
  312. // infinite loop. Instead, we do the match in a special way:
  313. // anchored -- to force another try at the same position --
  314. // and with a flag saying that this time, ignore empty matches.
  315. // If this special match returns, that means there's a non-empty
  316. // match at this position as well, and we can continue. If not,
  317. // we do what perl does, and just advance by one.
  318. // Notice that perl prints '@@@' for this;
  319. // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
  320. int matches;
  321. if (last_match_was_empty_string) {
  322. matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
  323. vec, kVecSize);
  324. if (matches <= 0) {
  325. if (start < str->size())
  326. out.push_back((*str)[start]);
  327. start++;
  328. last_match_was_empty_string = false;
  329. continue;
  330. }
  331. } else {
  332. matches = pattern.TryMatch(*str, start, UNANCHORED, true,
  333. vec, kVecSize);
  334. if (matches <= 0)
  335. break;
  336. }
  337. size_t matchstart = vec[0], matchend = vec[1];
  338. assert(matchstart >= start);
  339. assert(matchend >= matchstart);
  340. out.append(*str, start, matchstart - start);
  341. pattern.Rewrite(&out, rewrite, *str, vec, matches);
  342. start = matchend;
  343. count++;
  344. last_match_was_empty_string = (matchstart == matchend);
  345. }
  346. if (count == 0)
  347. return 0;
  348. if (start < str->size())
  349. out.append(*str, start, str->size() - start);
  350. using std::swap;
  351. swap(out, *str);
  352. return count;
  353. }
  354. bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
  355. absl::string_view rewrite, std::string* out) {
  356. int vec[kVecSize] = {};
  357. int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
  358. if (matches == 0)
  359. return false;
  360. out->clear();
  361. return pattern.Rewrite(out, rewrite, text, vec, matches);
  362. }
  363. std::string PCRE::QuoteMeta(absl::string_view unquoted) {
  364. std::string result;
  365. result.reserve(unquoted.size() << 1);
  366. // Escape any ascii character not in [A-Za-z_0-9].
  367. //
  368. // Note that it's legal to escape a character even if it has no
  369. // special meaning in a regular expression -- so this function does
  370. // that. (This also makes it identical to the perl function of the
  371. // same name except for the null-character special case;
  372. // see `perldoc -f quotemeta`.)
  373. for (size_t ii = 0; ii < unquoted.size(); ++ii) {
  374. // Note that using 'isalnum' here raises the benchmark time from
  375. // 32ns to 58ns:
  376. if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  377. (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  378. (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  379. unquoted[ii] != '_' &&
  380. // If this is the part of a UTF8 or Latin1 character, we need
  381. // to copy this byte without escaping. Experimentally this is
  382. // what works correctly with the regexp library.
  383. !(unquoted[ii] & 128)) {
  384. if (unquoted[ii] == '\0') { // Special handling for null chars.
  385. // Can't use "\\0" since the next character might be a digit.
  386. result += "\\x00";
  387. continue;
  388. }
  389. result += '\\';
  390. }
  391. result += unquoted[ii];
  392. }
  393. return result;
  394. }
  395. /***** Actual matching and rewriting code *****/
  396. bool PCRE::HitLimit() {
  397. return hit_limit_ != 0;
  398. }
  399. void PCRE::ClearHitLimit() {
  400. hit_limit_ = 0;
  401. }
  402. int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
  403. bool empty_ok, int* vec, int vecsize) const {
  404. pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  405. if (re == NULL) {
  406. PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
  407. return 0;
  408. }
  409. int match_limit = match_limit_;
  410. if (match_limit <= 0) {
  411. match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
  412. }
  413. int stack_limit = stack_limit_;
  414. if (stack_limit <= 0) {
  415. stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
  416. }
  417. pcre_extra extra = { 0 };
  418. if (match_limit > 0) {
  419. extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  420. extra.match_limit = match_limit;
  421. }
  422. if (stack_limit > 0) {
  423. extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  424. extra.match_limit_recursion = stack_limit / kPCREFrameSize;
  425. }
  426. int options = 0;
  427. if (anchor != UNANCHORED)
  428. options |= PCRE_ANCHORED;
  429. if (!empty_ok)
  430. options |= PCRE_NOTEMPTY;
  431. int rc = pcre_exec(re, // The regular expression object
  432. &extra,
  433. (text.data() == NULL) ? "" : text.data(),
  434. static_cast<int>(text.size()),
  435. static_cast<int>(startpos),
  436. options,
  437. vec,
  438. vecsize);
  439. // Handle errors
  440. if (rc == 0) {
  441. // pcre_exec() returns 0 as a special case when the number of
  442. // capturing subpatterns exceeds the size of the vector.
  443. // When this happens, there is a match and the output vector
  444. // is filled, but we miss out on the positions of the extra subpatterns.
  445. rc = vecsize / 2;
  446. } else if (rc < 0) {
  447. switch (rc) {
  448. case PCRE_ERROR_NOMATCH:
  449. return 0;
  450. case PCRE_ERROR_MATCHLIMIT:
  451. // Writing to hit_limit is not safe if multiple threads
  452. // are using the PCRE, but the flag is only intended
  453. // for use by unit tests anyway, so we let it go.
  454. hit_limit_ = true;
  455. PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
  456. << " when matching '" << pattern_ << "'"
  457. << " against text that is " << text.size() << " bytes.";
  458. return 0;
  459. case PCRE_ERROR_RECURSIONLIMIT:
  460. // See comment about hit_limit above.
  461. hit_limit_ = true;
  462. PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
  463. << " when matching '" << pattern_ << "'"
  464. << " against text that is " << text.size() << " bytes.";
  465. return 0;
  466. default:
  467. // There are other return codes from pcre.h :
  468. // PCRE_ERROR_NULL (-2)
  469. // PCRE_ERROR_BADOPTION (-3)
  470. // PCRE_ERROR_BADMAGIC (-4)
  471. // PCRE_ERROR_UNKNOWN_NODE (-5)
  472. // PCRE_ERROR_NOMEMORY (-6)
  473. // PCRE_ERROR_NOSUBSTRING (-7)
  474. // ...
  475. PCREPORT(ERROR) << "Unexpected return code: " << rc
  476. << " when matching '" << pattern_ << "'"
  477. << ", re=" << re
  478. << ", text=" << text
  479. << ", vec=" << vec
  480. << ", vecsize=" << vecsize;
  481. return 0;
  482. }
  483. }
  484. return rc;
  485. }
  486. bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
  487. const Arg* const* args, int n, int* vec,
  488. int vecsize) const {
  489. assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
  490. if (NumberOfCapturingGroups() < n) {
  491. // RE has fewer capturing groups than number of Arg pointers passed in.
  492. return false;
  493. }
  494. int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
  495. assert(matches >= 0); // TryMatch never returns negatives
  496. if (matches == 0)
  497. return false;
  498. *consumed = vec[1];
  499. if (n == 0 || args == NULL) {
  500. // We are not interested in results
  501. return true;
  502. }
  503. // If we got here, we must have matched the whole pattern.
  504. // We do not need (can not do) any more checks on the value of 'matches' here
  505. // -- see the comment for TryMatch.
  506. for (int i = 0; i < n; i++) {
  507. const int start = vec[2*(i+1)];
  508. const int limit = vec[2*(i+1)+1];
  509. // Avoid invoking undefined behavior when text.data() happens
  510. // to be null and start happens to be -1, the latter being the
  511. // case for an unmatched subexpression. Even if text.data() is
  512. // not null, pointing one byte before was a longstanding bug.
  513. const char* addr = NULL;
  514. if (start != -1) {
  515. addr = text.data() + start;
  516. }
  517. if (!args[i]->Parse(addr, limit-start)) {
  518. // TODO: Should we indicate what the error was?
  519. return false;
  520. }
  521. }
  522. return true;
  523. }
  524. bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
  525. const Arg* const args[], int n) const {
  526. assert(n >= 0);
  527. const int vecsize = (1 + n) * 3; // results + PCRE workspace
  528. // (as for kVecSize)
  529. int* vec = new int[vecsize];
  530. bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
  531. delete[] vec;
  532. return b;
  533. }
  534. bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
  535. absl::string_view text, int* vec, int veclen) const {
  536. int number_of_capturing_groups = NumberOfCapturingGroups();
  537. for (const char *s = rewrite.data(), *end = s + rewrite.size();
  538. s < end; s++) {
  539. int c = *s;
  540. if (c == '\\') {
  541. c = *++s;
  542. if (isdigit(c)) {
  543. int n = (c - '0');
  544. if (n >= veclen) {
  545. if (n <= number_of_capturing_groups) {
  546. // unmatched optional capturing group. treat
  547. // its value as empty string; i.e., nothing to append.
  548. } else {
  549. PCREPORT(ERROR) << "requested group " << n
  550. << " in regexp " << rewrite.data();
  551. return false;
  552. }
  553. }
  554. int start = vec[2 * n];
  555. if (start >= 0)
  556. out->append(text.data() + start, vec[2 * n + 1] - start);
  557. } else if (c == '\\') {
  558. out->push_back('\\');
  559. } else {
  560. PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
  561. return false;
  562. }
  563. } else {
  564. out->push_back(c);
  565. }
  566. }
  567. return true;
  568. }
  569. bool PCRE::CheckRewriteString(absl::string_view rewrite,
  570. std::string* error) const {
  571. int max_token = -1;
  572. for (const char *s = rewrite.data(), *end = s + rewrite.size();
  573. s < end; s++) {
  574. int c = *s;
  575. if (c != '\\') {
  576. continue;
  577. }
  578. if (++s == end) {
  579. *error = "Rewrite schema error: '\\' not allowed at end.";
  580. return false;
  581. }
  582. c = *s;
  583. if (c == '\\') {
  584. continue;
  585. }
  586. if (!isdigit(c)) {
  587. *error = "Rewrite schema error: "
  588. "'\\' must be followed by a digit or '\\'.";
  589. return false;
  590. }
  591. int n = (c - '0');
  592. if (max_token < n) {
  593. max_token = n;
  594. }
  595. }
  596. if (max_token > NumberOfCapturingGroups()) {
  597. *error = absl::StrFormat(
  598. "Rewrite schema requests %d matches, but the regexp only has %d "
  599. "parenthesized subexpressions.",
  600. max_token, NumberOfCapturingGroups());
  601. return false;
  602. }
  603. return true;
  604. }
  605. // Return the number of capturing subpatterns, or -1 if the
  606. // regexp wasn't valid on construction.
  607. int PCRE::NumberOfCapturingGroups() const {
  608. if (re_partial_ == NULL) return -1;
  609. int result;
  610. int rc = pcre_fullinfo(re_partial_, // The regular expression object
  611. NULL, // We did not study the pattern
  612. PCRE_INFO_CAPTURECOUNT,
  613. &result);
  614. if (rc != 0) {
  615. PCREPORT(ERROR) << "Unexpected return code: " << rc;
  616. return -1;
  617. }
  618. return result;
  619. }
  620. /***** Parsers for various types *****/
  621. bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
  622. // We fail if somebody asked us to store into a non-NULL void* pointer
  623. return (dest == NULL);
  624. }
  625. bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
  626. if (dest == NULL) return true;
  627. reinterpret_cast<std::string*>(dest)->assign(str, n);
  628. return true;
  629. }
  630. bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
  631. if (dest == NULL) return true;
  632. *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
  633. return true;
  634. }
  635. bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
  636. if (n != 1) return false;
  637. if (dest == NULL) return true;
  638. *(reinterpret_cast<char*>(dest)) = str[0];
  639. return true;
  640. }
  641. bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
  642. if (n != 1) return false;
  643. if (dest == NULL) return true;
  644. *(reinterpret_cast<signed char*>(dest)) = str[0];
  645. return true;
  646. }
  647. bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
  648. if (n != 1) return false;
  649. if (dest == NULL) return true;
  650. *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  651. return true;
  652. }
  653. // Largest number spec that we are willing to parse
  654. static const int kMaxNumberLength = 32;
  655. // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
  656. // PCREQUIPCRES "n > 0"
  657. // Copies "str" into "buf" and null-terminates if necessary.
  658. // Returns one of:
  659. // a. "str" if no termination is needed
  660. // b. "buf" if the string was copied and null-terminated
  661. // c. "" if the input was invalid and has no hope of being parsed
  662. static const char* TerminateNumber(char* buf, const char* str, size_t n) {
  663. if ((n > 0) && isspace(*str)) {
  664. // We are less forgiving than the strtoxxx() routines and do not
  665. // allow leading spaces.
  666. return "";
  667. }
  668. // See if the character right after the input text may potentially
  669. // look like a digit.
  670. if (isdigit(str[n]) ||
  671. ((str[n] >= 'a') && (str[n] <= 'f')) ||
  672. ((str[n] >= 'A') && (str[n] <= 'F'))) {
  673. if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  674. memcpy(buf, str, n);
  675. buf[n] = '\0';
  676. return buf;
  677. } else {
  678. // We can parse right out of the supplied string, so return it.
  679. return str;
  680. }
  681. }
  682. bool PCRE::Arg::parse_long_radix(const char* str,
  683. size_t n,
  684. void* dest,
  685. int radix) {
  686. if (n == 0) return false;
  687. char buf[kMaxNumberLength+1];
  688. str = TerminateNumber(buf, str, n);
  689. char* end;
  690. errno = 0;
  691. long r = strtol(str, &end, radix);
  692. if (end != str + n) return false; // Leftover junk
  693. if (errno) return false;
  694. if (dest == NULL) return true;
  695. *(reinterpret_cast<long*>(dest)) = r;
  696. return true;
  697. }
  698. bool PCRE::Arg::parse_ulong_radix(const char* str,
  699. size_t n,
  700. void* dest,
  701. int radix) {
  702. if (n == 0) return false;
  703. char buf[kMaxNumberLength+1];
  704. str = TerminateNumber(buf, str, n);
  705. if (str[0] == '-') {
  706. // strtoul() will silently accept negative numbers and parse
  707. // them. This module is more strict and treats them as errors.
  708. return false;
  709. }
  710. char* end;
  711. errno = 0;
  712. unsigned long r = strtoul(str, &end, radix);
  713. if (end != str + n) return false; // Leftover junk
  714. if (errno) return false;
  715. if (dest == NULL) return true;
  716. *(reinterpret_cast<unsigned long*>(dest)) = r;
  717. return true;
  718. }
  719. bool PCRE::Arg::parse_short_radix(const char* str,
  720. size_t n,
  721. void* dest,
  722. int radix) {
  723. long r;
  724. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  725. if ((short)r != r) return false; // Out of range
  726. if (dest == NULL) return true;
  727. *(reinterpret_cast<short*>(dest)) = (short)r;
  728. return true;
  729. }
  730. bool PCRE::Arg::parse_ushort_radix(const char* str,
  731. size_t n,
  732. void* dest,
  733. int radix) {
  734. unsigned long r;
  735. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  736. if ((unsigned short)r != r) return false; // Out of range
  737. if (dest == NULL) return true;
  738. *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
  739. return true;
  740. }
  741. bool PCRE::Arg::parse_int_radix(const char* str,
  742. size_t n,
  743. void* dest,
  744. int radix) {
  745. long r;
  746. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  747. if ((int)r != r) return false; // Out of range
  748. if (dest == NULL) return true;
  749. *(reinterpret_cast<int*>(dest)) = (int)r;
  750. return true;
  751. }
  752. bool PCRE::Arg::parse_uint_radix(const char* str,
  753. size_t n,
  754. void* dest,
  755. int radix) {
  756. unsigned long r;
  757. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  758. if ((unsigned int)r != r) return false; // Out of range
  759. if (dest == NULL) return true;
  760. *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
  761. return true;
  762. }
  763. bool PCRE::Arg::parse_longlong_radix(const char* str,
  764. size_t n,
  765. void* dest,
  766. int radix) {
  767. if (n == 0) return false;
  768. char buf[kMaxNumberLength+1];
  769. str = TerminateNumber(buf, str, n);
  770. char* end;
  771. errno = 0;
  772. long long r = strtoll(str, &end, radix);
  773. if (end != str + n) return false; // Leftover junk
  774. if (errno) return false;
  775. if (dest == NULL) return true;
  776. *(reinterpret_cast<long long*>(dest)) = r;
  777. return true;
  778. }
  779. bool PCRE::Arg::parse_ulonglong_radix(const char* str,
  780. size_t n,
  781. void* dest,
  782. int radix) {
  783. if (n == 0) return false;
  784. char buf[kMaxNumberLength+1];
  785. str = TerminateNumber(buf, str, n);
  786. if (str[0] == '-') {
  787. // strtoull() will silently accept negative numbers and parse
  788. // them. This module is more strict and treats them as errors.
  789. return false;
  790. }
  791. char* end;
  792. errno = 0;
  793. unsigned long long r = strtoull(str, &end, radix);
  794. if (end != str + n) return false; // Leftover junk
  795. if (errno) return false;
  796. if (dest == NULL) return true;
  797. *(reinterpret_cast<unsigned long long*>(dest)) = r;
  798. return true;
  799. }
  800. static bool parse_double_float(const char* str, size_t n, bool isfloat,
  801. void* dest) {
  802. if (n == 0) return false;
  803. static const int kMaxLength = 200;
  804. char buf[kMaxLength];
  805. if (n >= kMaxLength) return false;
  806. memcpy(buf, str, n);
  807. buf[n] = '\0';
  808. char* end;
  809. errno = 0;
  810. double r;
  811. if (isfloat) {
  812. r = strtof(buf, &end);
  813. } else {
  814. r = strtod(buf, &end);
  815. }
  816. if (end != buf + n) return false; // Leftover junk
  817. if (errno) return false;
  818. if (dest == NULL) return true;
  819. if (isfloat) {
  820. *(reinterpret_cast<float*>(dest)) = (float)r;
  821. } else {
  822. *(reinterpret_cast<double*>(dest)) = r;
  823. }
  824. return true;
  825. }
  826. bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
  827. return parse_double_float(str, n, false, dest);
  828. }
  829. bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
  830. return parse_double_float(str, n, true, dest);
  831. }
  832. #define DEFINE_INTEGER_PARSER(name) \
  833. bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
  834. return parse_##name##_radix(str, n, dest, 10); \
  835. } \
  836. bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
  837. return parse_##name##_radix(str, n, dest, 16); \
  838. } \
  839. bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
  840. void* dest) { \
  841. return parse_##name##_radix(str, n, dest, 8); \
  842. } \
  843. bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
  844. void* dest) { \
  845. return parse_##name##_radix(str, n, dest, 0); \
  846. }
  847. DEFINE_INTEGER_PARSER(short);
  848. DEFINE_INTEGER_PARSER(ushort);
  849. DEFINE_INTEGER_PARSER(int);
  850. DEFINE_INTEGER_PARSER(uint);
  851. DEFINE_INTEGER_PARSER(long);
  852. DEFINE_INTEGER_PARSER(ulong);
  853. DEFINE_INTEGER_PARSER(longlong);
  854. DEFINE_INTEGER_PARSER(ulonglong);
  855. #undef DEFINE_INTEGER_PARSER
  856. } // namespace re2