pcre.cc 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025
  1. // Copyright 2003-2009 Google Inc. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
  5. // The main changes are the addition of the HitLimit method and
  6. // compilation as PCRE in namespace re2.
  7. #include <assert.h>
  8. #include <ctype.h>
  9. #include <errno.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #include <limits>
  13. #include <string>
  14. #include <utility>
  15. #include "util/util.h"
  16. #include "util/flags.h"
  17. #include "util/logging.h"
  18. #include "util/pcre.h"
  19. #include "util/strutil.h"
  20. // Silence warnings about the wacky formatting in the operator() functions.
  21. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
  22. #pragma GCC diagnostic ignored "-Wmisleading-indentation"
  23. #endif
  24. #define PCREPORT(level) LOG(level)
  25. // Default PCRE limits.
  26. // Defaults chosen to allow a plausible amount of CPU and
  27. // not exceed main thread stacks. Note that other threads
  28. // often have smaller stacks, and therefore tightening
  29. // regexp_stack_limit may frequently be necessary.
  30. DEFINE_FLAG(int, regexp_stack_limit, 256 << 10,
  31. "default PCRE stack limit (bytes)");
  32. DEFINE_FLAG(int, regexp_match_limit, 1000000,
  33. "default PCRE match limit (function calls)");
  34. #ifndef USEPCRE
  35. // Fake just enough of the PCRE API to allow this file to build. :)
  36. struct pcre_extra {
  37. int flags;
  38. int match_limit;
  39. int match_limit_recursion;
  40. };
  41. #define PCRE_EXTRA_MATCH_LIMIT 0
  42. #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
  43. #define PCRE_ANCHORED 0
  44. #define PCRE_NOTEMPTY 0
  45. #define PCRE_ERROR_NOMATCH 1
  46. #define PCRE_ERROR_MATCHLIMIT 2
  47. #define PCRE_ERROR_RECURSIONLIMIT 3
  48. #define PCRE_INFO_CAPTURECOUNT 0
  49. void pcre_free(void*) {
  50. }
  51. pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
  52. return NULL;
  53. }
  54. int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
  55. return 0;
  56. }
  57. int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
  58. return 0;
  59. }
  60. #endif
  61. namespace re2 {
  62. // Maximum number of args we can set
  63. static const int kMaxArgs = 16;
  64. static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
  65. // Approximate size of a recursive invocation of PCRE's
  66. // internal "match()" frame. This varies depending on the
  67. // compiler and architecture, of course, so the constant is
  68. // just a conservative estimate. To find the exact number,
  69. // run regexp_unittest with --regexp_stack_limit=0 under
  70. // a debugger and look at the frames when it crashes.
  71. // The exact frame size was 656 in production on 2008/02/03.
  72. static const int kPCREFrameSize = 700;
  73. // Special name for missing C++ arguments.
  74. PCRE::Arg PCRE::no_more_args((void*)NULL);
  75. const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
  76. const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
  77. const PCRE::ConsumeFunctor PCRE::Consume = { };
  78. const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
  79. // If a regular expression has no error, its error_ field points here
  80. static const std::string empty_string;
  81. void PCRE::Init(const char* pattern, Option options, int match_limit,
  82. int stack_limit, bool report_errors) {
  83. pattern_ = pattern;
  84. options_ = options;
  85. match_limit_ = match_limit;
  86. stack_limit_ = stack_limit;
  87. hit_limit_ = false;
  88. error_ = &empty_string;
  89. report_errors_ = report_errors;
  90. re_full_ = NULL;
  91. re_partial_ = NULL;
  92. if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
  93. error_ = new std::string("illegal regexp option");
  94. PCREPORT(ERROR)
  95. << "Error compiling '" << pattern << "': illegal regexp option";
  96. } else {
  97. re_partial_ = Compile(UNANCHORED);
  98. if (re_partial_ != NULL) {
  99. re_full_ = Compile(ANCHOR_BOTH);
  100. }
  101. }
  102. }
  103. PCRE::PCRE(const char* pattern) {
  104. Init(pattern, None, 0, 0, true);
  105. }
  106. PCRE::PCRE(const char* pattern, Option option) {
  107. Init(pattern, option, 0, 0, true);
  108. }
  109. PCRE::PCRE(const std::string& pattern) {
  110. Init(pattern.c_str(), None, 0, 0, true);
  111. }
  112. PCRE::PCRE(const std::string& pattern, Option option) {
  113. Init(pattern.c_str(), option, 0, 0, true);
  114. }
  115. PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
  116. Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
  117. re_option.stack_limit(), re_option.report_errors());
  118. }
  119. PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
  120. Init(pattern, re_option.option(), re_option.match_limit(),
  121. re_option.stack_limit(), re_option.report_errors());
  122. }
  123. PCRE::~PCRE() {
  124. if (re_full_ != NULL) pcre_free(re_full_);
  125. if (re_partial_ != NULL) pcre_free(re_partial_);
  126. if (error_ != &empty_string) delete error_;
  127. }
  128. pcre* PCRE::Compile(Anchor anchor) {
  129. // Special treatment for anchoring. This is needed because at
  130. // runtime pcre only provides an option for anchoring at the
  131. // beginning of a string.
  132. //
  133. // There are three types of anchoring we want:
  134. // UNANCHORED Compile the original pattern, and use
  135. // a pcre unanchored match.
  136. // ANCHOR_START Compile the original pattern, and use
  137. // a pcre anchored match.
  138. // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
  139. // and use a pcre anchored match.
  140. const char* error = "";
  141. int eoffset;
  142. pcre* re;
  143. if (anchor != ANCHOR_BOTH) {
  144. re = pcre_compile(pattern_.c_str(),
  145. (options_ & EnabledCompileOptions),
  146. &error, &eoffset, NULL);
  147. } else {
  148. // Tack a '\z' at the end of PCRE. Parenthesize it first so that
  149. // the '\z' applies to all top-level alternatives in the regexp.
  150. std::string wrapped = "(?:"; // A non-counting grouping operator
  151. wrapped += pattern_;
  152. wrapped += ")\\z";
  153. re = pcre_compile(wrapped.c_str(),
  154. (options_ & EnabledCompileOptions),
  155. &error, &eoffset, NULL);
  156. }
  157. if (re == NULL) {
  158. if (error_ == &empty_string) error_ = new std::string(error);
  159. PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
  160. }
  161. return re;
  162. }
  163. /***** Convenience interfaces *****/
  164. bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
  165. const PCRE& re,
  166. const Arg& a0,
  167. const Arg& a1,
  168. const Arg& a2,
  169. const Arg& a3,
  170. const Arg& a4,
  171. const Arg& a5,
  172. const Arg& a6,
  173. const Arg& a7,
  174. const Arg& a8,
  175. const Arg& a9,
  176. const Arg& a10,
  177. const Arg& a11,
  178. const Arg& a12,
  179. const Arg& a13,
  180. const Arg& a14,
  181. const Arg& a15) const {
  182. const Arg* args[kMaxArgs];
  183. int n = 0;
  184. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  185. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  186. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  187. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  188. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  189. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  190. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  191. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  192. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  193. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  194. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  195. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  196. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  197. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  198. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  199. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  200. done:
  201. size_t consumed;
  202. int vec[kVecSize] = {};
  203. return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  204. }
  205. bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
  206. const PCRE& re,
  207. const Arg& a0,
  208. const Arg& a1,
  209. const Arg& a2,
  210. const Arg& a3,
  211. const Arg& a4,
  212. const Arg& a5,
  213. const Arg& a6,
  214. const Arg& a7,
  215. const Arg& a8,
  216. const Arg& a9,
  217. const Arg& a10,
  218. const Arg& a11,
  219. const Arg& a12,
  220. const Arg& a13,
  221. const Arg& a14,
  222. const Arg& a15) const {
  223. const Arg* args[kMaxArgs];
  224. int n = 0;
  225. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  226. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  227. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  228. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  229. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  230. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  231. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  232. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  233. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  234. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  235. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  236. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  237. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  238. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  239. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  240. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  241. done:
  242. size_t consumed;
  243. int vec[kVecSize] = {};
  244. return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  245. }
  246. bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
  247. const PCRE& pattern,
  248. const Arg& a0,
  249. const Arg& a1,
  250. const Arg& a2,
  251. const Arg& a3,
  252. const Arg& a4,
  253. const Arg& a5,
  254. const Arg& a6,
  255. const Arg& a7,
  256. const Arg& a8,
  257. const Arg& a9,
  258. const Arg& a10,
  259. const Arg& a11,
  260. const Arg& a12,
  261. const Arg& a13,
  262. const Arg& a14,
  263. const Arg& a15) const {
  264. const Arg* args[kMaxArgs];
  265. int n = 0;
  266. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  267. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  268. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  269. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  270. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  271. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  272. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  273. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  274. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  275. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  276. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  277. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  278. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  279. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  280. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  281. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  282. done:
  283. size_t consumed;
  284. int vec[kVecSize] = {};
  285. if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
  286. args, n, vec, kVecSize)) {
  287. input->remove_prefix(consumed);
  288. return true;
  289. } else {
  290. return false;
  291. }
  292. }
  293. bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
  294. const PCRE& pattern,
  295. const Arg& a0,
  296. const Arg& a1,
  297. const Arg& a2,
  298. const Arg& a3,
  299. const Arg& a4,
  300. const Arg& a5,
  301. const Arg& a6,
  302. const Arg& a7,
  303. const Arg& a8,
  304. const Arg& a9,
  305. const Arg& a10,
  306. const Arg& a11,
  307. const Arg& a12,
  308. const Arg& a13,
  309. const Arg& a14,
  310. const Arg& a15) const {
  311. const Arg* args[kMaxArgs];
  312. int n = 0;
  313. if (&a0 == &no_more_args) goto done; args[n++] = &a0;
  314. if (&a1 == &no_more_args) goto done; args[n++] = &a1;
  315. if (&a2 == &no_more_args) goto done; args[n++] = &a2;
  316. if (&a3 == &no_more_args) goto done; args[n++] = &a3;
  317. if (&a4 == &no_more_args) goto done; args[n++] = &a4;
  318. if (&a5 == &no_more_args) goto done; args[n++] = &a5;
  319. if (&a6 == &no_more_args) goto done; args[n++] = &a6;
  320. if (&a7 == &no_more_args) goto done; args[n++] = &a7;
  321. if (&a8 == &no_more_args) goto done; args[n++] = &a8;
  322. if (&a9 == &no_more_args) goto done; args[n++] = &a9;
  323. if (&a10 == &no_more_args) goto done; args[n++] = &a10;
  324. if (&a11 == &no_more_args) goto done; args[n++] = &a11;
  325. if (&a12 == &no_more_args) goto done; args[n++] = &a12;
  326. if (&a13 == &no_more_args) goto done; args[n++] = &a13;
  327. if (&a14 == &no_more_args) goto done; args[n++] = &a14;
  328. if (&a15 == &no_more_args) goto done; args[n++] = &a15;
  329. done:
  330. size_t consumed;
  331. int vec[kVecSize] = {};
  332. if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
  333. args, n, vec, kVecSize)) {
  334. input->remove_prefix(consumed);
  335. return true;
  336. } else {
  337. return false;
  338. }
  339. }
  340. bool PCRE::Replace(std::string *str,
  341. const PCRE& pattern,
  342. const StringPiece& rewrite) {
  343. int vec[kVecSize] = {};
  344. int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
  345. if (matches == 0)
  346. return false;
  347. std::string s;
  348. if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
  349. return false;
  350. assert(vec[0] >= 0);
  351. assert(vec[1] >= 0);
  352. str->replace(vec[0], vec[1] - vec[0], s);
  353. return true;
  354. }
  355. int PCRE::GlobalReplace(std::string *str,
  356. const PCRE& pattern,
  357. const StringPiece& rewrite) {
  358. int count = 0;
  359. int vec[kVecSize] = {};
  360. std::string out;
  361. size_t start = 0;
  362. bool last_match_was_empty_string = false;
  363. while (start <= str->size()) {
  364. // If the previous match was for the empty string, we shouldn't
  365. // just match again: we'll match in the same way and get an
  366. // infinite loop. Instead, we do the match in a special way:
  367. // anchored -- to force another try at the same position --
  368. // and with a flag saying that this time, ignore empty matches.
  369. // If this special match returns, that means there's a non-empty
  370. // match at this position as well, and we can continue. If not,
  371. // we do what perl does, and just advance by one.
  372. // Notice that perl prints '@@@' for this;
  373. // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
  374. int matches;
  375. if (last_match_was_empty_string) {
  376. matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
  377. vec, kVecSize);
  378. if (matches <= 0) {
  379. if (start < str->size())
  380. out.push_back((*str)[start]);
  381. start++;
  382. last_match_was_empty_string = false;
  383. continue;
  384. }
  385. } else {
  386. matches = pattern.TryMatch(*str, start, UNANCHORED, true,
  387. vec, kVecSize);
  388. if (matches <= 0)
  389. break;
  390. }
  391. size_t matchstart = vec[0], matchend = vec[1];
  392. assert(matchstart >= start);
  393. assert(matchend >= matchstart);
  394. out.append(*str, start, matchstart - start);
  395. pattern.Rewrite(&out, rewrite, *str, vec, matches);
  396. start = matchend;
  397. count++;
  398. last_match_was_empty_string = (matchstart == matchend);
  399. }
  400. if (count == 0)
  401. return 0;
  402. if (start < str->size())
  403. out.append(*str, start, str->size() - start);
  404. using std::swap;
  405. swap(out, *str);
  406. return count;
  407. }
  408. bool PCRE::Extract(const StringPiece &text,
  409. const PCRE& pattern,
  410. const StringPiece &rewrite,
  411. std::string *out) {
  412. int vec[kVecSize] = {};
  413. int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
  414. if (matches == 0)
  415. return false;
  416. out->clear();
  417. return pattern.Rewrite(out, rewrite, text, vec, matches);
  418. }
  419. std::string PCRE::QuoteMeta(const StringPiece& unquoted) {
  420. std::string result;
  421. result.reserve(unquoted.size() << 1);
  422. // Escape any ascii character not in [A-Za-z_0-9].
  423. //
  424. // Note that it's legal to escape a character even if it has no
  425. // special meaning in a regular expression -- so this function does
  426. // that. (This also makes it identical to the perl function of the
  427. // same name except for the null-character special case;
  428. // see `perldoc -f quotemeta`.)
  429. for (size_t ii = 0; ii < unquoted.size(); ++ii) {
  430. // Note that using 'isalnum' here raises the benchmark time from
  431. // 32ns to 58ns:
  432. if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  433. (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  434. (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  435. unquoted[ii] != '_' &&
  436. // If this is the part of a UTF8 or Latin1 character, we need
  437. // to copy this byte without escaping. Experimentally this is
  438. // what works correctly with the regexp library.
  439. !(unquoted[ii] & 128)) {
  440. if (unquoted[ii] == '\0') { // Special handling for null chars.
  441. // Can't use "\\0" since the next character might be a digit.
  442. result += "\\x00";
  443. continue;
  444. }
  445. result += '\\';
  446. }
  447. result += unquoted[ii];
  448. }
  449. return result;
  450. }
  451. /***** Actual matching and rewriting code *****/
  452. bool PCRE::HitLimit() {
  453. return hit_limit_ != 0;
  454. }
  455. void PCRE::ClearHitLimit() {
  456. hit_limit_ = 0;
  457. }
  458. int PCRE::TryMatch(const StringPiece& text,
  459. size_t startpos,
  460. Anchor anchor,
  461. bool empty_ok,
  462. int *vec,
  463. int vecsize) const {
  464. pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  465. if (re == NULL) {
  466. PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
  467. return 0;
  468. }
  469. int match_limit = match_limit_;
  470. if (match_limit <= 0) {
  471. match_limit = GetFlag(FLAGS_regexp_match_limit);
  472. }
  473. int stack_limit = stack_limit_;
  474. if (stack_limit <= 0) {
  475. stack_limit = GetFlag(FLAGS_regexp_stack_limit);
  476. }
  477. pcre_extra extra = { 0 };
  478. if (match_limit > 0) {
  479. extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  480. extra.match_limit = match_limit;
  481. }
  482. if (stack_limit > 0) {
  483. extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  484. extra.match_limit_recursion = stack_limit / kPCREFrameSize;
  485. }
  486. int options = 0;
  487. if (anchor != UNANCHORED)
  488. options |= PCRE_ANCHORED;
  489. if (!empty_ok)
  490. options |= PCRE_NOTEMPTY;
  491. int rc = pcre_exec(re, // The regular expression object
  492. &extra,
  493. (text.data() == NULL) ? "" : text.data(),
  494. static_cast<int>(text.size()),
  495. static_cast<int>(startpos),
  496. options,
  497. vec,
  498. vecsize);
  499. // Handle errors
  500. if (rc == 0) {
  501. // pcre_exec() returns 0 as a special case when the number of
  502. // capturing subpatterns exceeds the size of the vector.
  503. // When this happens, there is a match and the output vector
  504. // is filled, but we miss out on the positions of the extra subpatterns.
  505. rc = vecsize / 2;
  506. } else if (rc < 0) {
  507. switch (rc) {
  508. case PCRE_ERROR_NOMATCH:
  509. return 0;
  510. case PCRE_ERROR_MATCHLIMIT:
  511. // Writing to hit_limit is not safe if multiple threads
  512. // are using the PCRE, but the flag is only intended
  513. // for use by unit tests anyway, so we let it go.
  514. hit_limit_ = true;
  515. PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
  516. << " when matching '" << pattern_ << "'"
  517. << " against text that is " << text.size() << " bytes.";
  518. return 0;
  519. case PCRE_ERROR_RECURSIONLIMIT:
  520. // See comment about hit_limit above.
  521. hit_limit_ = true;
  522. PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
  523. << " when matching '" << pattern_ << "'"
  524. << " against text that is " << text.size() << " bytes.";
  525. return 0;
  526. default:
  527. // There are other return codes from pcre.h :
  528. // PCRE_ERROR_NULL (-2)
  529. // PCRE_ERROR_BADOPTION (-3)
  530. // PCRE_ERROR_BADMAGIC (-4)
  531. // PCRE_ERROR_UNKNOWN_NODE (-5)
  532. // PCRE_ERROR_NOMEMORY (-6)
  533. // PCRE_ERROR_NOSUBSTRING (-7)
  534. // ...
  535. PCREPORT(ERROR) << "Unexpected return code: " << rc
  536. << " when matching '" << pattern_ << "'"
  537. << ", re=" << re
  538. << ", text=" << text
  539. << ", vec=" << vec
  540. << ", vecsize=" << vecsize;
  541. return 0;
  542. }
  543. }
  544. return rc;
  545. }
  546. bool PCRE::DoMatchImpl(const StringPiece& text,
  547. Anchor anchor,
  548. size_t* consumed,
  549. const Arg* const* args,
  550. int n,
  551. int* vec,
  552. int vecsize) const {
  553. assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
  554. if (NumberOfCapturingGroups() < n) {
  555. // RE has fewer capturing groups than number of Arg pointers passed in.
  556. return false;
  557. }
  558. int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
  559. assert(matches >= 0); // TryMatch never returns negatives
  560. if (matches == 0)
  561. return false;
  562. *consumed = vec[1];
  563. if (n == 0 || args == NULL) {
  564. // We are not interested in results
  565. return true;
  566. }
  567. // If we got here, we must have matched the whole pattern.
  568. // We do not need (can not do) any more checks on the value of 'matches' here
  569. // -- see the comment for TryMatch.
  570. for (int i = 0; i < n; i++) {
  571. const int start = vec[2*(i+1)];
  572. const int limit = vec[2*(i+1)+1];
  573. // Avoid invoking undefined behavior when text.data() happens
  574. // to be null and start happens to be -1, the latter being the
  575. // case for an unmatched subexpression. Even if text.data() is
  576. // not null, pointing one byte before was a longstanding bug.
  577. const char* addr = NULL;
  578. if (start != -1) {
  579. addr = text.data() + start;
  580. }
  581. if (!args[i]->Parse(addr, limit-start)) {
  582. // TODO: Should we indicate what the error was?
  583. return false;
  584. }
  585. }
  586. return true;
  587. }
  588. bool PCRE::DoMatch(const StringPiece& text,
  589. Anchor anchor,
  590. size_t* consumed,
  591. const Arg* const args[],
  592. int n) const {
  593. assert(n >= 0);
  594. const int vecsize = (1 + n) * 3; // results + PCRE workspace
  595. // (as for kVecSize)
  596. int* vec = new int[vecsize];
  597. bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
  598. delete[] vec;
  599. return b;
  600. }
  601. bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite,
  602. const StringPiece &text, int *vec, int veclen) const {
  603. int number_of_capturing_groups = NumberOfCapturingGroups();
  604. for (const char *s = rewrite.data(), *end = s + rewrite.size();
  605. s < end; s++) {
  606. int c = *s;
  607. if (c == '\\') {
  608. c = *++s;
  609. if (isdigit(c)) {
  610. int n = (c - '0');
  611. if (n >= veclen) {
  612. if (n <= number_of_capturing_groups) {
  613. // unmatched optional capturing group. treat
  614. // its value as empty string; i.e., nothing to append.
  615. } else {
  616. PCREPORT(ERROR) << "requested group " << n
  617. << " in regexp " << rewrite.data();
  618. return false;
  619. }
  620. }
  621. int start = vec[2 * n];
  622. if (start >= 0)
  623. out->append(text.data() + start, vec[2 * n + 1] - start);
  624. } else if (c == '\\') {
  625. out->push_back('\\');
  626. } else {
  627. PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
  628. return false;
  629. }
  630. } else {
  631. out->push_back(c);
  632. }
  633. }
  634. return true;
  635. }
  636. bool PCRE::CheckRewriteString(const StringPiece& rewrite,
  637. std::string* error) const {
  638. int max_token = -1;
  639. for (const char *s = rewrite.data(), *end = s + rewrite.size();
  640. s < end; s++) {
  641. int c = *s;
  642. if (c != '\\') {
  643. continue;
  644. }
  645. if (++s == end) {
  646. *error = "Rewrite schema error: '\\' not allowed at end.";
  647. return false;
  648. }
  649. c = *s;
  650. if (c == '\\') {
  651. continue;
  652. }
  653. if (!isdigit(c)) {
  654. *error = "Rewrite schema error: "
  655. "'\\' must be followed by a digit or '\\'.";
  656. return false;
  657. }
  658. int n = (c - '0');
  659. if (max_token < n) {
  660. max_token = n;
  661. }
  662. }
  663. if (max_token > NumberOfCapturingGroups()) {
  664. *error = StringPrintf(
  665. "Rewrite schema requests %d matches, but the regexp only has %d "
  666. "parenthesized subexpressions.",
  667. max_token, NumberOfCapturingGroups());
  668. return false;
  669. }
  670. return true;
  671. }
  672. // Return the number of capturing subpatterns, or -1 if the
  673. // regexp wasn't valid on construction.
  674. int PCRE::NumberOfCapturingGroups() const {
  675. if (re_partial_ == NULL) return -1;
  676. int result;
  677. int rc = pcre_fullinfo(re_partial_, // The regular expression object
  678. NULL, // We did not study the pattern
  679. PCRE_INFO_CAPTURECOUNT,
  680. &result);
  681. if (rc != 0) {
  682. PCREPORT(ERROR) << "Unexpected return code: " << rc;
  683. return -1;
  684. }
  685. return result;
  686. }
  687. /***** Parsers for various types *****/
  688. bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
  689. // We fail if somebody asked us to store into a non-NULL void* pointer
  690. return (dest == NULL);
  691. }
  692. bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
  693. if (dest == NULL) return true;
  694. reinterpret_cast<std::string*>(dest)->assign(str, n);
  695. return true;
  696. }
  697. bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
  698. if (dest == NULL) return true;
  699. *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
  700. return true;
  701. }
  702. bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
  703. if (n != 1) return false;
  704. if (dest == NULL) return true;
  705. *(reinterpret_cast<char*>(dest)) = str[0];
  706. return true;
  707. }
  708. bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
  709. if (n != 1) return false;
  710. if (dest == NULL) return true;
  711. *(reinterpret_cast<signed char*>(dest)) = str[0];
  712. return true;
  713. }
  714. bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
  715. if (n != 1) return false;
  716. if (dest == NULL) return true;
  717. *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  718. return true;
  719. }
  720. // Largest number spec that we are willing to parse
  721. static const int kMaxNumberLength = 32;
  722. // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
  723. // PCREQUIPCRES "n > 0"
  724. // Copies "str" into "buf" and null-terminates if necessary.
  725. // Returns one of:
  726. // a. "str" if no termination is needed
  727. // b. "buf" if the string was copied and null-terminated
  728. // c. "" if the input was invalid and has no hope of being parsed
  729. static const char* TerminateNumber(char* buf, const char* str, size_t n) {
  730. if ((n > 0) && isspace(*str)) {
  731. // We are less forgiving than the strtoxxx() routines and do not
  732. // allow leading spaces.
  733. return "";
  734. }
  735. // See if the character right after the input text may potentially
  736. // look like a digit.
  737. if (isdigit(str[n]) ||
  738. ((str[n] >= 'a') && (str[n] <= 'f')) ||
  739. ((str[n] >= 'A') && (str[n] <= 'F'))) {
  740. if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  741. memcpy(buf, str, n);
  742. buf[n] = '\0';
  743. return buf;
  744. } else {
  745. // We can parse right out of the supplied string, so return it.
  746. return str;
  747. }
  748. }
  749. bool PCRE::Arg::parse_long_radix(const char* str,
  750. size_t n,
  751. void* dest,
  752. int radix) {
  753. if (n == 0) return false;
  754. char buf[kMaxNumberLength+1];
  755. str = TerminateNumber(buf, str, n);
  756. char* end;
  757. errno = 0;
  758. long r = strtol(str, &end, radix);
  759. if (end != str + n) return false; // Leftover junk
  760. if (errno) return false;
  761. if (dest == NULL) return true;
  762. *(reinterpret_cast<long*>(dest)) = r;
  763. return true;
  764. }
  765. bool PCRE::Arg::parse_ulong_radix(const char* str,
  766. size_t n,
  767. void* dest,
  768. int radix) {
  769. if (n == 0) return false;
  770. char buf[kMaxNumberLength+1];
  771. str = TerminateNumber(buf, str, n);
  772. if (str[0] == '-') {
  773. // strtoul() will silently accept negative numbers and parse
  774. // them. This module is more strict and treats them as errors.
  775. return false;
  776. }
  777. char* end;
  778. errno = 0;
  779. unsigned long r = strtoul(str, &end, radix);
  780. if (end != str + n) return false; // Leftover junk
  781. if (errno) return false;
  782. if (dest == NULL) return true;
  783. *(reinterpret_cast<unsigned long*>(dest)) = r;
  784. return true;
  785. }
  786. bool PCRE::Arg::parse_short_radix(const char* str,
  787. size_t n,
  788. void* dest,
  789. int radix) {
  790. long r;
  791. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  792. if ((short)r != r) return false; // Out of range
  793. if (dest == NULL) return true;
  794. *(reinterpret_cast<short*>(dest)) = (short)r;
  795. return true;
  796. }
  797. bool PCRE::Arg::parse_ushort_radix(const char* str,
  798. size_t n,
  799. void* dest,
  800. int radix) {
  801. unsigned long r;
  802. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  803. if ((unsigned short)r != r) return false; // Out of range
  804. if (dest == NULL) return true;
  805. *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
  806. return true;
  807. }
  808. bool PCRE::Arg::parse_int_radix(const char* str,
  809. size_t n,
  810. void* dest,
  811. int radix) {
  812. long r;
  813. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  814. if ((int)r != r) return false; // Out of range
  815. if (dest == NULL) return true;
  816. *(reinterpret_cast<int*>(dest)) = (int)r;
  817. return true;
  818. }
  819. bool PCRE::Arg::parse_uint_radix(const char* str,
  820. size_t n,
  821. void* dest,
  822. int radix) {
  823. unsigned long r;
  824. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  825. if ((unsigned int)r != r) return false; // Out of range
  826. if (dest == NULL) return true;
  827. *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
  828. return true;
  829. }
  830. bool PCRE::Arg::parse_longlong_radix(const char* str,
  831. size_t n,
  832. void* dest,
  833. int radix) {
  834. if (n == 0) return false;
  835. char buf[kMaxNumberLength+1];
  836. str = TerminateNumber(buf, str, n);
  837. char* end;
  838. errno = 0;
  839. long long r = strtoll(str, &end, radix);
  840. if (end != str + n) return false; // Leftover junk
  841. if (errno) return false;
  842. if (dest == NULL) return true;
  843. *(reinterpret_cast<long long*>(dest)) = r;
  844. return true;
  845. }
  846. bool PCRE::Arg::parse_ulonglong_radix(const char* str,
  847. size_t n,
  848. void* dest,
  849. int radix) {
  850. if (n == 0) return false;
  851. char buf[kMaxNumberLength+1];
  852. str = TerminateNumber(buf, str, n);
  853. if (str[0] == '-') {
  854. // strtoull() will silently accept negative numbers and parse
  855. // them. This module is more strict and treats them as errors.
  856. return false;
  857. }
  858. char* end;
  859. errno = 0;
  860. unsigned long long r = strtoull(str, &end, radix);
  861. if (end != str + n) return false; // Leftover junk
  862. if (errno) return false;
  863. if (dest == NULL) return true;
  864. *(reinterpret_cast<unsigned long long*>(dest)) = r;
  865. return true;
  866. }
  867. static bool parse_double_float(const char* str, size_t n, bool isfloat,
  868. void* dest) {
  869. if (n == 0) return false;
  870. static const int kMaxLength = 200;
  871. char buf[kMaxLength];
  872. if (n >= kMaxLength) return false;
  873. memcpy(buf, str, n);
  874. buf[n] = '\0';
  875. char* end;
  876. errno = 0;
  877. double r;
  878. if (isfloat) {
  879. r = strtof(buf, &end);
  880. } else {
  881. r = strtod(buf, &end);
  882. }
  883. if (end != buf + n) return false; // Leftover junk
  884. if (errno) return false;
  885. if (dest == NULL) return true;
  886. if (isfloat) {
  887. *(reinterpret_cast<float*>(dest)) = (float)r;
  888. } else {
  889. *(reinterpret_cast<double*>(dest)) = r;
  890. }
  891. return true;
  892. }
  893. bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
  894. return parse_double_float(str, n, false, dest);
  895. }
  896. bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
  897. return parse_double_float(str, n, true, dest);
  898. }
  899. #define DEFINE_INTEGER_PARSER(name) \
  900. bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
  901. return parse_##name##_radix(str, n, dest, 10); \
  902. } \
  903. bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
  904. return parse_##name##_radix(str, n, dest, 16); \
  905. } \
  906. bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
  907. void* dest) { \
  908. return parse_##name##_radix(str, n, dest, 8); \
  909. } \
  910. bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
  911. void* dest) { \
  912. return parse_##name##_radix(str, n, dest, 0); \
  913. }
  914. DEFINE_INTEGER_PARSER(short);
  915. DEFINE_INTEGER_PARSER(ushort);
  916. DEFINE_INTEGER_PARSER(int);
  917. DEFINE_INTEGER_PARSER(uint);
  918. DEFINE_INTEGER_PARSER(long);
  919. DEFINE_INTEGER_PARSER(ulong);
  920. DEFINE_INTEGER_PARSER(longlong);
  921. DEFINE_INTEGER_PARSER(ulonglong);
  922. #undef DEFINE_INTEGER_PARSER
  923. } // namespace re2