regexp.cpp 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #include "regexp.h"
  2. #include <util/generic/string.h>
  3. #include <util/string/ascii.h>
  4. #include <util/system/defaults.h>
  5. #include <cstdlib>
  6. #include <util/generic/noncopyable.h>
  7. class TGlobalImpl : TNonCopyable {
  8. private:
  9. const char* Str;
  10. regmatch_t* Pmatch;
  11. int Options;
  12. int StrLen;
  13. int StartOffset, NotEmptyOpts, MatchPos;
  14. int MatchBuf[NMATCHES * 3];
  15. pcre* PregComp;
  16. enum StateCode {
  17. TGI_EXIT,
  18. TGI_CONTINUE,
  19. TGI_WALKTHROUGH
  20. };
  21. private:
  22. void CopyResults(int count) {
  23. for (int i = 0; i < count; i++) {
  24. Pmatch[MatchPos].rm_so = MatchBuf[2 * i];
  25. Pmatch[MatchPos].rm_eo = MatchBuf[2 * i + 1];
  26. MatchPos++;
  27. if (MatchPos >= NMATCHES) {
  28. ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
  29. }
  30. }
  31. }
  32. int DoPcreExec(int opts) {
  33. int rc = pcre_exec(
  34. PregComp, /* the compiled pattern */
  35. nullptr, /* no extra data - we didn't study the pattern */
  36. Str, /* the subject string */
  37. StrLen, /* the length of the subject */
  38. StartOffset, /* start at offset 0 in the subject */
  39. opts, /* default options */
  40. MatchBuf, /* output vector for substring information */
  41. NMATCHES); /* number of elements in the output vector */
  42. if (rc == 0) {
  43. ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
  44. }
  45. return rc;
  46. }
  47. StateCode CheckEmptyCase() {
  48. if (MatchBuf[0] == MatchBuf[1]) { // founded an empty string
  49. if (MatchBuf[0] == StrLen) { // at the end
  50. return TGI_EXIT;
  51. }
  52. NotEmptyOpts = PCRE_NOTEMPTY | PCRE_ANCHORED; // trying to find non empty string
  53. }
  54. return TGI_WALKTHROUGH;
  55. }
  56. StateCode CheckNoMatch(int rc) {
  57. if (rc == PCRE_ERROR_NOMATCH) {
  58. if (NotEmptyOpts == 0) {
  59. return TGI_EXIT;
  60. }
  61. MatchBuf[1] = StartOffset + 1; // we have failed to find non-empty-string. trying to find again shifting "previous match offset"
  62. return TGI_CONTINUE;
  63. }
  64. return TGI_WALKTHROUGH;
  65. }
  66. public:
  67. TGlobalImpl(const char* st, regmatch_t& pma, int opts, pcre* pc_re)
  68. : Str(st)
  69. , Pmatch(&pma)
  70. , Options(opts)
  71. , StartOffset(0)
  72. , NotEmptyOpts(0)
  73. , MatchPos(0)
  74. , PregComp(pc_re)
  75. {
  76. memset(Pmatch, -1, sizeof(regmatch_t) * NMATCHES);
  77. StrLen = strlen(Str);
  78. }
  79. int ExecGlobal() {
  80. StartOffset = 0;
  81. int rc = DoPcreExec(Options);
  82. if (rc < 0) {
  83. return rc;
  84. }
  85. CopyResults(rc);
  86. do {
  87. NotEmptyOpts = 0;
  88. StartOffset = MatchBuf[1];
  89. if (CheckEmptyCase() == TGI_EXIT) {
  90. return 0;
  91. }
  92. rc = DoPcreExec(NotEmptyOpts | Options);
  93. switch (CheckNoMatch(rc)) {
  94. case TGI_CONTINUE:
  95. continue;
  96. case TGI_EXIT:
  97. return 0;
  98. case TGI_WALKTHROUGH:
  99. default:
  100. break;
  101. }
  102. if (rc < 0) {
  103. return rc;
  104. }
  105. CopyResults(rc);
  106. } while (true);
  107. return 0;
  108. }
  109. private:
  110. };
  111. class TRegExBaseImpl: public TAtomicRefCount<TRegExBaseImpl> {
  112. friend class TRegExBase;
  113. protected:
  114. int CompileOptions;
  115. TString RegExpr;
  116. regex_t Preg;
  117. public:
  118. TRegExBaseImpl()
  119. : CompileOptions(0)
  120. {
  121. memset(&Preg, 0, sizeof(Preg));
  122. }
  123. TRegExBaseImpl(const TString& re, int cflags)
  124. : CompileOptions(cflags)
  125. , RegExpr(re)
  126. {
  127. int rc = regcomp(&Preg, re.data(), cflags);
  128. if (rc) {
  129. const size_t ERRBUF_SIZE = 100;
  130. char errbuf[ERRBUF_SIZE];
  131. regerror(rc, &Preg, errbuf, ERRBUF_SIZE);
  132. Error = "Error: regular expression " + re + " is wrong: " + errbuf;
  133. ythrow yexception() << "RegExp " << re << ": " << Error.data();
  134. }
  135. }
  136. int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
  137. if (!RegExpr) {
  138. ythrow yexception() << "Regular expression is not compiled";
  139. }
  140. if (!str) {
  141. ythrow yexception() << "Empty string is passed to TRegExBaseImpl::Exec";
  142. }
  143. if ((eflags & REGEXP_GLOBAL) == 0) {
  144. return regexec(&Preg, str, nmatches, pmatch, eflags);
  145. } else {
  146. int options = 0;
  147. if ((eflags & REG_NOTBOL) != 0)
  148. options |= PCRE_NOTBOL;
  149. if ((eflags & REG_NOTEOL) != 0)
  150. options |= PCRE_NOTEOL;
  151. return TGlobalImpl(str, pmatch[0], options, (pcre*)Preg.re_pcre).ExecGlobal();
  152. }
  153. }
  154. bool IsCompiled() {
  155. return Preg.re_pcre;
  156. }
  157. ~TRegExBaseImpl() {
  158. regfree(&Preg);
  159. }
  160. private:
  161. TString Error;
  162. };
  163. bool TRegExBase::IsCompiled() const {
  164. return Impl && Impl->IsCompiled();
  165. }
  166. TRegExBase::TRegExBase(const char* re, int cflags) {
  167. if (re) {
  168. Compile(re, cflags);
  169. }
  170. }
  171. TRegExBase::TRegExBase(const TString& re, int cflags) {
  172. Compile(re, cflags);
  173. }
  174. TRegExBase::~TRegExBase() {
  175. }
  176. void TRegExBase::Compile(const TString& re, int cflags) {
  177. Impl = new TRegExBaseImpl(re, cflags);
  178. }
  179. int TRegExBase::Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
  180. if (!Impl)
  181. ythrow yexception() << "!Regular expression is not compiled";
  182. return Impl->Exec(str, pmatch, eflags, nmatches);
  183. }
  184. int TRegExBase::GetCompileOptions() const {
  185. if (!Impl)
  186. ythrow yexception() << "!Regular expression is not compiled";
  187. return Impl->CompileOptions;
  188. }
  189. TString TRegExBase::GetRegExpr() const {
  190. if (!Impl)
  191. ythrow yexception() << "!Regular expression is not compiled";
  192. return Impl->RegExpr;
  193. }
  194. TRegExMatch::TRegExMatch(const char* re, int cflags)
  195. : TRegExBase(re, cflags)
  196. {
  197. }
  198. TRegExMatch::TRegExMatch(const TString& re, int cflags)
  199. : TRegExBase(re, cflags)
  200. {
  201. }
  202. bool TRegExMatch::Match(const char* str) const {
  203. return Exec(str, nullptr, 0, 0) == 0;
  204. }
  205. TRegExSubst::TRegExSubst(const char* re, int cflags)
  206. : TRegExBase(re, cflags)
  207. , Replacement(nullptr)
  208. {
  209. memset(Brfs, 0, sizeof(TBackReferences) * NMATCHES);
  210. }
  211. TString TRegExSubst::Replace(const char* str, int eflags) {
  212. TString s;
  213. if (BrfsCount) {
  214. if (Exec(str, PMatch, eflags) == 0) {
  215. int i;
  216. for (i = 0; i < BrfsCount; i++) {
  217. s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
  218. if (Brfs[i].Refer >= 0 && Brfs[i].Refer < NMATCHES)
  219. s += TString(str, PMatch[Brfs[i].Refer].rm_so, int(PMatch[Brfs[i].Refer].rm_eo - PMatch[Brfs[i].Refer].rm_so));
  220. }
  221. s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
  222. }
  223. } else {
  224. s = Replacement;
  225. }
  226. return s;
  227. }
  228. //***
  229. // ��� ������������ ������ aaa.$1.$$$$.$2.bbb.$$$ccc Brfs ����� �����:
  230. // {beg = 0, end = 4, Refer = 1} => "aaa." + $1_match
  231. // {beg = 6, end = 8, Refer = -1} => ".$"
  232. // {beg = 9, end = 10, Refer = -1} => "$"
  233. // {beg = 11, end = 12, Refer = 2} => "." + $2_match
  234. // {beg = 14, end = 20, Refer = -1} => ".bbb.$"
  235. // {beg = 21, end = 22, Refer = -1} => "$"
  236. // {beg = 22, end = 25, Refer = -1} => "ccc"
  237. // {beg = 0, end = 0, Refer = 0}
  238. //***
  239. int TRegExSubst::ParseReplacement(const char* repl) {
  240. Replacement = repl;
  241. if (!Replacement || *Replacement == 0)
  242. return 0;
  243. char* pos = (char*)Replacement;
  244. char* pos1 = nullptr;
  245. char* pos2 = nullptr;
  246. int i = 0;
  247. while (pos && *pos && i < NMATCHES) {
  248. pos1 = strchr(pos, '$');
  249. Brfs[i].Refer = -1;
  250. pos2 = pos1;
  251. if (pos1) {
  252. pos2 = pos1 + 1;
  253. while (IsAsciiDigit(*pos2))
  254. pos2++;
  255. if (pos2 > pos1 + 1) {
  256. Brfs[i].Refer = atol(TString(Replacement, pos1 + 1 - Replacement, pos2 - (pos1 + 1)).data());
  257. } else {
  258. pos1++;
  259. if (*pos2 == '$')
  260. pos2++;
  261. Brfs[i].Refer = -1;
  262. }
  263. }
  264. Brfs[i].Beg = int(pos - (char*)Replacement);
  265. Brfs[i].End = (pos1 == nullptr ? (int)strlen(Replacement) : int(pos1 - Replacement));
  266. pos = pos2;
  267. i++;
  268. }
  269. Brfs[i].Beg = Brfs[i].End = 0;
  270. Brfs[i].Refer = -1;
  271. BrfsCount = i;
  272. return BrfsCount;
  273. }