regexp.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. #pragma once
  2. #include "pire.h"
  3. #include <library/cpp/charset/doccodes.h>
  4. #include <library/cpp/charset/recyr.hh>
  5. #include <util/generic/maybe.h>
  6. #include <util/generic/strbuf.h>
  7. #include <util/generic/string.h>
  8. #include <util/generic/vector.h>
  9. #include <util/generic/yexception.h>
  10. namespace NRegExp {
  11. struct TMatcher;
  12. struct TFsmBase {
  13. struct TOptions {
  14. inline TOptions& SetCaseInsensitive(bool v) noexcept {
  15. CaseInsensitive = v;
  16. return *this;
  17. }
  18. inline TOptions& SetSurround(bool v) noexcept {
  19. Surround = v;
  20. return *this;
  21. }
  22. inline TOptions& SetCapture(size_t pos) noexcept {
  23. CapturePos = pos;
  24. return *this;
  25. }
  26. inline TOptions& SetCharset(ECharset charset) noexcept {
  27. Charset = charset;
  28. return *this;
  29. }
  30. inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
  31. AndNotSupport = andNotSupport;
  32. return *this;
  33. }
  34. bool CaseInsensitive = false;
  35. bool Surround = false;
  36. TMaybe<size_t> CapturePos;
  37. ECharset Charset = CODES_UNKNOWN;
  38. bool AndNotSupport = false;
  39. };
  40. static inline NPire::TFsm Parse(const TStringBuf& regexp,
  41. const TOptions& opts, const bool needDetermine = true) {
  42. NPire::TLexer lexer;
  43. if (opts.Charset == CODES_UNKNOWN) {
  44. lexer.Assign(regexp.data(), regexp.data() + regexp.size());
  45. } else {
  46. TVector<wchar32> ucs4(regexp.size() + 1);
  47. size_t inRead = 0;
  48. size_t outWritten = 0;
  49. int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
  50. regexp.size(), regexp.size(), inRead, outWritten);
  51. Y_ASSERT(recodeRes == RECODE_OK);
  52. Y_ASSERT(outWritten < ucs4.size());
  53. ucs4[outWritten] = 0;
  54. lexer.Assign(ucs4.begin(),
  55. ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
  56. }
  57. if (opts.CaseInsensitive) {
  58. lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
  59. }
  60. if (opts.CapturePos) {
  61. lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
  62. }
  63. if (opts.AndNotSupport) {
  64. lexer.AddFeature(NPire::NFeatures::AndNotSupport());
  65. }
  66. switch (opts.Charset) {
  67. case CODES_UNKNOWN:
  68. break;
  69. case CODES_UTF8:
  70. lexer.SetEncoding(NPire::NEncodings::Utf8());
  71. break;
  72. case CODES_KOI8:
  73. lexer.SetEncoding(NPire::NEncodings::Koi8r());
  74. break;
  75. default:
  76. lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
  77. break;
  78. }
  79. NPire::TFsm ret = lexer.Parse();
  80. if (opts.Surround) {
  81. ret.Surround();
  82. }
  83. if (needDetermine) {
  84. ret.Determine();
  85. }
  86. return ret;
  87. }
  88. };
  89. template <class TScannerType>
  90. class TFsmParser: public TFsmBase {
  91. public:
  92. typedef TScannerType TScanner;
  93. public:
  94. inline explicit TFsmParser(const TStringBuf& regexp,
  95. const TOptions& opts = TOptions(), bool needDetermine = true)
  96. : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
  97. {
  98. }
  99. inline const TScanner& GetScanner() const noexcept {
  100. return Scanner;
  101. }
  102. static inline TFsmParser False() {
  103. return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
  104. }
  105. inline explicit TFsmParser(const TScanner& compiled)
  106. : Scanner(compiled)
  107. {
  108. if (Scanner.Empty())
  109. ythrow yexception() << "Can't create fsm with empty scanner";
  110. }
  111. private:
  112. TScanner Scanner;
  113. };
  114. class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
  115. public:
  116. inline explicit TFsm(const TStringBuf& regexp,
  117. const TOptions& opts = TOptions())
  118. : TFsmParser<TScanner>(regexp, opts)
  119. {
  120. }
  121. inline TFsm(const TFsmParser<TScanner>& fsm)
  122. : TFsmParser<TScanner>(fsm)
  123. {
  124. }
  125. static inline TFsm Glue(const TFsm& l, const TFsm& r) {
  126. return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
  127. }
  128. inline explicit TFsm(const TScanner& compiled)
  129. : TFsmParser<TScanner>(compiled)
  130. {
  131. }
  132. };
  133. static inline TFsm operator|(const TFsm& l, const TFsm& r) {
  134. return TFsm::Glue(l, r);
  135. }
  136. struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
  137. inline explicit TCapturingFsm(const TStringBuf& regexp,
  138. TOptions opts = TOptions())
  139. : TFsmParser<TScanner>(regexp,
  140. opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
  141. }
  142. inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
  143. : TFsmParser<TScanner>(fsm)
  144. {
  145. }
  146. };
  147. struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
  148. inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
  149. TOptions opts = TOptions())
  150. : TFsmParser<TScanner>(regexp,
  151. opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
  152. }
  153. inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
  154. : TFsmParser<TScanner>(fsm)
  155. {
  156. }
  157. };
  158. template <class TFsm>
  159. class TMatcherBase {
  160. public:
  161. typedef typename TFsm::TScanner::State TState;
  162. public:
  163. inline explicit TMatcherBase(const TFsm& fsm)
  164. : Fsm(fsm)
  165. {
  166. Fsm.GetScanner().Initialize(State);
  167. }
  168. inline bool Final() const noexcept {
  169. return GetScanner().Final(GetState());
  170. }
  171. protected:
  172. inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
  173. if (addBegin) {
  174. NPire::Step(GetScanner(), State, NPire::BeginMark);
  175. }
  176. NPire::Run(GetScanner(), State, data, data + len);
  177. if (addEnd) {
  178. NPire::Step(GetScanner(), State, NPire::EndMark);
  179. }
  180. }
  181. inline const typename TFsm::TScanner& GetScanner() const noexcept {
  182. return Fsm.GetScanner();
  183. }
  184. inline const TState& GetState() const noexcept {
  185. return State;
  186. }
  187. private:
  188. const TFsm& Fsm;
  189. TState State;
  190. };
  191. struct TMatcher : TMatcherBase<TFsm> {
  192. inline explicit TMatcher(const TFsm& fsm)
  193. : TMatcherBase<TFsm>(fsm)
  194. {
  195. }
  196. inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
  197. Run(data, len, addBegin, addEnd);
  198. return *this;
  199. }
  200. inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
  201. return Match(s.data(), s.size(), addBegin, addEnd);
  202. }
  203. inline const char* Find(const char* b, const char* e) noexcept {
  204. return NPire::ShortestPrefix(GetScanner(), b, e);
  205. }
  206. typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
  207. inline TMatchedRegexps MatchedRegexps() const noexcept {
  208. return GetScanner().AcceptedRegexps(GetState());
  209. }
  210. };
  211. class TSearcher: public TMatcherBase<TCapturingFsm> {
  212. public:
  213. inline explicit TSearcher(const TCapturingFsm& fsm)
  214. : TMatcherBase<TCapturingFsm>(fsm)
  215. {
  216. }
  217. inline bool Captured() const noexcept {
  218. return GetState().Captured();
  219. }
  220. inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
  221. Data = TStringBuf(data, len);
  222. Run(data, len, addBegin, addEnd);
  223. return *this;
  224. }
  225. inline TSearcher& Search(const TStringBuf& s) noexcept {
  226. return Search(s.data(), s.size());
  227. }
  228. inline TStringBuf GetCaptured() const noexcept {
  229. return TStringBuf(Data.data() + GetState().Begin() - 1,
  230. Data.data() + GetState().End() - 1);
  231. }
  232. private:
  233. TStringBuf Data;
  234. };
  235. class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
  236. public:
  237. typedef typename TSlowCapturingFsm::TScanner::State TState;
  238. inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
  239. : TMatcherBase<TSlowCapturingFsm>(fsm)
  240. , HasCaptured(false)
  241. {
  242. }
  243. inline bool Captured() const noexcept {
  244. return HasCaptured;
  245. }
  246. inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
  247. TStringBuf textData(data, len);
  248. Data = textData;
  249. Run(Data.begin(), Data.size(), addBegin, addEnd);
  250. return GetAns();
  251. }
  252. inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
  253. return Search(s.data(), s.size());
  254. }
  255. inline TStringBuf GetCaptured() const noexcept {
  256. return Ans;
  257. }
  258. private:
  259. TStringBuf Data;
  260. TStringBuf Ans;
  261. bool HasCaptured;
  262. inline TSlowSearcher& GetAns() {
  263. auto state = GetState();
  264. Pire::SlowCapturingScanner::SingleState final;
  265. if (!GetScanner().GetCapture(state, final)) {
  266. HasCaptured = false;
  267. } else {
  268. if (!final.HasEnd()) {
  269. final.SetEnd(Data.size());
  270. }
  271. Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
  272. HasCaptured = true;
  273. }
  274. return *this;
  275. }
  276. };
  277. }