#pragma once #include "pire.h" #include #include #include #include #include #include #include namespace NRegExp { struct TMatcher; struct TFsmBase { struct TOptions { inline TOptions& SetCaseInsensitive(bool v) noexcept { CaseInsensitive = v; return *this; } inline TOptions& SetSurround(bool v) noexcept { Surround = v; return *this; } inline TOptions& SetCapture(size_t pos) noexcept { CapturePos = pos; return *this; } inline TOptions& SetCharset(ECharset charset) noexcept { Charset = charset; return *this; } inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept { AndNotSupport = andNotSupport; return *this; } bool CaseInsensitive = false; bool Surround = false; TMaybe CapturePos; ECharset Charset = CODES_UNKNOWN; bool AndNotSupport = false; }; static inline NPire::TFsm Parse(const TStringBuf& regexp, const TOptions& opts, const bool needDetermine = true) { NPire::TLexer lexer; if (opts.Charset == CODES_UNKNOWN) { lexer.Assign(regexp.data(), regexp.data() + regexp.size()); } else { TVector ucs4(regexp.size() + 1); size_t inRead = 0; size_t outWritten = 0; int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(), regexp.size(), regexp.size(), inRead, outWritten); Y_ASSERT(recodeRes == RECODE_OK); Y_ASSERT(outWritten < ucs4.size()); ucs4[outWritten] = 0; lexer.Assign(ucs4.begin(), ucs4.begin() + std::char_traits::length(ucs4.data())); } if (opts.CaseInsensitive) { lexer.AddFeature(NPire::NFeatures::CaseInsensitive()); } if (opts.CapturePos) { lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos)); } if (opts.AndNotSupport) { lexer.AddFeature(NPire::NFeatures::AndNotSupport()); } switch (opts.Charset) { case CODES_UNKNOWN: break; case CODES_UTF8: lexer.SetEncoding(NPire::NEncodings::Utf8()); break; case CODES_KOI8: lexer.SetEncoding(NPire::NEncodings::Koi8r()); break; default: lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset)); break; } NPire::TFsm ret = lexer.Parse(); if (opts.Surround) { ret.Surround(); } if (needDetermine) { ret.Determine(); } return ret; } }; template class TFsmParser: public TFsmBase { public: typedef TScannerType TScanner; public: inline explicit TFsmParser(const TStringBuf& regexp, const TOptions& opts = TOptions(), bool needDetermine = true) : Scanner(Parse(regexp, opts, needDetermine).template Compile()) { } inline const TScanner& GetScanner() const noexcept { return Scanner; } static inline TFsmParser False() { return TFsmParser(NPire::TFsm::MakeFalse().Compile()); } inline explicit TFsmParser(const TScanner& compiled) : Scanner(compiled) { if (Scanner.Empty()) ythrow yexception() << "Can't create fsm with empty scanner"; } private: TScanner Scanner; }; class TFsm: public TFsmParser { public: inline explicit TFsm(const TStringBuf& regexp, const TOptions& opts = TOptions()) : TFsmParser(regexp, opts) { } inline TFsm(const TFsmParser& fsm) : TFsmParser(fsm) { } static inline TFsm Glue(const TFsm& l, const TFsm& r) { return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner())); } inline explicit TFsm(const TScanner& compiled) : TFsmParser(compiled) { } }; static inline TFsm operator|(const TFsm& l, const TFsm& r) { return TFsm::Glue(l, r); } struct TCapturingFsm : TFsmParser { inline explicit TCapturingFsm(const TStringBuf& regexp, TOptions opts = TOptions()) : TFsmParser(regexp, opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) { } inline TCapturingFsm(const TFsmParser& fsm) : TFsmParser(fsm) { } }; struct TSlowCapturingFsm : TFsmParser { inline explicit TSlowCapturingFsm(const TStringBuf& regexp, TOptions opts = TOptions()) : TFsmParser(regexp, opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) { } inline TSlowCapturingFsm(const TFsmParser& fsm) : TFsmParser(fsm) { } }; template class TMatcherBase { public: typedef typename TFsm::TScanner::State TState; public: inline explicit TMatcherBase(const TFsm& fsm) : Fsm(fsm) { Fsm.GetScanner().Initialize(State); } inline bool Final() const noexcept { return GetScanner().Final(GetState()); } protected: inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept { if (addBegin) { NPire::Step(GetScanner(), State, NPire::BeginMark); } NPire::Run(GetScanner(), State, data, data + len); if (addEnd) { NPire::Step(GetScanner(), State, NPire::EndMark); } } inline const typename TFsm::TScanner& GetScanner() const noexcept { return Fsm.GetScanner(); } inline const TState& GetState() const noexcept { return State; } private: const TFsm& Fsm; TState State; }; struct TMatcher : TMatcherBase { inline explicit TMatcher(const TFsm& fsm) : TMatcherBase(fsm) { } inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept { Run(data, len, addBegin, addEnd); return *this; } inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept { return Match(s.data(), s.size(), addBegin, addEnd); } inline const char* Find(const char* b, const char* e) noexcept { return NPire::ShortestPrefix(GetScanner(), b, e); } typedef std::pair TMatchedRegexps; inline TMatchedRegexps MatchedRegexps() const noexcept { return GetScanner().AcceptedRegexps(GetState()); } }; class TSearcher: public TMatcherBase { public: inline explicit TSearcher(const TCapturingFsm& fsm) : TMatcherBase(fsm) { } inline bool Captured() const noexcept { return GetState().Captured(); } inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept { Data = TStringBuf(data, len); Run(data, len, addBegin, addEnd); return *this; } inline TSearcher& Search(const TStringBuf& s) noexcept { return Search(s.data(), s.size()); } inline TStringBuf GetCaptured() const noexcept { return TStringBuf(Data.data() + GetState().Begin() - 1, Data.data() + GetState().End() - 1); } private: TStringBuf Data; }; class TSlowSearcher : TMatcherBase{ public: typedef typename TSlowCapturingFsm::TScanner::State TState; inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm) : TMatcherBase(fsm) , HasCaptured(false) { } inline bool Captured() const noexcept { return HasCaptured; } inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept { TStringBuf textData(data, len); Data = textData; Run(Data.begin(), Data.size(), addBegin, addEnd); return GetAns(); } inline TSlowSearcher& Search(const TStringBuf& s) noexcept { return Search(s.data(), s.size()); } inline TStringBuf GetCaptured() const noexcept { return Ans; } private: TStringBuf Data; TStringBuf Ans; bool HasCaptured; inline TSlowSearcher& GetAns() { auto state = GetState(); Pire::SlowCapturingScanner::SingleState final; if (!GetScanner().GetCapture(state, final)) { HasCaptured = false; } else { if (!final.HasEnd()) { final.SetEnd(Data.size()); } Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin()); HasCaptured = true; } return *this; } }; }