123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337 |
- #pragma once
- #include "pire.h"
- #include <library/cpp/charset/doccodes.h>
- #include <library/cpp/charset/recyr.hh>
- #include <util/generic/maybe.h>
- #include <util/generic/strbuf.h>
- #include <util/generic/string.h>
- #include <util/generic/vector.h>
- #include <util/generic/yexception.h>
- namespace NRegExp {
- struct TMatcher;
- struct TFsmBase {
- struct TOptions {
- inline TOptions& SetCaseInsensitive(bool v) noexcept {
- CaseInsensitive = v;
- return *this;
- }
- inline TOptions& SetSurround(bool v) noexcept {
- Surround = v;
- return *this;
- }
- inline TOptions& SetCapture(size_t pos) noexcept {
- CapturePos = pos;
- return *this;
- }
- inline TOptions& SetCharset(ECharset charset) noexcept {
- Charset = charset;
- return *this;
- }
- inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
- AndNotSupport = andNotSupport;
- return *this;
- }
- bool CaseInsensitive = false;
- bool Surround = false;
- TMaybe<size_t> CapturePos;
- ECharset Charset = CODES_UNKNOWN;
- bool AndNotSupport = false;
- };
- static inline NPire::TFsm Parse(const TStringBuf& regexp,
- const TOptions& opts, const bool needDetermine = true) {
- NPire::TLexer lexer;
- if (opts.Charset == CODES_UNKNOWN) {
- lexer.Assign(regexp.data(), regexp.data() + regexp.size());
- } else {
- TVector<wchar32> ucs4(regexp.size() + 1);
- size_t inRead = 0;
- size_t outWritten = 0;
- int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
- regexp.size(), regexp.size(), inRead, outWritten);
- Y_ASSERT(recodeRes == RECODE_OK);
- Y_ASSERT(outWritten < ucs4.size());
- ucs4[outWritten] = 0;
- lexer.Assign(ucs4.begin(),
- ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
- }
- if (opts.CaseInsensitive) {
- lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
- }
- if (opts.CapturePos) {
- lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
- }
- if (opts.AndNotSupport) {
- lexer.AddFeature(NPire::NFeatures::AndNotSupport());
- }
- switch (opts.Charset) {
- case CODES_UNKNOWN:
- break;
- case CODES_UTF8:
- lexer.SetEncoding(NPire::NEncodings::Utf8());
- break;
- case CODES_KOI8:
- lexer.SetEncoding(NPire::NEncodings::Koi8r());
- break;
- default:
- lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
- break;
- }
- NPire::TFsm ret = lexer.Parse();
- if (opts.Surround) {
- ret.Surround();
- }
- if (needDetermine) {
- ret.Determine();
- }
- return ret;
- }
- };
- template <class TScannerType>
- class TFsmParser: public TFsmBase {
- public:
- typedef TScannerType TScanner;
- public:
- inline explicit TFsmParser(const TStringBuf& regexp,
- const TOptions& opts = TOptions(), bool needDetermine = true)
- : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
- {
- }
- inline const TScanner& GetScanner() const noexcept {
- return Scanner;
- }
- static inline TFsmParser False() {
- return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
- }
- inline explicit TFsmParser(const TScanner& compiled)
- : Scanner(compiled)
- {
- if (Scanner.Empty())
- ythrow yexception() << "Can't create fsm with empty scanner";
- }
- private:
- TScanner Scanner;
- };
- class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
- public:
- inline explicit TFsm(const TStringBuf& regexp,
- const TOptions& opts = TOptions())
- : TFsmParser<TScanner>(regexp, opts)
- {
- }
- inline TFsm(const TFsmParser<TScanner>& fsm)
- : TFsmParser<TScanner>(fsm)
- {
- }
- static inline TFsm Glue(const TFsm& l, const TFsm& r) {
- return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
- }
- inline explicit TFsm(const TScanner& compiled)
- : TFsmParser<TScanner>(compiled)
- {
- }
- };
- static inline TFsm operator|(const TFsm& l, const TFsm& r) {
- return TFsm::Glue(l, r);
- }
- struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
- inline explicit TCapturingFsm(const TStringBuf& regexp,
- TOptions opts = TOptions())
- : TFsmParser<TScanner>(regexp,
- opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
- }
- inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
- : TFsmParser<TScanner>(fsm)
- {
- }
- };
- struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
- inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
- TOptions opts = TOptions())
- : TFsmParser<TScanner>(regexp,
- opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
- }
- inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
- : TFsmParser<TScanner>(fsm)
- {
- }
- };
- template <class TFsm>
- class TMatcherBase {
- public:
- typedef typename TFsm::TScanner::State TState;
- public:
- inline explicit TMatcherBase(const TFsm& fsm)
- : Fsm(fsm)
- {
- Fsm.GetScanner().Initialize(State);
- }
- inline bool Final() const noexcept {
- return GetScanner().Final(GetState());
- }
- protected:
- inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
- if (addBegin) {
- NPire::Step(GetScanner(), State, NPire::BeginMark);
- }
- NPire::Run(GetScanner(), State, data, data + len);
- if (addEnd) {
- NPire::Step(GetScanner(), State, NPire::EndMark);
- }
- }
- inline const typename TFsm::TScanner& GetScanner() const noexcept {
- return Fsm.GetScanner();
- }
- inline const TState& GetState() const noexcept {
- return State;
- }
- private:
- const TFsm& Fsm;
- TState State;
- };
- struct TMatcher : TMatcherBase<TFsm> {
- inline explicit TMatcher(const TFsm& fsm)
- : TMatcherBase<TFsm>(fsm)
- {
- }
- inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
- Run(data, len, addBegin, addEnd);
- return *this;
- }
- inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
- return Match(s.data(), s.size(), addBegin, addEnd);
- }
- inline const char* Find(const char* b, const char* e) noexcept {
- return NPire::ShortestPrefix(GetScanner(), b, e);
- }
- typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
- inline TMatchedRegexps MatchedRegexps() const noexcept {
- return GetScanner().AcceptedRegexps(GetState());
- }
- };
- class TSearcher: public TMatcherBase<TCapturingFsm> {
- public:
- inline explicit TSearcher(const TCapturingFsm& fsm)
- : TMatcherBase<TCapturingFsm>(fsm)
- {
- }
- inline bool Captured() const noexcept {
- return GetState().Captured();
- }
- inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
- Data = TStringBuf(data, len);
- Run(data, len, addBegin, addEnd);
- return *this;
- }
- inline TSearcher& Search(const TStringBuf& s) noexcept {
- return Search(s.data(), s.size());
- }
- inline TStringBuf GetCaptured() const noexcept {
- return TStringBuf(Data.data() + GetState().Begin() - 1,
- Data.data() + GetState().End() - 1);
- }
- private:
- TStringBuf Data;
- };
- class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
- public:
- typedef typename TSlowCapturingFsm::TScanner::State TState;
- inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
- : TMatcherBase<TSlowCapturingFsm>(fsm)
- , HasCaptured(false)
- {
- }
- inline bool Captured() const noexcept {
- return HasCaptured;
- }
- inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
- TStringBuf textData(data, len);
- Data = textData;
- Run(Data.begin(), Data.size(), addBegin, addEnd);
- return GetAns();
- }
- inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
- return Search(s.data(), s.size());
- }
- inline TStringBuf GetCaptured() const noexcept {
- return Ans;
- }
- private:
- TStringBuf Data;
- TStringBuf Ans;
- bool HasCaptured;
- inline TSlowSearcher& GetAns() {
- auto state = GetState();
- Pire::SlowCapturingScanner::SingleState final;
- if (!GetScanner().GetCapture(state, final)) {
- HasCaptured = false;
- } else {
- if (!final.HasEnd()) {
- final.SetEnd(Data.size());
- }
- Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
- HasCaptured = true;
- }
- return *this;
- }
- };
- }
|