#pragma once #include "traits.h" #include #include #include #include #include namespace NPcre { //! Start and end offset for match group. using TPcreMatch = std::pair; //! Full match result containing all capturing groups. /*! * At zero index we have whole matched string start and end offsets. * All other elements will contain capturing groups positions. * Non-captured capturing groups will have {-1, -1} offsets. */ using TPcreMatches = TVector; //! Compiled pattern optimization strategy. enum class EOptimize { //! No optimization. /*! * Useful for non-reusable patterns where compile time matters. */ None, //! Basic optimization via |pcre_study|. /*! * Could give up to 4x match speed boost in exchange of increased * construction time. Could not. */ Study, //! PCRE JIT optimization. /*! * Could give up to 10x match speed bust in exchange of significantly * increased compile time. Also, for very complex patterns |pcre_exec| * could return |PCRE_ERROR_JIT_STACKLIMIT|. See * https://www.pcre.org/original/doc/html/pcrejit.html for details. */ JIT }; //! PCRE code container. Controls its life time and provides handy wrapper. template class TPcre { private: using TCodeType = typename TPcreTraits::TCodeType; using TExtraType = typename TPcreTraits::TExtraType; using TStringType = typename TPcreTraits::TStringType; using TTraits = TPcreTraits; static constexpr size_t DefaultWorkspaceSize = 16; public: //! Compiles regexp into internal representation for future use. /*! * \param pattern Regular expression to be compiled. * \param optimize If |EOptimize::JIT|, perform additional * analysis, which will take extra time, but could * speed up matching. |None| to omit optimization. * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html **/ TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) { int errcode; const char* errptr; int erroffset; Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr)); if (!Code) { ythrow yexception() << "Failed to compile pattern <" << pattern << ">, because of error at pos " << erroffset << ", error code " << errcode << ": " << errptr; } if (optimize != EOptimize::None) { errptr = nullptr; int options; if (optimize == EOptimize::Study) { options = 0; } else { options = PCRE_STUDY_JIT_COMPILE; } Extra.Reset(TTraits::Study(Code.Get(), options, &errptr)); if (errptr) { ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr; } } } //! Check if compiled pattern matches string. /*! * \param string String to search in. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html * \param workspaceSize Amount of space which will be allocated for * back references. PCRE could allocate more * heap space is provided workspaceSize won't * fit all of them. * \returns |true| if there is a match. */ bool Matches(TBasicStringBuf string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { Y_ASSERT(workspaceSize >= 0); size_t ovecsize = workspaceSize * 3; NStackArray::TStackArray ovector(ALLOC_ON_STACK(int, ovecsize)); return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize)); } //! Find compiled pattern in string. /*! * \param string String to search in. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html * \param workspaceSize Amount of space which will be allocated for * back references. PCRE could allocate more * heap space is provided workspaceSize won't * fit all of them. * \returns Start and end offsets pair if there is a * match. |Nothing| otherwise. */ Y_NO_SANITIZE("memory") TMaybe Find(TBasicStringBuf string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { Y_ASSERT(workspaceSize >= 0); size_t ovecsize = workspaceSize * 3; NStackArray::TStackArray ovector(ALLOC_ON_STACK(int, ovecsize)); for (size_t i = 0; i < ovecsize; ++i) { ovector[i] = -4; } int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); if (ConvertReturnCode(rc)) { return MakeMaybe(ovector[0], ovector[1]); } else { return Nothing(); } } //! Find and return all capturing groups in string. /*! * \param string String to search in. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html * \param initialWorkspaceSize Capturing groups vector initial size. * Workspace will be grown and search will * be repeated if there is not enough * space. * \returns List of capturing groups start and end * offsets. First element will contain * whole matched substring start and end * offsets. For non-matched capturing * groups, result will contain {-1, -1} * pair. * If pattern not found in string, result * vector will be empty. */ Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const { Y_ASSERT(initialWorkspaceSize > 0); size_t ovecsize = (initialWorkspaceSize + 1) * 3; while (true) { NStackArray::TStackArray ovector(ALLOC_ON_STACK(int, ovecsize)); int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); if (rc > 0) { TPcreMatches result(Reserve(rc >> 1)); for (int i = 0, pos = 0; i < rc; ++i) { int start = ovector[pos++]; int end = ovector[pos++]; result.emplace_back(start, end); } return result; } else if (rc == 0) { ovecsize <<= 1; } else if (rc == PCRE_ERROR_NOMATCH) { return TPcreMatches{}; } else if (rc < 0) { ythrow yexception() << "Error. RC = " << rc; } } } private: TPcreCode Code; TPcreExtra Extra; private: static inline bool ConvertReturnCode(int rc) { if (rc >= 0) { return true; } else if (rc == PCRE_ERROR_NOMATCH) { return false; } else { ythrow yexception() << "Error. RC = " << rc; } } }; }