123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- #pragma once
- #include "traits.h"
- #include <library/cpp/containers/stack_array/stack_array.h>
- #include <util/generic/maybe.h>
- #include <util/generic/strbuf.h>
- #include <util/generic/vector.h>
- #include <util/generic/yexception.h>
- namespace NPcre {
- //! Start and end offset for match group.
- using TPcreMatch = std::pair<int, int>;
- //! Full match result containing all capturing groups.
- /*!
- * At zero index we have whole matched string start and end offsets.
- * All other elements will contain capturing groups positions.
- * Non-captured capturing groups will have {-1, -1} offsets.
- */
- using TPcreMatches = TVector<TPcreMatch>;
- //! Compiled pattern optimization strategy.
- enum class EOptimize {
- //! No optimization.
- /*!
- * Useful for non-reusable patterns where compile time matters.
- */
- None,
- //! Basic optimization via |pcre_study|.
- /*!
- * Could give up to 4x match speed boost in exchange of increased
- * construction time. Could not.
- */
- Study,
- //! PCRE JIT optimization.
- /*!
- * Could give up to 10x match speed bust in exchange of significantly
- * increased compile time. Also, for very complex patterns |pcre_exec|
- * could return |PCRE_ERROR_JIT_STACKLIMIT|. See
- * https://www.pcre.org/original/doc/html/pcrejit.html for details.
- */
- JIT
- };
- //! PCRE code container. Controls its life time and provides handy wrapper.
- template <class TCharType>
- class TPcre {
- private:
- using TCodeType = typename TPcreTraits<TCharType>::TCodeType;
- using TExtraType = typename TPcreTraits<TCharType>::TExtraType;
- using TStringType = typename TPcreTraits<TCharType>::TStringType;
- using TTraits = TPcreTraits<TCharType>;
- static constexpr size_t DefaultWorkspaceSize = 16;
- public:
- //! Compiles regexp into internal representation for future use.
- /*!
- * \param pattern Regular expression to be compiled.
- * \param optimize If |EOptimize::JIT|, perform additional
- * analysis, which will take extra time, but could
- * speed up matching. |None| to omit optimization.
- * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html
- **/
- TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) {
- int errcode;
- const char* errptr;
- int erroffset;
- Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr));
- if (!Code) {
- ythrow yexception() << "Failed to compile pattern <" << pattern
- << ">, because of error at pos " << erroffset
- << ", error code " << errcode << ": " << errptr;
- }
- if (optimize != EOptimize::None) {
- errptr = nullptr;
- int options;
- if (optimize == EOptimize::Study) {
- options = 0;
- } else {
- options = PCRE_STUDY_JIT_COMPILE;
- }
- Extra.Reset(TTraits::Study(Code.Get(), options, &errptr));
- if (errptr) {
- ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr;
- }
- }
- }
- //! Check if compiled pattern matches string.
- /*!
- * \param string String to search in.
- * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
- * \param workspaceSize Amount of space which will be allocated for
- * back references. PCRE could allocate more
- * heap space is provided workspaceSize won't
- * fit all of them.
- * \returns |true| if there is a match.
- */
- bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
- Y_ASSERT(workspaceSize >= 0);
- size_t ovecsize = workspaceSize * 3;
- NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
- return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize));
- }
- //! Find compiled pattern in string.
- /*!
- * \param string String to search in.
- * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
- * \param workspaceSize Amount of space which will be allocated for
- * back references. PCRE could allocate more
- * heap space is provided workspaceSize won't
- * fit all of them.
- * \returns Start and end offsets pair if there is a
- * match. |Nothing| otherwise.
- */
- Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
- Y_ASSERT(workspaceSize >= 0);
- size_t ovecsize = workspaceSize * 3;
- NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
- for (size_t i = 0; i < ovecsize; ++i) {
- ovector[i] = -4;
- }
- int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
- if (ConvertReturnCode(rc)) {
- return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]);
- } else {
- return Nothing();
- }
- }
- //! Find and return all capturing groups in string.
- /*!
- * \param string String to search in.
- * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
- * \param initialWorkspaceSize Capturing groups vector initial size.
- * Workspace will be grown and search will
- * be repeated if there is not enough
- * space.
- * \returns List of capturing groups start and end
- * offsets. First element will contain
- * whole matched substring start and end
- * offsets. For non-matched capturing
- * groups, result will contain {-1, -1}
- * pair.
- * If pattern not found in string, result
- * vector will be empty.
- */
- Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const {
- Y_ASSERT(initialWorkspaceSize > 0);
- size_t ovecsize = (initialWorkspaceSize + 1) * 3;
- while (true) {
- NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
- int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
- if (rc > 0) {
- TPcreMatches result(Reserve(rc >> 1));
- for (int i = 0, pos = 0; i < rc; ++i) {
- int start = ovector[pos++];
- int end = ovector[pos++];
- result.emplace_back(start, end);
- }
- return result;
- } else if (rc == 0) {
- ovecsize <<= 1;
- } else if (rc == PCRE_ERROR_NOMATCH) {
- return TPcreMatches{};
- } else if (rc < 0) {
- ythrow yexception() << "Error. RC = " << rc;
- }
- }
- }
- private:
- TPcreCode<TCharType> Code;
- TPcreExtra<TCharType> Extra;
- private:
- static inline bool ConvertReturnCode(int rc) {
- if (rc >= 0) {
- return true;
- } else if (rc == PCRE_ERROR_NOMATCH) {
- return false;
- } else {
- ythrow yexception() << "Error. RC = " << rc;
- }
- }
- };
- }
|