pcre.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. #pragma once
  2. #include "traits.h"
  3. #include <library/cpp/containers/stack_array/stack_array.h>
  4. #include <util/generic/maybe.h>
  5. #include <util/generic/strbuf.h>
  6. #include <util/generic/vector.h>
  7. #include <util/generic/yexception.h>
  8. namespace NPcre {
  9. //! Start and end offset for match group.
  10. using TPcreMatch = std::pair<int, int>;
  11. //! Full match result containing all capturing groups.
  12. /*!
  13. * At zero index we have whole matched string start and end offsets.
  14. * All other elements will contain capturing groups positions.
  15. * Non-captured capturing groups will have {-1, -1} offsets.
  16. */
  17. using TPcreMatches = TVector<TPcreMatch>;
  18. //! Compiled pattern optimization strategy.
  19. enum class EOptimize {
  20. //! No optimization.
  21. /*!
  22. * Useful for non-reusable patterns where compile time matters.
  23. */
  24. None,
  25. //! Basic optimization via |pcre_study|.
  26. /*!
  27. * Could give up to 4x match speed boost in exchange of increased
  28. * construction time. Could not.
  29. */
  30. Study,
  31. //! PCRE JIT optimization.
  32. /*!
  33. * Could give up to 10x match speed bust in exchange of significantly
  34. * increased compile time. Also, for very complex patterns |pcre_exec|
  35. * could return |PCRE_ERROR_JIT_STACKLIMIT|. See
  36. * https://www.pcre.org/original/doc/html/pcrejit.html for details.
  37. */
  38. JIT
  39. };
  40. //! PCRE code container. Controls its life time and provides handy wrapper.
  41. template <class TCharType>
  42. class TPcre {
  43. private:
  44. using TCodeType = typename TPcreTraits<TCharType>::TCodeType;
  45. using TExtraType = typename TPcreTraits<TCharType>::TExtraType;
  46. using TStringType = typename TPcreTraits<TCharType>::TStringType;
  47. using TTraits = TPcreTraits<TCharType>;
  48. static constexpr size_t DefaultWorkspaceSize = 16;
  49. public:
  50. //! Compiles regexp into internal representation for future use.
  51. /*!
  52. * \param pattern Regular expression to be compiled.
  53. * \param optimize If |EOptimize::JIT|, perform additional
  54. * analysis, which will take extra time, but could
  55. * speed up matching. |None| to omit optimization.
  56. * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html
  57. **/
  58. TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) {
  59. int errcode;
  60. const char* errptr;
  61. int erroffset;
  62. Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr));
  63. if (!Code) {
  64. ythrow yexception() << "Failed to compile pattern <" << pattern
  65. << ">, because of error at pos " << erroffset
  66. << ", error code " << errcode << ": " << errptr;
  67. }
  68. if (optimize != EOptimize::None) {
  69. errptr = nullptr;
  70. int options;
  71. if (optimize == EOptimize::Study) {
  72. options = 0;
  73. } else {
  74. options = PCRE_STUDY_JIT_COMPILE;
  75. }
  76. Extra.Reset(TTraits::Study(Code.Get(), options, &errptr));
  77. if (errptr) {
  78. ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr;
  79. }
  80. }
  81. }
  82. //! Check if compiled pattern matches string.
  83. /*!
  84. * \param string String to search in.
  85. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
  86. * \param workspaceSize Amount of space which will be allocated for
  87. * back references. PCRE could allocate more
  88. * heap space is provided workspaceSize won't
  89. * fit all of them.
  90. * \returns |true| if there is a match.
  91. */
  92. bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
  93. Y_ASSERT(workspaceSize >= 0);
  94. size_t ovecsize = workspaceSize * 3;
  95. NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
  96. return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize));
  97. }
  98. //! Find compiled pattern in string.
  99. /*!
  100. * \param string String to search in.
  101. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
  102. * \param workspaceSize Amount of space which will be allocated for
  103. * back references. PCRE could allocate more
  104. * heap space is provided workspaceSize won't
  105. * fit all of them.
  106. * \returns Start and end offsets pair if there is a
  107. * match. |Nothing| otherwise.
  108. */
  109. Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
  110. Y_ASSERT(workspaceSize >= 0);
  111. size_t ovecsize = workspaceSize * 3;
  112. NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
  113. for (size_t i = 0; i < ovecsize; ++i) {
  114. ovector[i] = -4;
  115. }
  116. int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
  117. if (ConvertReturnCode(rc)) {
  118. return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]);
  119. } else {
  120. return Nothing();
  121. }
  122. }
  123. //! Find and return all capturing groups in string.
  124. /*!
  125. * \param string String to search in.
  126. * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
  127. * \param initialWorkspaceSize Capturing groups vector initial size.
  128. * Workspace will be grown and search will
  129. * be repeated if there is not enough
  130. * space.
  131. * \returns List of capturing groups start and end
  132. * offsets. First element will contain
  133. * whole matched substring start and end
  134. * offsets. For non-matched capturing
  135. * groups, result will contain {-1, -1}
  136. * pair.
  137. * If pattern not found in string, result
  138. * vector will be empty.
  139. */
  140. Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const {
  141. Y_ASSERT(initialWorkspaceSize > 0);
  142. size_t ovecsize = (initialWorkspaceSize + 1) * 3;
  143. while (true) {
  144. NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
  145. int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
  146. if (rc > 0) {
  147. TPcreMatches result(Reserve(rc >> 1));
  148. for (int i = 0, pos = 0; i < rc; ++i) {
  149. int start = ovector[pos++];
  150. int end = ovector[pos++];
  151. result.emplace_back(start, end);
  152. }
  153. return result;
  154. } else if (rc == 0) {
  155. ovecsize <<= 1;
  156. } else if (rc == PCRE_ERROR_NOMATCH) {
  157. return TPcreMatches{};
  158. } else if (rc < 0) {
  159. ythrow yexception() << "Error. RC = " << rc;
  160. }
  161. }
  162. }
  163. private:
  164. TPcreCode<TCharType> Code;
  165. TPcreExtra<TCharType> Extra;
  166. private:
  167. static inline bool ConvertReturnCode(int rc) {
  168. if (rc >= 0) {
  169. return true;
  170. } else if (rc == PCRE_ERROR_NOMATCH) {
  171. return false;
  172. } else {
  173. ythrow yexception() << "Error. RC = " << rc;
  174. }
  175. }
  176. };
  177. }