re_lexer.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. /*
  2. * re_lexer.h -- definition required for parsing regexps
  3. *
  4. * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
  5. * Alexander Gololobov <agololobov@gmail.com>
  6. *
  7. * This file is part of Pire, the Perl Incompatible
  8. * Regular Expressions library.
  9. *
  10. * Pire is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Pire is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Lesser Public License for more details.
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  21. */
  22. #ifndef PIRE_RE_LEXER_H
  23. #define PIRE_RE_LEXER_H
  24. #include <vector>
  25. #include <stack>
  26. #include <set>
  27. #include <utility>
  28. #include <stdexcept>
  29. #include <utility>
  30. #include <string.h>
  31. #include <contrib/libs/pire/pire/stub/defaults.h>
  32. #include <contrib/libs/pire/pire/stub/stl.h>
  33. #include "encoding.h"
  34. #include "any.h"
  35. namespace Pire {
  36. namespace Consts {
  37. enum { Inf = -1 };
  38. static const wchar32 Control = 0xF0000000;
  39. static const wchar32 ControlMask = 0xFF000000;
  40. static const wchar32 End = Control | 0xFF;
  41. };
  42. using namespace Consts;
  43. namespace TokenTypes {
  44. enum {
  45. None = 0,
  46. Letters,
  47. Count,
  48. Dot,
  49. Open,
  50. Close,
  51. Or,
  52. And,
  53. Not,
  54. BeginMark,
  55. EndMark,
  56. End
  57. };
  58. }
  59. /**
  60. * A single terminal character in regexp pattern.
  61. * Consists of a type (a character, a repetition count, an opening parenthesis, etc...)
  62. * and optional value.
  63. */
  64. class Term {
  65. public:
  66. typedef TVector<wchar32> String;
  67. typedef TSet<String> Strings;
  68. typedef ypair<int, int> RepetitionCount;
  69. typedef ypair<Strings, bool> CharacterRange;
  70. struct DotTag {};
  71. struct BeginTag {};
  72. struct EndTag {};
  73. Term(int type): m_type(type) {}
  74. template<class T> Term(int type, T t): m_type(type), m_value(t) {}
  75. Term(int type, const Any& value): m_type(type), m_value(value) {}
  76. static Term Character(wchar32 c);
  77. static Term Repetition(int lower, int upper);
  78. static Term Dot();
  79. static Term BeginMark();
  80. static Term EndMark();
  81. int Type() const { return m_type; }
  82. const Any& Value() const { return m_value; }
  83. private:
  84. int m_type;
  85. Any m_value;
  86. };
  87. class Feature;
  88. /**
  89. * A class performing regexp pattern parsing.
  90. */
  91. class Lexer {
  92. public:
  93. // One-size-fits-all constructor set.
  94. Lexer()
  95. : m_encoding(&Encodings::Latin1())
  96. { InstallDefaultFeatures(); }
  97. explicit Lexer(const char* str)
  98. : m_encoding(&Encodings::Latin1())
  99. {
  100. InstallDefaultFeatures();
  101. Assign(str, str + strlen(str));
  102. }
  103. template<class T> explicit Lexer(const T& t)
  104. : m_encoding(&Encodings::Latin1())
  105. {
  106. InstallDefaultFeatures();
  107. Assign(t.begin(), t.end());
  108. }
  109. template<class Iter> Lexer(Iter begin, Iter end)
  110. : m_encoding(&Encodings::Latin1())
  111. {
  112. InstallDefaultFeatures();
  113. Assign(begin, end);
  114. }
  115. ~Lexer();
  116. template<class Iter> void Assign(Iter begin, Iter end)
  117. {
  118. m_input.clear();
  119. std::copy(begin, end, std::back_inserter(m_input));
  120. }
  121. /// The main lexer function. Extracts and returns the next term in input sequence.
  122. Term Lex();
  123. /// Installs an additional lexer feature.
  124. /// We declare both lvalue and rvalue reference types to fix some linker errors.
  125. Lexer& AddFeature(THolder<Feature>& a);
  126. Lexer& AddFeature(THolder<Feature>&& a);
  127. const Pire::Encoding& Encoding() const { return *m_encoding; }
  128. Lexer& SetEncoding(const Pire::Encoding& encoding) { m_encoding = &encoding; return *this; }
  129. void SetError(const char* msg) { errmsg = msg; }
  130. void SetError(ystring msg) { errmsg = msg; }
  131. ystring& GetError() { return errmsg; }
  132. Any& Retval() { return m_retval; }
  133. Fsm Parse();
  134. void Parenthesized(Fsm& fsm);
  135. private:
  136. Term DoLex();
  137. wchar32 GetChar();
  138. wchar32 PeekChar();
  139. void UngetChar(wchar32 c);
  140. void Error(const char* msg) { throw Pire::Error(msg); }
  141. void InstallDefaultFeatures();
  142. TDeque<wchar32> m_input;
  143. const Pire::Encoding* m_encoding;
  144. TVector<THolder<Feature>> m_features;
  145. Any m_retval;
  146. ystring errmsg;
  147. friend class Feature;
  148. Lexer(const Lexer&);
  149. Lexer& operator = (const Lexer&);
  150. };
  151. /**
  152. * A basic class for Pire customization.
  153. * Features can be installed in the lexer and alter its behaviour.
  154. */
  155. class Feature {
  156. public:
  157. /// Precedence of features. The less the priority, the earlier
  158. /// will Lex() be called, and the later will Alter() and Parenthesized() be called.
  159. virtual int Priority() const { return 50; }
  160. /// Lexer will call this function to check whether the feature
  161. /// wants to handle the next part of the input sequence in its
  162. /// specific way. If it does not, features Lex() will not be called.
  163. virtual bool Accepts(wchar32 /*c*/) const { return false; }
  164. /// Should eat up some part of the input sequence, handle it
  165. /// somehow and produce a terminal.
  166. virtual Term Lex() { return Term(0); }
  167. /// This function recieves a shiny new terminal, and the feature
  168. /// has a chance to hack it somehow if it wants.
  169. virtual void Alter(Term&) {}
  170. /// This function recieves a parenthesized part of a pattern, and the feature
  171. /// has a chance to hack it somehow if it wants (its the way to implement
  172. /// those perl-style (?@#$%:..) clauses).
  173. virtual void Parenthesized(Fsm&) {}
  174. using Ptr = THolder<Feature>;
  175. virtual ~Feature() = default;
  176. protected:
  177. // These functions are exposed versions of the corresponding lexer functions.
  178. const Pire::Encoding& Encoding() const { return m_lexer->Encoding(); }
  179. wchar32 GetChar() { return m_lexer->GetChar(); }
  180. wchar32 PeekChar() { return m_lexer->PeekChar(); }
  181. void UngetChar(wchar32 c) { m_lexer->UngetChar(c); }
  182. wchar32 CorrectChar(wchar32 c, const char* controls);
  183. void Error(const char* msg) { m_lexer->Error(msg); }
  184. private:
  185. friend class Lexer;
  186. Lexer* m_lexer;
  187. };
  188. namespace Features {
  189. /// Disables case sensitivity
  190. Feature::Ptr CaseInsensitive();
  191. /**
  192. * Adds two more operations:
  193. * (pattern1)&(pattern2) -- matches those strings which match both /pattern1/ and /pattern2/;
  194. * ~(pattern) -- matches those strings which do not match /pattern/.
  195. */
  196. Feature::Ptr AndNotSupport();
  197. }
  198. }
  199. #endif