unicode_set_parser.cpp 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #include "quoted_pair.h"
  2. #include "unicode_set_lexer.h"
  3. #include <util/string/cast.h>
  4. #include <util/charset/wide.h>
  5. namespace NUnicode {
  6. namespace NPrivate {
  7. #define UNEXPECTED_TOKEN throw yexception() << "Unexpected token: " << lexer.GetLastToken()
  8. #define EXPECT_TOKEN(type) \
  9. if (lexer.GetToken() != type) { \
  10. UNEXPECTED_TOKEN; \
  11. }
  12. void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer);
  13. void ParseCharSequence(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
  14. wchar32 prevChar = 0;
  15. bool range = false;
  16. for (EUnicodeSetTokenType type = lexer.GetToken(); type != USTT_RBRACKET; type = lexer.GetToken()) {
  17. wchar32 curChar = 0;
  18. switch (type) {
  19. case USTT_SYMBOL:
  20. curChar = lexer.GetLastToken().Symbol;
  21. break;
  22. case USTT_NEGATION:
  23. curChar = '^';
  24. break;
  25. case USTT_QUOTED_PAIR:
  26. ResolveUnicodeQuotedPair(lexer.GetLastToken().Symbol, curChar, set);
  27. break;
  28. case USTT_CODEPOINT8:
  29. case USTT_CODEPOINT16:
  30. case USTT_CODEPOINT32:
  31. curChar = IntFromString<ui32, 16>(lexer.GetLastToken().Data);
  32. if (curChar >= TUnicodeSet::CODEPOINT_HIGH) {
  33. throw yexception() << "Invalid unicode codepoint: " << lexer.GetLastToken();
  34. }
  35. break;
  36. case USTT_RANGE:
  37. if (0 == prevChar) {
  38. UNEXPECTED_TOKEN;
  39. }
  40. range = true;
  41. continue;
  42. case USTT_LBRACKET: {
  43. lexer.PushBack();
  44. TUnicodeSet inner;
  45. ParseUnicodeSet(inner, lexer);
  46. set.Add(inner);
  47. break;
  48. }
  49. default:
  50. UNEXPECTED_TOKEN;
  51. }
  52. if (curChar) {
  53. if (range) {
  54. if (prevChar >= curChar) {
  55. throw yexception() << "Invalid character range";
  56. }
  57. set.Add(prevChar, curChar);
  58. curChar = 0;
  59. } else {
  60. set.Add(curChar);
  61. }
  62. } else if (range) {
  63. UNEXPECTED_TOKEN;
  64. }
  65. range = false;
  66. prevChar = curChar;
  67. }
  68. if (range) {
  69. UNEXPECTED_TOKEN;
  70. }
  71. lexer.PushBack();
  72. }
  73. void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
  74. EXPECT_TOKEN(USTT_LBRACKET);
  75. bool invert = false;
  76. if (USTT_NEGATION == lexer.GetToken()) {
  77. invert = true;
  78. } else {
  79. lexer.PushBack();
  80. }
  81. if (USTT_CATEGORY == lexer.GetToken()) {
  82. set.AddCategory(WideToUTF8(lexer.GetLastToken().Data));
  83. } else {
  84. lexer.PushBack();
  85. ParseCharSequence(set, lexer);
  86. }
  87. EXPECT_TOKEN(USTT_RBRACKET);
  88. if (invert) {
  89. set.Invert();
  90. }
  91. }
  92. void ParseUnicodeSet(TUnicodeSet& set, const TWtringBuf& data) {
  93. TUnicodeSetLexer lexer(data);
  94. ParseUnicodeSet(set, lexer);
  95. EXPECT_TOKEN(USTT_EOS);
  96. }
  97. } // NPrivate
  98. }