lexer_base.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. #pragma once
  2. #include "byte_reader.h"
  3. #include "cescape.h"
  4. #include "macros.h"
  5. #include "number.h"
  6. #include "percent_scalar.h"
  7. #include "stream_counter.h"
  8. #include "varint.h"
  9. #include <util/generic/maybe.h>
  10. #include <util/generic/vector.h>
  11. #include <util/string/cast.h>
  12. namespace NYsonPull {
  13. namespace NDetail {
  14. template <bool EnableLinePositionInfo>
  15. class lexer_base: public byte_reader<stream_counter<EnableLinePositionInfo>> {
  16. using Base = byte_reader<
  17. stream_counter<EnableLinePositionInfo>>;
  18. TVector<ui8> token_buffer_;
  19. TMaybe<size_t> memory_limit_;
  20. public:
  21. lexer_base(
  22. NYsonPull::NInput::IStream& buffer,
  23. TMaybe<size_t> memory_limit)
  24. : Base(buffer)
  25. , memory_limit_{memory_limit} {
  26. }
  27. ATTRIBUTE(noinline, hot)
  28. ui8 skip_space_and_get_byte() {
  29. auto& buf = Base::stream().buffer();
  30. if (Y_LIKELY(!buf.is_empty())) {
  31. auto ch = *buf.pos();
  32. if (Y_LIKELY(!is_space(ch))) {
  33. return ch;
  34. }
  35. }
  36. return skip_space_and_get_byte_fallback();
  37. }
  38. ATTRIBUTE(hot)
  39. ui8 get_byte() {
  40. auto& buf = Base::stream().buffer();
  41. if (Y_LIKELY(!buf.is_empty())) {
  42. return *buf.pos();
  43. }
  44. return Base::get_byte();
  45. }
  46. number read_numeric() {
  47. token_buffer_.clear();
  48. auto type = number_type::int64;
  49. while (true) {
  50. auto ch = this->Base::template get_byte<true>();
  51. if (isdigit(ch) || ch == '+' || ch == '-') {
  52. token_buffer_.push_back(ch);
  53. } else if (ch == '.' || ch == 'e' || ch == 'E') {
  54. token_buffer_.push_back(ch);
  55. type = number_type::float64;
  56. } else if (ch == 'u') {
  57. token_buffer_.push_back(ch);
  58. type = number_type::uint64;
  59. } else if (Y_UNLIKELY(isalpha(ch))) {
  60. COLD_BLOCK_BYVALUE
  61. Base::fail("Unexpected ", NCEscape::quote(ch), " in numeric literal");
  62. COLD_BLOCK_END
  63. } else {
  64. break;
  65. }
  66. check_memory_limit();
  67. Base::advance(1);
  68. }
  69. auto str = token_buffer();
  70. try {
  71. switch (type) {
  72. case number_type::float64:
  73. return FromString<double>(str);
  74. case number_type::int64:
  75. return FromString<i64>(str);
  76. case number_type::uint64:
  77. str.Chop(1); // 'u' suffix
  78. return FromString<ui64>(str);
  79. }
  80. Y_UNREACHABLE();
  81. } catch (const std::exception& err) {
  82. Base::fail(err.what());
  83. }
  84. }
  85. TStringBuf read_quoted_string() {
  86. auto count_trailing_slashes = [](ui8* begin, ui8* end) {
  87. auto count = size_t{0};
  88. if (begin < end) {
  89. for (auto p = end - 1; p >= begin && *p == '\\'; --p) {
  90. ++count;
  91. }
  92. }
  93. return count;
  94. };
  95. token_buffer_.clear();
  96. auto& buf = Base::stream().buffer();
  97. while (true) {
  98. this->Base::template fill_buffer<false>();
  99. auto* quote = reinterpret_cast<const ui8*>(
  100. ::memchr(buf.pos(), '"', buf.available()));
  101. if (quote == nullptr) {
  102. token_buffer_.insert(
  103. token_buffer_.end(),
  104. buf.pos(),
  105. buf.end());
  106. Base::advance(buf.available());
  107. continue;
  108. }
  109. token_buffer_.insert(
  110. token_buffer_.end(),
  111. buf.pos(),
  112. quote);
  113. Base::advance(quote - buf.pos() + 1); // +1 for the quote itself
  114. // We must count the number of '\' at the end of StringValue
  115. // to check if it's not \"
  116. int slash_count = count_trailing_slashes(
  117. token_buffer_.data(),
  118. token_buffer_.data() + token_buffer_.size());
  119. if (slash_count % 2 == 0) {
  120. break;
  121. } else {
  122. token_buffer_.push_back('"');
  123. }
  124. check_memory_limit();
  125. }
  126. NCEscape::decode_inplace(token_buffer_);
  127. return token_buffer();
  128. }
  129. TStringBuf read_unquoted_string() {
  130. token_buffer_.clear();
  131. while (true) {
  132. auto ch = this->Base::template get_byte<true>();
  133. if (isalpha(ch) || isdigit(ch) ||
  134. ch == '_' || ch == '-' || ch == '%' || ch == '.') {
  135. token_buffer_.push_back(ch);
  136. } else {
  137. break;
  138. }
  139. check_memory_limit();
  140. Base::advance(1);
  141. }
  142. return token_buffer();
  143. }
  144. ATTRIBUTE(noinline, hot)
  145. TStringBuf read_binary_string() {
  146. auto slength = NVarInt::read<i32>(*this);
  147. if (Y_UNLIKELY(slength < 0)) {
  148. COLD_BLOCK_BYVALUE
  149. Base::fail("Negative binary string literal length ", slength);
  150. COLD_BLOCK_END
  151. }
  152. auto length = static_cast<ui32>(slength);
  153. auto& buf = Base::stream().buffer();
  154. if (Y_LIKELY(buf.available() >= length)) {
  155. auto result = TStringBuf{
  156. reinterpret_cast<const char*>(buf.pos()),
  157. length};
  158. Base::advance(length);
  159. return result;
  160. } else { // reading in Buffer
  161. return read_binary_string_fallback(length);
  162. }
  163. }
  164. ATTRIBUTE(noinline)
  165. TStringBuf read_binary_string_fallback(size_t length) {
  166. auto& buf = Base::stream().buffer();
  167. auto needToRead = length;
  168. token_buffer_.clear();
  169. while (needToRead) {
  170. this->Base::template fill_buffer<false>();
  171. auto chunk_size = std::min(needToRead, buf.available());
  172. token_buffer_.insert(
  173. token_buffer_.end(),
  174. buf.pos(),
  175. buf.pos() + chunk_size);
  176. check_memory_limit();
  177. needToRead -= chunk_size;
  178. Base::advance(chunk_size);
  179. }
  180. return token_buffer();
  181. }
  182. percent_scalar read_percent_scalar() {
  183. auto throw_incorrect_percent_scalar = [&]() {
  184. Base::fail("Incorrect %-literal prefix ", NCEscape::quote(token_buffer()));
  185. };
  186. auto assert_literal = [&](TStringBuf literal) -> void {
  187. for (size_t i = 2; i < literal.size(); ++i) {
  188. token_buffer_.push_back(this->Base::template get_byte<false>());
  189. Base::advance(1);
  190. if (Y_UNLIKELY(token_buffer_.back() != literal[i])) {
  191. throw_incorrect_percent_scalar();
  192. }
  193. }
  194. };
  195. token_buffer_.clear();
  196. token_buffer_.push_back(this->Base::template get_byte<false>());
  197. Base::advance(1);
  198. switch (token_buffer_[0]) {
  199. case 't':
  200. assert_literal(percent_scalar::true_literal);
  201. return percent_scalar(true);
  202. case 'f':
  203. assert_literal(percent_scalar::false_literal);
  204. return percent_scalar(false);
  205. case 'n':
  206. assert_literal(percent_scalar::nan_literal);
  207. return percent_scalar(std::numeric_limits<double>::quiet_NaN());
  208. case 'i':
  209. assert_literal(percent_scalar::positive_inf_literal);
  210. return percent_scalar(std::numeric_limits<double>::infinity());
  211. case '-':
  212. assert_literal(percent_scalar::negative_inf_literal);
  213. return percent_scalar(-std::numeric_limits<double>::infinity());
  214. default:
  215. throw_incorrect_percent_scalar();
  216. }
  217. Y_UNREACHABLE();
  218. }
  219. i64 read_binary_int64() {
  220. return NVarInt::read<i64>(*this);
  221. }
  222. ui64 read_binary_uint64() {
  223. return NVarInt::read<ui64>(*this);
  224. }
  225. double read_binary_double() {
  226. union {
  227. double as_double;
  228. ui8 as_bytes[sizeof(double)];
  229. } data;
  230. static_assert(sizeof(data) == sizeof(double), "bad union size");
  231. auto needToRead = sizeof(double);
  232. auto& buf = Base::stream().buffer();
  233. while (needToRead != 0) {
  234. Base::fill_buffer();
  235. auto chunk_size = std::min(needToRead, buf.available());
  236. if (chunk_size == 0) {
  237. Base::fail("Error parsing binary double literal");
  238. }
  239. std::copy(
  240. buf.pos(),
  241. buf.pos() + chunk_size,
  242. data.as_bytes + (sizeof(double) - needToRead));
  243. needToRead -= chunk_size;
  244. Base::advance(chunk_size);
  245. }
  246. return data.as_double;
  247. }
  248. private:
  249. static bool is_space(ui8 ch) {
  250. static const ui8 lookupTable[] =
  251. {
  252. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
  253. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  254. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  255. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  256. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  257. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  258. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  259. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  260. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  261. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  262. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  263. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  264. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  265. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  266. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  267. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  268. return lookupTable[ch];
  269. }
  270. ATTRIBUTE(noinline, cold)
  271. ui8 skip_space_and_get_byte_fallback() {
  272. auto& buf = Base::stream().buffer();
  273. while (true) {
  274. // FIXME
  275. if (buf.is_empty()) {
  276. if (Base::stream().at_end()) {
  277. return '\0';
  278. }
  279. Base::fill_buffer();
  280. } else {
  281. if (!is_space(*buf.pos())) {
  282. break;
  283. }
  284. Base::advance(1);
  285. }
  286. }
  287. return Base::get_byte();
  288. }
  289. void check_memory_limit() {
  290. if (Y_UNLIKELY(memory_limit_ && token_buffer_.capacity() > *memory_limit_)) {
  291. COLD_BLOCK_BYVALUE
  292. Base::fail(
  293. "Memory limit exceeded while parsing YSON stream: "
  294. "allocated ",
  295. token_buffer_.capacity(),
  296. ", limit ", *memory_limit_);
  297. COLD_BLOCK_END
  298. }
  299. }
  300. TStringBuf token_buffer() const {
  301. auto* begin = reinterpret_cast<const char*>(token_buffer_.data());
  302. return {begin, token_buffer_.size()};
  303. }
  304. };
  305. }
  306. }