JsonIO.hh 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * https://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #ifndef avro_json_JsonIO_hh__
  19. #define avro_json_JsonIO_hh__
  20. #include <boost/lexical_cast.hpp>
  21. #include <boost/math/special_functions/fpclassify.hpp>
  22. #include <boost/utility.hpp>
  23. #include <locale>
  24. #include <sstream>
  25. #include <stack>
  26. #include <string>
  27. #include "Config.hh"
  28. #include "Stream.hh"
  29. namespace avro {
  30. namespace json {
  31. inline char toHex(unsigned int n) {
  32. return static_cast<char>((n < 10) ? (n + '0') : (n + 'a' - 10));
  33. }
  34. class AVRO_DECL JsonParser : boost::noncopyable {
  35. public:
  36. enum class Token {
  37. Null,
  38. Bool,
  39. Long,
  40. Double,
  41. String,
  42. ArrayStart,
  43. ArrayEnd,
  44. ObjectStart,
  45. ObjectEnd
  46. };
  47. size_t line() const { return line_; }
  48. private:
  49. enum State {
  50. stValue, // Expect a data type
  51. stArray0, // Expect a data type or ']'
  52. stArrayN, // Expect a ',' or ']'
  53. stObject0, // Expect a string or a '}'
  54. stObjectN, // Expect a ',' or '}'
  55. stKey // Expect a ':'
  56. };
  57. std::stack<State> stateStack;
  58. State curState;
  59. bool hasNext;
  60. char nextChar;
  61. bool peeked;
  62. StreamReader in_;
  63. Token curToken;
  64. bool bv;
  65. int64_t lv;
  66. double dv;
  67. std::string sv;
  68. size_t line_;
  69. Token doAdvance();
  70. Token tryLiteral(const char exp[], size_t n, Token tk);
  71. Token tryNumber(char ch);
  72. Token tryString();
  73. static Exception unexpected(unsigned char ch);
  74. char next();
  75. static std::string decodeString(const std::string &s, bool binary);
  76. public:
  77. JsonParser() : curState(stValue), hasNext(false), nextChar(0), peeked(false),
  78. curToken(Token::Null), bv(false), lv(0), dv(0), line_(1) {}
  79. void init(InputStream &is) {
  80. // Clear by swapping with an empty stack
  81. std::stack<State>().swap(stateStack);
  82. curState = stValue;
  83. hasNext = false;
  84. peeked = false;
  85. line_ = 1;
  86. in_.reset(is);
  87. }
  88. Token advance() {
  89. if (!peeked) {
  90. curToken = doAdvance();
  91. } else {
  92. peeked = false;
  93. }
  94. return curToken;
  95. }
  96. Token peek() {
  97. if (!peeked) {
  98. curToken = doAdvance();
  99. peeked = true;
  100. }
  101. return curToken;
  102. }
  103. void expectToken(Token tk);
  104. bool boolValue() const {
  105. return bv;
  106. }
  107. Token cur() const {
  108. return curToken;
  109. }
  110. double doubleValue() const {
  111. return dv;
  112. }
  113. int64_t longValue() const {
  114. return lv;
  115. }
  116. const std::string &rawString() const {
  117. return sv;
  118. }
  119. std::string stringValue() const {
  120. return decodeString(sv, false);
  121. }
  122. std::string bytesValue() const {
  123. return decodeString(sv, true);
  124. }
  125. void drain() {
  126. if (!stateStack.empty() || peeked) {
  127. throw Exception("Invalid state for draining");
  128. }
  129. in_.drain(hasNext);
  130. hasNext = false;
  131. }
  132. /**
  133. * Return UTF-8 encoded string value.
  134. */
  135. static std::string toStringValue(const std::string &sv) {
  136. return decodeString(sv, false);
  137. }
  138. /**
  139. * Return byte-encoded string value. It is an error if the input
  140. * JSON string contained unicode characters more than "\u00ff'.
  141. */
  142. static std::string toBytesValue(const std::string &sv) {
  143. return decodeString(sv, true);
  144. }
  145. static const char *const tokenNames[];
  146. static const char *toString(Token tk) {
  147. return tokenNames[static_cast<size_t>(tk)];
  148. }
  149. };
  150. class AVRO_DECL JsonNullFormatter {
  151. public:
  152. explicit JsonNullFormatter(StreamWriter &) {}
  153. void handleObjectStart() {}
  154. void handleObjectEnd() {}
  155. void handleValueEnd() {}
  156. void handleColon() {}
  157. };
  158. class AVRO_DECL JsonPrettyFormatter {
  159. StreamWriter &out_;
  160. size_t level_;
  161. std::vector<uint8_t> indent_;
  162. static const int CHARS_PER_LEVEL = 2;
  163. void printIndent() {
  164. size_t charsToIndent = level_ * CHARS_PER_LEVEL;
  165. if (indent_.size() < charsToIndent) {
  166. indent_.resize(charsToIndent * 2, ' ');
  167. }
  168. out_.writeBytes(indent_.data(), charsToIndent);
  169. }
  170. public:
  171. explicit JsonPrettyFormatter(StreamWriter &out) : out_(out), level_(0), indent_(10, ' ') {}
  172. void handleObjectStart() {
  173. out_.write('\n');
  174. ++level_;
  175. printIndent();
  176. }
  177. void handleObjectEnd() {
  178. out_.write('\n');
  179. --level_;
  180. printIndent();
  181. }
  182. void handleValueEnd() {
  183. out_.write('\n');
  184. printIndent();
  185. }
  186. void handleColon() {
  187. out_.write(' ');
  188. }
  189. };
  190. template<class F>
  191. class AVRO_DECL JsonGenerator {
  192. StreamWriter out_;
  193. F formatter_;
  194. enum State {
  195. stStart,
  196. stArray0,
  197. stArrayN,
  198. stMap0,
  199. stMapN,
  200. stKey,
  201. };
  202. std::stack<State> stateStack;
  203. State top;
  204. void write(const char *b, const char *p) {
  205. if (b != p) {
  206. out_.writeBytes(reinterpret_cast<const uint8_t *>(b), p - b);
  207. }
  208. }
  209. void escape(char c, const char *b, const char *p) {
  210. write(b, p);
  211. out_.write('\\');
  212. out_.write(c);
  213. }
  214. void escapeCtl(char c) {
  215. escapeUnicode(static_cast<uint8_t>(c));
  216. }
  217. void writeHex(char c) {
  218. out_.write(toHex((static_cast<unsigned char>(c)) / 16));
  219. out_.write(toHex((static_cast<unsigned char>(c)) % 16));
  220. }
  221. void escapeUnicode16(uint32_t c) {
  222. out_.write('\\');
  223. out_.write('u');
  224. writeHex(static_cast<char>((c >> 8) & 0xff));
  225. writeHex(static_cast<char>(c & 0xff));
  226. }
  227. void escapeUnicode(uint32_t c) {
  228. if (c < 0x10000) {
  229. escapeUnicode16(c);
  230. } else if (c < 0x110000) {
  231. c -= 0x10000;
  232. escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
  233. escapeUnicode16((c & 0x3ff) | 0xdc00);
  234. } else {
  235. throw Exception("Invalid code-point: {}", c);
  236. }
  237. }
  238. void doEncodeString(const char *b, size_t len, bool binary) {
  239. const char *e = b + len;
  240. out_.write('"');
  241. for (const char *p = b; p != e; p++) {
  242. if ((*p & 0x80) != 0) {
  243. write(b, p);
  244. if (binary) {
  245. escapeCtl(*p);
  246. } else if ((*p & 0x40) == 0) {
  247. throw Exception("Invalid UTF-8 sequence");
  248. } else {
  249. int more = 1;
  250. uint32_t value;
  251. if ((*p & 0x20) != 0) {
  252. more++;
  253. if ((*p & 0x10) != 0) {
  254. more++;
  255. if ((*p & 0x08) != 0) {
  256. throw Exception("Invalid UTF-8 sequence");
  257. } else {
  258. value = *p & 0x07;
  259. }
  260. } else {
  261. value = *p & 0x0f;
  262. }
  263. } else {
  264. value = *p & 0x1f;
  265. }
  266. for (int i = 0; i < more; ++i) {
  267. if (++p == e || (*p & 0xc0) != 0x80) {
  268. throw Exception("Invalid UTF-8 sequence");
  269. }
  270. value <<= 6;
  271. value |= *p & 0x3f;
  272. }
  273. escapeUnicode(value);
  274. }
  275. } else {
  276. switch (*p) {
  277. case '\\':
  278. case '"':
  279. escape(*p, b, p);
  280. break;
  281. case '\b':
  282. escape('b', b, p);
  283. break;
  284. case '\f':
  285. escape('f', b, p);
  286. break;
  287. case '\n':
  288. escape('n', b, p);
  289. break;
  290. case '\r':
  291. escape('r', b, p);
  292. break;
  293. case '\t':
  294. escape('t', b, p);
  295. break;
  296. default:
  297. if (std::iscntrl(*p, std::locale::classic())) {
  298. write(b, p);
  299. escapeCtl(*p);
  300. break;
  301. } else {
  302. continue;
  303. }
  304. }
  305. }
  306. b = p + 1;
  307. }
  308. write(b, e);
  309. out_.write('"');
  310. }
  311. void sep() {
  312. if (top == stArrayN) {
  313. out_.write(',');
  314. formatter_.handleValueEnd();
  315. } else if (top == stArray0) {
  316. top = stArrayN;
  317. }
  318. }
  319. void sep2() {
  320. if (top == stKey) {
  321. top = stMapN;
  322. }
  323. }
  324. public:
  325. JsonGenerator() : formatter_(out_), top(stStart) {}
  326. void init(OutputStream &os) {
  327. out_.reset(os);
  328. }
  329. void flush() {
  330. out_.flush();
  331. }
  332. int64_t byteCount() const {
  333. return out_.byteCount();
  334. }
  335. void encodeNull() {
  336. sep();
  337. out_.writeBytes(reinterpret_cast<const uint8_t *>("null"), 4);
  338. sep2();
  339. }
  340. void encodeBool(bool b) {
  341. sep();
  342. if (b) {
  343. out_.writeBytes(reinterpret_cast<const uint8_t *>("true"), 4);
  344. } else {
  345. out_.writeBytes(reinterpret_cast<const uint8_t *>("false"), 5);
  346. }
  347. sep2();
  348. }
  349. template<typename T>
  350. void encodeNumber(T t) {
  351. sep();
  352. std::ostringstream oss;
  353. oss << boost::lexical_cast<std::string>(t);
  354. const std::string s = oss.str();
  355. out_.writeBytes(reinterpret_cast<const uint8_t *>(s.data()), s.size());
  356. sep2();
  357. }
  358. void encodeNumber(double t) {
  359. sep();
  360. std::ostringstream oss;
  361. if (boost::math::isfinite(t)) {
  362. oss << boost::lexical_cast<std::string>(t);
  363. } else if (boost::math::isnan(t)) {
  364. oss << "NaN";
  365. } else if (t == std::numeric_limits<double>::infinity()) {
  366. oss << "Infinity";
  367. } else {
  368. oss << "-Infinity";
  369. }
  370. const std::string s = oss.str();
  371. out_.writeBytes(reinterpret_cast<const uint8_t *>(s.data()), s.size());
  372. sep2();
  373. }
  374. void encodeString(const std::string &s) {
  375. if (top == stMap0) {
  376. top = stKey;
  377. } else if (top == stMapN) {
  378. out_.write(',');
  379. formatter_.handleValueEnd();
  380. top = stKey;
  381. } else if (top == stKey) {
  382. top = stMapN;
  383. } else {
  384. sep();
  385. }
  386. doEncodeString(s.c_str(), s.size(), false);
  387. if (top == stKey) {
  388. out_.write(':');
  389. formatter_.handleColon();
  390. }
  391. }
  392. void encodeBinary(const uint8_t *bytes, size_t len) {
  393. sep();
  394. doEncodeString(reinterpret_cast<const char *>(bytes), len, true);
  395. sep2();
  396. }
  397. void arrayStart() {
  398. sep();
  399. stateStack.push(top);
  400. top = stArray0;
  401. out_.write('[');
  402. formatter_.handleObjectStart();
  403. }
  404. void arrayEnd() {
  405. top = stateStack.top();
  406. stateStack.pop();
  407. formatter_.handleObjectEnd();
  408. out_.write(']');
  409. sep2();
  410. }
  411. void objectStart() {
  412. sep();
  413. stateStack.push(top);
  414. top = stMap0;
  415. out_.write('{');
  416. formatter_.handleObjectStart();
  417. }
  418. void objectEnd() {
  419. top = stateStack.top();
  420. stateStack.pop();
  421. formatter_.handleObjectEnd();
  422. out_.write('}');
  423. sep2();
  424. }
  425. };
  426. } // namespace json
  427. } // namespace avro
  428. #endif