JsonIO.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * https://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #include "JsonIO.hh"
  19. namespace avro {
  20. namespace json {
  21. using std::ostringstream;
  22. using std::string;
  23. const char *const
  24. JsonParser::tokenNames[] = {
  25. "Null",
  26. "Bool",
  27. "Integer",
  28. "Double",
  29. "String",
  30. "Array start",
  31. "Array end",
  32. "Object start",
  33. "Object end",
  34. };
  35. char JsonParser::next() {
  36. char ch = hasNext ? nextChar : ' ';
  37. while (isspace(ch)) {
  38. if (ch == '\n') {
  39. line_++;
  40. }
  41. ch = in_.read();
  42. }
  43. hasNext = false;
  44. return ch;
  45. }
  46. void JsonParser::expectToken(Token tk) {
  47. if (advance() != tk) {
  48. if (tk == Token::Double) {
  49. if (cur() == Token::String
  50. && (sv == "Infinity" || sv == "-Infinity" || sv == "NaN")) {
  51. curToken = Token::Double;
  52. dv = sv == "Infinity" ? std::numeric_limits<double>::infinity() : sv == "-Infinity" ? -std::numeric_limits<double>::infinity()
  53. : std::numeric_limits<double>::quiet_NaN();
  54. return;
  55. } else if (cur() == Token::Long) {
  56. dv = double(lv);
  57. return;
  58. }
  59. }
  60. ostringstream oss;
  61. oss << "Incorrect token in the stream. Expected: "
  62. << JsonParser::toString(tk) << ", found "
  63. << JsonParser::toString(cur());
  64. throw Exception(oss.str());
  65. }
  66. }
  67. JsonParser::Token JsonParser::doAdvance() {
  68. char ch = next();
  69. if (ch == ']') {
  70. if (curState == stArray0 || curState == stArrayN) {
  71. curState = stateStack.top();
  72. stateStack.pop();
  73. return Token::ArrayEnd;
  74. } else {
  75. throw unexpected(ch);
  76. }
  77. } else if (ch == '}') {
  78. if (curState == stObject0 || curState == stObjectN) {
  79. curState = stateStack.top();
  80. stateStack.pop();
  81. return Token::ObjectEnd;
  82. } else {
  83. throw unexpected(ch);
  84. }
  85. } else if (ch == ',') {
  86. if (curState != stObjectN && curState != stArrayN) {
  87. throw unexpected(ch);
  88. }
  89. if (curState == stObjectN) {
  90. curState = stObject0;
  91. }
  92. ch = next();
  93. } else if (ch == ':') {
  94. if (curState != stKey) {
  95. throw unexpected(ch);
  96. }
  97. curState = stObjectN;
  98. ch = next();
  99. }
  100. if (curState == stObject0) {
  101. if (ch != '"') {
  102. throw unexpected(ch);
  103. }
  104. curState = stKey;
  105. } else if (curState == stArray0) {
  106. curState = stArrayN;
  107. }
  108. switch (ch) {
  109. case '[':
  110. stateStack.push(curState);
  111. curState = stArray0;
  112. return Token::ArrayStart;
  113. case '{':
  114. stateStack.push(curState);
  115. curState = stObject0;
  116. return Token::ObjectStart;
  117. case '"':
  118. return tryString();
  119. case 't':
  120. bv = true;
  121. return tryLiteral("rue", 3, Token::Bool);
  122. case 'f':
  123. bv = false;
  124. return tryLiteral("alse", 4, Token::Bool);
  125. case 'n':
  126. return tryLiteral("ull", 3, Token::Null);
  127. default:
  128. if (isdigit(ch) || ch == '-') {
  129. return tryNumber(ch);
  130. } else {
  131. throw unexpected(ch);
  132. }
  133. }
  134. }
  135. JsonParser::Token JsonParser::tryNumber(char ch) {
  136. sv.clear();
  137. sv.push_back(ch);
  138. hasNext = false;
  139. int state = (ch == '-') ? 0 : (ch == '0') ? 1
  140. : 2;
  141. for (;;) {
  142. switch (state) {
  143. case 0:
  144. if (in_.hasMore()) {
  145. ch = in_.read();
  146. if (isdigit(ch)) {
  147. state = (ch == '0') ? 1 : 2;
  148. sv.push_back(ch);
  149. continue;
  150. }
  151. hasNext = true;
  152. }
  153. break;
  154. case 1:
  155. if (in_.hasMore()) {
  156. ch = in_.read();
  157. if (ch == '.') {
  158. state = 3;
  159. sv.push_back(ch);
  160. continue;
  161. } else if (ch == 'e' || ch == 'E') {
  162. sv.push_back(ch);
  163. state = 5;
  164. continue;
  165. }
  166. hasNext = true;
  167. }
  168. break;
  169. case 2:
  170. if (in_.hasMore()) {
  171. ch = in_.read();
  172. if (isdigit(ch)) {
  173. sv.push_back(ch);
  174. continue;
  175. } else if (ch == '.') {
  176. state = 3;
  177. sv.push_back(ch);
  178. continue;
  179. } else if (ch == 'e' || ch == 'E') {
  180. sv.push_back(ch);
  181. state = 5;
  182. continue;
  183. }
  184. hasNext = true;
  185. }
  186. break;
  187. case 3:
  188. case 6:
  189. if (in_.hasMore()) {
  190. ch = in_.read();
  191. if (isdigit(ch)) {
  192. sv.push_back(ch);
  193. state++;
  194. continue;
  195. }
  196. hasNext = true;
  197. }
  198. break;
  199. case 4:
  200. if (in_.hasMore()) {
  201. ch = in_.read();
  202. if (isdigit(ch)) {
  203. sv.push_back(ch);
  204. continue;
  205. } else if (ch == 'e' || ch == 'E') {
  206. sv.push_back(ch);
  207. state = 5;
  208. continue;
  209. }
  210. hasNext = true;
  211. }
  212. break;
  213. case 5:
  214. if (in_.hasMore()) {
  215. ch = in_.read();
  216. if (ch == '+' || ch == '-') {
  217. sv.push_back(ch);
  218. state = 6;
  219. continue;
  220. } else if (isdigit(ch)) {
  221. sv.push_back(ch);
  222. state = 7;
  223. continue;
  224. }
  225. hasNext = true;
  226. }
  227. break;
  228. case 7:
  229. if (in_.hasMore()) {
  230. ch = in_.read();
  231. if (isdigit(ch)) {
  232. sv.push_back(ch);
  233. continue;
  234. }
  235. hasNext = true;
  236. }
  237. break;
  238. default:
  239. throw Exception("Unexpected JSON parse state");
  240. }
  241. if (state == 1 || state == 2 || state == 4 || state == 7) {
  242. if (hasNext) {
  243. nextChar = ch;
  244. }
  245. std::istringstream iss(sv);
  246. if (state == 1 || state == 2) {
  247. iss >> lv;
  248. return Token::Long;
  249. } else {
  250. iss >> dv;
  251. return Token::Double;
  252. }
  253. } else {
  254. if (hasNext) {
  255. throw unexpected(ch);
  256. } else {
  257. throw Exception("Unexpected EOF");
  258. }
  259. }
  260. }
  261. }
  262. JsonParser::Token JsonParser::tryString() {
  263. sv.clear();
  264. for (;;) {
  265. char ch = in_.read();
  266. if (ch == '"') {
  267. return Token::String;
  268. } else if (ch == '\\') {
  269. ch = in_.read();
  270. switch (ch) {
  271. case '"':
  272. case '\\':
  273. case '/':
  274. case 'b':
  275. case 'f':
  276. case 'n':
  277. case 'r':
  278. case 't':
  279. sv.push_back('\\');
  280. sv.push_back(ch);
  281. break;
  282. case 'u':
  283. case 'U': {
  284. uint32_t n = 0;
  285. char e[4];
  286. in_.readBytes(reinterpret_cast<uint8_t *>(e), 4);
  287. sv.push_back('\\');
  288. sv.push_back(ch);
  289. for (char c : e) {
  290. n *= 16;
  291. if (isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
  292. sv.push_back(c);
  293. } else {
  294. throw unexpected(c);
  295. }
  296. }
  297. } break;
  298. default:
  299. throw unexpected(ch);
  300. }
  301. } else {
  302. sv.push_back(ch);
  303. }
  304. }
  305. }
  306. // Decode the given string and return contents as UTF8-encoded bytes.
  307. // The input does not have the enclosing double-quotes.
  308. string JsonParser::decodeString(const string &s, bool binary) {
  309. string result;
  310. auto it = s.cbegin();
  311. const auto end = s.cend();
  312. const auto readNextByte = [&]() -> char {
  313. if (it == end) {
  314. throw Exception("Unexpected EOF");
  315. }
  316. return *it++;
  317. };
  318. const auto unicodeParse = [&]() {
  319. uint32_t n = 0;
  320. for (int i = 0; i < 4; i++) {
  321. auto c = readNextByte();
  322. n *= 16;
  323. if (isdigit(c)) {
  324. n += c - '0';
  325. } else if (c >= 'a' && c <= 'f') {
  326. n += c - 'a' + 10;
  327. } else if (c >= 'A' && c <= 'F') {
  328. n += c - 'A' + 10;
  329. } else {
  330. throw Exception("Invalid hex character: {}", c);
  331. }
  332. }
  333. return n;
  334. };
  335. while (it != end) {
  336. string::const_iterator startSeq = it;
  337. char ch = readNextByte();
  338. if (ch == '\\') {
  339. ch = readNextByte();
  340. switch (ch) {
  341. case '"':
  342. case '\\':
  343. case '/':
  344. result.push_back(ch);
  345. continue;
  346. case 'b':
  347. result.push_back('\b');
  348. continue;
  349. case 'f':
  350. result.push_back('\f');
  351. continue;
  352. case 'n':
  353. result.push_back('\n');
  354. continue;
  355. case 'r':
  356. result.push_back('\r');
  357. continue;
  358. case 't':
  359. result.push_back('\t');
  360. continue;
  361. case 'u':
  362. case 'U': {
  363. uint32_t n = unicodeParse();
  364. if (binary) {
  365. if (n > 0xff) {
  366. throw Exception("Invalid byte for binary: {}{}", ch, string(startSeq, ++it));
  367. } else {
  368. result.push_back(static_cast<char>(n));
  369. continue;
  370. }
  371. }
  372. if (n >= 0xd800 && n < 0xdc00) {
  373. ch = readNextByte();
  374. if (ch != '\\') {
  375. throw Exception("Invalid unicode sequence: {}", string(startSeq, it));
  376. }
  377. ch = readNextByte();
  378. if (ch != 'u' && ch != 'U') {
  379. throw Exception("Invalid unicode sequence: {}", string(startSeq, it));
  380. }
  381. uint32_t m = unicodeParse();
  382. if (m < 0xdc00 || m > 0xdfff) {
  383. throw Exception("Invalid unicode sequence: {}", string(startSeq, it));
  384. }
  385. n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00));
  386. } else if (n >= 0xdc00 && n < 0xdfff) {
  387. throw Exception("Invalid unicode sequence: {}", string(startSeq, it));
  388. }
  389. if (n < 0x80) {
  390. result.push_back(static_cast<char>(n));
  391. } else if (n < 0x800) {
  392. result.push_back(static_cast<char>((n >> 6) | 0xc0));
  393. result.push_back(static_cast<char>((n & 0x3f) | 0x80));
  394. } else if (n < 0x10000) {
  395. result.push_back(static_cast<char>((n >> 12) | 0xe0));
  396. result.push_back(static_cast<char>(((n >> 6) & 0x3f) | 0x80));
  397. result.push_back(static_cast<char>((n & 0x3f) | 0x80));
  398. } else if (n < 0x110000) {
  399. result.push_back(static_cast<char>((n >> 18) | 0xf0));
  400. result.push_back(static_cast<char>(((n >> 12) & 0x3f) | 0x80));
  401. result.push_back(static_cast<char>(((n >> 6) & 0x3f) | 0x80));
  402. result.push_back(static_cast<char>((n & 0x3f) | 0x80));
  403. } else {
  404. throw Exception("Invalid unicode value: {}{}", n, string(startSeq, ++it));
  405. }
  406. }
  407. continue;
  408. default:
  409. throw Exception("Unexpected JSON parse state");
  410. }
  411. } else {
  412. result.push_back(ch);
  413. }
  414. }
  415. return result;
  416. }
  417. Exception JsonParser::unexpected(unsigned char c) {
  418. std::ostringstream oss;
  419. oss << "Unexpected character in json " << toHex(c / 16) << toHex(c % 16);
  420. return Exception(oss.str());
  421. }
  422. JsonParser::Token JsonParser::tryLiteral(const char exp[], size_t n, Token tk) {
  423. char c[100];
  424. in_.readBytes(reinterpret_cast<uint8_t *>(c), n);
  425. for (size_t i = 0; i < n; ++i) {
  426. if (c[i] != exp[i]) {
  427. throw unexpected(c[i]);
  428. }
  429. }
  430. if (in_.hasMore()) {
  431. nextChar = in_.read();
  432. if (isdigit(nextChar) || isalpha(nextChar)) {
  433. throw unexpected(nextChar);
  434. }
  435. hasNext = true;
  436. }
  437. return tk;
  438. }
  439. } // namespace json
  440. } // namespace avro