JSON.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===---------------------------------------------------------------------===//
  8. #include "llvm/Support/JSON.h"
  9. #include "llvm/ADT/STLExtras.h"
  10. #include "llvm/Support/ConvertUTF.h"
  11. #include "llvm/Support/Error.h"
  12. #include "llvm/Support/Format.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "llvm/Support/NativeFormatting.h"
  15. #include <cctype>
  16. namespace llvm {
  17. namespace json {
  18. Value &Object::operator[](const ObjectKey &K) {
  19. return try_emplace(K, nullptr).first->getSecond();
  20. }
  21. Value &Object::operator[](ObjectKey &&K) {
  22. return try_emplace(std::move(K), nullptr).first->getSecond();
  23. }
  24. Value *Object::get(StringRef K) {
  25. auto I = find(K);
  26. if (I == end())
  27. return nullptr;
  28. return &I->second;
  29. }
  30. const Value *Object::get(StringRef K) const {
  31. auto I = find(K);
  32. if (I == end())
  33. return nullptr;
  34. return &I->second;
  35. }
  36. llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
  37. if (auto *V = get(K))
  38. return V->getAsNull();
  39. return llvm::None;
  40. }
  41. llvm::Optional<bool> Object::getBoolean(StringRef K) const {
  42. if (auto *V = get(K))
  43. return V->getAsBoolean();
  44. return llvm::None;
  45. }
  46. llvm::Optional<double> Object::getNumber(StringRef K) const {
  47. if (auto *V = get(K))
  48. return V->getAsNumber();
  49. return llvm::None;
  50. }
  51. llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
  52. if (auto *V = get(K))
  53. return V->getAsInteger();
  54. return llvm::None;
  55. }
  56. llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
  57. if (auto *V = get(K))
  58. return V->getAsString();
  59. return llvm::None;
  60. }
  61. const json::Object *Object::getObject(StringRef K) const {
  62. if (auto *V = get(K))
  63. return V->getAsObject();
  64. return nullptr;
  65. }
  66. json::Object *Object::getObject(StringRef K) {
  67. if (auto *V = get(K))
  68. return V->getAsObject();
  69. return nullptr;
  70. }
  71. const json::Array *Object::getArray(StringRef K) const {
  72. if (auto *V = get(K))
  73. return V->getAsArray();
  74. return nullptr;
  75. }
  76. json::Array *Object::getArray(StringRef K) {
  77. if (auto *V = get(K))
  78. return V->getAsArray();
  79. return nullptr;
  80. }
  81. bool operator==(const Object &LHS, const Object &RHS) {
  82. if (LHS.size() != RHS.size())
  83. return false;
  84. for (const auto &L : LHS) {
  85. auto R = RHS.find(L.first);
  86. if (R == RHS.end() || L.second != R->second)
  87. return false;
  88. }
  89. return true;
  90. }
  91. Array::Array(std::initializer_list<Value> Elements) {
  92. V.reserve(Elements.size());
  93. for (const Value &V : Elements) {
  94. emplace_back(nullptr);
  95. back().moveFrom(std::move(V));
  96. }
  97. }
  98. Value::Value(std::initializer_list<Value> Elements)
  99. : Value(json::Array(Elements)) {}
  100. void Value::copyFrom(const Value &M) {
  101. Type = M.Type;
  102. switch (Type) {
  103. case T_Null:
  104. case T_Boolean:
  105. case T_Double:
  106. case T_Integer:
  107. case T_UINT64:
  108. memcpy(&Union, &M.Union, sizeof(Union));
  109. break;
  110. case T_StringRef:
  111. create<StringRef>(M.as<StringRef>());
  112. break;
  113. case T_String:
  114. create<std::string>(M.as<std::string>());
  115. break;
  116. case T_Object:
  117. create<json::Object>(M.as<json::Object>());
  118. break;
  119. case T_Array:
  120. create<json::Array>(M.as<json::Array>());
  121. break;
  122. }
  123. }
  124. void Value::moveFrom(const Value &&M) {
  125. Type = M.Type;
  126. switch (Type) {
  127. case T_Null:
  128. case T_Boolean:
  129. case T_Double:
  130. case T_Integer:
  131. case T_UINT64:
  132. memcpy(&Union, &M.Union, sizeof(Union));
  133. break;
  134. case T_StringRef:
  135. create<StringRef>(M.as<StringRef>());
  136. break;
  137. case T_String:
  138. create<std::string>(std::move(M.as<std::string>()));
  139. M.Type = T_Null;
  140. break;
  141. case T_Object:
  142. create<json::Object>(std::move(M.as<json::Object>()));
  143. M.Type = T_Null;
  144. break;
  145. case T_Array:
  146. create<json::Array>(std::move(M.as<json::Array>()));
  147. M.Type = T_Null;
  148. break;
  149. }
  150. }
  151. void Value::destroy() {
  152. switch (Type) {
  153. case T_Null:
  154. case T_Boolean:
  155. case T_Double:
  156. case T_Integer:
  157. case T_UINT64:
  158. break;
  159. case T_StringRef:
  160. as<StringRef>().~StringRef();
  161. break;
  162. case T_String:
  163. as<std::string>().~basic_string();
  164. break;
  165. case T_Object:
  166. as<json::Object>().~Object();
  167. break;
  168. case T_Array:
  169. as<json::Array>().~Array();
  170. break;
  171. }
  172. }
  173. bool operator==(const Value &L, const Value &R) {
  174. if (L.kind() != R.kind())
  175. return false;
  176. switch (L.kind()) {
  177. case Value::Null:
  178. return *L.getAsNull() == *R.getAsNull();
  179. case Value::Boolean:
  180. return *L.getAsBoolean() == *R.getAsBoolean();
  181. case Value::Number:
  182. // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
  183. // The same integer must convert to the same double, per the standard.
  184. // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
  185. // So we avoid floating point promotion for exact comparisons.
  186. if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
  187. return L.getAsInteger() == R.getAsInteger();
  188. return *L.getAsNumber() == *R.getAsNumber();
  189. case Value::String:
  190. return *L.getAsString() == *R.getAsString();
  191. case Value::Array:
  192. return *L.getAsArray() == *R.getAsArray();
  193. case Value::Object:
  194. return *L.getAsObject() == *R.getAsObject();
  195. }
  196. llvm_unreachable("Unknown value kind");
  197. }
  198. void Path::report(llvm::StringLiteral Msg) {
  199. // Walk up to the root context, and count the number of segments.
  200. unsigned Count = 0;
  201. const Path *P;
  202. for (P = this; P->Parent != nullptr; P = P->Parent)
  203. ++Count;
  204. Path::Root *R = P->Seg.root();
  205. // Fill in the error message and copy the path (in reverse order).
  206. R->ErrorMessage = Msg;
  207. R->ErrorPath.resize(Count);
  208. auto It = R->ErrorPath.begin();
  209. for (P = this; P->Parent != nullptr; P = P->Parent)
  210. *It++ = P->Seg;
  211. }
  212. Error Path::Root::getError() const {
  213. std::string S;
  214. raw_string_ostream OS(S);
  215. OS << (ErrorMessage.empty() ? "invalid JSON contents" : ErrorMessage);
  216. if (ErrorPath.empty()) {
  217. if (!Name.empty())
  218. OS << " when parsing " << Name;
  219. } else {
  220. OS << " at " << (Name.empty() ? "(root)" : Name);
  221. for (const Path::Segment &S : llvm::reverse(ErrorPath)) {
  222. if (S.isField())
  223. OS << '.' << S.field();
  224. else
  225. OS << '[' << S.index() << ']';
  226. }
  227. }
  228. return createStringError(llvm::inconvertibleErrorCode(), OS.str());
  229. }
  230. namespace {
  231. std::vector<const Object::value_type *> sortedElements(const Object &O) {
  232. std::vector<const Object::value_type *> Elements;
  233. for (const auto &E : O)
  234. Elements.push_back(&E);
  235. llvm::sort(Elements,
  236. [](const Object::value_type *L, const Object::value_type *R) {
  237. return L->first < R->first;
  238. });
  239. return Elements;
  240. }
  241. // Prints a one-line version of a value that isn't our main focus.
  242. // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
  243. // This is OK as we own the implementation.
  244. void abbreviate(const Value &V, OStream &JOS) {
  245. switch (V.kind()) {
  246. case Value::Array:
  247. JOS.rawValue(V.getAsArray()->empty() ? "[]" : "[ ... ]");
  248. break;
  249. case Value::Object:
  250. JOS.rawValue(V.getAsObject()->empty() ? "{}" : "{ ... }");
  251. break;
  252. case Value::String: {
  253. llvm::StringRef S = *V.getAsString();
  254. if (S.size() < 40) {
  255. JOS.value(V);
  256. } else {
  257. std::string Truncated = fixUTF8(S.take_front(37));
  258. Truncated.append("...");
  259. JOS.value(Truncated);
  260. }
  261. break;
  262. }
  263. default:
  264. JOS.value(V);
  265. }
  266. }
  267. // Prints a semi-expanded version of a value that is our main focus.
  268. // Array/Object entries are printed, but not recursively as they may be huge.
  269. void abbreviateChildren(const Value &V, OStream &JOS) {
  270. switch (V.kind()) {
  271. case Value::Array:
  272. JOS.array([&] {
  273. for (const auto &I : *V.getAsArray())
  274. abbreviate(I, JOS);
  275. });
  276. break;
  277. case Value::Object:
  278. JOS.object([&] {
  279. for (const auto *KV : sortedElements(*V.getAsObject())) {
  280. JOS.attributeBegin(KV->first);
  281. abbreviate(KV->second, JOS);
  282. JOS.attributeEnd();
  283. }
  284. });
  285. break;
  286. default:
  287. JOS.value(V);
  288. }
  289. }
  290. } // namespace
  291. void Path::Root::printErrorContext(const Value &R, raw_ostream &OS) const {
  292. OStream JOS(OS, /*IndentSize=*/2);
  293. // PrintValue recurses down the path, printing the ancestors of our target.
  294. // Siblings of nodes along the path are printed with abbreviate(), and the
  295. // target itself is printed with the somewhat richer abbreviateChildren().
  296. // 'Recurse' is the lambda itself, to allow recursive calls.
  297. auto PrintValue = [&](const Value &V, ArrayRef<Segment> Path, auto &Recurse) {
  298. // Print the target node itself, with the error as a comment.
  299. // Also used if we can't follow our path, e.g. it names a field that
  300. // *should* exist but doesn't.
  301. auto HighlightCurrent = [&] {
  302. std::string Comment = "error: ";
  303. Comment.append(ErrorMessage.data(), ErrorMessage.size());
  304. JOS.comment(Comment);
  305. abbreviateChildren(V, JOS);
  306. };
  307. if (Path.empty()) // We reached our target.
  308. return HighlightCurrent();
  309. const Segment &S = Path.back(); // Path is in reverse order.
  310. if (S.isField()) {
  311. // Current node is an object, path names a field.
  312. llvm::StringRef FieldName = S.field();
  313. const Object *O = V.getAsObject();
  314. if (!O || !O->get(FieldName))
  315. return HighlightCurrent();
  316. JOS.object([&] {
  317. for (const auto *KV : sortedElements(*O)) {
  318. JOS.attributeBegin(KV->first);
  319. if (FieldName.equals(KV->first))
  320. Recurse(KV->second, Path.drop_back(), Recurse);
  321. else
  322. abbreviate(KV->second, JOS);
  323. JOS.attributeEnd();
  324. }
  325. });
  326. } else {
  327. // Current node is an array, path names an element.
  328. const Array *A = V.getAsArray();
  329. if (!A || S.index() >= A->size())
  330. return HighlightCurrent();
  331. JOS.array([&] {
  332. unsigned Current = 0;
  333. for (const auto &V : *A) {
  334. if (Current++ == S.index())
  335. Recurse(V, Path.drop_back(), Recurse);
  336. else
  337. abbreviate(V, JOS);
  338. }
  339. });
  340. }
  341. };
  342. PrintValue(R, ErrorPath, PrintValue);
  343. }
  344. namespace {
  345. // Simple recursive-descent JSON parser.
  346. class Parser {
  347. public:
  348. Parser(StringRef JSON)
  349. : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
  350. bool checkUTF8() {
  351. size_t ErrOffset;
  352. if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
  353. return true;
  354. P = Start + ErrOffset; // For line/column calculation.
  355. return parseError("Invalid UTF-8 sequence");
  356. }
  357. bool parseValue(Value &Out);
  358. bool assertEnd() {
  359. eatWhitespace();
  360. if (P == End)
  361. return true;
  362. return parseError("Text after end of document");
  363. }
  364. Error takeError() {
  365. assert(Err);
  366. return std::move(*Err);
  367. }
  368. private:
  369. void eatWhitespace() {
  370. while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
  371. ++P;
  372. }
  373. // On invalid syntax, parseX() functions return false and set Err.
  374. bool parseNumber(char First, Value &Out);
  375. bool parseString(std::string &Out);
  376. bool parseUnicode(std::string &Out);
  377. bool parseError(const char *Msg); // always returns false
  378. char next() { return P == End ? 0 : *P++; }
  379. char peek() { return P == End ? 0 : *P; }
  380. static bool isNumber(char C) {
  381. return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
  382. C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
  383. C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
  384. }
  385. Optional<Error> Err;
  386. const char *Start, *P, *End;
  387. };
  388. bool Parser::parseValue(Value &Out) {
  389. eatWhitespace();
  390. if (P == End)
  391. return parseError("Unexpected EOF");
  392. switch (char C = next()) {
  393. // Bare null/true/false are easy - first char identifies them.
  394. case 'n':
  395. Out = nullptr;
  396. return (next() == 'u' && next() == 'l' && next() == 'l') ||
  397. parseError("Invalid JSON value (null?)");
  398. case 't':
  399. Out = true;
  400. return (next() == 'r' && next() == 'u' && next() == 'e') ||
  401. parseError("Invalid JSON value (true?)");
  402. case 'f':
  403. Out = false;
  404. return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
  405. parseError("Invalid JSON value (false?)");
  406. case '"': {
  407. std::string S;
  408. if (parseString(S)) {
  409. Out = std::move(S);
  410. return true;
  411. }
  412. return false;
  413. }
  414. case '[': {
  415. Out = Array{};
  416. Array &A = *Out.getAsArray();
  417. eatWhitespace();
  418. if (peek() == ']') {
  419. ++P;
  420. return true;
  421. }
  422. for (;;) {
  423. A.emplace_back(nullptr);
  424. if (!parseValue(A.back()))
  425. return false;
  426. eatWhitespace();
  427. switch (next()) {
  428. case ',':
  429. eatWhitespace();
  430. continue;
  431. case ']':
  432. return true;
  433. default:
  434. return parseError("Expected , or ] after array element");
  435. }
  436. }
  437. }
  438. case '{': {
  439. Out = Object{};
  440. Object &O = *Out.getAsObject();
  441. eatWhitespace();
  442. if (peek() == '}') {
  443. ++P;
  444. return true;
  445. }
  446. for (;;) {
  447. if (next() != '"')
  448. return parseError("Expected object key");
  449. std::string K;
  450. if (!parseString(K))
  451. return false;
  452. eatWhitespace();
  453. if (next() != ':')
  454. return parseError("Expected : after object key");
  455. eatWhitespace();
  456. if (!parseValue(O[std::move(K)]))
  457. return false;
  458. eatWhitespace();
  459. switch (next()) {
  460. case ',':
  461. eatWhitespace();
  462. continue;
  463. case '}':
  464. return true;
  465. default:
  466. return parseError("Expected , or } after object property");
  467. }
  468. }
  469. }
  470. default:
  471. if (isNumber(C))
  472. return parseNumber(C, Out);
  473. return parseError("Invalid JSON value");
  474. }
  475. }
  476. bool Parser::parseNumber(char First, Value &Out) {
  477. // Read the number into a string. (Must be null-terminated for strto*).
  478. SmallString<24> S;
  479. S.push_back(First);
  480. while (isNumber(peek()))
  481. S.push_back(next());
  482. char *End;
  483. // Try first to parse as integer, and if so preserve full 64 bits.
  484. // strtoll returns long long >= 64 bits, so check it's in range too.
  485. auto I = std::strtoll(S.c_str(), &End, 10);
  486. if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
  487. I <= std::numeric_limits<int64_t>::max()) {
  488. Out = int64_t(I);
  489. return true;
  490. }
  491. // If it's not an integer
  492. Out = std::strtod(S.c_str(), &End);
  493. return End == S.end() || parseError("Invalid JSON value (number?)");
  494. }
  495. bool Parser::parseString(std::string &Out) {
  496. // leading quote was already consumed.
  497. for (char C = next(); C != '"'; C = next()) {
  498. if (LLVM_UNLIKELY(P == End))
  499. return parseError("Unterminated string");
  500. if (LLVM_UNLIKELY((C & 0x1f) == C))
  501. return parseError("Control character in string");
  502. if (LLVM_LIKELY(C != '\\')) {
  503. Out.push_back(C);
  504. continue;
  505. }
  506. // Handle escape sequence.
  507. switch (C = next()) {
  508. case '"':
  509. case '\\':
  510. case '/':
  511. Out.push_back(C);
  512. break;
  513. case 'b':
  514. Out.push_back('\b');
  515. break;
  516. case 'f':
  517. Out.push_back('\f');
  518. break;
  519. case 'n':
  520. Out.push_back('\n');
  521. break;
  522. case 'r':
  523. Out.push_back('\r');
  524. break;
  525. case 't':
  526. Out.push_back('\t');
  527. break;
  528. case 'u':
  529. if (!parseUnicode(Out))
  530. return false;
  531. break;
  532. default:
  533. return parseError("Invalid escape sequence");
  534. }
  535. }
  536. return true;
  537. }
  538. static void encodeUtf8(uint32_t Rune, std::string &Out) {
  539. if (Rune < 0x80) {
  540. Out.push_back(Rune & 0x7F);
  541. } else if (Rune < 0x800) {
  542. uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
  543. uint8_t SecondByte = 0x80 | (Rune & 0x3F);
  544. Out.push_back(FirstByte);
  545. Out.push_back(SecondByte);
  546. } else if (Rune < 0x10000) {
  547. uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
  548. uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
  549. uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
  550. Out.push_back(FirstByte);
  551. Out.push_back(SecondByte);
  552. Out.push_back(ThirdByte);
  553. } else if (Rune < 0x110000) {
  554. uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
  555. uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
  556. uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
  557. uint8_t FourthByte = 0x80 | (Rune & 0x3F);
  558. Out.push_back(FirstByte);
  559. Out.push_back(SecondByte);
  560. Out.push_back(ThirdByte);
  561. Out.push_back(FourthByte);
  562. } else {
  563. llvm_unreachable("Invalid codepoint");
  564. }
  565. }
  566. // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
  567. // May parse several sequential escapes to ensure proper surrogate handling.
  568. // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
  569. // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
  570. bool Parser::parseUnicode(std::string &Out) {
  571. // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
  572. auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
  573. // Decodes 4 hex digits from the stream into Out, returns false on error.
  574. auto Parse4Hex = [this](uint16_t &Out) -> bool {
  575. Out = 0;
  576. char Bytes[] = {next(), next(), next(), next()};
  577. for (unsigned char C : Bytes) {
  578. if (!std::isxdigit(C))
  579. return parseError("Invalid \\u escape sequence");
  580. Out <<= 4;
  581. Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
  582. }
  583. return true;
  584. };
  585. uint16_t First; // UTF-16 code unit from the first \u escape.
  586. if (!Parse4Hex(First))
  587. return false;
  588. // We loop to allow proper surrogate-pair error handling.
  589. while (true) {
  590. // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
  591. if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
  592. encodeUtf8(First, Out);
  593. return true;
  594. }
  595. // Case 2: it's an (unpaired) trailing surrogate.
  596. if (LLVM_UNLIKELY(First >= 0xDC00)) {
  597. Invalid();
  598. return true;
  599. }
  600. // Case 3: it's a leading surrogate. We expect a trailing one next.
  601. // Case 3a: there's no trailing \u escape. Don't advance in the stream.
  602. if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
  603. Invalid(); // Leading surrogate was unpaired.
  604. return true;
  605. }
  606. P += 2;
  607. uint16_t Second;
  608. if (!Parse4Hex(Second))
  609. return false;
  610. // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
  611. if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
  612. Invalid(); // Leading surrogate was unpaired.
  613. First = Second; // Second escape still needs to be processed.
  614. continue;
  615. }
  616. // Case 3c: a valid surrogate pair encoding an astral codepoint.
  617. encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
  618. return true;
  619. }
  620. }
  621. bool Parser::parseError(const char *Msg) {
  622. int Line = 1;
  623. const char *StartOfLine = Start;
  624. for (const char *X = Start; X < P; ++X) {
  625. if (*X == 0x0A) {
  626. ++Line;
  627. StartOfLine = X + 1;
  628. }
  629. }
  630. Err.emplace(
  631. std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
  632. return false;
  633. }
  634. } // namespace
  635. Expected<Value> parse(StringRef JSON) {
  636. Parser P(JSON);
  637. Value E = nullptr;
  638. if (P.checkUTF8())
  639. if (P.parseValue(E))
  640. if (P.assertEnd())
  641. return std::move(E);
  642. return P.takeError();
  643. }
  644. char ParseError::ID = 0;
  645. bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
  646. // Fast-path for ASCII, which is valid UTF-8.
  647. if (LLVM_LIKELY(isASCII(S)))
  648. return true;
  649. const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
  650. if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
  651. return true;
  652. if (ErrOffset)
  653. *ErrOffset = Rest - Data;
  654. return false;
  655. }
  656. std::string fixUTF8(llvm::StringRef S) {
  657. // This isn't particularly efficient, but is only for error-recovery.
  658. std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
  659. const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
  660. UTF32 *Out32 = Codepoints.data();
  661. ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
  662. lenientConversion);
  663. Codepoints.resize(Out32 - Codepoints.data());
  664. std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
  665. const UTF32 *In32 = Codepoints.data();
  666. UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
  667. ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
  668. strictConversion);
  669. Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
  670. return Res;
  671. }
  672. static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
  673. OS << '\"';
  674. for (unsigned char C : S) {
  675. if (C == 0x22 || C == 0x5C)
  676. OS << '\\';
  677. if (C >= 0x20) {
  678. OS << C;
  679. continue;
  680. }
  681. OS << '\\';
  682. switch (C) {
  683. // A few characters are common enough to make short escapes worthwhile.
  684. case '\t':
  685. OS << 't';
  686. break;
  687. case '\n':
  688. OS << 'n';
  689. break;
  690. case '\r':
  691. OS << 'r';
  692. break;
  693. default:
  694. OS << 'u';
  695. llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
  696. break;
  697. }
  698. }
  699. OS << '\"';
  700. }
  701. void llvm::json::OStream::value(const Value &V) {
  702. switch (V.kind()) {
  703. case Value::Null:
  704. valueBegin();
  705. OS << "null";
  706. return;
  707. case Value::Boolean:
  708. valueBegin();
  709. OS << (*V.getAsBoolean() ? "true" : "false");
  710. return;
  711. case Value::Number:
  712. valueBegin();
  713. if (V.Type == Value::T_Integer)
  714. OS << *V.getAsInteger();
  715. else if (V.Type == Value::T_UINT64)
  716. OS << *V.getAsUINT64();
  717. else
  718. OS << format("%.*g", std::numeric_limits<double>::max_digits10,
  719. *V.getAsNumber());
  720. return;
  721. case Value::String:
  722. valueBegin();
  723. quote(OS, *V.getAsString());
  724. return;
  725. case Value::Array:
  726. return array([&] {
  727. for (const Value &E : *V.getAsArray())
  728. value(E);
  729. });
  730. case Value::Object:
  731. return object([&] {
  732. for (const Object::value_type *E : sortedElements(*V.getAsObject()))
  733. attribute(E->first, E->second);
  734. });
  735. }
  736. }
  737. void llvm::json::OStream::valueBegin() {
  738. assert(Stack.back().Ctx != Object && "Only attributes allowed here");
  739. if (Stack.back().HasValue) {
  740. assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
  741. OS << ',';
  742. }
  743. if (Stack.back().Ctx == Array)
  744. newline();
  745. flushComment();
  746. Stack.back().HasValue = true;
  747. }
  748. void OStream::comment(llvm::StringRef Comment) {
  749. assert(PendingComment.empty() && "Only one comment per value!");
  750. PendingComment = Comment;
  751. }
  752. void OStream::flushComment() {
  753. if (PendingComment.empty())
  754. return;
  755. OS << (IndentSize ? "/* " : "/*");
  756. // Be sure not to accidentally emit "*/". Transform to "* /".
  757. while (!PendingComment.empty()) {
  758. auto Pos = PendingComment.find("*/");
  759. if (Pos == StringRef::npos) {
  760. OS << PendingComment;
  761. PendingComment = "";
  762. } else {
  763. OS << PendingComment.take_front(Pos) << "* /";
  764. PendingComment = PendingComment.drop_front(Pos + 2);
  765. }
  766. }
  767. OS << (IndentSize ? " */" : "*/");
  768. // Comments are on their own line unless attached to an attribute value.
  769. if (Stack.size() > 1 && Stack.back().Ctx == Singleton) {
  770. if (IndentSize)
  771. OS << ' ';
  772. } else {
  773. newline();
  774. }
  775. }
  776. void llvm::json::OStream::newline() {
  777. if (IndentSize) {
  778. OS.write('\n');
  779. OS.indent(Indent);
  780. }
  781. }
  782. void llvm::json::OStream::arrayBegin() {
  783. valueBegin();
  784. Stack.emplace_back();
  785. Stack.back().Ctx = Array;
  786. Indent += IndentSize;
  787. OS << '[';
  788. }
  789. void llvm::json::OStream::arrayEnd() {
  790. assert(Stack.back().Ctx == Array);
  791. Indent -= IndentSize;
  792. if (Stack.back().HasValue)
  793. newline();
  794. OS << ']';
  795. assert(PendingComment.empty());
  796. Stack.pop_back();
  797. assert(!Stack.empty());
  798. }
  799. void llvm::json::OStream::objectBegin() {
  800. valueBegin();
  801. Stack.emplace_back();
  802. Stack.back().Ctx = Object;
  803. Indent += IndentSize;
  804. OS << '{';
  805. }
  806. void llvm::json::OStream::objectEnd() {
  807. assert(Stack.back().Ctx == Object);
  808. Indent -= IndentSize;
  809. if (Stack.back().HasValue)
  810. newline();
  811. OS << '}';
  812. assert(PendingComment.empty());
  813. Stack.pop_back();
  814. assert(!Stack.empty());
  815. }
  816. void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
  817. assert(Stack.back().Ctx == Object);
  818. if (Stack.back().HasValue)
  819. OS << ',';
  820. newline();
  821. flushComment();
  822. Stack.back().HasValue = true;
  823. Stack.emplace_back();
  824. Stack.back().Ctx = Singleton;
  825. if (LLVM_LIKELY(isUTF8(Key))) {
  826. quote(OS, Key);
  827. } else {
  828. assert(false && "Invalid UTF-8 in attribute key");
  829. quote(OS, fixUTF8(Key));
  830. }
  831. OS.write(':');
  832. if (IndentSize)
  833. OS.write(' ');
  834. }
  835. void llvm::json::OStream::attributeEnd() {
  836. assert(Stack.back().Ctx == Singleton);
  837. assert(Stack.back().HasValue && "Attribute must have a value");
  838. assert(PendingComment.empty());
  839. Stack.pop_back();
  840. assert(Stack.back().Ctx == Object);
  841. }
  842. raw_ostream &llvm::json::OStream::rawValueBegin() {
  843. valueBegin();
  844. Stack.emplace_back();
  845. Stack.back().Ctx = RawValue;
  846. return OS;
  847. }
  848. void llvm::json::OStream::rawValueEnd() {
  849. assert(Stack.back().Ctx == RawValue);
  850. Stack.pop_back();
  851. }
  852. } // namespace json
  853. } // namespace llvm
  854. void llvm::format_provider<llvm::json::Value>::format(
  855. const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
  856. unsigned IndentAmount = 0;
  857. if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
  858. llvm_unreachable("json::Value format options should be an integer");
  859. json::OStream(OS, IndentAmount).value(E);
  860. }