JSON.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===---------------------------------------------------------------------===//
  8. #include "llvm/Support/JSON.h"
  9. #include "llvm/ADT/STLExtras.h"
  10. #include "llvm/Support/ConvertUTF.h"
  11. #include "llvm/Support/Error.h"
  12. #include "llvm/Support/Format.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "llvm/Support/NativeFormatting.h"
  15. #include <cctype>
  16. #include <optional>
  17. namespace llvm {
  18. namespace json {
  19. Value &Object::operator[](const ObjectKey &K) {
  20. return try_emplace(K, nullptr).first->getSecond();
  21. }
  22. Value &Object::operator[](ObjectKey &&K) {
  23. return try_emplace(std::move(K), nullptr).first->getSecond();
  24. }
  25. Value *Object::get(StringRef K) {
  26. auto I = find(K);
  27. if (I == end())
  28. return nullptr;
  29. return &I->second;
  30. }
  31. const Value *Object::get(StringRef K) const {
  32. auto I = find(K);
  33. if (I == end())
  34. return nullptr;
  35. return &I->second;
  36. }
  37. std::optional<std::nullptr_t> Object::getNull(StringRef K) const {
  38. if (auto *V = get(K))
  39. return V->getAsNull();
  40. return std::nullopt;
  41. }
  42. std::optional<bool> Object::getBoolean(StringRef K) const {
  43. if (auto *V = get(K))
  44. return V->getAsBoolean();
  45. return std::nullopt;
  46. }
  47. std::optional<double> Object::getNumber(StringRef K) const {
  48. if (auto *V = get(K))
  49. return V->getAsNumber();
  50. return std::nullopt;
  51. }
  52. std::optional<int64_t> Object::getInteger(StringRef K) const {
  53. if (auto *V = get(K))
  54. return V->getAsInteger();
  55. return std::nullopt;
  56. }
  57. std::optional<llvm::StringRef> Object::getString(StringRef K) const {
  58. if (auto *V = get(K))
  59. return V->getAsString();
  60. return std::nullopt;
  61. }
  62. const json::Object *Object::getObject(StringRef K) const {
  63. if (auto *V = get(K))
  64. return V->getAsObject();
  65. return nullptr;
  66. }
  67. json::Object *Object::getObject(StringRef K) {
  68. if (auto *V = get(K))
  69. return V->getAsObject();
  70. return nullptr;
  71. }
  72. const json::Array *Object::getArray(StringRef K) const {
  73. if (auto *V = get(K))
  74. return V->getAsArray();
  75. return nullptr;
  76. }
  77. json::Array *Object::getArray(StringRef K) {
  78. if (auto *V = get(K))
  79. return V->getAsArray();
  80. return nullptr;
  81. }
  82. bool operator==(const Object &LHS, const Object &RHS) {
  83. if (LHS.size() != RHS.size())
  84. return false;
  85. for (const auto &L : LHS) {
  86. auto R = RHS.find(L.first);
  87. if (R == RHS.end() || L.second != R->second)
  88. return false;
  89. }
  90. return true;
  91. }
  92. Array::Array(std::initializer_list<Value> Elements) {
  93. V.reserve(Elements.size());
  94. for (const Value &V : Elements) {
  95. emplace_back(nullptr);
  96. back().moveFrom(std::move(V));
  97. }
  98. }
  99. Value::Value(std::initializer_list<Value> Elements)
  100. : Value(json::Array(Elements)) {}
  101. void Value::copyFrom(const Value &M) {
  102. Type = M.Type;
  103. switch (Type) {
  104. case T_Null:
  105. case T_Boolean:
  106. case T_Double:
  107. case T_Integer:
  108. case T_UINT64:
  109. memcpy(&Union, &M.Union, sizeof(Union));
  110. break;
  111. case T_StringRef:
  112. create<StringRef>(M.as<StringRef>());
  113. break;
  114. case T_String:
  115. create<std::string>(M.as<std::string>());
  116. break;
  117. case T_Object:
  118. create<json::Object>(M.as<json::Object>());
  119. break;
  120. case T_Array:
  121. create<json::Array>(M.as<json::Array>());
  122. break;
  123. }
  124. }
  125. void Value::moveFrom(const Value &&M) {
  126. Type = M.Type;
  127. switch (Type) {
  128. case T_Null:
  129. case T_Boolean:
  130. case T_Double:
  131. case T_Integer:
  132. case T_UINT64:
  133. memcpy(&Union, &M.Union, sizeof(Union));
  134. break;
  135. case T_StringRef:
  136. create<StringRef>(M.as<StringRef>());
  137. break;
  138. case T_String:
  139. create<std::string>(std::move(M.as<std::string>()));
  140. M.Type = T_Null;
  141. break;
  142. case T_Object:
  143. create<json::Object>(std::move(M.as<json::Object>()));
  144. M.Type = T_Null;
  145. break;
  146. case T_Array:
  147. create<json::Array>(std::move(M.as<json::Array>()));
  148. M.Type = T_Null;
  149. break;
  150. }
  151. }
  152. void Value::destroy() {
  153. switch (Type) {
  154. case T_Null:
  155. case T_Boolean:
  156. case T_Double:
  157. case T_Integer:
  158. case T_UINT64:
  159. break;
  160. case T_StringRef:
  161. as<StringRef>().~StringRef();
  162. break;
  163. case T_String:
  164. as<std::string>().~basic_string();
  165. break;
  166. case T_Object:
  167. as<json::Object>().~Object();
  168. break;
  169. case T_Array:
  170. as<json::Array>().~Array();
  171. break;
  172. }
  173. }
  174. bool operator==(const Value &L, const Value &R) {
  175. if (L.kind() != R.kind())
  176. return false;
  177. switch (L.kind()) {
  178. case Value::Null:
  179. return *L.getAsNull() == *R.getAsNull();
  180. case Value::Boolean:
  181. return *L.getAsBoolean() == *R.getAsBoolean();
  182. case Value::Number:
  183. // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
  184. // The same integer must convert to the same double, per the standard.
  185. // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
  186. // So we avoid floating point promotion for exact comparisons.
  187. if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
  188. return L.getAsInteger() == R.getAsInteger();
  189. return *L.getAsNumber() == *R.getAsNumber();
  190. case Value::String:
  191. return *L.getAsString() == *R.getAsString();
  192. case Value::Array:
  193. return *L.getAsArray() == *R.getAsArray();
  194. case Value::Object:
  195. return *L.getAsObject() == *R.getAsObject();
  196. }
  197. llvm_unreachable("Unknown value kind");
  198. }
  199. void Path::report(llvm::StringLiteral Msg) {
  200. // Walk up to the root context, and count the number of segments.
  201. unsigned Count = 0;
  202. const Path *P;
  203. for (P = this; P->Parent != nullptr; P = P->Parent)
  204. ++Count;
  205. Path::Root *R = P->Seg.root();
  206. // Fill in the error message and copy the path (in reverse order).
  207. R->ErrorMessage = Msg;
  208. R->ErrorPath.resize(Count);
  209. auto It = R->ErrorPath.begin();
  210. for (P = this; P->Parent != nullptr; P = P->Parent)
  211. *It++ = P->Seg;
  212. }
  213. Error Path::Root::getError() const {
  214. std::string S;
  215. raw_string_ostream OS(S);
  216. OS << (ErrorMessage.empty() ? "invalid JSON contents" : ErrorMessage);
  217. if (ErrorPath.empty()) {
  218. if (!Name.empty())
  219. OS << " when parsing " << Name;
  220. } else {
  221. OS << " at " << (Name.empty() ? "(root)" : Name);
  222. for (const Path::Segment &S : llvm::reverse(ErrorPath)) {
  223. if (S.isField())
  224. OS << '.' << S.field();
  225. else
  226. OS << '[' << S.index() << ']';
  227. }
  228. }
  229. return createStringError(llvm::inconvertibleErrorCode(), OS.str());
  230. }
  231. namespace {
  232. std::vector<const Object::value_type *> sortedElements(const Object &O) {
  233. std::vector<const Object::value_type *> Elements;
  234. for (const auto &E : O)
  235. Elements.push_back(&E);
  236. llvm::sort(Elements,
  237. [](const Object::value_type *L, const Object::value_type *R) {
  238. return L->first < R->first;
  239. });
  240. return Elements;
  241. }
  242. // Prints a one-line version of a value that isn't our main focus.
  243. // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
  244. // This is OK as we own the implementation.
  245. void abbreviate(const Value &V, OStream &JOS) {
  246. switch (V.kind()) {
  247. case Value::Array:
  248. JOS.rawValue(V.getAsArray()->empty() ? "[]" : "[ ... ]");
  249. break;
  250. case Value::Object:
  251. JOS.rawValue(V.getAsObject()->empty() ? "{}" : "{ ... }");
  252. break;
  253. case Value::String: {
  254. llvm::StringRef S = *V.getAsString();
  255. if (S.size() < 40) {
  256. JOS.value(V);
  257. } else {
  258. std::string Truncated = fixUTF8(S.take_front(37));
  259. Truncated.append("...");
  260. JOS.value(Truncated);
  261. }
  262. break;
  263. }
  264. default:
  265. JOS.value(V);
  266. }
  267. }
  268. // Prints a semi-expanded version of a value that is our main focus.
  269. // Array/Object entries are printed, but not recursively as they may be huge.
  270. void abbreviateChildren(const Value &V, OStream &JOS) {
  271. switch (V.kind()) {
  272. case Value::Array:
  273. JOS.array([&] {
  274. for (const auto &I : *V.getAsArray())
  275. abbreviate(I, JOS);
  276. });
  277. break;
  278. case Value::Object:
  279. JOS.object([&] {
  280. for (const auto *KV : sortedElements(*V.getAsObject())) {
  281. JOS.attributeBegin(KV->first);
  282. abbreviate(KV->second, JOS);
  283. JOS.attributeEnd();
  284. }
  285. });
  286. break;
  287. default:
  288. JOS.value(V);
  289. }
  290. }
  291. } // namespace
  292. void Path::Root::printErrorContext(const Value &R, raw_ostream &OS) const {
  293. OStream JOS(OS, /*IndentSize=*/2);
  294. // PrintValue recurses down the path, printing the ancestors of our target.
  295. // Siblings of nodes along the path are printed with abbreviate(), and the
  296. // target itself is printed with the somewhat richer abbreviateChildren().
  297. // 'Recurse' is the lambda itself, to allow recursive calls.
  298. auto PrintValue = [&](const Value &V, ArrayRef<Segment> Path, auto &Recurse) {
  299. // Print the target node itself, with the error as a comment.
  300. // Also used if we can't follow our path, e.g. it names a field that
  301. // *should* exist but doesn't.
  302. auto HighlightCurrent = [&] {
  303. std::string Comment = "error: ";
  304. Comment.append(ErrorMessage.data(), ErrorMessage.size());
  305. JOS.comment(Comment);
  306. abbreviateChildren(V, JOS);
  307. };
  308. if (Path.empty()) // We reached our target.
  309. return HighlightCurrent();
  310. const Segment &S = Path.back(); // Path is in reverse order.
  311. if (S.isField()) {
  312. // Current node is an object, path names a field.
  313. llvm::StringRef FieldName = S.field();
  314. const Object *O = V.getAsObject();
  315. if (!O || !O->get(FieldName))
  316. return HighlightCurrent();
  317. JOS.object([&] {
  318. for (const auto *KV : sortedElements(*O)) {
  319. JOS.attributeBegin(KV->first);
  320. if (FieldName.equals(KV->first))
  321. Recurse(KV->second, Path.drop_back(), Recurse);
  322. else
  323. abbreviate(KV->second, JOS);
  324. JOS.attributeEnd();
  325. }
  326. });
  327. } else {
  328. // Current node is an array, path names an element.
  329. const Array *A = V.getAsArray();
  330. if (!A || S.index() >= A->size())
  331. return HighlightCurrent();
  332. JOS.array([&] {
  333. unsigned Current = 0;
  334. for (const auto &V : *A) {
  335. if (Current++ == S.index())
  336. Recurse(V, Path.drop_back(), Recurse);
  337. else
  338. abbreviate(V, JOS);
  339. }
  340. });
  341. }
  342. };
  343. PrintValue(R, ErrorPath, PrintValue);
  344. }
  345. namespace {
  346. // Simple recursive-descent JSON parser.
  347. class Parser {
  348. public:
  349. Parser(StringRef JSON)
  350. : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
  351. bool checkUTF8() {
  352. size_t ErrOffset;
  353. if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
  354. return true;
  355. P = Start + ErrOffset; // For line/column calculation.
  356. return parseError("Invalid UTF-8 sequence");
  357. }
  358. bool parseValue(Value &Out);
  359. bool assertEnd() {
  360. eatWhitespace();
  361. if (P == End)
  362. return true;
  363. return parseError("Text after end of document");
  364. }
  365. Error takeError() {
  366. assert(Err);
  367. return std::move(*Err);
  368. }
  369. private:
  370. void eatWhitespace() {
  371. while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
  372. ++P;
  373. }
  374. // On invalid syntax, parseX() functions return false and set Err.
  375. bool parseNumber(char First, Value &Out);
  376. bool parseString(std::string &Out);
  377. bool parseUnicode(std::string &Out);
  378. bool parseError(const char *Msg); // always returns false
  379. char next() { return P == End ? 0 : *P++; }
  380. char peek() { return P == End ? 0 : *P; }
  381. static bool isNumber(char C) {
  382. return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
  383. C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
  384. C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
  385. }
  386. std::optional<Error> Err;
  387. const char *Start, *P, *End;
  388. };
  389. bool Parser::parseValue(Value &Out) {
  390. eatWhitespace();
  391. if (P == End)
  392. return parseError("Unexpected EOF");
  393. switch (char C = next()) {
  394. // Bare null/true/false are easy - first char identifies them.
  395. case 'n':
  396. Out = nullptr;
  397. return (next() == 'u' && next() == 'l' && next() == 'l') ||
  398. parseError("Invalid JSON value (null?)");
  399. case 't':
  400. Out = true;
  401. return (next() == 'r' && next() == 'u' && next() == 'e') ||
  402. parseError("Invalid JSON value (true?)");
  403. case 'f':
  404. Out = false;
  405. return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
  406. parseError("Invalid JSON value (false?)");
  407. case '"': {
  408. std::string S;
  409. if (parseString(S)) {
  410. Out = std::move(S);
  411. return true;
  412. }
  413. return false;
  414. }
  415. case '[': {
  416. Out = Array{};
  417. Array &A = *Out.getAsArray();
  418. eatWhitespace();
  419. if (peek() == ']') {
  420. ++P;
  421. return true;
  422. }
  423. for (;;) {
  424. A.emplace_back(nullptr);
  425. if (!parseValue(A.back()))
  426. return false;
  427. eatWhitespace();
  428. switch (next()) {
  429. case ',':
  430. eatWhitespace();
  431. continue;
  432. case ']':
  433. return true;
  434. default:
  435. return parseError("Expected , or ] after array element");
  436. }
  437. }
  438. }
  439. case '{': {
  440. Out = Object{};
  441. Object &O = *Out.getAsObject();
  442. eatWhitespace();
  443. if (peek() == '}') {
  444. ++P;
  445. return true;
  446. }
  447. for (;;) {
  448. if (next() != '"')
  449. return parseError("Expected object key");
  450. std::string K;
  451. if (!parseString(K))
  452. return false;
  453. eatWhitespace();
  454. if (next() != ':')
  455. return parseError("Expected : after object key");
  456. eatWhitespace();
  457. if (!parseValue(O[std::move(K)]))
  458. return false;
  459. eatWhitespace();
  460. switch (next()) {
  461. case ',':
  462. eatWhitespace();
  463. continue;
  464. case '}':
  465. return true;
  466. default:
  467. return parseError("Expected , or } after object property");
  468. }
  469. }
  470. }
  471. default:
  472. if (isNumber(C))
  473. return parseNumber(C, Out);
  474. return parseError("Invalid JSON value");
  475. }
  476. }
  477. bool Parser::parseNumber(char First, Value &Out) {
  478. // Read the number into a string. (Must be null-terminated for strto*).
  479. SmallString<24> S;
  480. S.push_back(First);
  481. while (isNumber(peek()))
  482. S.push_back(next());
  483. char *End;
  484. // Try first to parse as integer, and if so preserve full 64 bits.
  485. // We check for errno for out of bounds errors and for End == S.end()
  486. // to make sure that the numeric string is not malformed.
  487. errno = 0;
  488. int64_t I = std::strtoll(S.c_str(), &End, 10);
  489. if (End == S.end() && errno != ERANGE) {
  490. Out = int64_t(I);
  491. return true;
  492. }
  493. // strtroull has a special handling for negative numbers, but in this
  494. // case we don't want to do that because negative numbers were already
  495. // handled in the previous block.
  496. if (First != '-') {
  497. errno = 0;
  498. uint64_t UI = std::strtoull(S.c_str(), &End, 10);
  499. if (End == S.end() && errno != ERANGE) {
  500. Out = UI;
  501. return true;
  502. }
  503. }
  504. // If it's not an integer
  505. Out = std::strtod(S.c_str(), &End);
  506. return End == S.end() || parseError("Invalid JSON value (number?)");
  507. }
  508. bool Parser::parseString(std::string &Out) {
  509. // leading quote was already consumed.
  510. for (char C = next(); C != '"'; C = next()) {
  511. if (LLVM_UNLIKELY(P == End))
  512. return parseError("Unterminated string");
  513. if (LLVM_UNLIKELY((C & 0x1f) == C))
  514. return parseError("Control character in string");
  515. if (LLVM_LIKELY(C != '\\')) {
  516. Out.push_back(C);
  517. continue;
  518. }
  519. // Handle escape sequence.
  520. switch (C = next()) {
  521. case '"':
  522. case '\\':
  523. case '/':
  524. Out.push_back(C);
  525. break;
  526. case 'b':
  527. Out.push_back('\b');
  528. break;
  529. case 'f':
  530. Out.push_back('\f');
  531. break;
  532. case 'n':
  533. Out.push_back('\n');
  534. break;
  535. case 'r':
  536. Out.push_back('\r');
  537. break;
  538. case 't':
  539. Out.push_back('\t');
  540. break;
  541. case 'u':
  542. if (!parseUnicode(Out))
  543. return false;
  544. break;
  545. default:
  546. return parseError("Invalid escape sequence");
  547. }
  548. }
  549. return true;
  550. }
  551. static void encodeUtf8(uint32_t Rune, std::string &Out) {
  552. if (Rune < 0x80) {
  553. Out.push_back(Rune & 0x7F);
  554. } else if (Rune < 0x800) {
  555. uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
  556. uint8_t SecondByte = 0x80 | (Rune & 0x3F);
  557. Out.push_back(FirstByte);
  558. Out.push_back(SecondByte);
  559. } else if (Rune < 0x10000) {
  560. uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
  561. uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
  562. uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
  563. Out.push_back(FirstByte);
  564. Out.push_back(SecondByte);
  565. Out.push_back(ThirdByte);
  566. } else if (Rune < 0x110000) {
  567. uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
  568. uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
  569. uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
  570. uint8_t FourthByte = 0x80 | (Rune & 0x3F);
  571. Out.push_back(FirstByte);
  572. Out.push_back(SecondByte);
  573. Out.push_back(ThirdByte);
  574. Out.push_back(FourthByte);
  575. } else {
  576. llvm_unreachable("Invalid codepoint");
  577. }
  578. }
  579. // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
  580. // May parse several sequential escapes to ensure proper surrogate handling.
  581. // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
  582. // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
  583. bool Parser::parseUnicode(std::string &Out) {
  584. // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
  585. auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
  586. // Decodes 4 hex digits from the stream into Out, returns false on error.
  587. auto Parse4Hex = [this](uint16_t &Out) -> bool {
  588. Out = 0;
  589. char Bytes[] = {next(), next(), next(), next()};
  590. for (unsigned char C : Bytes) {
  591. if (!std::isxdigit(C))
  592. return parseError("Invalid \\u escape sequence");
  593. Out <<= 4;
  594. Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
  595. }
  596. return true;
  597. };
  598. uint16_t First; // UTF-16 code unit from the first \u escape.
  599. if (!Parse4Hex(First))
  600. return false;
  601. // We loop to allow proper surrogate-pair error handling.
  602. while (true) {
  603. // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
  604. if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
  605. encodeUtf8(First, Out);
  606. return true;
  607. }
  608. // Case 2: it's an (unpaired) trailing surrogate.
  609. if (LLVM_UNLIKELY(First >= 0xDC00)) {
  610. Invalid();
  611. return true;
  612. }
  613. // Case 3: it's a leading surrogate. We expect a trailing one next.
  614. // Case 3a: there's no trailing \u escape. Don't advance in the stream.
  615. if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
  616. Invalid(); // Leading surrogate was unpaired.
  617. return true;
  618. }
  619. P += 2;
  620. uint16_t Second;
  621. if (!Parse4Hex(Second))
  622. return false;
  623. // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
  624. if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
  625. Invalid(); // Leading surrogate was unpaired.
  626. First = Second; // Second escape still needs to be processed.
  627. continue;
  628. }
  629. // Case 3c: a valid surrogate pair encoding an astral codepoint.
  630. encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
  631. return true;
  632. }
  633. }
  634. bool Parser::parseError(const char *Msg) {
  635. int Line = 1;
  636. const char *StartOfLine = Start;
  637. for (const char *X = Start; X < P; ++X) {
  638. if (*X == 0x0A) {
  639. ++Line;
  640. StartOfLine = X + 1;
  641. }
  642. }
  643. Err.emplace(
  644. std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
  645. return false;
  646. }
  647. } // namespace
  648. Expected<Value> parse(StringRef JSON) {
  649. Parser P(JSON);
  650. Value E = nullptr;
  651. if (P.checkUTF8())
  652. if (P.parseValue(E))
  653. if (P.assertEnd())
  654. return std::move(E);
  655. return P.takeError();
  656. }
  657. char ParseError::ID = 0;
  658. bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
  659. // Fast-path for ASCII, which is valid UTF-8.
  660. if (LLVM_LIKELY(isASCII(S)))
  661. return true;
  662. const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
  663. if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
  664. return true;
  665. if (ErrOffset)
  666. *ErrOffset = Rest - Data;
  667. return false;
  668. }
  669. std::string fixUTF8(llvm::StringRef S) {
  670. // This isn't particularly efficient, but is only for error-recovery.
  671. std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
  672. const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
  673. UTF32 *Out32 = Codepoints.data();
  674. ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
  675. lenientConversion);
  676. Codepoints.resize(Out32 - Codepoints.data());
  677. std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
  678. const UTF32 *In32 = Codepoints.data();
  679. UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
  680. ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
  681. strictConversion);
  682. Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
  683. return Res;
  684. }
  685. static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
  686. OS << '\"';
  687. for (unsigned char C : S) {
  688. if (C == 0x22 || C == 0x5C)
  689. OS << '\\';
  690. if (C >= 0x20) {
  691. OS << C;
  692. continue;
  693. }
  694. OS << '\\';
  695. switch (C) {
  696. // A few characters are common enough to make short escapes worthwhile.
  697. case '\t':
  698. OS << 't';
  699. break;
  700. case '\n':
  701. OS << 'n';
  702. break;
  703. case '\r':
  704. OS << 'r';
  705. break;
  706. default:
  707. OS << 'u';
  708. llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
  709. break;
  710. }
  711. }
  712. OS << '\"';
  713. }
  714. void llvm::json::OStream::value(const Value &V) {
  715. switch (V.kind()) {
  716. case Value::Null:
  717. valueBegin();
  718. OS << "null";
  719. return;
  720. case Value::Boolean:
  721. valueBegin();
  722. OS << (*V.getAsBoolean() ? "true" : "false");
  723. return;
  724. case Value::Number:
  725. valueBegin();
  726. if (V.Type == Value::T_Integer)
  727. OS << *V.getAsInteger();
  728. else if (V.Type == Value::T_UINT64)
  729. OS << *V.getAsUINT64();
  730. else
  731. OS << format("%.*g", std::numeric_limits<double>::max_digits10,
  732. *V.getAsNumber());
  733. return;
  734. case Value::String:
  735. valueBegin();
  736. quote(OS, *V.getAsString());
  737. return;
  738. case Value::Array:
  739. return array([&] {
  740. for (const Value &E : *V.getAsArray())
  741. value(E);
  742. });
  743. case Value::Object:
  744. return object([&] {
  745. for (const Object::value_type *E : sortedElements(*V.getAsObject()))
  746. attribute(E->first, E->second);
  747. });
  748. }
  749. }
  750. void llvm::json::OStream::valueBegin() {
  751. assert(Stack.back().Ctx != Object && "Only attributes allowed here");
  752. if (Stack.back().HasValue) {
  753. assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
  754. OS << ',';
  755. }
  756. if (Stack.back().Ctx == Array)
  757. newline();
  758. flushComment();
  759. Stack.back().HasValue = true;
  760. }
  761. void OStream::comment(llvm::StringRef Comment) {
  762. assert(PendingComment.empty() && "Only one comment per value!");
  763. PendingComment = Comment;
  764. }
  765. void OStream::flushComment() {
  766. if (PendingComment.empty())
  767. return;
  768. OS << (IndentSize ? "/* " : "/*");
  769. // Be sure not to accidentally emit "*/". Transform to "* /".
  770. while (!PendingComment.empty()) {
  771. auto Pos = PendingComment.find("*/");
  772. if (Pos == StringRef::npos) {
  773. OS << PendingComment;
  774. PendingComment = "";
  775. } else {
  776. OS << PendingComment.take_front(Pos) << "* /";
  777. PendingComment = PendingComment.drop_front(Pos + 2);
  778. }
  779. }
  780. OS << (IndentSize ? " */" : "*/");
  781. // Comments are on their own line unless attached to an attribute value.
  782. if (Stack.size() > 1 && Stack.back().Ctx == Singleton) {
  783. if (IndentSize)
  784. OS << ' ';
  785. } else {
  786. newline();
  787. }
  788. }
  789. void llvm::json::OStream::newline() {
  790. if (IndentSize) {
  791. OS.write('\n');
  792. OS.indent(Indent);
  793. }
  794. }
  795. void llvm::json::OStream::arrayBegin() {
  796. valueBegin();
  797. Stack.emplace_back();
  798. Stack.back().Ctx = Array;
  799. Indent += IndentSize;
  800. OS << '[';
  801. }
  802. void llvm::json::OStream::arrayEnd() {
  803. assert(Stack.back().Ctx == Array);
  804. Indent -= IndentSize;
  805. if (Stack.back().HasValue)
  806. newline();
  807. OS << ']';
  808. assert(PendingComment.empty());
  809. Stack.pop_back();
  810. assert(!Stack.empty());
  811. }
  812. void llvm::json::OStream::objectBegin() {
  813. valueBegin();
  814. Stack.emplace_back();
  815. Stack.back().Ctx = Object;
  816. Indent += IndentSize;
  817. OS << '{';
  818. }
  819. void llvm::json::OStream::objectEnd() {
  820. assert(Stack.back().Ctx == Object);
  821. Indent -= IndentSize;
  822. if (Stack.back().HasValue)
  823. newline();
  824. OS << '}';
  825. assert(PendingComment.empty());
  826. Stack.pop_back();
  827. assert(!Stack.empty());
  828. }
  829. void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
  830. assert(Stack.back().Ctx == Object);
  831. if (Stack.back().HasValue)
  832. OS << ',';
  833. newline();
  834. flushComment();
  835. Stack.back().HasValue = true;
  836. Stack.emplace_back();
  837. Stack.back().Ctx = Singleton;
  838. if (LLVM_LIKELY(isUTF8(Key))) {
  839. quote(OS, Key);
  840. } else {
  841. assert(false && "Invalid UTF-8 in attribute key");
  842. quote(OS, fixUTF8(Key));
  843. }
  844. OS.write(':');
  845. if (IndentSize)
  846. OS.write(' ');
  847. }
  848. void llvm::json::OStream::attributeEnd() {
  849. assert(Stack.back().Ctx == Singleton);
  850. assert(Stack.back().HasValue && "Attribute must have a value");
  851. assert(PendingComment.empty());
  852. Stack.pop_back();
  853. assert(Stack.back().Ctx == Object);
  854. }
  855. raw_ostream &llvm::json::OStream::rawValueBegin() {
  856. valueBegin();
  857. Stack.emplace_back();
  858. Stack.back().Ctx = RawValue;
  859. return OS;
  860. }
  861. void llvm::json::OStream::rawValueEnd() {
  862. assert(Stack.back().Ctx == RawValue);
  863. Stack.pop_back();
  864. }
  865. } // namespace json
  866. } // namespace llvm
  867. void llvm::format_provider<llvm::json::Value>::format(
  868. const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
  869. unsigned IndentAmount = 0;
  870. if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
  871. llvm_unreachable("json::Value format options should be an integer");
  872. json::OStream(OS, IndentAmount).value(E);
  873. }