Compiler.cc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * https://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #include <boost/algorithm/string/replace.hpp>
  19. #include <sstream>
  20. #include <unordered_set>
  21. #include <utility>
  22. #include "Compiler.hh"
  23. #include "CustomAttributes.hh"
  24. #include "NodeConcepts.hh"
  25. #include "Schema.hh"
  26. #include "Stream.hh"
  27. #include "Types.hh"
  28. #include "ValidSchema.hh"
  29. #include "json/JsonDom.hh"
  30. using std::make_pair;
  31. using std::map;
  32. using std::pair;
  33. using std::string;
  34. using std::vector;
  35. namespace avro {
  36. using json::Array;
  37. using json::Entity;
  38. using json::EntityType;
  39. using json::Object;
  40. using SymbolTable = map<Name, NodePtr>;
  41. // #define DEBUG_VERBOSE
  42. static NodePtr makePrimitive(const string &t) {
  43. if (t == "null") {
  44. return NodePtr(new NodePrimitive(AVRO_NULL));
  45. } else if (t == "boolean") {
  46. return NodePtr(new NodePrimitive(AVRO_BOOL));
  47. } else if (t == "int") {
  48. return NodePtr(new NodePrimitive(AVRO_INT));
  49. } else if (t == "long") {
  50. return NodePtr(new NodePrimitive(AVRO_LONG));
  51. } else if (t == "float") {
  52. return NodePtr(new NodePrimitive(AVRO_FLOAT));
  53. } else if (t == "double") {
  54. return NodePtr(new NodePrimitive(AVRO_DOUBLE));
  55. } else if (t == "string") {
  56. return NodePtr(new NodePrimitive(AVRO_STRING));
  57. } else if (t == "bytes") {
  58. return NodePtr(new NodePrimitive(AVRO_BYTES));
  59. } else {
  60. return NodePtr();
  61. }
  62. }
  63. static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns);
  64. template<typename T>
  65. concepts::SingleAttribute<T> asSingleAttribute(const T &t) {
  66. concepts::SingleAttribute<T> n;
  67. n.add(t);
  68. return n;
  69. }
  70. static bool isFullName(const string &s) {
  71. return s.find('.') != string::npos;
  72. }
  73. static Name getName(const string &name, const string &ns) {
  74. return (isFullName(name)) ? Name(name) : Name(name, ns);
  75. }
  76. static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns) {
  77. NodePtr result = makePrimitive(t);
  78. if (result) {
  79. return result;
  80. }
  81. Name n = getName(t, ns);
  82. auto it = st.find(n);
  83. if (it != st.end()) {
  84. return NodePtr(new NodeSymbolic(asSingleAttribute(n), it->second));
  85. }
  86. throw Exception("Unknown type: {}", n);
  87. }
  88. /** Returns "true" if the field is in the container */
  89. // e.g.: can be false for non-mandatory fields
  90. bool containsField(const Object &m, const string &fieldName) {
  91. auto it = m.find(fieldName);
  92. return (it != m.end());
  93. }
  94. json::Object::const_iterator findField(const Entity &e,
  95. const Object &m, const string &fieldName);
  96. template<typename T>
  97. void ensureType(const Entity &e, const string &name) {
  98. if (e.type() != json::type_traits<T>::type()) {
  99. throw Exception("Json field \"{}\" is not a {}: {}", name, json::type_traits<T>::name(), e.toString());
  100. }
  101. }
  102. string getStringField(const Entity &e, const Object &m,
  103. const string &fieldName) {
  104. auto it = findField(e, m, fieldName);
  105. ensureType<string>(it->second, fieldName);
  106. return it->second.stringValue();
  107. }
  108. const Array &getArrayField(const Entity &e, const Object &m,
  109. const string &fieldName);
  110. int64_t getLongField(const Entity &e, const Object &m,
  111. const string &fieldName) {
  112. auto it = findField(e, m, fieldName);
  113. ensureType<int64_t>(it->second, fieldName);
  114. return it->second.longValue();
  115. }
  116. // Unescape double quotes (") for de-serialization. This method complements the
  117. // method NodeImpl::escape() which is used for serialization.
  118. static void unescape(string &s) {
  119. boost::replace_all(s, "\\\"", "\"");
  120. }
  121. string getDocField(const Entity &e, const Object &m) {
  122. string doc = getStringField(e, m, "doc");
  123. unescape(doc);
  124. return doc;
  125. }
  126. struct Field {
  127. const string name;
  128. const vector<string> aliases;
  129. const NodePtr schema;
  130. const GenericDatum defaultValue;
  131. const CustomAttributes customAttributes;
  132. Field(string n, vector<string> a, NodePtr v, GenericDatum dv, const CustomAttributes &ca)
  133. : name(std::move(n)), aliases(std::move(a)), schema(std::move(v)), defaultValue(std::move(dv)), customAttributes(ca) {}
  134. };
  135. static void assertType(const Entity &e, EntityType et) {
  136. if (e.type() != et) {
  137. throw Exception(
  138. "Unexpected type for default value: Expected {}, but found {} in line {}",
  139. json::typeToString(et), json::typeToString(e.type()), e.line());
  140. }
  141. }
  142. static vector<uint8_t> toBin(const string &s) {
  143. vector<uint8_t> result(s.size());
  144. if (!s.empty()) {
  145. std::copy(s.c_str(), s.c_str() + s.size(), result.data());
  146. }
  147. return result;
  148. }
  149. static GenericDatum makeGenericDatum(NodePtr n,
  150. const Entity &e, const SymbolTable &st) {
  151. Type t = n->type();
  152. EntityType dt = e.type();
  153. if (t == AVRO_SYMBOLIC) {
  154. n = st.find(n->name())->second;
  155. t = n->type();
  156. }
  157. switch (t) {
  158. case AVRO_STRING:
  159. assertType(e, json::EntityType::String);
  160. return GenericDatum(e.stringValue());
  161. case AVRO_BYTES:
  162. assertType(e, json::EntityType::String);
  163. return GenericDatum(toBin(e.bytesValue()));
  164. case AVRO_INT:
  165. assertType(e, json::EntityType::Long);
  166. return GenericDatum(static_cast<int32_t>(e.longValue()));
  167. case AVRO_LONG:
  168. assertType(e, json::EntityType::Long);
  169. return GenericDatum(e.longValue());
  170. case AVRO_FLOAT:
  171. if (dt == json::EntityType::Long) {
  172. return GenericDatum(static_cast<float>(e.longValue()));
  173. }
  174. assertType(e, json::EntityType::Double);
  175. return GenericDatum(static_cast<float>(e.doubleValue()));
  176. case AVRO_DOUBLE:
  177. if (dt == json::EntityType::Long) {
  178. return GenericDatum(static_cast<double>(e.longValue()));
  179. }
  180. assertType(e, json::EntityType::Double);
  181. return GenericDatum(e.doubleValue());
  182. case AVRO_BOOL:
  183. assertType(e, json::EntityType::Bool);
  184. return GenericDatum(e.boolValue());
  185. case AVRO_NULL:
  186. assertType(e, json::EntityType::Null);
  187. return GenericDatum();
  188. case AVRO_RECORD: {
  189. assertType(e, json::EntityType::Obj);
  190. GenericRecord result(n);
  191. const map<string, Entity> &v = e.objectValue();
  192. for (size_t i = 0; i < n->leaves(); ++i) {
  193. auto it = v.find(n->nameAt(i));
  194. if (it == v.end()) {
  195. throw Exception(
  196. "No value found in default for {}",
  197. n->nameAt(i));
  198. }
  199. result.setFieldAt(i,
  200. makeGenericDatum(n->leafAt(i), it->second, st));
  201. }
  202. return GenericDatum(n, result);
  203. }
  204. case AVRO_ENUM:
  205. assertType(e, json::EntityType::String);
  206. return GenericDatum(n, GenericEnum(n, e.stringValue()));
  207. case AVRO_ARRAY: {
  208. assertType(e, json::EntityType::Arr);
  209. GenericArray result(n);
  210. const vector<Entity> &elements = e.arrayValue();
  211. for (const auto &element : elements) {
  212. result.value().push_back(makeGenericDatum(n->leafAt(0), element, st));
  213. }
  214. return GenericDatum(n, result);
  215. }
  216. case AVRO_MAP: {
  217. assertType(e, json::EntityType::Obj);
  218. GenericMap result(n);
  219. const map<string, Entity> &v = e.objectValue();
  220. for (const auto &it : v) {
  221. result.value().push_back(make_pair(it.first,
  222. makeGenericDatum(n->leafAt(1), it.second, st)));
  223. }
  224. return GenericDatum(n, result);
  225. }
  226. case AVRO_UNION: {
  227. GenericUnion result(n);
  228. result.selectBranch(0);
  229. result.datum() = makeGenericDatum(n->leafAt(0), e, st);
  230. return GenericDatum(n, result);
  231. }
  232. case AVRO_FIXED:
  233. assertType(e, json::EntityType::String);
  234. return GenericDatum(n, GenericFixed(n, toBin(e.bytesValue())));
  235. default: throw Exception("Unknown type: {}", t);
  236. }
  237. }
  238. static const std::unordered_set<std::string> &getKnownFields() {
  239. // return known fields
  240. static const std::unordered_set<std::string> kKnownFields =
  241. {"name", "type", "aliases", "default", "doc", "size", "logicalType",
  242. "values", "precision", "scale", "namespace"};
  243. return kKnownFields;
  244. }
  245. static void getCustomAttributes(const Object &m, CustomAttributes &customAttributes) {
  246. // Don't add known fields on primitive type and fixed type into custom
  247. // fields.
  248. const std::unordered_set<std::string> &kKnownFields = getKnownFields();
  249. for (const auto &entry : m) {
  250. if (kKnownFields.find(entry.first) == kKnownFields.end()) {
  251. customAttributes.addAttribute(entry.first, entry.second.stringValue());
  252. }
  253. }
  254. }
  255. static Field makeField(const Entity &e, SymbolTable &st, const string &ns) {
  256. const Object &m = e.objectValue();
  257. string n = getStringField(e, m, "name");
  258. vector<string> aliases;
  259. string aliasesName = "aliases";
  260. if (containsField(m, aliasesName)) {
  261. for (const auto &alias : getArrayField(e, m, aliasesName)) {
  262. aliases.emplace_back(alias.stringValue());
  263. }
  264. }
  265. auto it = findField(e, m, "type");
  266. auto it2 = m.find("default");
  267. NodePtr node = makeNode(it->second, st, ns);
  268. if (containsField(m, "doc")) {
  269. node->setDoc(getDocField(e, m));
  270. }
  271. GenericDatum d = (it2 == m.end()) ? GenericDatum() : makeGenericDatum(node, it2->second, st);
  272. // Get custom attributes
  273. CustomAttributes customAttributes;
  274. getCustomAttributes(m, customAttributes);
  275. return Field(std::move(n), std::move(aliases), node, d, customAttributes);
  276. }
  277. // Extended makeRecordNode (with doc).
  278. static NodePtr makeRecordNode(const Entity &e, const Name &name,
  279. const string *doc, const Object &m,
  280. SymbolTable &st, const string &ns) {
  281. concepts::MultiAttribute<string> fieldNames;
  282. vector<vector<string>> fieldAliases;
  283. concepts::MultiAttribute<NodePtr> fieldValues;
  284. concepts::MultiAttribute<CustomAttributes> customAttributes;
  285. vector<GenericDatum> defaultValues;
  286. string fields = "fields";
  287. for (const auto &it : getArrayField(e, m, fields)) {
  288. Field f = makeField(it, st, ns);
  289. fieldNames.add(f.name);
  290. fieldAliases.push_back(f.aliases);
  291. fieldValues.add(f.schema);
  292. defaultValues.push_back(f.defaultValue);
  293. customAttributes.add(f.customAttributes);
  294. }
  295. NodeRecord *node;
  296. if (doc == nullptr) {
  297. node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
  298. fieldAliases, defaultValues, customAttributes);
  299. } else {
  300. node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
  301. fieldValues, fieldNames, fieldAliases, defaultValues, customAttributes);
  302. }
  303. return NodePtr(node);
  304. }
  305. static LogicalType makeLogicalType(const Entity &e, const Object &m) {
  306. if (!containsField(m, "logicalType")) {
  307. return LogicalType(LogicalType::NONE);
  308. }
  309. const std::string &typeField = getStringField(e, m, "logicalType");
  310. if (typeField == "decimal") {
  311. LogicalType decimalType(LogicalType::DECIMAL);
  312. try {
  313. // Precision probably won't go over 38 and scale beyond -77/+77
  314. decimalType.setPrecision(static_cast<int32_t>(getLongField(e, m, "precision")));
  315. if (containsField(m, "scale")) {
  316. decimalType.setScale(static_cast<int32_t>(getLongField(e, m, "scale")));
  317. }
  318. } catch (Exception &ex) {
  319. // If any part of the logical type is malformed, per the standard we
  320. // must ignore the whole attribute.
  321. return LogicalType(LogicalType::NONE);
  322. }
  323. return decimalType;
  324. }
  325. LogicalType::Type t = LogicalType::NONE;
  326. if (typeField == "date")
  327. t = LogicalType::DATE;
  328. else if (typeField == "time-millis")
  329. t = LogicalType::TIME_MILLIS;
  330. else if (typeField == "time-micros")
  331. t = LogicalType::TIME_MICROS;
  332. else if (typeField == "timestamp-millis")
  333. t = LogicalType::TIMESTAMP_MILLIS;
  334. else if (typeField == "timestamp-micros")
  335. t = LogicalType::TIMESTAMP_MICROS;
  336. else if (typeField == "duration")
  337. t = LogicalType::DURATION;
  338. else if (typeField == "uuid")
  339. t = LogicalType::UUID;
  340. return LogicalType(t);
  341. }
  342. static NodePtr makeEnumNode(const Entity &e,
  343. const Name &name, const Object &m) {
  344. string symbolsName = "symbols";
  345. const Array &v = getArrayField(e, m, symbolsName);
  346. concepts::MultiAttribute<string> symbols;
  347. for (const auto &it : v) {
  348. if (it.type() != json::EntityType::String) {
  349. throw Exception("Enum symbol not a string: {}", it.toString());
  350. }
  351. symbols.add(it.stringValue());
  352. }
  353. NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
  354. if (containsField(m, "doc")) {
  355. node->setDoc(getDocField(e, m));
  356. }
  357. return node;
  358. }
  359. static NodePtr makeFixedNode(const Entity &e,
  360. const Name &name, const Object &m) {
  361. int64_t v = getLongField(e, m, "size");
  362. if (v <= 0) {
  363. throw Exception("Size for fixed is not positive: {}", e.toString());
  364. }
  365. NodePtr node =
  366. NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(static_cast<size_t>(v))));
  367. if (containsField(m, "doc")) {
  368. node->setDoc(getDocField(e, m));
  369. }
  370. return node;
  371. }
  372. static NodePtr makeArrayNode(const Entity &e, const Object &m,
  373. SymbolTable &st, const string &ns) {
  374. auto it = findField(e, m, "items");
  375. NodePtr node = NodePtr(new NodeArray(
  376. asSingleAttribute(makeNode(it->second, st, ns))));
  377. if (containsField(m, "doc")) {
  378. node->setDoc(getDocField(e, m));
  379. }
  380. return node;
  381. }
  382. static NodePtr makeMapNode(const Entity &e, const Object &m,
  383. SymbolTable &st, const string &ns) {
  384. auto it = findField(e, m, "values");
  385. NodePtr node = NodePtr(new NodeMap(
  386. asSingleAttribute(makeNode(it->second, st, ns))));
  387. if (containsField(m, "doc")) {
  388. node->setDoc(getDocField(e, m));
  389. }
  390. return node;
  391. }
  392. static Name getName(const Entity &e, const Object &m, const string &ns) {
  393. const string &name = getStringField(e, m, "name");
  394. Name result;
  395. if (isFullName(name)) {
  396. result = Name(name);
  397. } else {
  398. auto it = m.find("namespace");
  399. if (it != m.end()) {
  400. if (it->second.type() != json::type_traits<string>::type()) {
  401. throw Exception(
  402. "Json field \"namespace\" is not a string: {}",
  403. it->second.toString());
  404. }
  405. result = Name(name, it->second.stringValue());
  406. } else {
  407. result = Name(name, ns);
  408. }
  409. }
  410. std::string aliases = "aliases";
  411. if (containsField(m, aliases)) {
  412. for (const auto &alias : getArrayField(e, m, aliases)) {
  413. result.addAlias(alias.stringValue());
  414. }
  415. }
  416. return result;
  417. }
  418. static NodePtr makeNode(const Entity &e, const Object &m,
  419. SymbolTable &st, const string &ns) {
  420. const string &type = getStringField(e, m, "type");
  421. NodePtr result;
  422. if (type == "record" || type == "error" || type == "enum" || type == "fixed") {
  423. Name nm = getName(e, m, ns);
  424. if (type == "record" || type == "error") {
  425. result = NodePtr(new NodeRecord());
  426. st[nm] = result;
  427. // Get field doc
  428. if (containsField(m, "doc")) {
  429. string doc = getDocField(e, m);
  430. NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
  431. (std::dynamic_pointer_cast<NodeRecord>(r))->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
  432. } else { // No doc
  433. NodePtr r =
  434. makeRecordNode(e, nm, nullptr, m, st, nm.ns());
  435. (std::dynamic_pointer_cast<NodeRecord>(r))
  436. ->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
  437. }
  438. } else {
  439. result = (type == "enum") ? makeEnumNode(e, nm, m) : makeFixedNode(e, nm, m);
  440. st[nm] = result;
  441. }
  442. } else if (type == "array") {
  443. result = makeArrayNode(e, m, st, ns);
  444. } else if (type == "map") {
  445. result = makeMapNode(e, m, st, ns);
  446. } else {
  447. result = makePrimitive(type);
  448. }
  449. if (result) {
  450. try {
  451. result->setLogicalType(makeLogicalType(e, m));
  452. } catch (Exception &ex) {
  453. // Per the standard we must ignore the logical type attribute if it
  454. // is malformed.
  455. }
  456. return result;
  457. }
  458. throw Exception("Unknown type definition: %1%", e.toString());
  459. }
  460. static NodePtr makeNode(const Entity &, const Array &m,
  461. SymbolTable &st, const string &ns) {
  462. concepts::MultiAttribute<NodePtr> mm;
  463. for (const auto &it : m) {
  464. mm.add(makeNode(it, st, ns));
  465. }
  466. return NodePtr(new NodeUnion(mm));
  467. }
  468. static NodePtr makeNode(const json::Entity &e, SymbolTable &st, const string &ns) {
  469. switch (e.type()) {
  470. case json::EntityType::String: return makeNode(e.stringValue(), st, ns);
  471. case json::EntityType::Obj: return makeNode(e, e.objectValue(), st, ns);
  472. case json::EntityType::Arr: return makeNode(e, e.arrayValue(), st, ns);
  473. default: throw Exception("Invalid Avro type: {}", e.toString());
  474. }
  475. }
  476. json::Object::const_iterator findField(const Entity &e, const Object &m, const string &fieldName) {
  477. auto it = m.find(fieldName);
  478. if (it == m.end()) {
  479. throw Exception("Missing Json field \"{}\": {}", fieldName, e.toString());
  480. } else {
  481. return it;
  482. }
  483. }
  484. const Array &getArrayField(const Entity &e, const Object &m, const string &fieldName) {
  485. auto it = findField(e, m, fieldName);
  486. ensureType<Array>(it->second, fieldName);
  487. return it->second.arrayValue();
  488. }
  489. ValidSchema compileJsonSchemaFromStream(InputStream &is) {
  490. json::Entity e = json::loadEntity(is);
  491. SymbolTable st;
  492. NodePtr n = makeNode(e, st, "");
  493. return ValidSchema(n);
  494. }
  495. AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char *filename) {
  496. std::unique_ptr<InputStream> s = fileInputStream(filename);
  497. return compileJsonSchemaFromStream(*s);
  498. }
  499. AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t *input, size_t len) {
  500. return compileJsonSchemaFromStream(*memoryInputStream(input, len));
  501. }
  502. AVRO_DECL ValidSchema compileJsonSchemaFromString(const char *input) {
  503. return compileJsonSchemaFromMemory(reinterpret_cast<const uint8_t *>(input),
  504. ::strlen(input));
  505. }
  506. AVRO_DECL ValidSchema compileJsonSchemaFromString(const string &input) {
  507. return compileJsonSchemaFromMemory(
  508. reinterpret_cast<const uint8_t *>(input.data()), input.size());
  509. }
  510. static ValidSchema compile(std::istream &is) {
  511. std::unique_ptr<InputStream> in = istreamInputStream(is);
  512. return compileJsonSchemaFromStream(*in);
  513. }
  514. void compileJsonSchema(std::istream &is, ValidSchema &schema) {
  515. if (!is.good()) {
  516. throw Exception("Input stream is not good");
  517. }
  518. schema = compile(is);
  519. }
  520. AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string &error) {
  521. try {
  522. compileJsonSchema(is, schema);
  523. return true;
  524. } catch (const Exception &e) {
  525. error = e.what();
  526. return false;
  527. }
  528. }
  529. } // namespace avro