xml_parser.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /**
  2. * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3. * SPDX-License-Identifier: Apache-2.0.
  4. */
  5. #include <aws/common/array_list.h>
  6. #include <aws/common/logging.h>
  7. #include <aws/common/private/xml_parser_impl.h>
  8. #ifdef _MSC_VER
  9. /* allow non-constant declared initializers. */
  10. # pragma warning(disable : 4204)
  11. #endif
  12. static const size_t s_max_document_depth = 20;
  13. #define MAX_NAME_LEN ((size_t)256)
  14. #define NODE_CLOSE_OVERHEAD ((size_t)3)
  15. struct cb_stack_data {
  16. aws_xml_parser_on_node_encountered_fn *cb;
  17. void *user_data;
  18. };
  19. struct aws_xml_parser *aws_xml_parser_new(
  20. struct aws_allocator *allocator,
  21. const struct aws_xml_parser_options *options) {
  22. AWS_PRECONDITION(allocator);
  23. AWS_PRECONDITION(options);
  24. struct aws_xml_parser *parser = aws_mem_calloc(allocator, 1, sizeof(struct aws_xml_parser));
  25. if (parser == NULL) {
  26. return NULL;
  27. }
  28. parser->allocator = allocator;
  29. parser->doc = options->doc;
  30. parser->max_depth = s_max_document_depth;
  31. parser->error = AWS_OP_SUCCESS;
  32. if (options->max_depth) {
  33. parser->max_depth = options->max_depth;
  34. }
  35. if (aws_array_list_init_dynamic(&parser->callback_stack, allocator, 4, sizeof(struct cb_stack_data))) {
  36. aws_mem_release(allocator, parser);
  37. return NULL;
  38. }
  39. return parser;
  40. }
  41. void aws_xml_parser_destroy(struct aws_xml_parser *parser) {
  42. AWS_PRECONDITION(parser);
  43. aws_array_list_clean_up(&parser->callback_stack);
  44. aws_mem_release(parser->allocator, parser);
  45. }
  46. int s_node_next_sibling(struct aws_xml_parser *parser);
  47. static bool s_double_quote_fn(uint8_t value) {
  48. return value == '"';
  49. }
  50. /* load the node declaration line, parsing node name and attributes.
  51. *
  52. * something of the form:
  53. * <NodeName Attribute1=Value1 Attribute2=Value2 ...>
  54. * */
  55. static int s_load_node_decl(
  56. struct aws_xml_parser *parser,
  57. struct aws_byte_cursor *decl_body,
  58. struct aws_xml_node *node) {
  59. AWS_PRECONDITION(parser);
  60. AWS_PRECONDITION(decl_body);
  61. AWS_PRECONDITION(node);
  62. struct aws_array_list splits;
  63. AWS_ZERO_STRUCT(splits);
  64. AWS_ZERO_ARRAY(parser->split_scratch);
  65. aws_array_list_init_static(
  66. &splits, parser->split_scratch, AWS_ARRAY_SIZE(parser->split_scratch), sizeof(struct aws_byte_cursor));
  67. /* split by space, first split will be the node name, everything after will be attribute=value pairs. For now
  68. * we limit to 10 attributes, if this is exceeded we consider it invalid document. */
  69. if (aws_byte_cursor_split_on_char(decl_body, ' ', &splits)) {
  70. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  71. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  72. }
  73. size_t splits_count = aws_array_list_length(&splits);
  74. if (splits_count < 1) {
  75. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  76. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  77. }
  78. aws_array_list_get_at(&splits, &node->name, 0);
  79. AWS_ZERO_ARRAY(parser->attributes);
  80. if (splits.length > 1) {
  81. aws_array_list_init_static(
  82. &node->attributes,
  83. parser->attributes,
  84. AWS_ARRAY_SIZE(parser->attributes),
  85. sizeof(struct aws_xml_attribute));
  86. for (size_t i = 1; i < splits.length; ++i) {
  87. struct aws_byte_cursor attribute_pair;
  88. AWS_ZERO_STRUCT(attribute_pair);
  89. aws_array_list_get_at(&splits, &attribute_pair, i);
  90. struct aws_byte_cursor att_val_pair[2];
  91. AWS_ZERO_ARRAY(att_val_pair);
  92. struct aws_array_list att_val_pair_lst;
  93. AWS_ZERO_STRUCT(att_val_pair_lst);
  94. aws_array_list_init_static(&att_val_pair_lst, att_val_pair, 2, sizeof(struct aws_byte_cursor));
  95. if (!aws_byte_cursor_split_on_char(&attribute_pair, '=', &att_val_pair_lst)) {
  96. struct aws_xml_attribute attribute = {
  97. .name = att_val_pair[0],
  98. .value = aws_byte_cursor_trim_pred(&att_val_pair[1], s_double_quote_fn),
  99. };
  100. aws_array_list_push_back(&node->attributes, &attribute);
  101. }
  102. }
  103. }
  104. return AWS_OP_SUCCESS;
  105. }
  106. int aws_xml_parser_parse(
  107. struct aws_xml_parser *parser,
  108. aws_xml_parser_on_node_encountered_fn *on_node_encountered,
  109. void *user_data) {
  110. AWS_PRECONDITION(parser);
  111. if (on_node_encountered == NULL) {
  112. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "'on_node_encountered' argument for aws_xml_parser_parse is invalid.");
  113. aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
  114. return AWS_OP_ERR;
  115. }
  116. aws_array_list_clear(&parser->callback_stack);
  117. /* burn everything that precedes the actual xml nodes. */
  118. while (parser->doc.len) {
  119. const uint8_t *start = memchr(parser->doc.ptr, '<', parser->doc.len);
  120. if (!start) {
  121. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  122. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  123. }
  124. const uint8_t *location = memchr(parser->doc.ptr, '>', parser->doc.len);
  125. if (!location) {
  126. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  127. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  128. }
  129. aws_byte_cursor_advance(&parser->doc, start - parser->doc.ptr);
  130. /* if these are preamble statements, burn them. otherwise don't seek at all
  131. * and assume it's just the doc with no preamble statements. */
  132. if (*(parser->doc.ptr + 1) == '?' || *(parser->doc.ptr + 1) == '!') {
  133. /* nobody cares about the preamble */
  134. size_t advance = location - parser->doc.ptr + 1;
  135. aws_byte_cursor_advance(&parser->doc, advance);
  136. } else {
  137. break;
  138. }
  139. }
  140. /* now we should be at the start of the actual document. */
  141. struct cb_stack_data stack_data = {
  142. .cb = on_node_encountered,
  143. .user_data = user_data,
  144. };
  145. AWS_FATAL_ASSERT(!aws_array_list_push_back(&parser->callback_stack, &stack_data));
  146. return s_node_next_sibling(parser);
  147. }
  148. int s_advance_to_closing_tag(
  149. struct aws_xml_parser *parser,
  150. struct aws_xml_node *node,
  151. struct aws_byte_cursor *out_body) {
  152. AWS_PRECONDITION(parser);
  153. AWS_PRECONDITION(node);
  154. /* currently the max node name is 256 characters. This is arbitrary, but should be enough
  155. * for our uses. If we ever generalize this, we'll have to come back and rethink this. */
  156. uint8_t name_close[MAX_NAME_LEN + NODE_CLOSE_OVERHEAD] = {0};
  157. uint8_t name_open[MAX_NAME_LEN + NODE_CLOSE_OVERHEAD] = {0};
  158. struct aws_byte_buf closing_cmp_buf = aws_byte_buf_from_empty_array(name_close, sizeof(name_close));
  159. struct aws_byte_buf open_cmp_buf = aws_byte_buf_from_empty_array(name_open, sizeof(name_open));
  160. size_t closing_name_len = node->name.len + NODE_CLOSE_OVERHEAD;
  161. if (closing_name_len > node->doc_at_body.len) {
  162. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  163. parser->error = aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  164. return AWS_OP_ERR;
  165. }
  166. if (sizeof(name_close) < closing_name_len) {
  167. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  168. parser->error = aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  169. return AWS_OP_ERR;
  170. }
  171. struct aws_byte_cursor open_bracket = aws_byte_cursor_from_c_str("<");
  172. struct aws_byte_cursor close_token = aws_byte_cursor_from_c_str("/");
  173. struct aws_byte_cursor close_bracket = aws_byte_cursor_from_c_str(">");
  174. aws_byte_buf_append(&open_cmp_buf, &open_bracket);
  175. aws_byte_buf_append(&open_cmp_buf, &node->name);
  176. aws_byte_buf_append(&closing_cmp_buf, &open_bracket);
  177. aws_byte_buf_append(&closing_cmp_buf, &close_token);
  178. aws_byte_buf_append(&closing_cmp_buf, &node->name);
  179. aws_byte_buf_append(&closing_cmp_buf, &close_bracket);
  180. size_t depth_count = 1;
  181. struct aws_byte_cursor to_find_open = aws_byte_cursor_from_buf(&open_cmp_buf);
  182. struct aws_byte_cursor to_find_close = aws_byte_cursor_from_buf(&closing_cmp_buf);
  183. struct aws_byte_cursor close_find_result;
  184. AWS_ZERO_STRUCT(close_find_result);
  185. do {
  186. if (aws_byte_cursor_find_exact(&parser->doc, &to_find_close, &close_find_result)) {
  187. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  188. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  189. }
  190. /* if we find an opening node with the same name, before the closing tag keep going. */
  191. struct aws_byte_cursor open_find_result;
  192. AWS_ZERO_STRUCT(open_find_result);
  193. while (parser->doc.len) {
  194. if (!aws_byte_cursor_find_exact(&parser->doc, &to_find_open, &open_find_result)) {
  195. if (open_find_result.ptr < close_find_result.ptr) {
  196. size_t skip_len = open_find_result.ptr - parser->doc.ptr;
  197. aws_byte_cursor_advance(&parser->doc, skip_len + 1);
  198. depth_count++;
  199. continue;
  200. }
  201. }
  202. size_t skip_len = close_find_result.ptr - parser->doc.ptr;
  203. aws_byte_cursor_advance(&parser->doc, skip_len + closing_cmp_buf.len);
  204. depth_count--;
  205. break;
  206. }
  207. } while (depth_count > 0);
  208. size_t len = close_find_result.ptr - node->doc_at_body.ptr;
  209. if (out_body) {
  210. *out_body = aws_byte_cursor_from_array(node->doc_at_body.ptr, len);
  211. }
  212. return parser->error;
  213. }
  214. int aws_xml_node_as_body(struct aws_xml_parser *parser, struct aws_xml_node *node, struct aws_byte_cursor *out_body) {
  215. AWS_PRECONDITION(parser);
  216. AWS_PRECONDITION(node);
  217. node->processed = true;
  218. return s_advance_to_closing_tag(parser, node, out_body);
  219. }
  220. int aws_xml_node_traverse(
  221. struct aws_xml_parser *parser,
  222. struct aws_xml_node *node,
  223. aws_xml_parser_on_node_encountered_fn *on_node_encountered,
  224. void *user_data) {
  225. AWS_PRECONDITION(parser);
  226. AWS_PRECONDITION(node);
  227. if (on_node_encountered == NULL) {
  228. AWS_LOGF_ERROR(
  229. AWS_LS_COMMON_XML_PARSER, "Callback 'on_node_encountered' for aws_xml_node_traverse is invalid.");
  230. aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
  231. return AWS_OP_ERR;
  232. }
  233. node->processed = true;
  234. struct cb_stack_data stack_data = {
  235. .cb = on_node_encountered,
  236. .user_data = user_data,
  237. };
  238. size_t doc_depth = aws_array_list_length(&parser->callback_stack);
  239. if (doc_depth >= parser->max_depth) {
  240. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  241. parser->error = aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  242. return AWS_OP_ERR;
  243. }
  244. if (aws_array_list_push_back(&parser->callback_stack, &stack_data)) {
  245. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  246. parser->error = aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  247. return AWS_OP_ERR;
  248. }
  249. /* look for the next node at the current level. do this until we encounter the parent node's
  250. * closing tag. */
  251. while (!parser->stop_parsing && !parser->error) {
  252. const uint8_t *next_location = memchr(parser->doc.ptr, '<', parser->doc.len);
  253. if (!next_location) {
  254. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  255. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  256. }
  257. const uint8_t *end_location = memchr(parser->doc.ptr, '>', parser->doc.len);
  258. if (!end_location) {
  259. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  260. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  261. }
  262. bool parent_closed = false;
  263. if (*(next_location + 1) == '/') {
  264. parent_closed = true;
  265. }
  266. size_t node_name_len = end_location - next_location;
  267. aws_byte_cursor_advance(&parser->doc, end_location - parser->doc.ptr + 1);
  268. if (parent_closed) {
  269. break;
  270. }
  271. struct aws_byte_cursor decl_body = aws_byte_cursor_from_array(next_location + 1, node_name_len - 1);
  272. struct aws_xml_node next_node = {
  273. .doc_at_body = parser->doc,
  274. .processed = false,
  275. };
  276. if (s_load_node_decl(parser, &decl_body, &next_node)) {
  277. return AWS_OP_ERR;
  278. }
  279. if (!on_node_encountered(parser, &next_node, user_data)) {
  280. parser->stop_parsing = true;
  281. return parser->error;
  282. }
  283. /* if the user simply returned while skipping the node altogether, go ahead and do the skip over. */
  284. if (!parser->stop_parsing && !next_node.processed) {
  285. if (s_advance_to_closing_tag(parser, &next_node, NULL)) {
  286. return AWS_OP_ERR;
  287. }
  288. }
  289. }
  290. if (parser->stop_parsing) {
  291. return parser->error;
  292. }
  293. aws_array_list_pop_back(&parser->callback_stack);
  294. return parser->error;
  295. }
  296. int aws_xml_node_get_name(const struct aws_xml_node *node, struct aws_byte_cursor *out_name) {
  297. AWS_PRECONDITION(node);
  298. if (out_name == NULL) {
  299. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "'out_name' argument for aws_xml_node_get_name is invalid.");
  300. aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
  301. return AWS_OP_ERR;
  302. }
  303. *out_name = node->name;
  304. return AWS_OP_SUCCESS;
  305. }
  306. size_t aws_xml_node_get_num_attributes(const struct aws_xml_node *node) {
  307. AWS_PRECONDITION(node);
  308. return aws_array_list_length(&node->attributes);
  309. }
  310. int aws_xml_node_get_attribute(
  311. const struct aws_xml_node *node,
  312. size_t attribute_index,
  313. struct aws_xml_attribute *out_attribute) {
  314. AWS_PRECONDITION(node);
  315. if (out_attribute == NULL) {
  316. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "'out_attribute' argument for aws_xml_node_get_attribute is invalid.");
  317. aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
  318. return AWS_OP_ERR;
  319. }
  320. return aws_array_list_get_at(&node->attributes, out_attribute, attribute_index);
  321. }
  322. /* advance the parser to the next sibling node.*/
  323. int s_node_next_sibling(struct aws_xml_parser *parser) {
  324. AWS_PRECONDITION(parser);
  325. const uint8_t *next_location = memchr(parser->doc.ptr, '<', parser->doc.len);
  326. if (!next_location) {
  327. return parser->error;
  328. }
  329. aws_byte_cursor_advance(&parser->doc, next_location - parser->doc.ptr);
  330. const uint8_t *end_location = memchr(parser->doc.ptr, '>', parser->doc.len);
  331. if (!end_location) {
  332. AWS_LOGF_ERROR(AWS_LS_COMMON_XML_PARSER, "XML document is invalid.");
  333. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  334. }
  335. size_t node_name_len = end_location - next_location;
  336. aws_byte_cursor_advance(&parser->doc, end_location - parser->doc.ptr + 1);
  337. struct aws_byte_cursor node_decl_body = aws_byte_cursor_from_array(next_location + 1, node_name_len - 1);
  338. struct aws_xml_node sibling_node = {
  339. .doc_at_body = parser->doc,
  340. .processed = false,
  341. };
  342. if (s_load_node_decl(parser, &node_decl_body, &sibling_node)) {
  343. return AWS_OP_ERR;
  344. }
  345. struct cb_stack_data stack_data;
  346. AWS_ZERO_STRUCT(stack_data);
  347. aws_array_list_back(&parser->callback_stack, &stack_data);
  348. AWS_FATAL_ASSERT(stack_data.cb);
  349. parser->stop_parsing = !stack_data.cb(parser, &sibling_node, stack_data.user_data);
  350. /* if the user simply returned while skipping the node altogether, go ahead and do the skip over. */
  351. if (!sibling_node.processed) {
  352. if (s_advance_to_closing_tag(parser, &sibling_node, NULL)) {
  353. return AWS_OP_ERR;
  354. }
  355. }
  356. return parser->error;
  357. }