uri.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. /**
  2. * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3. * SPDX-License-Identifier: Apache-2.0.
  4. */
  5. #include <aws/common/uri.h>
  6. #include <aws/common/common.h>
  7. #include <ctype.h>
  8. #include <inttypes.h>
  9. #include <stdio.h>
  10. #include <string.h>
  11. #ifdef _MSC_VER
  12. # pragma warning(disable : 4221) /* aggregate initializer using local variable addresses */
  13. # pragma warning(disable : 4204) /* non-constant aggregate initializer */
  14. #endif
  15. enum parser_state {
  16. ON_SCHEME,
  17. ON_AUTHORITY,
  18. ON_PATH,
  19. ON_QUERY_STRING,
  20. FINISHED,
  21. ERROR,
  22. };
  23. struct uri_parser {
  24. struct aws_uri *uri;
  25. enum parser_state state;
  26. };
  27. typedef void(parse_fn)(struct uri_parser *parser, struct aws_byte_cursor *str);
  28. static void s_parse_scheme(struct uri_parser *parser, struct aws_byte_cursor *str);
  29. static void s_parse_authority(struct uri_parser *parser, struct aws_byte_cursor *str);
  30. static void s_parse_path(struct uri_parser *parser, struct aws_byte_cursor *str);
  31. static void s_parse_query_string(struct uri_parser *parser, struct aws_byte_cursor *str);
  32. static parse_fn *s_states[] = {
  33. [ON_SCHEME] = s_parse_scheme,
  34. [ON_AUTHORITY] = s_parse_authority,
  35. [ON_PATH] = s_parse_path,
  36. [ON_QUERY_STRING] = s_parse_query_string,
  37. };
  38. static int s_init_from_uri_str(struct aws_uri *uri) {
  39. struct uri_parser parser = {
  40. .state = ON_SCHEME,
  41. .uri = uri,
  42. };
  43. struct aws_byte_cursor uri_cur = aws_byte_cursor_from_buf(&uri->uri_str);
  44. while (parser.state < FINISHED) {
  45. s_states[parser.state](&parser, &uri_cur);
  46. }
  47. /* Each state function sets the next state, if something goes wrong it sets it to ERROR which is > FINISHED */
  48. if (parser.state == FINISHED) {
  49. return AWS_OP_SUCCESS;
  50. }
  51. aws_byte_buf_clean_up(&uri->uri_str);
  52. AWS_ZERO_STRUCT(*uri);
  53. return AWS_OP_ERR;
  54. }
  55. int aws_uri_init_parse(struct aws_uri *uri, struct aws_allocator *allocator, const struct aws_byte_cursor *uri_str) {
  56. AWS_ZERO_STRUCT(*uri);
  57. uri->self_size = sizeof(struct aws_uri);
  58. uri->allocator = allocator;
  59. if (aws_byte_buf_init_copy_from_cursor(&uri->uri_str, allocator, *uri_str)) {
  60. return AWS_OP_ERR;
  61. }
  62. return s_init_from_uri_str(uri);
  63. }
  64. int aws_uri_init_from_builder_options(
  65. struct aws_uri *uri,
  66. struct aws_allocator *allocator,
  67. struct aws_uri_builder_options *options) {
  68. AWS_ZERO_STRUCT(*uri);
  69. if (options->query_string.len && options->query_params) {
  70. return aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
  71. }
  72. uri->self_size = sizeof(struct aws_uri);
  73. uri->allocator = allocator;
  74. size_t buffer_size = 0;
  75. if (options->scheme.len) {
  76. /* 3 for :// */
  77. buffer_size += options->scheme.len + 3;
  78. }
  79. buffer_size += options->host_name.len;
  80. if (options->port) {
  81. /* max strlen of a 16 bit integer is 5 */
  82. buffer_size += 6;
  83. }
  84. buffer_size += options->path.len;
  85. if (options->query_params) {
  86. size_t query_len = aws_array_list_length(options->query_params);
  87. if (query_len) {
  88. /* for the '?' */
  89. buffer_size += 1;
  90. for (size_t i = 0; i < query_len; ++i) {
  91. struct aws_uri_param *uri_param_ptr = NULL;
  92. aws_array_list_get_at_ptr(options->query_params, (void **)&uri_param_ptr, i);
  93. /* 2 == 1 for '&' and 1 for '='. who cares if we over-allocate a little? */
  94. buffer_size += uri_param_ptr->key.len + uri_param_ptr->value.len + 2;
  95. }
  96. }
  97. } else if (options->query_string.len) {
  98. /* for the '?' */
  99. buffer_size += 1;
  100. buffer_size += options->query_string.len;
  101. }
  102. if (aws_byte_buf_init(&uri->uri_str, allocator, buffer_size)) {
  103. return AWS_OP_ERR;
  104. }
  105. uri->uri_str.len = 0;
  106. if (options->scheme.len) {
  107. aws_byte_buf_append(&uri->uri_str, &options->scheme);
  108. struct aws_byte_cursor scheme_app = aws_byte_cursor_from_c_str("://");
  109. aws_byte_buf_append(&uri->uri_str, &scheme_app);
  110. }
  111. aws_byte_buf_append(&uri->uri_str, &options->host_name);
  112. struct aws_byte_cursor port_app = aws_byte_cursor_from_c_str(":");
  113. if (options->port) {
  114. aws_byte_buf_append(&uri->uri_str, &port_app);
  115. char port_arr[6] = {0};
  116. snprintf(port_arr, sizeof(port_arr), "%" PRIu16, options->port);
  117. struct aws_byte_cursor port_csr = aws_byte_cursor_from_c_str(port_arr);
  118. aws_byte_buf_append(&uri->uri_str, &port_csr);
  119. }
  120. aws_byte_buf_append(&uri->uri_str, &options->path);
  121. struct aws_byte_cursor query_app = aws_byte_cursor_from_c_str("?");
  122. if (options->query_params) {
  123. struct aws_byte_cursor query_param_app = aws_byte_cursor_from_c_str("&");
  124. struct aws_byte_cursor key_value_delim = aws_byte_cursor_from_c_str("=");
  125. aws_byte_buf_append(&uri->uri_str, &query_app);
  126. size_t query_len = aws_array_list_length(options->query_params);
  127. for (size_t i = 0; i < query_len; ++i) {
  128. struct aws_uri_param *uri_param_ptr = NULL;
  129. aws_array_list_get_at_ptr(options->query_params, (void **)&uri_param_ptr, i);
  130. aws_byte_buf_append(&uri->uri_str, &uri_param_ptr->key);
  131. aws_byte_buf_append(&uri->uri_str, &key_value_delim);
  132. aws_byte_buf_append(&uri->uri_str, &uri_param_ptr->value);
  133. if (i < query_len - 1) {
  134. aws_byte_buf_append(&uri->uri_str, &query_param_app);
  135. }
  136. }
  137. } else if (options->query_string.len) {
  138. aws_byte_buf_append(&uri->uri_str, &query_app);
  139. aws_byte_buf_append(&uri->uri_str, &options->query_string);
  140. }
  141. return s_init_from_uri_str(uri);
  142. }
  143. void aws_uri_clean_up(struct aws_uri *uri) {
  144. if (uri->uri_str.allocator) {
  145. aws_byte_buf_clean_up(&uri->uri_str);
  146. }
  147. AWS_ZERO_STRUCT(*uri);
  148. }
  149. const struct aws_byte_cursor *aws_uri_scheme(const struct aws_uri *uri) {
  150. return &uri->scheme;
  151. }
  152. const struct aws_byte_cursor *aws_uri_authority(const struct aws_uri *uri) {
  153. return &uri->authority;
  154. }
  155. const struct aws_byte_cursor *aws_uri_path(const struct aws_uri *uri) {
  156. return &uri->path;
  157. }
  158. const struct aws_byte_cursor *aws_uri_query_string(const struct aws_uri *uri) {
  159. return &uri->query_string;
  160. }
  161. const struct aws_byte_cursor *aws_uri_path_and_query(const struct aws_uri *uri) {
  162. return &uri->path_and_query;
  163. }
  164. const struct aws_byte_cursor *aws_uri_host_name(const struct aws_uri *uri) {
  165. return &uri->host_name;
  166. }
  167. uint16_t aws_uri_port(const struct aws_uri *uri) {
  168. return uri->port;
  169. }
  170. bool aws_uri_query_string_next_param(const struct aws_uri *uri, struct aws_uri_param *param) {
  171. /* If param is zeroed, then this is the first run. */
  172. bool first_run = param->value.ptr == NULL;
  173. /* aws_byte_cursor_next_split() is used to iterate over params in the query string.
  174. * It takes an in/out substring arg similar to how this function works */
  175. struct aws_byte_cursor substr;
  176. if (first_run) {
  177. /* substring must be zeroed to start */
  178. AWS_ZERO_STRUCT(substr);
  179. } else {
  180. /* re-assemble substring which contained key and value */
  181. substr.ptr = param->key.ptr;
  182. substr.len = (param->value.ptr - param->key.ptr) + param->value.len;
  183. }
  184. /* The do-while is to skip over any empty substrings */
  185. do {
  186. if (!aws_byte_cursor_next_split(&uri->query_string, '&', &substr)) {
  187. /* no more splits, done iterating */
  188. return false;
  189. }
  190. } while (substr.len == 0);
  191. uint8_t *delim = memchr(substr.ptr, '=', substr.len);
  192. if (delim) {
  193. param->key.ptr = substr.ptr;
  194. param->key.len = delim - substr.ptr;
  195. param->value.ptr = delim + 1;
  196. param->value.len = substr.len - param->key.len - 1;
  197. } else {
  198. /* no '=', key gets substring, value is blank */
  199. param->key = substr;
  200. param->value.ptr = substr.ptr + substr.len;
  201. param->value.len = 0;
  202. }
  203. return true;
  204. }
  205. int aws_uri_query_string_params(const struct aws_uri *uri, struct aws_array_list *out_params) {
  206. struct aws_uri_param param;
  207. AWS_ZERO_STRUCT(param);
  208. while (aws_uri_query_string_next_param(uri, &param)) {
  209. if (aws_array_list_push_back(out_params, &param)) {
  210. return AWS_OP_ERR;
  211. }
  212. }
  213. return AWS_OP_SUCCESS;
  214. }
  215. static void s_parse_scheme(struct uri_parser *parser, struct aws_byte_cursor *str) {
  216. const uint8_t *location_of_colon = memchr(str->ptr, ':', str->len);
  217. if (!location_of_colon) {
  218. parser->state = ON_AUTHORITY;
  219. return;
  220. }
  221. /* make sure we didn't just pick up the port by mistake */
  222. if ((size_t)(location_of_colon - str->ptr) < str->len && *(location_of_colon + 1) != '/') {
  223. parser->state = ON_AUTHORITY;
  224. return;
  225. }
  226. const size_t scheme_len = location_of_colon - str->ptr;
  227. parser->uri->scheme = aws_byte_cursor_advance(str, scheme_len);
  228. if (str->len < 3 || str->ptr[0] != ':' || str->ptr[1] != '/' || str->ptr[2] != '/') {
  229. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  230. parser->state = ERROR;
  231. return;
  232. }
  233. /* advance past the "://" */
  234. aws_byte_cursor_advance(str, 3);
  235. parser->state = ON_AUTHORITY;
  236. }
  237. static void s_parse_authority(struct uri_parser *parser, struct aws_byte_cursor *str) {
  238. const uint8_t *location_of_slash = memchr(str->ptr, '/', str->len);
  239. const uint8_t *location_of_qmark = memchr(str->ptr, '?', str->len);
  240. if (!location_of_slash && !location_of_qmark && str->len) {
  241. parser->uri->authority.ptr = str->ptr;
  242. parser->uri->authority.len = str->len;
  243. parser->uri->path.ptr = NULL;
  244. parser->uri->path.len = 0;
  245. parser->uri->path_and_query = parser->uri->path;
  246. parser->state = FINISHED;
  247. aws_byte_cursor_advance(str, parser->uri->authority.len);
  248. } else if (!str->len) {
  249. parser->state = ERROR;
  250. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  251. return;
  252. } else {
  253. const uint8_t *end = str->ptr + str->len;
  254. if (location_of_slash) {
  255. parser->state = ON_PATH;
  256. end = location_of_slash;
  257. } else if (location_of_qmark) {
  258. parser->state = ON_QUERY_STRING;
  259. end = location_of_qmark;
  260. }
  261. parser->uri->authority = aws_byte_cursor_advance(str, end - str->ptr);
  262. }
  263. struct aws_byte_cursor authority_parse_csr = parser->uri->authority;
  264. if (authority_parse_csr.len) {
  265. /* RFC-3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ] */
  266. const uint8_t *userinfo_delim = memchr(authority_parse_csr.ptr, '@', authority_parse_csr.len);
  267. if (userinfo_delim) {
  268. parser->uri->userinfo =
  269. aws_byte_cursor_advance(&authority_parse_csr, userinfo_delim - authority_parse_csr.ptr);
  270. /* For the "@" mark */
  271. aws_byte_cursor_advance(&authority_parse_csr, 1);
  272. struct aws_byte_cursor userinfo_parse_csr = parser->uri->userinfo;
  273. uint8_t *info_delim = memchr(userinfo_parse_csr.ptr, ':', userinfo_parse_csr.len);
  274. /* RFC-3986 section 3.2.1: Use of the format "user:password" in the userinfo field is deprecated. But we
  275. * treat the userinfo as URL here, also, if the format is not following URL pattern, you have the whole
  276. * userinfo */
  277. /* RFC-1738 section 3.1: <user>:<password> */
  278. if (info_delim) {
  279. parser->uri->user.ptr = userinfo_parse_csr.ptr;
  280. parser->uri->user.len = info_delim - userinfo_parse_csr.ptr;
  281. parser->uri->password.ptr = info_delim + 1;
  282. parser->uri->password.len = parser->uri->userinfo.len - parser->uri->user.len - 1;
  283. } else {
  284. parser->uri->user = userinfo_parse_csr;
  285. }
  286. }
  287. /* RFC-3986 section 3.2: host identified by IPv6 literal address is
  288. * enclosed within square brackets. We must ignore any colons within
  289. * IPv6 literals and only search for port delimiter after closing bracket.*/
  290. const uint8_t *port_search_start = authority_parse_csr.ptr;
  291. size_t port_search_len = authority_parse_csr.len;
  292. if (authority_parse_csr.len > 0 && authority_parse_csr.ptr[0] == '[') {
  293. port_search_start = memchr(authority_parse_csr.ptr, ']', authority_parse_csr.len);
  294. if (!port_search_start) {
  295. parser->state = ERROR;
  296. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  297. return;
  298. }
  299. port_search_len = authority_parse_csr.len - (port_search_start - authority_parse_csr.ptr);
  300. }
  301. const uint8_t *port_delim = memchr(port_search_start, ':', port_search_len);
  302. if (!port_delim) {
  303. parser->uri->port = 0;
  304. parser->uri->host_name = authority_parse_csr;
  305. return;
  306. }
  307. parser->uri->host_name.ptr = authority_parse_csr.ptr;
  308. parser->uri->host_name.len = port_delim - authority_parse_csr.ptr;
  309. size_t port_len = authority_parse_csr.len - parser->uri->host_name.len - 1;
  310. port_delim += 1;
  311. for (size_t i = 0; i < port_len; ++i) {
  312. if (!aws_isdigit(port_delim[i])) {
  313. parser->state = ERROR;
  314. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  315. return;
  316. }
  317. }
  318. if (port_len > 5) {
  319. parser->state = ERROR;
  320. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  321. return;
  322. }
  323. /* why 6? because the port is a 16-bit unsigned integer*/
  324. char atoi_buf[6] = {0};
  325. memcpy(atoi_buf, port_delim, port_len);
  326. int port_int = atoi(atoi_buf);
  327. if (port_int > UINT16_MAX) {
  328. parser->state = ERROR;
  329. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  330. return;
  331. }
  332. parser->uri->port = (uint16_t)port_int;
  333. }
  334. }
  335. static void s_parse_path(struct uri_parser *parser, struct aws_byte_cursor *str) {
  336. parser->uri->path_and_query = *str;
  337. const uint8_t *location_of_q_mark = memchr(str->ptr, '?', str->len);
  338. if (!location_of_q_mark) {
  339. parser->uri->path.ptr = str->ptr;
  340. parser->uri->path.len = str->len;
  341. parser->state = FINISHED;
  342. aws_byte_cursor_advance(str, parser->uri->path.len);
  343. return;
  344. }
  345. if (!str->len) {
  346. parser->state = ERROR;
  347. aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  348. return;
  349. }
  350. parser->uri->path.ptr = str->ptr;
  351. parser->uri->path.len = location_of_q_mark - str->ptr;
  352. aws_byte_cursor_advance(str, parser->uri->path.len);
  353. parser->state = ON_QUERY_STRING;
  354. }
  355. static void s_parse_query_string(struct uri_parser *parser, struct aws_byte_cursor *str) {
  356. if (!parser->uri->path_and_query.ptr) {
  357. parser->uri->path_and_query = *str;
  358. }
  359. /* we don't want the '?' character. */
  360. if (str->len) {
  361. parser->uri->query_string.ptr = str->ptr + 1;
  362. parser->uri->query_string.len = str->len - 1;
  363. }
  364. aws_byte_cursor_advance(str, parser->uri->query_string.len + 1);
  365. parser->state = FINISHED;
  366. }
  367. static uint8_t s_to_uppercase_hex(uint8_t value) {
  368. AWS_ASSERT(value < 16);
  369. if (value < 10) {
  370. return (uint8_t)('0' + value);
  371. }
  372. return (uint8_t)('A' + value - 10);
  373. }
  374. typedef void(unchecked_append_canonicalized_character_fn)(struct aws_byte_buf *buffer, uint8_t value);
  375. /*
  376. * Appends a character or its hex encoding to the buffer. We reserve enough space up front so that
  377. * we can do this with raw pointers rather than multiple function calls/cursors/etc...
  378. *
  379. * This function is for the uri path
  380. */
  381. static void s_unchecked_append_canonicalized_path_character(struct aws_byte_buf *buffer, uint8_t value) {
  382. AWS_ASSERT(buffer->len + 3 <= buffer->capacity);
  383. uint8_t *dest_ptr = buffer->buffer + buffer->len;
  384. if (aws_isalnum(value)) {
  385. ++buffer->len;
  386. *dest_ptr = value;
  387. return;
  388. }
  389. switch (value) {
  390. /* non-alpha-numeric unreserved, don't % encode them */
  391. case '-':
  392. case '_':
  393. case '.':
  394. case '~':
  395. /* reserved characters that we should not % encode in the path component */
  396. case '/':
  397. ++buffer->len;
  398. *dest_ptr = value;
  399. return;
  400. /*
  401. * everything else we should % encode, including from the reserved list
  402. */
  403. default:
  404. buffer->len += 3;
  405. *dest_ptr++ = '%';
  406. *dest_ptr++ = s_to_uppercase_hex(value >> 4);
  407. *dest_ptr = s_to_uppercase_hex(value & 0x0F);
  408. return;
  409. }
  410. }
  411. /*
  412. * Appends a character or its hex encoding to the buffer. We reserve enough space up front so that
  413. * we can do this with raw pointers rather than multiple function calls/cursors/etc...
  414. *
  415. * This function is for query params
  416. */
  417. static void s_raw_append_canonicalized_param_character(struct aws_byte_buf *buffer, uint8_t value) {
  418. AWS_ASSERT(buffer->len + 3 <= buffer->capacity);
  419. uint8_t *dest_ptr = buffer->buffer + buffer->len;
  420. if (aws_isalnum(value)) {
  421. ++buffer->len;
  422. *dest_ptr = value;
  423. return;
  424. }
  425. switch (value) {
  426. case '-':
  427. case '_':
  428. case '.':
  429. case '~': {
  430. ++buffer->len;
  431. *dest_ptr = value;
  432. return;
  433. }
  434. default:
  435. buffer->len += 3;
  436. *dest_ptr++ = '%';
  437. *dest_ptr++ = s_to_uppercase_hex(value >> 4);
  438. *dest_ptr = s_to_uppercase_hex(value & 0x0F);
  439. return;
  440. }
  441. }
  442. /*
  443. * Writes a cursor to a buffer using the supplied encoding function.
  444. */
  445. static int s_encode_cursor_to_buffer(
  446. struct aws_byte_buf *buffer,
  447. const struct aws_byte_cursor *cursor,
  448. unchecked_append_canonicalized_character_fn *append_canonicalized_character) {
  449. const uint8_t *current_ptr = cursor->ptr;
  450. const uint8_t *end_ptr = cursor->ptr + cursor->len;
  451. /*
  452. * reserve room up front for the worst possible case: everything gets % encoded
  453. */
  454. size_t capacity_needed = 0;
  455. if (AWS_UNLIKELY(aws_mul_size_checked(3, cursor->len, &capacity_needed))) {
  456. return AWS_OP_ERR;
  457. }
  458. if (aws_byte_buf_reserve_relative(buffer, capacity_needed)) {
  459. return AWS_OP_ERR;
  460. }
  461. while (current_ptr < end_ptr) {
  462. append_canonicalized_character(buffer, *current_ptr);
  463. ++current_ptr;
  464. }
  465. return AWS_OP_SUCCESS;
  466. }
  467. int aws_byte_buf_append_encoding_uri_path(struct aws_byte_buf *buffer, const struct aws_byte_cursor *cursor) {
  468. return s_encode_cursor_to_buffer(buffer, cursor, s_unchecked_append_canonicalized_path_character);
  469. }
  470. int aws_byte_buf_append_encoding_uri_param(struct aws_byte_buf *buffer, const struct aws_byte_cursor *cursor) {
  471. return s_encode_cursor_to_buffer(buffer, cursor, s_raw_append_canonicalized_param_character);
  472. }
  473. int aws_byte_buf_append_decoding_uri(struct aws_byte_buf *buffer, const struct aws_byte_cursor *cursor) {
  474. /* reserve room up front for worst possible case: no % and everything copies over 1:1 */
  475. if (aws_byte_buf_reserve_relative(buffer, cursor->len)) {
  476. return AWS_OP_ERR;
  477. }
  478. /* advance over cursor */
  479. struct aws_byte_cursor advancing = *cursor;
  480. uint8_t c;
  481. while (aws_byte_cursor_read_u8(&advancing, &c)) {
  482. if (c == '%') {
  483. /* two hex characters following '%' are the byte's value */
  484. if (AWS_UNLIKELY(aws_byte_cursor_read_hex_u8(&advancing, &c) == false)) {
  485. return aws_raise_error(AWS_ERROR_MALFORMED_INPUT_STRING);
  486. }
  487. }
  488. buffer->buffer[buffer->len++] = c;
  489. }
  490. return AWS_OP_SUCCESS;
  491. }