url.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "../libnetdata.h"
  3. // ----------------------------------------------------------------------------
  4. // URL encode / decode
  5. // code from: http://www.geekhideout.com/urlcode.shtml
  6. /* Converts a hex character to its integer value */
  7. char from_hex(char ch) {
  8. return (char)(isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10);
  9. }
  10. /* Converts an integer value to its hex character*/
  11. char to_hex(char code) {
  12. static char hex[] = "0123456789abcdef";
  13. return hex[code & 15];
  14. }
  15. /* Returns a url-encoded version of str */
  16. /* IMPORTANT: be sure to free() the returned string after use */
  17. char *url_encode(char *str) {
  18. char *buf, *pbuf;
  19. pbuf = buf = mallocz(strlen(str) * 3 + 1);
  20. while (*str) {
  21. if (isalnum(*str) || *str == '-' || *str == '_' || *str == '.' || *str == '~')
  22. *pbuf++ = *str;
  23. else if (*str == ' ')
  24. *pbuf++ = '+';
  25. else{
  26. *pbuf++ = '%';
  27. *pbuf++ = to_hex(*str >> 4);
  28. *pbuf++ = to_hex(*str & 15);
  29. }
  30. str++;
  31. }
  32. *pbuf = '\0';
  33. pbuf = strdupz(buf);
  34. freez(buf);
  35. return pbuf;
  36. }
  37. /**
  38. * Percentage escape decode
  39. *
  40. * Decode %XX character or return 0 if cannot
  41. *
  42. * @param s the string to decode
  43. *
  44. * @return The character decoded on success and 0 otherwise
  45. */
  46. char url_percent_escape_decode(char *s) {
  47. if(likely(s[1] && s[2]))
  48. return from_hex(s[1]) << 4 | from_hex(s[2]);
  49. return 0;
  50. }
  51. /**
  52. * Get byte length
  53. *
  54. * This (utf8 string related) should be moved in separate file in future
  55. *
  56. * @param c is the utf8 character
  57. * *
  58. * @return It returns the length of the specific character.
  59. */
  60. char url_utf8_get_byte_length(char c) {
  61. if(!IS_UTF8_BYTE(c))
  62. return 1;
  63. char length = 0;
  64. while(likely(c & 0x80)) {
  65. length++;
  66. c <<= 1;
  67. }
  68. //4 byte is max size for UTF-8 char
  69. //10XX XXXX is not valid character -> check length == 1
  70. if(length > 4 || length == 1)
  71. return -1;
  72. return length;
  73. }
  74. /**
  75. * Decode Multibyte UTF8
  76. *
  77. * Decode % encoded UTF-8 characters and copy them to *d
  78. *
  79. * @param s first address
  80. * @param d
  81. * @param d_end last address
  82. *
  83. * @return count of bytes written to *d
  84. */
  85. char url_decode_multibyte_utf8(char *s, char *d, char *d_end) {
  86. char first_byte = url_percent_escape_decode(s);
  87. if(unlikely(!first_byte || !IS_UTF8_STARTBYTE(first_byte)))
  88. return 0;
  89. char byte_length = url_utf8_get_byte_length(first_byte);
  90. if(unlikely(byte_length <= 0 || d+byte_length >= d_end))
  91. return 0;
  92. char to_read = byte_length;
  93. while(to_read > 0) {
  94. char c = url_percent_escape_decode(s);
  95. if(unlikely( !IS_UTF8_BYTE(c) ))
  96. return 0;
  97. if((to_read != byte_length) && IS_UTF8_STARTBYTE(c))
  98. return 0;
  99. *d++ = c;
  100. s+=3;
  101. to_read--;
  102. }
  103. return byte_length;
  104. }
  105. /*
  106. * The utf8_check() function scans the '\0'-terminated string starting
  107. * at s. It returns a pointer to the first byte of the first malformed
  108. * or overlong UTF-8 sequence found, or NULL if the string contains
  109. * only correct UTF-8. It also spots UTF-8 sequences that could cause
  110. * trouble if converted to UTF-16, namely surrogate characters
  111. * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
  112. * routine is very likely to find a malformed sequence if the input
  113. * uses any other encoding than UTF-8. It therefore can be used as a
  114. * very effective heuristic for distinguishing between UTF-8 and other
  115. * encodings.
  116. *
  117. * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
  118. * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
  119. */
  120. unsigned char *utf8_check(unsigned char *s)
  121. {
  122. while (*s)
  123. {
  124. if (*s < 0x80)
  125. /* 0xxxxxxx */
  126. s++;
  127. else if ((s[0] & 0xe0) == 0xc0)
  128. {
  129. /* 110XXXXx 10xxxxxx */
  130. if ((s[1] & 0xc0) != 0x80 ||
  131. (s[0] & 0xfe) == 0xc0) /* overlong? */
  132. return s;
  133. else
  134. s += 2;
  135. }
  136. else if ((s[0] & 0xf0) == 0xe0)
  137. {
  138. /* 1110XXXX 10Xxxxxx 10xxxxxx */
  139. if ((s[1] & 0xc0) != 0x80 ||
  140. (s[2] & 0xc0) != 0x80 ||
  141. (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
  142. (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
  143. (s[0] == 0xef && s[1] == 0xbf &&
  144. (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
  145. return s;
  146. else
  147. s += 3;
  148. }
  149. else if ((s[0] & 0xf8) == 0xf0)
  150. {
  151. /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
  152. if ((s[1] & 0xc0) != 0x80 ||
  153. (s[2] & 0xc0) != 0x80 ||
  154. (s[3] & 0xc0) != 0x80 ||
  155. (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
  156. (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
  157. return s;
  158. else
  159. s += 4;
  160. }
  161. else
  162. return s;
  163. }
  164. return NULL;
  165. }
  166. char *url_decode_r(char *to, char *url, size_t size) {
  167. char *s = url, // source
  168. *d = to, // destination
  169. *e = &to[size - 1]; // destination end
  170. while(*s && d < e) {
  171. if(unlikely(*s == '%')) {
  172. char t = url_percent_escape_decode(s);
  173. if(IS_UTF8_BYTE(t)) {
  174. char bytes_written = url_decode_multibyte_utf8(s, d, e);
  175. if(likely(bytes_written)){
  176. d += bytes_written;
  177. s += (bytes_written * 3)-1;
  178. }
  179. else {
  180. goto fail_cleanup;
  181. }
  182. }
  183. else if(likely(t) && isprint(t)) {
  184. // avoid HTTP header injection
  185. *d++ = t;
  186. s += 2;
  187. }
  188. else
  189. goto fail_cleanup;
  190. }
  191. else if(unlikely(*s == '+'))
  192. *d++ = ' ';
  193. else
  194. *d++ = *s;
  195. s++;
  196. }
  197. *d = '\0';
  198. if(unlikely( utf8_check((unsigned char *)to) )) //NULL means success here
  199. return NULL;
  200. return to;
  201. fail_cleanup:
  202. *d = '\0';
  203. return NULL;
  204. }
  205. /**
  206. * Is request complete?
  207. *
  208. * Check whether the request is complete.
  209. * This function cannot check all the requests METHODS, for example, case you are working with POST, it will fail.
  210. *
  211. * @param begin is the first character of the sequence to analyse.
  212. * @param end is the last character of the sequence
  213. * @param length is the length of the total of bytes read, it is not the difference between end and begin.
  214. *
  215. * @return It returns 1 when the request is complete and 0 otherwise.
  216. */
  217. inline int url_is_request_complete(char *begin, char *end, size_t length) {
  218. if ( begin == end) {
  219. //Message cannot be complete when first and last address are the same
  220. return 0;
  221. }
  222. //This math to verify the last is valid, because we are discarding the POST
  223. if (length > 4) {
  224. begin = end - 4;
  225. }
  226. return (strstr(begin, "\r\n\r\n"))?1:0;
  227. }
  228. /**
  229. * Find protocol
  230. *
  231. * Search for the string ' HTTP/' in the message given.
  232. *
  233. * @param s is the start of the user request.
  234. * @return
  235. */
  236. inline char *url_find_protocol(char *s) {
  237. while(*s) {
  238. // find the next space
  239. while (*s && *s != ' ') s++;
  240. // is it SPACE + "HTTP/" ?
  241. if(*s && !strncmp(s, " HTTP/", 6)) break;
  242. else s++;
  243. }
  244. return s;
  245. }
  246. /**
  247. * Map query string
  248. *
  249. * Map the query string fields that will be decoded.
  250. * This functions must be called after to check the presence of query strings,
  251. * here we are assuming that you already tested this.
  252. *
  253. * @param out the pointer to pointers that will be used to map
  254. * @param url the input url that we are decoding.
  255. *
  256. * @return It returns the number of total variables in the query string.
  257. */
  258. int url_map_query_string(char **out, char *url) {
  259. (void)out;
  260. (void)url;
  261. int count = 0;
  262. //First we try to parse considering that there was not URL encode process
  263. char *moveme = url;
  264. char *ptr;
  265. //We always we have at least one here, so I can set this.
  266. out[count++] = moveme;
  267. while(moveme) {
  268. ptr = strchr((moveme+1), '&');
  269. if(ptr) {
  270. out[count++] = ptr;
  271. }
  272. moveme = ptr;
  273. }
  274. //I could not find any '&', so I am assuming now it is like '%26'
  275. if (count == 1) {
  276. moveme = url;
  277. while(moveme) {
  278. ptr = strchr((moveme+1), '%');
  279. if(ptr) {
  280. char *test = (ptr+1);
  281. if (!strncmp(test, "3f", 2) || !strncmp(test, "3F", 2)) {
  282. out[count++] = ptr;
  283. }
  284. }
  285. moveme = ptr;
  286. }
  287. }
  288. return count;
  289. }
  290. /**
  291. * Parse query string
  292. *
  293. * Parse the query string mapped and store it inside output.
  294. *
  295. * @param output is a vector where I will store the string.
  296. * @param max is the maximum length of the output
  297. * @param map the map done by the function url_map_query_string.
  298. * @param total the total number of variables inside map
  299. *
  300. * @return It returns 0 on success and -1 otherwise
  301. */
  302. int url_parse_query_string(char *output, size_t max, char **map, int total) {
  303. if(!total) {
  304. return 0;
  305. }
  306. int counter, next;
  307. size_t length;
  308. char *end;
  309. char *begin = map[0];
  310. char save;
  311. size_t copied = 0;
  312. for(counter = 0, next=1 ; next <= total ; ++counter, ++next) {
  313. if (next != total) {
  314. end = map[next];
  315. length = (size_t) (end - begin);
  316. save = *end;
  317. *end = 0x00;
  318. } else {
  319. length = strlen(begin);
  320. end = NULL;
  321. }
  322. length++;
  323. if (length > (max - copied)) {
  324. error("Parsing query string: we cannot parse a query string so big");
  325. break;
  326. }
  327. if(!url_decode_r(output, begin, length)) {
  328. return -1;
  329. }
  330. length = strlen(output);
  331. copied += length;
  332. output += length;
  333. begin = end;
  334. if (begin) {
  335. *begin = save;
  336. }
  337. }
  338. return 0;
  339. }