url.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. #include "url.h"
  2. #include <util/string/cast.h>
  3. #include <util/string/util.h>
  4. #include <util/string/cstriter.h>
  5. #include <util/string/ascii.h>
  6. #include <util/string/strip.h>
  7. #include <util/charset/unidata.h> // for ToLower
  8. #include <util/system/defaults.h>
  9. #include <util/generic/algorithm.h>
  10. #include <util/generic/hash_set.h>
  11. #include <util/generic/yexception.h>
  12. #include <util/generic/singleton.h>
  13. #include <cstdlib>
  14. namespace {
  15. struct TUncheckedSize {
  16. static bool Has(size_t) {
  17. return true;
  18. }
  19. };
  20. struct TKnownSize {
  21. size_t MySize;
  22. explicit TKnownSize(size_t sz)
  23. : MySize(sz)
  24. {
  25. }
  26. bool Has(size_t sz) const {
  27. return sz <= MySize;
  28. }
  29. };
  30. template <typename TChar1, typename TChar2>
  31. int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
  32. for (size_t i = 0; i < n; ++i) {
  33. if ((TChar1)ToLower(s1[i]) != s2[i])
  34. return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
  35. }
  36. return 0;
  37. }
  38. template <typename TChar, typename TBounds>
  39. inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
  40. const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
  41. const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
  42. if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
  43. return 7;
  44. if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
  45. return 8;
  46. return 0;
  47. }
  48. template <typename T>
  49. inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
  50. size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
  51. if (prefixSize)
  52. return url.substr(prefixSize);
  53. return url;
  54. }
  55. }
  56. namespace NUrl {
  57. TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
  58. TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
  59. TStringBuf path = url;
  60. path.SkipPrefix(host);
  61. return {host, path};
  62. }
  63. } // namespace NUrl
  64. size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
  65. return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
  66. }
  67. size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
  68. return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
  69. }
  70. size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
  71. return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
  72. }
  73. size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
  74. return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
  75. }
  76. TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
  77. return CutHttpPrefixImpl(url, ignorehttps);
  78. }
  79. TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
  80. return CutHttpPrefixImpl(url, ignorehttps);
  81. }
  82. size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
  83. if (url.empty()) {
  84. return 0;
  85. }
  86. struct TDelim: public str_spn {
  87. inline TDelim()
  88. : str_spn("!-/:-@[-`{|}", true)
  89. {
  90. }
  91. };
  92. const auto& delim = *Singleton<TDelim>();
  93. const char* n = delim.brk(url.data(), url.end());
  94. if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
  95. return 0;
  96. }
  97. return n + 3 - url.begin();
  98. }
  99. TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
  100. return url.Head(GetSchemePrefixSize(url));
  101. }
  102. TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
  103. return url.Tail(GetSchemePrefixSize(url));
  104. }
  105. template <bool KeepPort>
  106. static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
  107. TStringBuf urlNoScheme = url;
  108. urlNoScheme.Skip(GetHttpPrefixSize(url));
  109. struct TDelim: public str_spn {
  110. inline TDelim()
  111. : str_spn(KeepPort ? "/;?#" : "/:;?#")
  112. {
  113. }
  114. };
  115. const auto& nonHostCharacters = *Singleton<TDelim>();
  116. const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());
  117. if (firstNonHostCharacter != urlNoScheme.end()) {
  118. return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
  119. }
  120. return urlNoScheme;
  121. }
  122. TStringBuf GetHost(const TStringBuf url) noexcept {
  123. return GetHostAndPortImpl<false>(url);
  124. }
  125. TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
  126. return GetHostAndPortImpl<true>(url);
  127. }
  128. TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
  129. const size_t schemeSize = GetSchemePrefixSize(url);
  130. const TStringBuf scheme = url.Head(schemeSize);
  131. const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
  132. const TStringBuf host = GetHost(url.Tail(schemeSize));
  133. if (isHttp && trimHttp) {
  134. return host;
  135. } else {
  136. return TStringBuf(scheme.begin(), host.end());
  137. }
  138. }
  139. TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
  140. const size_t schemeSize = GetSchemePrefixSize(url);
  141. const TStringBuf scheme = url.Head(schemeSize);
  142. const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
  143. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  144. if (trimDefaultPort) {
  145. const size_t pos = hostAndPort.find(':');
  146. if (pos != TStringBuf::npos) {
  147. const bool isHttps = (scheme == TStringBuf("https://"));
  148. const TStringBuf port = hostAndPort.Tail(pos + 1);
  149. if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
  150. // trimming default port
  151. hostAndPort = hostAndPort.Head(pos);
  152. }
  153. }
  154. }
  155. if (isHttp && trimHttp) {
  156. return hostAndPort;
  157. } else {
  158. return TStringBuf(scheme.begin(), hostAndPort.end());
  159. }
  160. }
  161. void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
  162. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  163. host = hostBuf;
  164. path = pathBuf;
  165. }
  166. void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
  167. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  168. host = hostBuf;
  169. path = pathBuf;
  170. }
  171. void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
  172. TStringBuf urlWithoutFragment;
  173. if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
  174. fragment = "";
  175. urlWithoutFragment = url;
  176. }
  177. if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
  178. query = "";
  179. sanitizedUrl = urlWithoutFragment;
  180. }
  181. }
  182. bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  183. const size_t schemeSize = GetSchemePrefixSize(url);
  184. if (schemeSize != 0) {
  185. scheme = url.Head(schemeSize);
  186. }
  187. TStringBuf portStr;
  188. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  189. if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
  190. // URL has port
  191. if (!TryFromString(portStr, port)) {
  192. return false;
  193. }
  194. } else {
  195. host = hostAndPort;
  196. if (scheme == TStringBuf("https://")) {
  197. port = 443;
  198. } else if (scheme == TStringBuf("http://")) {
  199. port = 80;
  200. }
  201. }
  202. return true;
  203. }
  204. void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  205. bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
  206. Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
  207. }
  208. TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
  209. return GetHost(CutSchemePrefix(url));
  210. }
  211. TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
  212. const size_t off = url.find('/', GetHttpPrefixSize(url));
  213. TStringBuf hostUnused, path;
  214. if (!url.TrySplitAt(off, hostUnused, path))
  215. return "/";
  216. return trimFragment ? path.Before('#') : path;
  217. }
  218. // this strange creature returns 2nd level domain, possibly with port
  219. TStringBuf GetDomain(const TStringBuf host) noexcept {
  220. const char* c = !host ? host.data() : host.end() - 1;
  221. for (bool wasPoint = false; c != host.data(); --c) {
  222. if (*c == '.') {
  223. if (wasPoint) {
  224. ++c;
  225. break;
  226. }
  227. wasPoint = true;
  228. }
  229. }
  230. return TStringBuf(c, host.end());
  231. }
  232. TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
  233. size_t pos = host.size();
  234. for (size_t i = 0; i < level; ++i) {
  235. pos = host.rfind('.', pos);
  236. if (pos == TString::npos)
  237. return host;
  238. }
  239. return host.SubStr(pos + 1);
  240. }
  241. TStringBuf GetZone(const TStringBuf host) noexcept {
  242. return GetParentDomain(host, 1);
  243. }
  244. TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
  245. if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
  246. return url.substr(4);
  247. return url;
  248. }
  249. TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
  250. auto it = url.begin();
  251. StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
  252. if (it == url.begin()) {
  253. return url;
  254. }
  255. StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
  256. if (it == url.end()) {
  257. return url;
  258. }
  259. if (*it++ == '.') {
  260. return url.Tail(it - url.begin());
  261. }
  262. return url;
  263. }
  264. TStringBuf CutMPrefix(const TStringBuf url) noexcept {
  265. if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
  266. return url.substr(2);
  267. }
  268. return url;
  269. }
  270. static inline bool IsSchemeChar(char c) noexcept {
  271. return IsAsciiAlnum(c); //what about '+' ?..
  272. }
  273. static bool HasPrefix(const TStringBuf url) noexcept {
  274. TStringBuf scheme, unused;
  275. if (!url.TrySplit(TStringBuf("://"), scheme, unused))
  276. return false;
  277. return AllOf(scheme, IsSchemeChar);
  278. }
  279. TString AddSchemePrefix(const TString& url) {
  280. return AddSchemePrefix(url, TStringBuf("http"));
  281. }
  282. TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
  283. if (HasPrefix(url)) {
  284. return url;
  285. }
  286. return TString::Join(scheme, TStringBuf("://"), url);
  287. }
  288. #define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))
  289. static inline int x2c(unsigned char* x) {
  290. if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
  291. return -1;
  292. return X(x[0]) * 16 + X(x[1]);
  293. }
  294. #undef X
  295. static inline int Unescape(char* str) {
  296. char *to, *from;
  297. int dlen = 0;
  298. if ((str = strchr(str, '%')) == nullptr)
  299. return dlen;
  300. for (to = str, from = str; *from; from++, to++) {
  301. if ((*to = *from) == '%') {
  302. int c = x2c((unsigned char*)from + 1);
  303. *to = char((c > 0) ? c : '0');
  304. from += 2;
  305. dlen += 2;
  306. }
  307. }
  308. *to = 0; /* terminate it at the new length */
  309. return dlen;
  310. }
  311. size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
  312. if (source.empty() || source[0] == '?')
  313. return strlcpy(dest, "/", dest_size);
  314. size_t len = Min(dest_size - 1, source.length());
  315. memcpy(dest, source.data(), len);
  316. dest[len] = 0;
  317. len -= Unescape(dest);
  318. strlwr(dest);
  319. return len;
  320. }
  321. size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
  322. size_t len = Min(dest_size - 1, source.length());
  323. memcpy(dest, source.data(), len);
  324. dest[len] = 0;
  325. char buf[8] = ":";
  326. size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
  327. buf[buflen] = '\0';
  328. char* ptr = strstr(dest, buf);
  329. if (ptr && ptr[buflen] == 0) {
  330. len -= buflen;
  331. *ptr = 0;
  332. }
  333. strlwr(dest);
  334. return len;
  335. }
  336. TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
  337. if (str.EndsWith('/')) {
  338. str.Chop(1);
  339. }
  340. return str;
  341. }
  342. TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
  343. url = CutSchemePrefix(url);
  344. url = CutWWWPrefix(url);
  345. return url;
  346. }
  347. bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
  348. url = CutSchemePrefix(url);
  349. const TStringBuf noHostSuffix = url.After('/');
  350. if (noHostSuffix == url) {
  351. // no slash => no suffix with token info
  352. return false;
  353. }
  354. const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
  355. if (!suffixHasPrefix) {
  356. return false;
  357. }
  358. const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
  359. const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
  360. const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
  361. const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
  362. return prefixIsToken;
  363. }