url.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. #include "url.h"
  2. #include <util/string/cast.h>
  3. #include <util/string/util.h>
  4. #include <util/string/cstriter.h>
  5. #include <util/string/ascii.h>
  6. #include <util/string/strip.h>
  7. #include <util/charset/unidata.h> // for ToLower
  8. #include <util/system/defaults.h>
  9. #include <util/generic/algorithm.h>
  10. #include <util/generic/hash_set.h>
  11. #include <util/generic/yexception.h>
  12. #include <util/generic/singleton.h>
  13. #include <cstdlib>
  14. namespace {
  15. struct TUncheckedSize {
  16. static bool Has(size_t) {
  17. return true;
  18. }
  19. };
  20. struct TKnownSize {
  21. size_t MySize;
  22. explicit TKnownSize(size_t sz)
  23. : MySize(sz)
  24. {
  25. }
  26. bool Has(size_t sz) const {
  27. return sz <= MySize;
  28. }
  29. };
  30. template <typename TChar1, typename TChar2>
  31. int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
  32. for (size_t i = 0; i < n; ++i) {
  33. if ((TChar1)ToLower(s1[i]) != s2[i])
  34. return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
  35. }
  36. return 0;
  37. }
  38. template <typename TChar, typename TBounds>
  39. inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
  40. const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
  41. const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
  42. if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
  43. return 7;
  44. if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
  45. return 8;
  46. return 0;
  47. }
  48. template <typename T>
  49. inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
  50. size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
  51. if (prefixSize)
  52. return url.substr(prefixSize);
  53. return url;
  54. }
  55. }
  56. namespace NUrl {
  57. TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
  58. TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
  59. TStringBuf path = url;
  60. path.SkipPrefix(host);
  61. return {host, path};
  62. }
  63. bool HasLowerHost(const TStringBuf &url) {
  64. for (size_t n = 0; n < url.length(); ++n) {
  65. if (url[n] == '/')
  66. break;
  67. if (isupper(url[n]))
  68. return false;
  69. }
  70. return true;
  71. }
  72. TStringBuf CutHttpWwwPrefixes(const TStringBuf &url) {
  73. TStringBuf urlCut = CutWWWPrefix(CutHttpPrefix(url));
  74. if (!urlCut.empty() && urlCut.back() == '/')
  75. urlCut = urlCut.substr(0, urlCut.length() - 1);
  76. return urlCut;
  77. }
  78. TString MakeLowerHost(const TStringBuf &url, size_t shift) {
  79. TString urlFixed(url);
  80. for (char *c = urlFixed.begin() + shift; *c && (*c != '/'); ++c) {
  81. *c = tolower(*c);
  82. }
  83. return urlFixed;
  84. }
  85. TString MakeNormalized(const TStringBuf &url) {
  86. TStringBuf urlCut = CutHttpWwwPrefixes(url);
  87. if (HasLowerHost(urlCut)) {
  88. return ToString(urlCut);
  89. }
  90. return MakeLowerHost(urlCut);
  91. }
  92. } // namespace NUrl
  93. size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
  94. return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
  95. }
  96. size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
  97. return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
  98. }
  99. size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
  100. return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
  101. }
  102. size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
  103. return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
  104. }
  105. TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
  106. return CutHttpPrefixImpl(url, ignorehttps);
  107. }
  108. TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
  109. return CutHttpPrefixImpl(url, ignorehttps);
  110. }
  111. size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
  112. if (url.empty()) {
  113. return 0;
  114. }
  115. struct TDelim: public str_spn {
  116. inline TDelim()
  117. : str_spn("!-/:-@[-`{|}", true)
  118. {
  119. }
  120. };
  121. const auto& delim = *Singleton<TDelim>();
  122. const char* n = delim.brk(url.data(), url.end());
  123. if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
  124. return 0;
  125. }
  126. return n + 3 - url.begin();
  127. }
  128. TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
  129. return url.Head(GetSchemePrefixSize(url));
  130. }
  131. TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
  132. return url.Tail(GetSchemePrefixSize(url));
  133. }
  134. template <bool KeepPort>
  135. static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
  136. TStringBuf urlNoScheme = url;
  137. urlNoScheme.Skip(GetHttpPrefixSize(url));
  138. struct TDelim: public str_spn {
  139. inline TDelim()
  140. : str_spn(KeepPort ? "/;?#" : "/:;?#")
  141. {
  142. }
  143. };
  144. const auto& nonHostCharacters = *Singleton<TDelim>();
  145. const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());
  146. if (firstNonHostCharacter != urlNoScheme.end()) {
  147. return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
  148. }
  149. return urlNoScheme;
  150. }
  151. TStringBuf GetHost(const TStringBuf url) noexcept {
  152. return GetHostAndPortImpl<false>(url);
  153. }
  154. TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
  155. return GetHostAndPortImpl<true>(url);
  156. }
  157. TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
  158. const size_t schemeSize = GetSchemePrefixSize(url);
  159. const TStringBuf scheme = url.Head(schemeSize);
  160. const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
  161. const TStringBuf host = GetHost(url.Tail(schemeSize));
  162. if (isHttp && trimHttp) {
  163. return host;
  164. } else {
  165. return TStringBuf(scheme.begin(), host.end());
  166. }
  167. }
  168. TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
  169. const size_t schemeSize = GetSchemePrefixSize(url);
  170. const TStringBuf scheme = url.Head(schemeSize);
  171. const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
  172. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  173. if (trimDefaultPort) {
  174. const size_t pos = hostAndPort.find(':');
  175. if (pos != TStringBuf::npos) {
  176. const bool isHttps = (scheme == TStringBuf("https://"));
  177. const TStringBuf port = hostAndPort.Tail(pos + 1);
  178. if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
  179. // trimming default port
  180. hostAndPort = hostAndPort.Head(pos);
  181. }
  182. }
  183. }
  184. if (isHttp && trimHttp) {
  185. return hostAndPort;
  186. } else {
  187. return TStringBuf(scheme.begin(), hostAndPort.end());
  188. }
  189. }
  190. void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
  191. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  192. host = hostBuf;
  193. path = pathBuf;
  194. }
  195. void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
  196. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  197. host = hostBuf;
  198. path = pathBuf;
  199. }
  200. void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
  201. TStringBuf urlWithoutFragment;
  202. if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
  203. fragment = "";
  204. urlWithoutFragment = url;
  205. }
  206. if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
  207. query = "";
  208. sanitizedUrl = urlWithoutFragment;
  209. }
  210. }
  211. bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  212. const size_t schemeSize = GetSchemePrefixSize(url);
  213. if (schemeSize != 0) {
  214. scheme = url.Head(schemeSize);
  215. }
  216. TStringBuf portStr;
  217. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  218. if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
  219. // URL has port
  220. if (!TryFromString(portStr, port)) {
  221. return false;
  222. }
  223. } else {
  224. host = hostAndPort;
  225. if (scheme == TStringBuf("https://")) {
  226. port = 443;
  227. } else if (scheme == TStringBuf("http://")) {
  228. port = 80;
  229. }
  230. }
  231. return true;
  232. }
  233. void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  234. bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
  235. Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
  236. }
  237. TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
  238. return GetHost(CutSchemePrefix(url));
  239. }
  240. TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
  241. const size_t off = url.find('/', GetHttpPrefixSize(url));
  242. TStringBuf hostUnused, path;
  243. if (!url.TrySplitAt(off, hostUnused, path))
  244. return "/";
  245. return trimFragment ? path.Before('#') : path;
  246. }
  247. // this strange creature returns 2nd level domain, possibly with port
  248. TStringBuf GetDomain(const TStringBuf host) noexcept {
  249. const char* c = !host ? host.data() : host.end() - 1;
  250. for (bool wasPoint = false; c != host.data(); --c) {
  251. if (*c == '.') {
  252. if (wasPoint) {
  253. ++c;
  254. break;
  255. }
  256. wasPoint = true;
  257. }
  258. }
  259. return TStringBuf(c, host.end());
  260. }
  261. TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
  262. size_t pos = host.size();
  263. for (size_t i = 0; i < level; ++i) {
  264. pos = host.rfind('.', pos);
  265. if (pos == TString::npos)
  266. return host;
  267. }
  268. return host.SubStr(pos + 1);
  269. }
  270. TStringBuf GetZone(const TStringBuf host) noexcept {
  271. return GetParentDomain(host, 1);
  272. }
  273. TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
  274. if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
  275. return url.substr(4);
  276. return url;
  277. }
  278. TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
  279. auto it = url.begin();
  280. StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
  281. if (it == url.begin()) {
  282. return url;
  283. }
  284. StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
  285. if (it == url.end()) {
  286. return url;
  287. }
  288. if (*it++ == '.') {
  289. return url.Tail(it - url.begin());
  290. }
  291. return url;
  292. }
  293. TStringBuf CutMPrefix(const TStringBuf url) noexcept {
  294. if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
  295. return url.substr(2);
  296. }
  297. return url;
  298. }
  299. static inline bool IsSchemeChar(char c) noexcept {
  300. return IsAsciiAlnum(c); //what about '+' ?..
  301. }
  302. static bool HasPrefix(const TStringBuf url) noexcept {
  303. TStringBuf scheme, unused;
  304. if (!url.TrySplit(TStringBuf("://"), scheme, unused))
  305. return false;
  306. return AllOf(scheme, IsSchemeChar);
  307. }
  308. TString AddSchemePrefix(const TString& url) {
  309. return AddSchemePrefix(url, TStringBuf("http"));
  310. }
  311. TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
  312. if (HasPrefix(url)) {
  313. return url;
  314. }
  315. return TString::Join(scheme, TStringBuf("://"), url);
  316. }
  317. #define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))
  318. static inline int x2c(unsigned char* x) {
  319. if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
  320. return -1;
  321. return X(x[0]) * 16 + X(x[1]);
  322. }
  323. #undef X
  324. static inline int Unescape(char* str) {
  325. char *to, *from;
  326. int dlen = 0;
  327. if ((str = strchr(str, '%')) == nullptr)
  328. return dlen;
  329. for (to = str, from = str; *from; from++, to++) {
  330. if ((*to = *from) == '%') {
  331. int c = x2c((unsigned char*)from + 1);
  332. *to = char((c > 0) ? c : '0');
  333. from += 2;
  334. dlen += 2;
  335. }
  336. }
  337. *to = 0; /* terminate it at the new length */
  338. return dlen;
  339. }
  340. size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
  341. if (source.empty() || source[0] == '?')
  342. return strlcpy(dest, "/", dest_size);
  343. size_t len = Min(dest_size - 1, source.length());
  344. memcpy(dest, source.data(), len);
  345. dest[len] = 0;
  346. len -= Unescape(dest);
  347. strlwr(dest);
  348. return len;
  349. }
  350. size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
  351. size_t len = Min(dest_size - 1, source.length());
  352. memcpy(dest, source.data(), len);
  353. dest[len] = 0;
  354. char buf[8] = ":";
  355. size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
  356. buf[buflen] = '\0';
  357. char* ptr = strstr(dest, buf);
  358. if (ptr && ptr[buflen] == 0) {
  359. len -= buflen;
  360. *ptr = 0;
  361. }
  362. strlwr(dest);
  363. return len;
  364. }
  365. TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
  366. if (str.EndsWith('/')) {
  367. str.Chop(1);
  368. }
  369. return str;
  370. }
  371. TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
  372. url = CutSchemePrefix(url);
  373. url = CutWWWPrefix(url);
  374. return url;
  375. }
  376. bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
  377. url = CutSchemePrefix(url);
  378. const TStringBuf noHostSuffix = url.After('/');
  379. if (noHostSuffix == url) {
  380. // no slash => no suffix with token info
  381. return false;
  382. }
  383. const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
  384. if (!suffixHasPrefix) {
  385. return false;
  386. }
  387. const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
  388. const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
  389. const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
  390. const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
  391. return prefixIsToken;
  392. }