url.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. #include "url.h"
  2. #include <util/string/cast.h>
  3. #include <util/string/util.h>
  4. #include <util/string/cstriter.h>
  5. #include <util/string/ascii.h>
  6. #include <util/string/strip.h>
  7. #include <util/charset/unidata.h> // for ToLower
  8. #include <util/system/defaults.h>
  9. #include <util/generic/algorithm.h>
  10. #include <util/generic/hash_set.h>
  11. #include <util/generic/yexception.h>
  12. #include <util/generic/singleton.h>
  13. #include <cstdlib>
  14. namespace {
  15. struct TUncheckedSize {
  16. static bool Has(size_t) {
  17. return true;
  18. }
  19. };
  20. struct TKnownSize {
  21. size_t MySize;
  22. explicit TKnownSize(size_t sz)
  23. : MySize(sz)
  24. {
  25. }
  26. bool Has(size_t sz) const {
  27. return sz <= MySize;
  28. }
  29. };
  30. template <typename TChar1, typename TChar2>
  31. int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
  32. for (size_t i = 0; i < n; ++i) {
  33. if ((TChar1)ToLower(s1[i]) != s2[i])
  34. return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
  35. }
  36. return 0;
  37. }
  38. template <typename TChar, typename TBounds>
  39. inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
  40. const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
  41. const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
  42. if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
  43. return 7;
  44. if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
  45. return 8;
  46. return 0;
  47. }
  48. template <typename T>
  49. inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
  50. size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
  51. if (prefixSize)
  52. return url.substr(prefixSize);
  53. return url;
  54. }
  55. }
  56. namespace NUrl {
  57. TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
  58. TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
  59. TStringBuf path = url;
  60. path.SkipPrefix(host);
  61. return {host, path};
  62. }
  63. } // namespace NUrl
  64. size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
  65. return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
  66. }
  67. size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
  68. return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
  69. }
  70. size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
  71. return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
  72. }
  73. size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
  74. return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
  75. }
  76. TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
  77. return CutHttpPrefixImpl(url, ignorehttps);
  78. }
  79. TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
  80. return CutHttpPrefixImpl(url, ignorehttps);
  81. }
  82. size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
  83. struct TDelim: public str_spn {
  84. inline TDelim()
  85. : str_spn("!-/:-@[-`{|}", true)
  86. {
  87. }
  88. };
  89. const auto& delim = *Singleton<TDelim>();
  90. const char* n = delim.brk(url.data(), url.end());
  91. if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
  92. return 0;
  93. }
  94. return n + 3 - url.begin();
  95. }
  96. TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
  97. return url.Head(GetSchemePrefixSize(url));
  98. }
  99. TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
  100. return url.Tail(GetSchemePrefixSize(url));
  101. }
  102. template <bool KeepPort>
  103. static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
  104. TStringBuf urlNoScheme = url;
  105. urlNoScheme.Skip(GetHttpPrefixSize(url));
  106. struct TDelim: public str_spn {
  107. inline TDelim()
  108. : str_spn(KeepPort ? "/;?#" : "/:;?#")
  109. {
  110. }
  111. };
  112. const auto& nonHostCharacters = *Singleton<TDelim>();
  113. const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());
  114. if (firstNonHostCharacter != urlNoScheme.end()) {
  115. return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
  116. }
  117. return urlNoScheme;
  118. }
  119. TStringBuf GetHost(const TStringBuf url) noexcept {
  120. return GetHostAndPortImpl<false>(url);
  121. }
  122. TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
  123. return GetHostAndPortImpl<true>(url);
  124. }
  125. TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
  126. const size_t schemeSize = GetSchemePrefixSize(url);
  127. const TStringBuf scheme = url.Head(schemeSize);
  128. const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
  129. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  130. if (trimDefaultPort) {
  131. const size_t pos = hostAndPort.find(':');
  132. if (pos != TStringBuf::npos) {
  133. const bool isHttps = (scheme == TStringBuf("https://"));
  134. const TStringBuf port = hostAndPort.Tail(pos + 1);
  135. if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
  136. // trimming default port
  137. hostAndPort = hostAndPort.Head(pos);
  138. }
  139. }
  140. }
  141. if (isHttp && trimHttp) {
  142. return hostAndPort;
  143. } else {
  144. return TStringBuf(scheme.begin(), hostAndPort.end());
  145. }
  146. }
  147. void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
  148. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  149. host = hostBuf;
  150. path = pathBuf;
  151. }
  152. void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
  153. auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
  154. host = hostBuf;
  155. path = pathBuf;
  156. }
  157. void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
  158. TStringBuf urlWithoutFragment;
  159. if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
  160. fragment = "";
  161. urlWithoutFragment = url;
  162. }
  163. if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
  164. query = "";
  165. sanitizedUrl = urlWithoutFragment;
  166. }
  167. }
  168. bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  169. const size_t schemeSize = GetSchemePrefixSize(url);
  170. if (schemeSize != 0) {
  171. scheme = url.Head(schemeSize);
  172. }
  173. TStringBuf portStr;
  174. TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
  175. if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
  176. // URL has port
  177. if (!TryFromString(portStr, port)) {
  178. return false;
  179. }
  180. } else {
  181. host = hostAndPort;
  182. if (scheme == TStringBuf("https://")) {
  183. port = 443;
  184. } else if (scheme == TStringBuf("http://")) {
  185. port = 80;
  186. }
  187. }
  188. return true;
  189. }
  190. void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
  191. bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
  192. Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
  193. }
  194. TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
  195. return GetHost(CutSchemePrefix(url));
  196. }
  197. TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
  198. const size_t off = url.find('/', GetHttpPrefixSize(url));
  199. TStringBuf hostUnused, path;
  200. if (!url.TrySplitAt(off, hostUnused, path))
  201. return "/";
  202. return trimFragment ? path.Before('#') : path;
  203. }
  204. // this strange creature returns 2nd level domain, possibly with port
  205. TStringBuf GetDomain(const TStringBuf host) noexcept {
  206. const char* c = !host ? host.data() : host.end() - 1;
  207. for (bool wasPoint = false; c != host.data(); --c) {
  208. if (*c == '.') {
  209. if (wasPoint) {
  210. ++c;
  211. break;
  212. }
  213. wasPoint = true;
  214. }
  215. }
  216. return TStringBuf(c, host.end());
  217. }
  218. TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
  219. size_t pos = host.size();
  220. for (size_t i = 0; i < level; ++i) {
  221. pos = host.rfind('.', pos);
  222. if (pos == TString::npos)
  223. return host;
  224. }
  225. return host.SubStr(pos + 1);
  226. }
  227. TStringBuf GetZone(const TStringBuf host) noexcept {
  228. return GetParentDomain(host, 1);
  229. }
  230. TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
  231. if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
  232. return url.substr(4);
  233. return url;
  234. }
  235. TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
  236. auto it = url.begin();
  237. StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
  238. if (it == url.begin()) {
  239. return url;
  240. }
  241. StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
  242. if (it == url.end()) {
  243. return url;
  244. }
  245. if (*it++ == '.') {
  246. return url.Tail(it - url.begin());
  247. }
  248. return url;
  249. }
  250. TStringBuf CutMPrefix(const TStringBuf url) noexcept {
  251. if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
  252. return url.substr(2);
  253. }
  254. return url;
  255. }
  256. static inline bool IsSchemeChar(char c) noexcept {
  257. return IsAsciiAlnum(c); //what about '+' ?..
  258. }
  259. static bool HasPrefix(const TStringBuf url) noexcept {
  260. TStringBuf scheme, unused;
  261. if (!url.TrySplit(TStringBuf("://"), scheme, unused))
  262. return false;
  263. return AllOf(scheme, IsSchemeChar);
  264. }
  265. TString AddSchemePrefix(const TString& url) {
  266. return AddSchemePrefix(url, TStringBuf("http"));
  267. }
  268. TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
  269. if (HasPrefix(url)) {
  270. return url;
  271. }
  272. return TString::Join(scheme, TStringBuf("://"), url);
  273. }
  274. #define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))
  275. static inline int x2c(unsigned char* x) {
  276. if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
  277. return -1;
  278. return X(x[0]) * 16 + X(x[1]);
  279. }
  280. #undef X
  281. static inline int Unescape(char* str) {
  282. char *to, *from;
  283. int dlen = 0;
  284. if ((str = strchr(str, '%')) == nullptr)
  285. return dlen;
  286. for (to = str, from = str; *from; from++, to++) {
  287. if ((*to = *from) == '%') {
  288. int c = x2c((unsigned char*)from + 1);
  289. *to = char((c > 0) ? c : '0');
  290. from += 2;
  291. dlen += 2;
  292. }
  293. }
  294. *to = 0; /* terminate it at the new length */
  295. return dlen;
  296. }
  297. size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
  298. if (source.empty() || source[0] == '?')
  299. return strlcpy(dest, "/", dest_size);
  300. size_t len = Min(dest_size - 1, source.length());
  301. memcpy(dest, source.data(), len);
  302. dest[len] = 0;
  303. len -= Unescape(dest);
  304. strlwr(dest);
  305. return len;
  306. }
  307. size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
  308. size_t len = Min(dest_size - 1, source.length());
  309. memcpy(dest, source.data(), len);
  310. dest[len] = 0;
  311. char buf[8] = ":";
  312. size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
  313. buf[buflen] = '\0';
  314. char* ptr = strstr(dest, buf);
  315. if (ptr && ptr[buflen] == 0) {
  316. len -= buflen;
  317. *ptr = 0;
  318. }
  319. strlwr(dest);
  320. return len;
  321. }
  322. TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
  323. if (str.EndsWith('/')) {
  324. str.Chop(1);
  325. }
  326. return str;
  327. }
  328. TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
  329. url = CutSchemePrefix(url);
  330. url = CutWWWPrefix(url);
  331. return url;
  332. }
  333. bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
  334. url = CutSchemePrefix(url);
  335. const TStringBuf noHostSuffix = url.After('/');
  336. if (noHostSuffix == url) {
  337. // no slash => no suffix with token info
  338. return false;
  339. }
  340. const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
  341. if (!suffixHasPrefix) {
  342. return false;
  343. }
  344. const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
  345. const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
  346. const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
  347. const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
  348. return prefixIsToken;
  349. }