123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439 |
- #include "url.h"
- #include <util/string/cast.h>
- #include <util/string/util.h>
- #include <util/string/cstriter.h>
- #include <util/string/ascii.h>
- #include <util/string/strip.h>
- #include <util/charset/unidata.h> // for ToLower
- #include <util/system/defaults.h>
- #include <util/generic/algorithm.h>
- #include <util/generic/hash_set.h>
- #include <util/generic/yexception.h>
- #include <util/generic/singleton.h>
- #include <cstdlib>
- namespace {
- struct TUncheckedSize {
- static bool Has(size_t) {
- return true;
- }
- };
- struct TKnownSize {
- size_t MySize;
- explicit TKnownSize(size_t sz)
- : MySize(sz)
- {
- }
- bool Has(size_t sz) const {
- return sz <= MySize;
- }
- };
- template <typename TChar1, typename TChar2>
- int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
- for (size_t i = 0; i < n; ++i) {
- if ((TChar1)ToLower(s1[i]) != s2[i])
- return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
- }
- return 0;
- }
- template <typename TChar, typename TBounds>
- inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
- const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
- const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
- if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
- return 7;
- if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
- return 8;
- return 0;
- }
- template <typename T>
- inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
- size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
- if (prefixSize)
- return url.substr(prefixSize);
- return url;
- }
- }
- namespace NUrl {
- TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
- TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
- TStringBuf path = url;
- path.SkipPrefix(host);
- return {host, path};
- }
- } // namespace NUrl
- size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
- return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
- }
- size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
- return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
- }
- size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
- return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
- }
- size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
- return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
- }
- TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
- return CutHttpPrefixImpl(url, ignorehttps);
- }
- TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
- return CutHttpPrefixImpl(url, ignorehttps);
- }
- size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
- if (url.empty()) {
- return 0;
- }
- struct TDelim: public str_spn {
- inline TDelim()
- : str_spn("!-/:-@[-`{|}", true)
- {
- }
- };
- const auto& delim = *Singleton<TDelim>();
- const char* n = delim.brk(url.data(), url.end());
- if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
- return 0;
- }
- return n + 3 - url.begin();
- }
- TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
- return url.Head(GetSchemePrefixSize(url));
- }
- TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
- return url.Tail(GetSchemePrefixSize(url));
- }
- template <bool KeepPort>
- static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
- TStringBuf urlNoScheme = url;
- urlNoScheme.Skip(GetHttpPrefixSize(url));
- struct TDelim: public str_spn {
- inline TDelim()
- : str_spn(KeepPort ? "/;?#" : "/:;?#")
- {
- }
- };
- const auto& nonHostCharacters = *Singleton<TDelim>();
- const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());
- if (firstNonHostCharacter != urlNoScheme.end()) {
- return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
- }
- return urlNoScheme;
- }
- TStringBuf GetHost(const TStringBuf url) noexcept {
- return GetHostAndPortImpl<false>(url);
- }
- TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
- return GetHostAndPortImpl<true>(url);
- }
- TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
- const size_t schemeSize = GetSchemePrefixSize(url);
- const TStringBuf scheme = url.Head(schemeSize);
- const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
- const TStringBuf host = GetHost(url.Tail(schemeSize));
- if (isHttp && trimHttp) {
- return host;
- } else {
- return TStringBuf(scheme.begin(), host.end());
- }
- }
- TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
- const size_t schemeSize = GetSchemePrefixSize(url);
- const TStringBuf scheme = url.Head(schemeSize);
- const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
- TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
- if (trimDefaultPort) {
- const size_t pos = hostAndPort.find(':');
- if (pos != TStringBuf::npos) {
- const bool isHttps = (scheme == TStringBuf("https://"));
- const TStringBuf port = hostAndPort.Tail(pos + 1);
- if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
- // trimming default port
- hostAndPort = hostAndPort.Head(pos);
- }
- }
- }
- if (isHttp && trimHttp) {
- return hostAndPort;
- } else {
- return TStringBuf(scheme.begin(), hostAndPort.end());
- }
- }
- void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
- auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
- host = hostBuf;
- path = pathBuf;
- }
- void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
- auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
- host = hostBuf;
- path = pathBuf;
- }
- void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
- TStringBuf urlWithoutFragment;
- if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
- fragment = "";
- urlWithoutFragment = url;
- }
- if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
- query = "";
- sanitizedUrl = urlWithoutFragment;
- }
- }
- bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
- const size_t schemeSize = GetSchemePrefixSize(url);
- if (schemeSize != 0) {
- scheme = url.Head(schemeSize);
- }
- TStringBuf portStr;
- TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
- if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
- // URL has port
- if (!TryFromString(portStr, port)) {
- return false;
- }
- } else {
- host = hostAndPort;
- if (scheme == TStringBuf("https://")) {
- port = 443;
- } else if (scheme == TStringBuf("http://")) {
- port = 80;
- }
- }
- return true;
- }
- void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
- bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
- Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
- }
- TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
- return GetHost(CutSchemePrefix(url));
- }
- TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
- const size_t off = url.find('/', GetHttpPrefixSize(url));
- TStringBuf hostUnused, path;
- if (!url.TrySplitAt(off, hostUnused, path))
- return "/";
- return trimFragment ? path.Before('#') : path;
- }
- // this strange creature returns 2nd level domain, possibly with port
- TStringBuf GetDomain(const TStringBuf host) noexcept {
- const char* c = !host ? host.data() : host.end() - 1;
- for (bool wasPoint = false; c != host.data(); --c) {
- if (*c == '.') {
- if (wasPoint) {
- ++c;
- break;
- }
- wasPoint = true;
- }
- }
- return TStringBuf(c, host.end());
- }
- TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
- size_t pos = host.size();
- for (size_t i = 0; i < level; ++i) {
- pos = host.rfind('.', pos);
- if (pos == TString::npos)
- return host;
- }
- return host.SubStr(pos + 1);
- }
- TStringBuf GetZone(const TStringBuf host) noexcept {
- return GetParentDomain(host, 1);
- }
- TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
- if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
- return url.substr(4);
- return url;
- }
- TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
- auto it = url.begin();
- StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
- if (it == url.begin()) {
- return url;
- }
- StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
- if (it == url.end()) {
- return url;
- }
- if (*it++ == '.') {
- return url.Tail(it - url.begin());
- }
- return url;
- }
- TStringBuf CutMPrefix(const TStringBuf url) noexcept {
- if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
- return url.substr(2);
- }
- return url;
- }
- static inline bool IsSchemeChar(char c) noexcept {
- return IsAsciiAlnum(c); //what about '+' ?..
- }
- static bool HasPrefix(const TStringBuf url) noexcept {
- TStringBuf scheme, unused;
- if (!url.TrySplit(TStringBuf("://"), scheme, unused))
- return false;
- return AllOf(scheme, IsSchemeChar);
- }
- TString AddSchemePrefix(const TString& url) {
- return AddSchemePrefix(url, TStringBuf("http"));
- }
- TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
- if (HasPrefix(url)) {
- return url;
- }
- return TString::Join(scheme, TStringBuf("://"), url);
- }
- #define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))
- static inline int x2c(unsigned char* x) {
- if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
- return -1;
- return X(x[0]) * 16 + X(x[1]);
- }
- #undef X
- static inline int Unescape(char* str) {
- char *to, *from;
- int dlen = 0;
- if ((str = strchr(str, '%')) == nullptr)
- return dlen;
- for (to = str, from = str; *from; from++, to++) {
- if ((*to = *from) == '%') {
- int c = x2c((unsigned char*)from + 1);
- *to = char((c > 0) ? c : '0');
- from += 2;
- dlen += 2;
- }
- }
- *to = 0; /* terminate it at the new length */
- return dlen;
- }
- size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
- if (source.empty() || source[0] == '?')
- return strlcpy(dest, "/", dest_size);
- size_t len = Min(dest_size - 1, source.length());
- memcpy(dest, source.data(), len);
- dest[len] = 0;
- len -= Unescape(dest);
- strlwr(dest);
- return len;
- }
- size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
- size_t len = Min(dest_size - 1, source.length());
- memcpy(dest, source.data(), len);
- dest[len] = 0;
- char buf[8] = ":";
- size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
- buf[buflen] = '\0';
- char* ptr = strstr(dest, buf);
- if (ptr && ptr[buflen] == 0) {
- len -= buflen;
- *ptr = 0;
- }
- strlwr(dest);
- return len;
- }
- TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
- if (str.EndsWith('/')) {
- str.Chop(1);
- }
- return str;
- }
- TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
- url = CutSchemePrefix(url);
- url = CutWWWPrefix(url);
- return url;
- }
- bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
- url = CutSchemePrefix(url);
- const TStringBuf noHostSuffix = url.After('/');
- if (noHostSuffix == url) {
- // no slash => no suffix with token info
- return false;
- }
- const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
- if (!suffixHasPrefix) {
- return false;
- }
- const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
- const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
- const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
- const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
- return prefixIsToken;
- }
|