Browse Source

Support non-http scheme in Url::GetSchemeHostPort and Url::GetSchemeHost

alexromanov 2 years ago
parent
commit
fa3668fcff

+ 15 - 0
library/cpp/string_utils/url/url.cpp

@@ -154,6 +154,21 @@ TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
     return GetHostAndPortImpl<true>(url);
 }
 
+TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
+    const size_t schemeSize = GetSchemePrefixSize(url);
+    const TStringBuf scheme = url.Head(schemeSize);
+
+    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
+
+    const TStringBuf host = GetHost(url.Tail(schemeSize));
+
+    if (isHttp && trimHttp) {
+        return host;
+    } else {
+        return TStringBuf(scheme.begin(), host.end());
+    }
+}
+
 TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
     const size_t schemeSize = GetSchemePrefixSize(url);
     const TStringBuf scheme = url.Head(schemeSize);

+ 3 - 0
library/cpp/string_utils/url/url.h

@@ -69,6 +69,9 @@ TStringBuf GetHost(const TStringBuf url) noexcept;
 Y_PURE_FUNCTION
 TStringBuf GetHostAndPort(const TStringBuf url) noexcept;
 
+Y_PURE_FUNCTION
+TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp = true) noexcept;
+
 Y_PURE_FUNCTION
 TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept;
 

+ 87 - 0
library/cpp/string_utils/url/url_ut.cpp

@@ -27,6 +27,93 @@ Y_UNIT_TEST_SUITE(TUtilUrlTest) {
         UNIT_ASSERT_VALUES_EQUAL("", GetHost(""));
     }
 
+    Y_UNIT_TEST(TestGetSchemeHostAndPortWithoutSplit) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://www.ya.ru", GetSchemeHostAndPort("https://www.ya.ru:443/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru:80", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru:443/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:8080", GetSchemeHostAndPort("ftp://ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:1234", GetSchemeHostAndPort("ftp://ya.ru:1234/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru:80", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+        // irl RFC3986 sometimes gets ignored
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973"));
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false, /*trimDefaultPort*/false));
+        // check simple string
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url"));
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false, /*trimDefaultPort*/false));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort(""));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false, /*trimDefaultPort*/false));
+    }
+
+    Y_UNIT_TEST(TestGetSchemeHost) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe", /*trimHttp*/false));
+
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:443/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:444/bebe", /*trimHttp*/false));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:1234/bebe", /*trimHttp*/false));
+
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80"));
+        UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81", /*trimHttp*/false));
+
+        // irl RFC3986 sometimes gets ignored
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973"));
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/ false));
+        // check simple string
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url"));
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url", /*trimHttp*/ false));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost(""));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost("", /*trimHttp*/ false));
+    }
+
     Y_UNIT_TEST(TestGetPathAndQuery) {
         UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org"));
         UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/"));

+ 2 - 2
ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h

@@ -59,7 +59,7 @@ namespace {
     SIMPLE_UDF(TGetSchemeHost, TOptional<char*>(TOptional<char*>)) {
         EMPTY_RESULT_ON_EMPTY_ARG(0);
         const std::string_view url(args[0].AsStringRef());
-        const std::string_view host(GetHost(url));
+        const std::string_view host(GetSchemeHost(url, /* trimHttp */ false));
         return host.empty() ? TUnboxedValue() :
             valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end()));
     }
@@ -67,7 +67,7 @@ namespace {
     SIMPLE_UDF(TGetSchemeHostPort, TOptional<char*>(TOptional<char*>)) {
         EMPTY_RESULT_ON_EMPTY_ARG(0);
         const std::string_view url(args[0].AsStringRef());
-        const std::string_view host(GetHostAndPort(url));
+        const std::string_view host(GetSchemeHostAndPort(url, /* trimHttp */ false, /* trimDefaultPort */ false));
         return host.empty() ? TUnboxedValue() :
             valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end()));
     }