url_base_udf.h 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. #pragma once
  2. #include "url_parse.h"
  3. #include "url_query.h"
  4. #include <yql/essentials/public/udf/udf_helpers.h>
  5. #include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>
  6. #include <library/cpp/tld/tld.h>
  7. #include <library/cpp/charset/wide.h>
  8. #include <library/cpp/unicode/punycode/punycode.h>
  9. #include <library/cpp/string_utils/quote/quote.h>
  10. #include <library/cpp/string_utils/url/url.h>
  11. #include <util/string/split.h>
  12. #include <util/string/subst.h>
  13. using namespace NKikimr;
  14. using namespace NUdf;
  15. using namespace NTld;
  16. using namespace NUrlUdf;
  17. inline bool PrepareUrl(const std::string_view& keyStr, TUri& parser) {
  18. const NUri::TParseFlags& parseFlags(TUri::FeaturesRecommended);
  19. return parser.ParseAbs(keyStr, parseFlags) == TUri::ParsedOK;
  20. }
  21. #define ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(udfName, functionName) \
  22. BEGIN_SIMPLE_ARROW_UDF(udfName, TOptional<char*>(TOptional<char*>)) { \
  23. EMPTY_RESULT_ON_EMPTY_ARG(0); \
  24. const std::string_view url(args[0].AsStringRef()); \
  25. const std::string_view res(functionName(url)); \
  26. return res.empty() ? TUnboxedValue() : \
  27. valueBuilder->SubString(args[0], std::distance(url.begin(), res.begin()), res.size()); \
  28. } \
  29. struct udfName##KernelExec : public TUnaryKernelExec<udfName##KernelExec> { \
  30. template <typename TSink> \
  31. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) { \
  32. if (!arg) { \
  33. return sink(TBlockItem()); \
  34. } \
  35. const std::string_view url(arg.AsStringRef()); \
  36. const std::string_view res(functionName(url)); \
  37. if (res.empty()) { \
  38. return sink(TBlockItem()); \
  39. } \
  40. sink(TBlockItem(TStringRef(res))); \
  41. } \
  42. }; \
  43. END_SIMPLE_ARROW_UDF(udfName, udfName##KernelExec::Do);
  44. BEGIN_SIMPLE_ARROW_UDF(TNormalize, TOptional<char*>(TOptional<char*>)) {
  45. EMPTY_RESULT_ON_EMPTY_ARG(0);
  46. TUri url;
  47. const bool success = PrepareUrl(args[0].AsStringRef(), url);
  48. return success
  49. ? valueBuilder->NewString(url.PrintS(TUri::FlagNoFrag))
  50. : TUnboxedValue();
  51. }
  52. struct TNormalizeKernelExec : public TUnaryKernelExec<TNormalizeKernelExec> {
  53. template <typename TSink>
  54. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  55. if (!arg) {
  56. return sink(TBlockItem());
  57. }
  58. TUri url;
  59. const bool success = PrepareUrl(arg.AsStringRef(), url);
  60. if (success) {
  61. return sink(TBlockItem(TStringRef(url.PrintS(TUri::FlagNoFrag))));
  62. }
  63. sink(TBlockItem());
  64. }
  65. };
  66. END_SIMPLE_ARROW_UDF(TNormalize, TNormalizeKernelExec::Do);
  67. BEGIN_SIMPLE_STRICT_ARROW_UDF(TGetScheme, char*(TAutoMap<char*>)) {
  68. const std::string_view url(args[0].AsStringRef());
  69. const std::string_view prefix(GetSchemePrefix(url));
  70. return valueBuilder->SubString(args[0], std::distance(url.begin(), prefix.begin()), prefix.size());
  71. }
  72. struct TGetSchemeKernelExec : public TUnaryKernelExec<TGetSchemeKernelExec> {
  73. template <typename TSink>
  74. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  75. const std::string_view url(arg.AsStringRef());
  76. const std::string_view prefix(GetSchemePrefix(url));
  77. const std::string_view scheme = url.substr(std::distance(url.begin(), prefix.begin()), prefix.size());
  78. sink(TBlockItem(scheme));
  79. }
  80. };
  81. END_SIMPLE_ARROW_UDF(TGetScheme, TGetSchemeKernelExec::Do);
  82. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TGetHost, GetOnlyHost)
  83. std::string_view GetHostAndPortAfterCut(const std::string_view url) {
  84. return GetHostAndPort(CutSchemePrefix(url));
  85. }
  86. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TGetHostPort, GetHostAndPortAfterCut)
  87. std::string_view GetSchemeHostParameterized(const std::string_view url) {
  88. return GetSchemeHost(url, /* trimHttp */ false);
  89. }
  90. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TGetSchemeHost, GetSchemeHostParameterized);
  91. std::string_view GetSchemeHostPortParameterized(const std::string_view url) {
  92. return GetSchemeHostAndPort(url, /* trimHttp */ false, /* trimDefaultPort */ false);
  93. }
  94. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TGetSchemeHostPort, GetSchemeHostPortParameterized);
  95. BEGIN_SIMPLE_ARROW_UDF(TGetPort, TOptional<ui64>(TOptional<char*>)) {
  96. EMPTY_RESULT_ON_EMPTY_ARG(0);
  97. Y_UNUSED(valueBuilder);
  98. ui16 port = 0;
  99. TStringBuf scheme, host;
  100. TString lowerUri(args[0].AsStringRef());
  101. std::transform(lowerUri.cbegin(), lowerUri.cbegin() + GetSchemePrefixSize(lowerUri),
  102. lowerUri.begin(), [](unsigned char c){ return std::tolower(c); });
  103. return TryGetSchemeHostAndPort(lowerUri, scheme, host, port) && port
  104. ? TUnboxedValuePod(port)
  105. : TUnboxedValuePod();
  106. }
  107. struct TGetPortKernelExec : public TUnaryKernelExec<TGetPortKernelExec> {
  108. template <typename TSink>
  109. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  110. if (!arg) {
  111. return sink(TBlockItem());
  112. }
  113. ui16 port = 0;
  114. TStringBuf scheme, host;
  115. TString lowerUri(arg.AsStringRef());
  116. std::transform(lowerUri.cbegin(), lowerUri.cbegin() + GetSchemePrefixSize(lowerUri),
  117. lowerUri.begin(), [](unsigned char c){ return std::tolower(c); });
  118. if (TryGetSchemeHostAndPort(lowerUri, scheme, host, port) && port) {
  119. return sink(TBlockItem(port));
  120. }
  121. sink(TBlockItem());
  122. }
  123. };
  124. END_SIMPLE_ARROW_UDF(TGetPort, TGetPortKernelExec::Do);
  125. BEGIN_SIMPLE_ARROW_UDF(TGetTail, TOptional<char*>(TOptional<char*>)) {
  126. EMPTY_RESULT_ON_EMPTY_ARG(0);
  127. const TStringBuf url(args[0].AsStringRef());
  128. TStringBuf host, tail;
  129. SplitUrlToHostAndPath(url, host, tail);
  130. return tail.StartsWith('/')
  131. ? valueBuilder->NewString(tail)
  132. : valueBuilder->NewString(TString('/').append(tail));
  133. }
  134. struct TGetTailKernelExec : public TUnaryKernelExec<TGetTailKernelExec> {
  135. template <typename TSink>
  136. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  137. if (!arg) {
  138. return sink(TBlockItem());
  139. }
  140. const TStringBuf url(arg.AsStringRef());
  141. TStringBuf host, tail;
  142. SplitUrlToHostAndPath(url, host, tail);
  143. if (tail.StartsWith('/')) {
  144. return sink(TBlockItem(TStringRef(tail)));
  145. }
  146. sink(TBlockItem(TStringRef(TString('/').append(tail))));
  147. }
  148. };
  149. END_SIMPLE_ARROW_UDF(TGetTail, TGetTailKernelExec::Do);
  150. BEGIN_SIMPLE_ARROW_UDF(TGetPath, TOptional<char*>(TOptional<char*>)) {
  151. EMPTY_RESULT_ON_EMPTY_ARG(0);
  152. const std::string_view url(args[0].AsStringRef());
  153. std::string_view cut(CutSchemePrefix(url));
  154. const auto s = cut.find('/');
  155. if (s == std::string_view::npos) {
  156. return valueBuilder->NewString("/");
  157. }
  158. cut.remove_prefix(s);
  159. const auto end = cut.find_first_of("?#");
  160. if (std::string_view::npos != end) {
  161. cut.remove_suffix(cut.size() - end);
  162. }
  163. return valueBuilder->SubString(args[0], std::distance(url.begin(), cut.begin()), cut.length());
  164. }
  165. struct TGetPathKernelExec : public TUnaryKernelExec<TGetPathKernelExec> {
  166. template <typename TSink>
  167. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  168. if (!arg) {
  169. return sink(TBlockItem());
  170. }
  171. const std::string_view url(arg.AsStringRef());
  172. std::string_view cut(CutSchemePrefix(url));
  173. const auto s = cut.find('/');
  174. if (s == std::string_view::npos) {
  175. return sink(TBlockItem(TStringRef("/")));
  176. }
  177. cut.remove_prefix(s);
  178. const auto end = cut.find_first_of("?#");
  179. if (std::string_view::npos != end) {
  180. cut.remove_suffix(cut.size() - end);
  181. }
  182. sink(TBlockItem(TStringRef(cut)));
  183. }
  184. };
  185. END_SIMPLE_ARROW_UDF(TGetPath, TGetPathKernelExec::Do);
  186. BEGIN_SIMPLE_ARROW_UDF(TGetFragment, TOptional<char*>(TOptional<char*>)) {
  187. EMPTY_RESULT_ON_EMPTY_ARG(0);
  188. const std::string_view url(args[0].AsStringRef());
  189. const auto pos = url.find('#');
  190. return pos == std::string_view::npos ? TUnboxedValue() :
  191. valueBuilder->SubString(args[0], pos + 1U, url.length() - pos - 1U);
  192. }
  193. struct TGetFragmentKernelExec : public TUnaryKernelExec<TGetFragmentKernelExec> {
  194. template <typename TSink>
  195. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  196. if (!arg) {
  197. return sink(TBlockItem());
  198. }
  199. const std::string_view url(arg.AsStringRef());
  200. const auto pos = url.find('#');
  201. if (pos == std::string_view::npos) {
  202. return sink(TBlockItem());
  203. }
  204. return sink(TBlockItem(arg.AsStringRef().Substring(pos + 1U, url.length() - pos - 1U)));
  205. }
  206. };
  207. END_SIMPLE_ARROW_UDF(TGetFragment, TGetFragmentKernelExec::Do);
  208. std::optional<std::pair<ui32, ui32>> GetDomain(const std::string_view url, const ui8 level) {
  209. const std::string_view host(GetOnlyHost(url));
  210. std::vector<std::string_view> parts;
  211. StringSplitter(host).Split('.').AddTo(&parts);
  212. if (level && parts.size() >= level) {
  213. const auto& result = host.substr(std::distance(host.begin(), parts[parts.size() - level].begin()));
  214. if (result.empty()) {
  215. return std::nullopt;
  216. }
  217. return std::make_pair(std::distance(url.begin(), result.begin()), result.size());
  218. }
  219. return std::nullopt;
  220. }
  221. BEGIN_SIMPLE_ARROW_UDF(TGetDomain, TOptional<char*>(TOptional<char*>, ui8)) {
  222. EMPTY_RESULT_ON_EMPTY_ARG(0);
  223. const std::string_view url = args[0].AsStringRef();
  224. const std::optional<std::pair<ui32, ui32>> resultOpt = GetDomain(url, args[1].Get<ui8>());
  225. if (!resultOpt) {
  226. return TUnboxedValue();
  227. }
  228. const std::pair<ui32, ui32> result = *resultOpt;
  229. return valueBuilder->SubString(args[0], result.first, result.second);
  230. }
  231. struct TGetDomainKernelExec : public TBinaryKernelExec<TGetDomainKernelExec> {
  232. template <typename TSink>
  233. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  234. if (!arg1) {
  235. return sink(TBlockItem());
  236. }
  237. const auto resultOpt = GetDomain(arg1.AsStringRef(), arg2.As<ui8>());
  238. if (!resultOpt) {
  239. return sink(TBlockItem());
  240. }
  241. const auto result = *resultOpt;
  242. sink(TBlockItem(arg1.AsStringRef().Substring(result.first, result.second)));
  243. }
  244. };
  245. END_SIMPLE_ARROW_UDF(TGetDomain, TGetDomainKernelExec::Do);
  246. BEGIN_SIMPLE_ARROW_UDF(TGetTLD, char*(TAutoMap<char*>)) {
  247. const TStringBuf url(args[0].AsStringRef());
  248. return valueBuilder->NewString(GetZone(GetOnlyHost(url)));
  249. }
  250. struct TGetTLDKernelExec : public TUnaryKernelExec<TGetTLDKernelExec> {
  251. template <typename TSink>
  252. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  253. const TStringBuf url(arg.AsStringRef());
  254. return sink(TBlockItem(GetZone(GetOnlyHost(url))));
  255. }
  256. };
  257. END_SIMPLE_ARROW_UDF(TGetTLD, TGetTLDKernelExec::Do);
  258. BEGIN_SIMPLE_ARROW_UDF(TGetDomainLevel, ui64(TAutoMap<char*>)) {
  259. Y_UNUSED(valueBuilder);
  260. std::vector<std::string_view> parts;
  261. StringSplitter(GetOnlyHost(args[0].AsStringRef())).Split('.').AddTo(&parts);
  262. return TUnboxedValuePod(ui64(parts.size()));
  263. }
  264. struct TGetDomainLevelKernelExec : public TUnaryKernelExec<TGetDomainLevelKernelExec> {
  265. template <typename TSink>
  266. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  267. std::vector<std::string_view> parts;
  268. StringSplitter(GetOnlyHost(arg.AsStringRef())).Split('.').AddTo(&parts);
  269. return sink(TBlockItem(ui64(parts.size())));
  270. }
  271. };
  272. END_SIMPLE_ARROW_UDF(TGetDomainLevel, TGetDomainLevelKernelExec::Do);
  273. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TGetSignificantDomain, char*(TAutoMap<char*>, TOptional<TListType<char*>>), 1) {
  274. const std::string_view url(args[0].AsStringRef());
  275. const std::string_view host(GetOnlyHost(url));
  276. std::vector<std::string_view> parts;
  277. StringSplitter(host).Split('.').AddTo(&parts);
  278. if (parts.size() > 2) {
  279. const auto& secondLevel = parts.at(parts.size() - 2);
  280. bool secondLevelIsZone = false;
  281. if (args[1]) {
  282. const auto& zonesIterator = args[1].GetListIterator();
  283. for (TUnboxedValue item; zonesIterator.Next(item);) {
  284. if (secondLevel == item.AsStringRef()) {
  285. secondLevelIsZone = true;
  286. break;
  287. }
  288. }
  289. } else {
  290. static const std::set<std::string_view> zones{"com", "net", "org", "co", "gov", "edu"};
  291. secondLevelIsZone = zones.count(secondLevel);
  292. }
  293. const auto from = parts[parts.size() - (secondLevelIsZone ? 3U : 2U)].begin();
  294. return valueBuilder->SubString(args[0], std::distance(url.begin(), from), std::distance(from, parts.back().end()));
  295. }
  296. return valueBuilder->SubString(args[0], std::distance(url.begin(), host.begin()), host.length());
  297. }
  298. std::optional<std::pair<ui32, ui32>> GetCGIParam(const std::string_view url, const std::string_view key) {
  299. const auto queryStart = url.find('?');
  300. if (queryStart != std::string_view::npos) {
  301. const auto from = queryStart + 1U;
  302. const auto anc = url.find('#', from);
  303. const auto end = anc == std::string_view::npos ? url.length() : anc;
  304. for (auto pos = from; pos && pos < end; ++pos) {
  305. const auto equal = url.find('=', pos);
  306. const auto amper = url.find('&', pos);
  307. if (equal < amper) {
  308. const auto& param = url.substr(pos, equal - pos);
  309. if (param == key) {
  310. return std::make_pair(equal + 1U, std::min(amper, end) - equal - 1U);
  311. }
  312. }
  313. pos = amper;
  314. }
  315. }
  316. return std::nullopt;
  317. }
  318. BEGIN_SIMPLE_ARROW_UDF(TGetCGIParam, TOptional<char*>(TOptional<char*>, char*)) {
  319. EMPTY_RESULT_ON_EMPTY_ARG(0);
  320. const std::string_view url = args[0].AsStringRef();
  321. const std::optional<std::pair<ui32, ui32>> resultOpt = GetCGIParam(url, args[1].AsStringRef());
  322. if (!resultOpt) {
  323. return TUnboxedValue();
  324. }
  325. const std::pair<ui32, ui32> result = *resultOpt;
  326. return valueBuilder->SubString(args[0], result.first, result.second);
  327. }
  328. struct TGetCGIParamKernelExec : public TBinaryKernelExec<TGetCGIParamKernelExec> {
  329. template <typename TSink>
  330. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  331. if (!arg1) {
  332. return sink(TBlockItem());
  333. }
  334. const auto resultOpt = GetCGIParam(arg1.AsStringRef(), arg2.AsStringRef());
  335. if (!resultOpt) {
  336. return sink(TBlockItem());
  337. }
  338. const auto result = *resultOpt;
  339. sink(TBlockItem(arg1.AsStringRef().Substring(result.first, result.second)));
  340. }
  341. };
  342. END_SIMPLE_ARROW_UDF(TGetCGIParam, TGetCGIParamKernelExec::Do);
  343. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TCutScheme, CutSchemePrefix)
  344. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TCutWWW, CutWWWPrefix)
  345. ARROW_UDF_SINGLE_STRING_FUNCTION_FOR_URL(TCutWWW2, CutWWWNumberedPrefix)
  346. BEGIN_SIMPLE_ARROW_UDF(TCutQueryStringAndFragment, char*(TAutoMap<char*>)) {
  347. const std::string_view input(args[0].AsStringRef());
  348. const auto cut = input.find_first_of("?#");
  349. return std::string_view::npos == cut ? NUdf::TUnboxedValue(args[0]) : valueBuilder->SubString(args[0], 0U, cut);
  350. }
  351. struct TCutQueryStringAndFragmentKernelExec : public TUnaryKernelExec<TCutQueryStringAndFragmentKernelExec> {
  352. template <typename TSink>
  353. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  354. const std::string_view input(arg.AsStringRef());
  355. const auto cut = input.find_first_of("?#");
  356. sink(TBlockItem(arg.AsStringRef().Substring(0U, cut)));
  357. }
  358. };
  359. END_SIMPLE_ARROW_UDF(TCutQueryStringAndFragment, TCutQueryStringAndFragmentKernelExec::Do);
  360. BEGIN_SIMPLE_ARROW_UDF(TEncode, TOptional<char*>(TOptional<char*>)) {
  361. EMPTY_RESULT_ON_EMPTY_ARG(0);
  362. const std::string_view input(args[0].AsStringRef());
  363. if (input.empty()) {
  364. return NUdf::TUnboxedValuePod();
  365. }
  366. TString url(input);
  367. UrlEscape(url);
  368. return input == url ? NUdf::TUnboxedValue(args[0]) : valueBuilder->NewString(url);
  369. }
  370. struct TEncodeKernelExec : public TUnaryKernelExec<TEncodeKernelExec> {
  371. template <typename TSink>
  372. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  373. if (!arg) {
  374. return sink(TBlockItem());
  375. }
  376. const std::string_view input(arg.AsStringRef());
  377. if (input.empty()) {
  378. return sink(TBlockItem());
  379. }
  380. TString url(input);
  381. UrlEscape(url);
  382. sink(TBlockItem(TStringRef(url)));
  383. }
  384. };
  385. END_SIMPLE_ARROW_UDF(TEncode, TEncodeKernelExec::Do);
  386. BEGIN_SIMPLE_ARROW_UDF(TDecode, TOptional<char*>(TOptional<char*>)) {
  387. EMPTY_RESULT_ON_EMPTY_ARG(0);
  388. const std::string_view input(args[0].AsStringRef());
  389. if (input.empty()) {
  390. return NUdf::TUnboxedValuePod();
  391. }
  392. TString url(input);
  393. SubstGlobal(url, '+', ' ');
  394. UrlUnescape(url);
  395. return input == url ? NUdf::TUnboxedValue(args[0]) : valueBuilder->NewString(url);
  396. }
  397. struct TDecodeKernelExec : public TUnaryKernelExec<TDecodeKernelExec> {
  398. template <typename TSink>
  399. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  400. if (!arg) {
  401. return sink(TBlockItem());
  402. }
  403. const std::string_view input(arg.AsStringRef());
  404. if (input.empty()) {
  405. return sink(TBlockItem());
  406. }
  407. TString url(input);
  408. SubstGlobal(url, '+', ' ');
  409. UrlUnescape(url);
  410. sink(TBlockItem(TStringRef(url)));
  411. }
  412. };
  413. END_SIMPLE_ARROW_UDF(TDecode, TDecodeKernelExec::Do);
  414. BEGIN_SIMPLE_ARROW_UDF(TIsKnownTLD, bool(TAutoMap<char*>)) {
  415. Y_UNUSED(valueBuilder);
  416. return TUnboxedValuePod(IsTld(args[0].AsStringRef()));
  417. }
  418. struct TIsKnownTLDKernelExec : public TUnaryKernelExec<TIsKnownTLDKernelExec> {
  419. template <typename TSink>
  420. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  421. sink(TBlockItem(static_cast<ui8>(IsTld(arg.AsStringRef()))));
  422. }
  423. };
  424. END_SIMPLE_ARROW_UDF(TIsKnownTLD, TIsKnownTLDKernelExec::Do);
  425. BEGIN_SIMPLE_ARROW_UDF(TIsWellKnownTLD, bool(TAutoMap<char*>)) {
  426. Y_UNUSED(valueBuilder);
  427. return TUnboxedValuePod(IsVeryGoodTld(args[0].AsStringRef()));
  428. }
  429. struct TIsWellKnownTLDKernelExec : public TUnaryKernelExec<TIsWellKnownTLDKernelExec> {
  430. template <typename TSink>
  431. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  432. sink(TBlockItem(static_cast<ui8>(IsVeryGoodTld(arg.AsStringRef()))));
  433. }
  434. };
  435. END_SIMPLE_ARROW_UDF(TIsWellKnownTLD, TIsWellKnownTLDKernelExec::Do);
  436. BEGIN_SIMPLE_ARROW_UDF(THostNameToPunycode, TOptional<char*>(TAutoMap<char*>)) try {
  437. const TUtf16String& input = UTF8ToWide(args[0].AsStringRef());
  438. return valueBuilder->NewString(HostNameToPunycode(input));
  439. } catch (TPunycodeError&) {
  440. return TUnboxedValue();
  441. }
  442. struct THostNameToPunycodeKernelExec : public TUnaryKernelExec<THostNameToPunycodeKernelExec> {
  443. template <typename TSink>
  444. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) try {
  445. const TUtf16String& input = UTF8ToWide(arg.AsStringRef());
  446. return sink(TBlockItem(TStringRef(HostNameToPunycode(input))));
  447. } catch (TPunycodeError&) {
  448. return sink(TBlockItem());
  449. }
  450. };
  451. END_SIMPLE_ARROW_UDF(THostNameToPunycode, THostNameToPunycodeKernelExec::Do);
  452. BEGIN_SIMPLE_ARROW_UDF(TForceHostNameToPunycode, char*(TAutoMap<char*>)) {
  453. const TUtf16String& input = UTF8ToWide(args[0].AsStringRef());
  454. return valueBuilder->NewString(ForceHostNameToPunycode(input));
  455. }
  456. struct TForceHostNameToPunycodeKernelExec : public TUnaryKernelExec<TForceHostNameToPunycodeKernelExec> {
  457. template <typename TSink>
  458. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  459. const TUtf16String& input = UTF8ToWide(arg.AsStringRef());
  460. sink(TBlockItem(TStringRef(ForceHostNameToPunycode(input))));
  461. }
  462. };
  463. END_SIMPLE_ARROW_UDF(TForceHostNameToPunycode, TForceHostNameToPunycodeKernelExec::Do);
  464. BEGIN_SIMPLE_ARROW_UDF(TPunycodeToHostName, TOptional<char*>(TAutoMap<char*>)) try {
  465. const TStringRef& input = args[0].AsStringRef();
  466. const auto& result = WideToUTF8(PunycodeToHostName(input));
  467. return valueBuilder->NewString(result);
  468. } catch (TPunycodeError&) {
  469. return TUnboxedValue();
  470. }
  471. struct TPunycodeToHostNameKernelExec : public TUnaryKernelExec<TPunycodeToHostNameKernelExec> {
  472. template <typename TSink>
  473. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) try {
  474. const TStringRef& input = arg.AsStringRef();
  475. const auto& result = WideToUTF8(PunycodeToHostName(input));
  476. return sink(TBlockItem(TStringRef(result)));
  477. } catch (TPunycodeError&) {
  478. return sink(TBlockItem());
  479. }
  480. };
  481. END_SIMPLE_ARROW_UDF(TPunycodeToHostName, TPunycodeToHostNameKernelExec::Do);
  482. BEGIN_SIMPLE_ARROW_UDF(TForcePunycodeToHostName, char*(TAutoMap<char*>)) {
  483. const TStringRef& input = args[0].AsStringRef();
  484. const auto& result = WideToUTF8(ForcePunycodeToHostName(input));
  485. return valueBuilder->NewString(result);
  486. }
  487. struct TForcePunycodeToHostNameKernelExec : public TUnaryKernelExec<TForcePunycodeToHostNameKernelExec> {
  488. template <typename TSink>
  489. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  490. const TStringRef& input = arg.AsStringRef();
  491. const auto& result = WideToUTF8(ForcePunycodeToHostName(input));
  492. sink(TBlockItem(TStringRef(result)));
  493. }
  494. };
  495. END_SIMPLE_ARROW_UDF(TForcePunycodeToHostName, TForcePunycodeToHostNameKernelExec::Do);
  496. BEGIN_SIMPLE_ARROW_UDF(TCanBePunycodeHostName, bool(TAutoMap<char*>)) {
  497. Y_UNUSED(valueBuilder);
  498. return TUnboxedValuePod(CanBePunycodeHostName(args[0].AsStringRef()));
  499. }
  500. struct TCanBePunycodeHostNameKernelExec : public TUnaryKernelExec<TCanBePunycodeHostNameKernelExec> {
  501. template <typename TSink>
  502. static void Process(const IValueBuilder*, TBlockItem arg, const TSink& sink) {
  503. sink(TBlockItem(static_cast<ui8>(CanBePunycodeHostName(arg.AsStringRef()))));
  504. }
  505. };
  506. END_SIMPLE_ARROW_UDF(TCanBePunycodeHostName, TCanBePunycodeHostNameKernelExec::Do);
  507. #define EXPORTED_URL_BASE_UDF \
  508. TNormalize, \
  509. TParse, \
  510. TGetScheme, \
  511. TGetHost, \
  512. TGetHostPort, \
  513. TGetSchemeHost, \
  514. TGetSchemeHostPort, \
  515. TGetPort, \
  516. TGetTail, \
  517. TGetPath, \
  518. TGetFragment, \
  519. TGetDomain, \
  520. TGetTLD, \
  521. TGetDomainLevel, \
  522. TGetSignificantDomain, \
  523. TGetCGIParam, \
  524. TCutScheme, \
  525. TCutWWW, \
  526. TCutWWW2, \
  527. TCutQueryStringAndFragment, \
  528. TEncode, \
  529. TDecode, \
  530. TIsKnownTLD, \
  531. TIsWellKnownTLD, \
  532. THostNameToPunycode, \
  533. TForceHostNameToPunycode, \
  534. TPunycodeToHostName, \
  535. TForcePunycodeToHostName, \
  536. TCanBePunycodeHostName, \
  537. TQueryStringToList, \
  538. TQueryStringToDict, \
  539. TBuildQueryString