string_udf.cpp 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. #include <yql/essentials/public/udf/udf_allocator.h>
  2. #include <yql/essentials/public/udf/udf_helpers.h>
  3. #include <yql/essentials/public/udf/udf_value_builder.h>
  4. #include <library/cpp/charset/codepage.h>
  5. #include <library/cpp/deprecated/split/split_iterator.h>
  6. #include <library/cpp/html/pcdata/pcdata.h>
  7. #include <library/cpp/string_utils/base32/base32.h>
  8. #include <library/cpp/string_utils/base64/base64.h>
  9. #include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>
  10. #include <library/cpp/string_utils/quote/quote.h>
  11. #include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>
  12. #include <util/charset/wide.h>
  13. #include <util/generic/vector.h>
  14. #include <util/stream/format.h>
  15. #include <util/string/ascii.h>
  16. #include <util/string/escape.h>
  17. #include <util/string/hex.h>
  18. #include <util/string/join.h>
  19. #include <util/string/reverse.h>
  20. #include <util/string/split.h>
  21. #include <util/string/strip.h>
  22. #include <util/string/subst.h>
  23. #include <util/string/util.h>
  24. #include <util/string/vector.h>
  25. using namespace NKikimr;
  26. using namespace NUdf;
  27. namespace {
  28. #define STRING_UDF(udfName, function) \
  29. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<char*>)) { \
  30. const TString input(args[0].AsStringRef()); \
  31. const auto& result = function(input); \
  32. return valueBuilder->NewString(result); \
  33. } \
  34. \
  35. struct T##udfName##KernelExec \
  36. : public TUnaryKernelExec<T##udfName##KernelExec> \
  37. { \
  38. template <typename TSink> \
  39. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  40. const TString input(arg1.AsStringRef()); \
  41. const auto& result = function(input); \
  42. sink(TBlockItem(result)); \
  43. } \
  44. }; \
  45. \
  46. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) \
  47. // 'unsafe' udf is actually strict - it returns null on any exception
  48. #define STRING_UNSAFE_UDF(udfName, function) \
  49. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
  50. EMPTY_RESULT_ON_EMPTY_ARG(0); \
  51. const TString input(args[0].AsStringRef()); \
  52. try { \
  53. const auto& result = function(input); \
  54. return valueBuilder->NewString(result); \
  55. } catch (yexception&) { \
  56. return TUnboxedValue(); \
  57. } \
  58. } \
  59. \
  60. struct T##udfName##KernelExec \
  61. : public TUnaryKernelExec<T##udfName##KernelExec> \
  62. { \
  63. template <typename TSink> \
  64. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  65. if (!arg1) { \
  66. return sink(TBlockItem()); \
  67. } \
  68. \
  69. const TString input(arg1.AsStringRef()); \
  70. try { \
  71. const auto& result = function(input); \
  72. sink(TBlockItem(result)); \
  73. } catch (yexception&) { \
  74. return sink(TBlockItem()); \
  75. } \
  76. } \
  77. }; \
  78. \
  79. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  80. #define STROKA_UDF(udfName, function) \
  81. SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
  82. EMPTY_RESULT_ON_EMPTY_ARG(0) \
  83. const TString input(args[0].AsStringRef()); \
  84. try { \
  85. TUtf16String wide = UTF8ToWide(input); \
  86. function(wide); \
  87. return valueBuilder->NewString(WideToUTF8(wide)); \
  88. } catch (yexception&) { \
  89. return TUnboxedValue(); \
  90. } \
  91. }
  92. #define STROKA_CASE_UDF(udfName, function) \
  93. SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
  94. EMPTY_RESULT_ON_EMPTY_ARG(0) \
  95. const TString input(args[0].AsStringRef()); \
  96. try { \
  97. TUtf16String wide = UTF8ToWide(input); \
  98. function(wide.begin(), wide.size()); \
  99. return valueBuilder->NewString(WideToUTF8(wide)); \
  100. } catch (yexception&) { \
  101. return TUnboxedValue(); \
  102. } \
  103. }
  104. #define STROKA_ASCII_CASE_UDF(udfName, function) \
  105. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<char*>)) { \
  106. TString input(args[0].AsStringRef()); \
  107. if (input.function()) { \
  108. return valueBuilder->NewString(input); \
  109. } else { \
  110. return args[0]; \
  111. } \
  112. } \
  113. \
  114. struct T##udfName##KernelExec \
  115. : public TUnaryKernelExec<T##udfName##KernelExec> \
  116. { \
  117. template <typename TSink> \
  118. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  119. TString input(arg1.AsStringRef()); \
  120. if (input.function()) { \
  121. sink(TBlockItem(input)); \
  122. } else { \
  123. sink(arg1); \
  124. } \
  125. } \
  126. }; \
  127. \
  128. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  129. #define STROKA_FIND_UDF(udfName, function) \
  130. SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
  131. Y_UNUSED(valueBuilder); \
  132. if (args[0]) { \
  133. const TString haystack(args[0].AsStringRef()); \
  134. const TString needle(args[1].AsStringRef()); \
  135. return TUnboxedValuePod(haystack.function(needle)); \
  136. } else { \
  137. return TUnboxedValuePod(false); \
  138. } \
  139. }
  140. #define STRING_TWO_ARGS_UDF(udfName, function) \
  141. SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
  142. Y_UNUSED(valueBuilder); \
  143. if (args[0]) { \
  144. const TString haystack(args[0].AsStringRef()); \
  145. const TString needle(args[1].AsStringRef()); \
  146. return TUnboxedValuePod(function(haystack, needle)); \
  147. } else { \
  148. return TUnboxedValuePod(false); \
  149. } \
  150. }
  151. #define IS_ASCII_UDF(function) \
  152. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, bool(TOptional<char*>)) { \
  153. Y_UNUSED(valueBuilder); \
  154. if (args[0]) { \
  155. const TStringBuf input(args[0].AsStringRef()); \
  156. bool result = true; \
  157. for (auto c : input) { \
  158. if (!function(c)) { \
  159. result = false; \
  160. break; \
  161. } \
  162. } \
  163. return TUnboxedValuePod(result); \
  164. } else { \
  165. return TUnboxedValuePod(false); \
  166. } \
  167. } \
  168. \
  169. struct T##function##KernelExec \
  170. : public TUnaryKernelExec<T##function##KernelExec> \
  171. { \
  172. template <typename TSink> \
  173. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  174. if (arg1) { \
  175. const TStringBuf input(arg1.AsStringRef()); \
  176. bool result = true; \
  177. for (auto c : input) { \
  178. if (!function(c)) { \
  179. result = false; \
  180. break; \
  181. } \
  182. } \
  183. sink(TBlockItem(result)); \
  184. } else { \
  185. sink(TBlockItem(false)); \
  186. } \
  187. } \
  188. }; \
  189. \
  190. END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do)
  191. #define STRING_STREAM_PAD_FORMATTER_UDF(function) \
  192. BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS(T##function, \
  193. char*(TAutoMap<char*>, ui64, TOptional<char*>), 1) \
  194. { \
  195. TStringStream result; \
  196. const TStringBuf input(args[0].AsStringRef()); \
  197. char paddingSymbol = ' '; \
  198. if (args[2]) { \
  199. if (args[2].AsStringRef().Size() != 1) { \
  200. ythrow yexception() << "Not 1 symbol in paddingSymbol"; \
  201. } \
  202. paddingSymbol = TString(args[2].AsStringRef())[0]; \
  203. } \
  204. const ui64 padLen = args[1].Get<ui64>(); \
  205. if (padLen > padLim) { \
  206. ythrow yexception() << "Padding length (" << padLen << ") exceeds maximum: " << padLim; \
  207. } \
  208. result << function(input, padLen, paddingSymbol); \
  209. return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \
  210. } \
  211. \
  212. struct T##function##KernelExec \
  213. : public TGenericKernelExec<T##function##KernelExec, 3> \
  214. { \
  215. template <typename TSink> \
  216. static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { \
  217. TStringStream result; \
  218. const TStringBuf input(args.GetElement(0).AsStringRef()); \
  219. char paddingSymbol = ' '; \
  220. if (args.GetElement(2)) { \
  221. if (args.GetElement(2).AsStringRef().Size() != 1) { \
  222. ythrow yexception() << "Not 1 symbol in paddingSymbol"; \
  223. } \
  224. paddingSymbol = TString(args.GetElement(2).AsStringRef())[0]; \
  225. } \
  226. const ui64 padLen = args.GetElement(1).Get<ui64>(); \
  227. if (padLen > padLim) { \
  228. ythrow yexception() << "Padding length (" << padLen \
  229. << ") exceeds maximum: " << padLim; \
  230. } \
  231. result << function(input, padLen, paddingSymbol); \
  232. sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \
  233. } \
  234. }; \
  235. \
  236. END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do)
  237. #define STRING_STREAM_NUM_FORMATTER_UDF(function, argType) \
  238. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<argType>)) { \
  239. TStringStream result; \
  240. result << function(args[0].Get<argType>()); \
  241. return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \
  242. } \
  243. \
  244. struct T##function##KernelExec \
  245. : public TUnaryKernelExec<T##function##KernelExec> \
  246. { \
  247. template <typename TSink> \
  248. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  249. TStringStream result; \
  250. result << function(arg1.Get<argType>()); \
  251. sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \
  252. } \
  253. }; \
  254. \
  255. END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do)
  256. #define STRING_STREAM_TEXT_FORMATTER_UDF(function) \
  257. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<char*>)) { \
  258. TStringStream result; \
  259. const TStringBuf input(args[0].AsStringRef()); \
  260. result << function(input); \
  261. return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \
  262. } \
  263. \
  264. struct T##function##KernelExec \
  265. : public TUnaryKernelExec<T##function##KernelExec> \
  266. { \
  267. template <typename TSink> \
  268. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  269. TStringStream result; \
  270. const TStringBuf input(arg1.AsStringRef()); \
  271. result << function(input); \
  272. sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \
  273. } \
  274. }; \
  275. \
  276. END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do)
  277. #define STRING_STREAM_HRSZ_FORMATTER_UDF(udfName, hrSize) \
  278. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<ui64>)) { \
  279. TStringStream result; \
  280. result << HumanReadableSize(args[0].Get<ui64>(), hrSize); \
  281. return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \
  282. } \
  283. \
  284. struct T##udfName##KernelExec \
  285. : public TUnaryKernelExec<T##udfName##KernelExec> \
  286. { \
  287. template <typename TSink> \
  288. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \
  289. TStringStream result; \
  290. result << HumanReadableSize(arg1.Get<ui64>(), hrSize); \
  291. sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \
  292. } \
  293. }; \
  294. \
  295. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  296. #define STRING_UDF_MAP(XX) \
  297. XX(Base32Encode, Base32Encode) \
  298. XX(Base64Encode, Base64Encode) \
  299. XX(Base64EncodeUrl, Base64EncodeUrl) \
  300. XX(EscapeC, EscapeC) \
  301. XX(UnescapeC, UnescapeC) \
  302. XX(HexEncode, HexEncode) \
  303. XX(EncodeHtml, EncodeHtmlPcdata) \
  304. XX(DecodeHtml, DecodeHtmlPcdata) \
  305. XX(CgiEscape, CGIEscapeRet) \
  306. XX(CgiUnescape, CGIUnescapeRet) \
  307. XX(Strip, Strip) \
  308. XX(Collapse, Collapse)
  309. #define STRING_UNSAFE_UDF_MAP(XX) \
  310. XX(Base32Decode, Base32Decode) \
  311. XX(Base32StrictDecode, Base32StrictDecode) \
  312. XX(Base64Decode, Base64Decode) \
  313. XX(Base64StrictDecode, Base64StrictDecode) \
  314. XX(HexDecode, HexDecode)
  315. // NOTE: The functions below are marked as deprecated, so block implementation
  316. // is not required for them. Hence, STROKA_CASE_UDF provides only the scalar
  317. // one at the moment.
  318. #define STROKA_CASE_UDF_MAP(XX) \
  319. XX(ToLower, ToLower) \
  320. XX(ToUpper, ToUpper) \
  321. XX(ToTitle, ToTitle)
  322. #define STROKA_ASCII_CASE_UDF_MAP(XX) \
  323. XX(AsciiToLower, to_lower) \
  324. XX(AsciiToUpper, to_upper) \
  325. XX(AsciiToTitle, to_title)
  326. // NOTE: The functions below are marked as deprecated, so block implementation
  327. // is not required for them. Hence, STROKA_FIND_UDF provides only the scalar
  328. // one at the moment.
  329. #define STROKA_FIND_UDF_MAP(XX) \
  330. XX(StartsWith, StartsWith) \
  331. XX(EndsWith, EndsWith) \
  332. XX(HasPrefix, StartsWith) \
  333. XX(HasSuffix, EndsWith)
  334. // NOTE: The functions below are marked as deprecated, so block implementation
  335. // is not required for them. Hence, STRING_TWO_ARGS_UDF provides only the
  336. // scalar one at the moment.
  337. #define STRING_TWO_ARGS_UDF_MAP(XX) \
  338. XX(StartsWithIgnoreCase, AsciiHasPrefixIgnoreCase) \
  339. XX(EndsWithIgnoreCase, AsciiHasSuffixIgnoreCase) \
  340. XX(HasPrefixIgnoreCase, AsciiHasPrefixIgnoreCase) \
  341. XX(HasSuffixIgnoreCase, AsciiHasSuffixIgnoreCase)
  342. // NOTE: The functions below are marked as deprecated, so block implementation
  343. // is not required for them. Hence, STROKA_UDF provides only the scalar one at
  344. // the moment.
  345. #define STROKA_UDF_MAP(XX) \
  346. XX(Reverse, ReverseInPlace)
  347. #define IS_ASCII_UDF_MAP(XX) \
  348. XX(IsAscii) \
  349. XX(IsAsciiSpace) \
  350. XX(IsAsciiUpper) \
  351. XX(IsAsciiLower) \
  352. XX(IsAsciiDigit) \
  353. XX(IsAsciiAlpha) \
  354. XX(IsAsciiAlnum) \
  355. XX(IsAsciiHex)
  356. #define STRING_STREAM_PAD_FORMATTER_UDF_MAP(XX) \
  357. XX(LeftPad) \
  358. XX(RightPad)
  359. #define STRING_STREAM_NUM_FORMATTER_UDF_MAP(XX) \
  360. XX(Hex, ui64) \
  361. XX(SHex, i64) \
  362. XX(Bin, ui64) \
  363. XX(SBin, i64)
  364. #define STRING_STREAM_TEXT_FORMATTER_UDF_MAP(XX) \
  365. XX(HexText) \
  366. XX(BinText)
  367. #define STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(XX) \
  368. XX(HumanReadableQuantity, SF_QUANTITY) \
  369. XX(HumanReadableBytes, SF_BYTES)
  370. BEGIN_SIMPLE_STRICT_ARROW_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) {
  371. TString input(args[0].AsStringRef());
  372. ui64 maxLength = args[1].Get<ui64>();
  373. CollapseText(input, maxLength);
  374. return valueBuilder->NewString(input);
  375. }
  376. struct TCollapseTextKernelExec
  377. : public TBinaryKernelExec<TCollapseTextKernelExec>
  378. {
  379. template <typename TSink>
  380. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  381. TString input(arg1.AsStringRef());
  382. ui64 maxLength = arg2.Get<ui64>();
  383. CollapseText(input, maxLength);
  384. return sink(TBlockItem(input));
  385. }
  386. };
  387. END_SIMPLE_ARROW_UDF(TCollapseText, TCollapseTextKernelExec::Do);
  388. BEGIN_SIMPLE_STRICT_ARROW_UDF(TContains, bool(TOptional<char*>, char*)) {
  389. Y_UNUSED(valueBuilder);
  390. if (!args[0])
  391. return TUnboxedValuePod(false);
  392. const TString haystack(args[0].AsStringRef());
  393. const TString needle(args[1].AsStringRef());
  394. return TUnboxedValuePod(haystack.Contains(needle));
  395. }
  396. struct TContainsKernelExec : public TBinaryKernelExec<TContainsKernelExec> {
  397. template <typename TSink>
  398. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  399. if (!arg1)
  400. return sink(TBlockItem(false));
  401. const TString haystack(arg1.AsStringRef());
  402. const TString needle(arg2.AsStringRef());
  403. sink(TBlockItem(haystack.Contains(needle)));
  404. }
  405. };
  406. END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do);
  407. BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
  408. if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
  409. return valueBuilder->NewString(result);
  410. else
  411. return args[0];
  412. }
  413. struct TReplaceAllKernelExec
  414. : public TGenericKernelExec<TReplaceAllKernelExec, 3>
  415. {
  416. template <typename TSink>
  417. static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) {
  418. TString result(args.GetElement(0).AsStringRef());
  419. const TStringBuf what(args.GetElement(1).AsStringRef());
  420. const TStringBuf with(args.GetElement(2).AsStringRef());
  421. if (SubstGlobal(result, what, with)) {
  422. return sink(TBlockItem(result));
  423. } else {
  424. return sink(args.GetElement(0));
  425. }
  426. }
  427. };
  428. END_SIMPLE_ARROW_UDF(TReplaceAll, TReplaceAllKernelExec::Do)
  429. BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) {
  430. std::string result(args[0].AsStringRef());
  431. const std::string_view what(args[1].AsStringRef());
  432. if (const auto index = result.find(what); index != std::string::npos) {
  433. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  434. return valueBuilder->NewString(result);
  435. }
  436. return args[0];
  437. }
  438. struct TReplaceFirstKernelExec
  439. : public TGenericKernelExec<TReplaceFirstKernelExec, 3>
  440. {
  441. template <typename TSink>
  442. static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) {
  443. std::string result(args.GetElement(0).AsStringRef());
  444. const std::string_view what(args.GetElement(1).AsStringRef());
  445. const std::string_view with(args.GetElement(2).AsStringRef());
  446. if (const auto index = result.find(what); index != std::string::npos) {
  447. result.replace(index, what.size(), with);
  448. return sink(TBlockItem(result));
  449. }
  450. return sink(args.GetElement(0));
  451. }
  452. };
  453. END_SIMPLE_ARROW_UDF(TReplaceFirst, TReplaceFirstKernelExec::Do)
  454. BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) {
  455. std::string result(args[0].AsStringRef());
  456. const std::string_view what(args[1].AsStringRef());
  457. if (const auto index = result.rfind(what); index != std::string::npos) {
  458. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  459. return valueBuilder->NewString(result);
  460. }
  461. return args[0];
  462. }
  463. struct TReplaceLastKernelExec
  464. : public TGenericKernelExec<TReplaceLastKernelExec, 3>
  465. {
  466. template <typename TSink>
  467. static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) {
  468. std::string result(args.GetElement(0).AsStringRef());
  469. const std::string_view what(args.GetElement(1).AsStringRef());
  470. const std::string_view with(args.GetElement(2).AsStringRef());
  471. if (const auto index = result.rfind(what); index != std::string::npos) {
  472. result.replace(index, what.size(), with);
  473. return sink(TBlockItem(result));
  474. }
  475. return sink(args.GetElement(0));
  476. }
  477. };
  478. END_SIMPLE_ARROW_UDF(TReplaceLast, TReplaceLastKernelExec::Do)
  479. BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) {
  480. std::string input(args[0].AsStringRef());
  481. const std::string_view remove(args[1].AsStringRef());
  482. std::array<bool, 256> chars{};
  483. for (const ui8 c : remove) {
  484. chars[c] = true;
  485. }
  486. size_t tpos = 0;
  487. for (const ui8 c : input) {
  488. if (!chars[c]) {
  489. input[tpos++] = c;
  490. }
  491. }
  492. if (tpos != input.size()) {
  493. input.resize(tpos);
  494. return valueBuilder->NewString(input);
  495. }
  496. return args[0];
  497. }
  498. struct TRemoveAllKernelExec
  499. : public TBinaryKernelExec<TRemoveAllKernelExec>
  500. {
  501. template <typename TSink>
  502. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  503. std::string input(arg1.AsStringRef());
  504. const std::string_view remove(arg2.AsStringRef());
  505. std::array<bool, 256> chars{};
  506. for (const ui8 c : remove) {
  507. chars[c] = true;
  508. }
  509. size_t tpos = 0;
  510. for (const ui8 c : input) {
  511. if (!chars[c]) {
  512. input[tpos++] = c;
  513. }
  514. }
  515. if (tpos != input.size()) {
  516. input.resize(tpos);
  517. return sink(TBlockItem(input));
  518. }
  519. sink(arg1);
  520. }
  521. };
  522. END_SIMPLE_ARROW_UDF(TRemoveAll, TRemoveAllKernelExec::Do)
  523. BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) {
  524. std::string input(args[0].AsStringRef());
  525. const std::string_view remove(args[1].AsStringRef());
  526. std::array<bool, 256> chars{};
  527. for (const ui8 c : remove) {
  528. chars[c] = true;
  529. }
  530. for (auto it = input.cbegin(); it != input.cend(); ++it) {
  531. if (chars[static_cast<ui8>(*it)]) {
  532. input.erase(it);
  533. return valueBuilder->NewString(input);
  534. }
  535. }
  536. return args[0];
  537. }
  538. struct TRemoveFirstKernelExec
  539. : public TBinaryKernelExec<TRemoveFirstKernelExec>
  540. {
  541. template <typename TSink>
  542. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  543. std::string input(arg1.AsStringRef());
  544. const std::string_view remove(arg2.AsStringRef());
  545. std::array<bool, 256> chars{};
  546. for (const ui8 c : remove) {
  547. chars[c] = true;
  548. }
  549. for (auto it = input.cbegin(); it != input.cend(); ++it) {
  550. if (chars[static_cast<ui8>(*it)]) {
  551. input.erase(it);
  552. return sink(TBlockItem(input));
  553. }
  554. }
  555. sink(arg1);
  556. }
  557. };
  558. END_SIMPLE_ARROW_UDF(TRemoveFirst, TRemoveFirstKernelExec::Do)
  559. BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) {
  560. std::string input(args[0].AsStringRef());
  561. const std::string_view remove(args[1].AsStringRef());
  562. std::array<bool, 256> chars{};
  563. for (const ui8 c : remove) {
  564. chars[c] = true;
  565. }
  566. for (auto it = input.crbegin(); it != input.crend(); ++it) {
  567. if (chars[static_cast<ui8>(*it)]) {
  568. input.erase(input.crend() - it - 1, 1);
  569. return valueBuilder->NewString(input);
  570. }
  571. }
  572. return args[0];
  573. }
  574. struct TRemoveLastKernelExec
  575. : public TBinaryKernelExec<TRemoveLastKernelExec>
  576. {
  577. template <typename TSink>
  578. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  579. std::string input(arg1.AsStringRef());
  580. const std::string_view remove(arg2.AsStringRef());
  581. std::array<bool, 256> chars{};
  582. for (const ui8 c : remove) {
  583. chars[c] = true;
  584. }
  585. for (auto it = input.crbegin(); it != input.crend(); ++it) {
  586. if (chars[static_cast<ui8>(*it)]) {
  587. input.erase(input.crend() - it - 1, 1);
  588. return sink(TBlockItem(input));
  589. }
  590. }
  591. sink(arg1);
  592. }
  593. };
  594. END_SIMPLE_ARROW_UDF(TRemoveLast, TRemoveLastKernelExec::Do)
  595. // NOTE: String::Find is marked as deprecated, so block implementation is
  596. // not required for them. Hence, only the scalar one is provided.
  597. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) {
  598. Y_UNUSED(valueBuilder);
  599. const TString haystack(args[0].AsStringRef());
  600. const TString needle(args[1].AsStringRef());
  601. const ui64 pos = args[2].GetOrDefault<ui64>(0);
  602. return TUnboxedValuePod(haystack.find(needle, pos));
  603. }
  604. // NOTE: String::ReverseFind is marked as deprecated, so block
  605. // implementation is not required for them. Hence, only the scalar one is
  606. // provided.
  607. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) {
  608. Y_UNUSED(valueBuilder);
  609. const TString haystack(args[0].AsStringRef());
  610. const TString needle(args[1].AsStringRef());
  611. const ui64 pos = args[2].GetOrDefault<ui64>(TString::npos);
  612. return TUnboxedValuePod(haystack.rfind(needle, pos));
  613. }
  614. // NOTE: String::Substring is marked as deprecated, so block implementation
  615. // is not required for them. Hence, only the scalar one is provided.
  616. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) {
  617. const TString input(args[0].AsStringRef());
  618. const ui64 from = args[1].GetOrDefault<ui64>(0);
  619. const ui64 count = args[2].GetOrDefault<ui64>(TString::npos);
  620. return valueBuilder->NewString(input.substr(from, count));
  621. }
  622. using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>;
  623. template <typename TIt>
  624. static void SplitToListImpl(
  625. const IValueBuilder* valueBuilder,
  626. const TUnboxedValue& input,
  627. const std::string_view::const_iterator from,
  628. const TIt& it,
  629. TTmpVector& result) {
  630. for (const auto& elem : it) {
  631. result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim())));
  632. }
  633. }
  634. template <typename TIt>
  635. static void SplitToListImpl(
  636. const IValueBuilder* valueBuilder,
  637. const TUnboxedValue& input,
  638. const std::string_view::const_iterator from,
  639. TIt& it,
  640. bool skipEmpty,
  641. TTmpVector& result) {
  642. if (skipEmpty) {
  643. SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result);
  644. } else {
  645. SplitToListImpl(valueBuilder, input, from, it, result);
  646. }
  647. }
  648. constexpr char delimeterStringName[] = "DelimeterString";
  649. constexpr char skipEmptyName[] = "SkipEmpty";
  650. constexpr char limitName[] = "Limit";
  651. using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>;
  652. using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>;
  653. using TLimitArg = TNamedArg<ui64, limitName>;
  654. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<char*>(
  655. TOptional<char*>,
  656. char*,
  657. TDelimeterStringArg,
  658. TSkipEmptyArg,
  659. TLimitArg
  660. ),
  661. 3) {
  662. TTmpVector result;
  663. if (args[0]) {
  664. const std::string_view input(args[0].AsStringRef());
  665. const std::string_view delimeter(args[1].AsStringRef());
  666. const bool delimiterString = args[2].GetOrDefault<bool>(true);
  667. const bool skipEmpty = args[3].GetOrDefault<bool>(false);
  668. const auto limit = args[4].GetOrDefault<ui64>(0);
  669. if (delimiterString) {
  670. if (limit) {
  671. auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1);
  672. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  673. } else {
  674. auto it = StringSplitter(input).SplitByString(delimeter);
  675. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  676. }
  677. } else {
  678. if (limit) {
  679. auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()).Limit(limit + 1);
  680. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  681. } else {
  682. auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str());
  683. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  684. }
  685. }
  686. }
  687. return valueBuilder->NewList(result.data(), result.size());
  688. }
  689. SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) {
  690. auto input = args[0].GetListIterator();
  691. const TString delimeter(args[1].AsStringRef());
  692. TVector<TString> items;
  693. for (TUnboxedValue current; input.Next(current);) {
  694. if (current) {
  695. TString item(current.AsStringRef());
  696. items.push_back(std::move(item));
  697. }
  698. }
  699. return valueBuilder->NewString(JoinSeq(delimeter, items));
  700. }
  701. BEGIN_SIMPLE_STRICT_ARROW_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) {
  702. Y_UNUSED(valueBuilder);
  703. const TStringBuf left(args[0].AsStringRef());
  704. const TStringBuf right(args[1].AsStringRef());
  705. const ui64 result = NLevenshtein::Distance(left, right);
  706. return TUnboxedValuePod(result);
  707. }
  708. struct TLevensteinDistanceKernelExec : public TBinaryKernelExec<TLevensteinDistanceKernelExec> {
  709. template <typename TSink>
  710. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  711. const std::string_view left(arg1.AsStringRef());
  712. const std::string_view right(arg2.AsStringRef());
  713. const ui64 result = NLevenshtein::Distance(left, right);
  714. sink(TBlockItem(result));
  715. }
  716. };
  717. END_SIMPLE_ARROW_UDF(TLevensteinDistance, TLevensteinDistanceKernelExec::Do);
  718. BEGIN_SIMPLE_STRICT_ARROW_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) {
  719. TStringStream result;
  720. result << HumanReadable(TDuration::MicroSeconds(args[0].Get<ui64>()));
  721. return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
  722. }
  723. struct THumanReadableDurationKernelExec
  724. : public TUnaryKernelExec<THumanReadableDurationKernelExec>
  725. {
  726. template <typename TSink>
  727. static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) {
  728. TStringStream result;
  729. result << HumanReadable(TDuration::MicroSeconds(arg1.Get<ui64>()));
  730. sink(TBlockItem(TStringRef(result.Data(), result.Size())));
  731. }
  732. };
  733. END_SIMPLE_ARROW_UDF(THumanReadableDuration, THumanReadableDurationKernelExec::Do)
  734. BEGIN_SIMPLE_STRICT_ARROW_UDF(TPrec, char*(TAutoMap<double>, ui64)) {
  735. TStringStream result;
  736. result << Prec(args[0].Get<double>(), args[1].Get<ui64>());
  737. return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
  738. }
  739. struct TPrecKernelExec : public TBinaryKernelExec<TPrecKernelExec> {
  740. template <typename TSink>
  741. static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
  742. TStringStream result;
  743. result << Prec(arg1.Get<double>(), arg2.Get<ui64>());
  744. sink(TBlockItem(TStringRef(result.Data(), result.Size())));
  745. }
  746. };
  747. END_SIMPLE_ARROW_UDF(TPrec, TPrecKernelExec::Do)
  748. SIMPLE_STRICT_UDF(TToByteList, TListType<ui8>(char*)) {
  749. const TStringBuf input(args[0].AsStringRef());
  750. TUnboxedValue* items = nullptr;
  751. TUnboxedValue result = valueBuilder->NewArray(input.size(), items);
  752. for (const unsigned char c : input) {
  753. *items++ = TUnboxedValuePod(c);
  754. }
  755. return result;
  756. }
  757. SIMPLE_STRICT_UDF(TFromByteList, char*(TListType<ui8>)) {
  758. auto input = args[0];
  759. if (auto elems = input.GetElements()) {
  760. const auto elemCount = input.GetListLength();
  761. TUnboxedValue result = valueBuilder->NewStringNotFilled(input.GetListLength());
  762. auto bufferPtr = result.AsStringRef().Data();
  763. for (ui64 i = 0; i != elemCount; ++i) {
  764. *(bufferPtr++) = elems[i].Get<ui8>();
  765. }
  766. return result;
  767. }
  768. std::vector<char, NKikimr::NUdf::TStdAllocatorForUdf<char>> buffer;
  769. buffer.reserve(TUnboxedValuePod::InternalBufferSize);
  770. const auto& iter = input.GetListIterator();
  771. for (NUdf::TUnboxedValue item; iter.Next(item); ) {
  772. buffer.push_back(item.Get<ui8>());
  773. }
  774. return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size()));
  775. }
  776. #define STRING_REGISTER_UDF(udfName, ...) T##udfName,
  777. STRING_UDF_MAP(STRING_UDF)
  778. STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF)
  779. STROKA_UDF_MAP(STROKA_UDF)
  780. STROKA_CASE_UDF_MAP(STROKA_CASE_UDF)
  781. STROKA_ASCII_CASE_UDF_MAP(STROKA_ASCII_CASE_UDF)
  782. STROKA_FIND_UDF_MAP(STROKA_FIND_UDF)
  783. STRING_TWO_ARGS_UDF_MAP(STRING_TWO_ARGS_UDF)
  784. IS_ASCII_UDF_MAP(IS_ASCII_UDF)
  785. static constexpr ui64 padLim = 1000000;
  786. STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_STREAM_PAD_FORMATTER_UDF)
  787. STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_STREAM_NUM_FORMATTER_UDF)
  788. STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_STREAM_TEXT_FORMATTER_UDF)
  789. STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_STREAM_HRSZ_FORMATTER_UDF)
  790. SIMPLE_MODULE(TStringModule,
  791. STRING_UDF_MAP(STRING_REGISTER_UDF)
  792. STRING_UNSAFE_UDF_MAP(STRING_REGISTER_UDF)
  793. STROKA_UDF_MAP(STRING_REGISTER_UDF)
  794. STROKA_CASE_UDF_MAP(STRING_REGISTER_UDF)
  795. STROKA_ASCII_CASE_UDF_MAP(STRING_REGISTER_UDF)
  796. STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF)
  797. STRING_TWO_ARGS_UDF_MAP(STRING_REGISTER_UDF)
  798. IS_ASCII_UDF_MAP(STRING_REGISTER_UDF)
  799. STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
  800. STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
  801. STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
  802. STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
  803. TCollapseText,
  804. TReplaceAll,
  805. TReplaceFirst,
  806. TReplaceLast,
  807. TRemoveAll,
  808. TRemoveFirst,
  809. TRemoveLast,
  810. TContains,
  811. TFind,
  812. TReverseFind,
  813. TSubstring,
  814. TSplitToList,
  815. TJoinFromList,
  816. TLevensteinDistance,
  817. THumanReadableDuration,
  818. TPrec,
  819. TToByteList,
  820. TFromByteList)
  821. }
  822. REGISTER_MODULES(TStringModule)