unicode_base_udf.h 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. #pragma once
  2. #include <yql/essentials/public/udf/udf_allocator.h>
  3. #include <yql/essentials/public/udf/udf_helpers.h>
  4. #include <yql/essentials/utils/utf8.h>
  5. #include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>
  6. #include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>
  7. #include <library/cpp/unicode/normalization/normalization.h>
  8. #include <library/cpp/unicode/set/unicode_set.h>
  9. #include <library/cpp/deprecated/split/split_iterator.h>
  10. #include <util/string/join.h>
  11. #include <util/string/reverse.h>
  12. #include <util/string/split.h>
  13. #include <util/string/subst.h>
  14. #include <util/charset/wide.h>
  15. #include <util/charset/utf8.h>
  16. #include <util/generic/scope.h>
  17. #include <util/string/strip.h>
  18. #include <util/string/ascii.h>
  19. #include <util/charset/unidata.h>
  20. using namespace NYql;
  21. using namespace NUdf;
  22. using namespace NUnicode;
  23. namespace {
  24. #define DISABLE_IMPICT_ARGUMENT_CAST \
  25. template <typename... Args> \
  26. static auto Execute(Args&&... args) = delete;
  27. inline constexpr bool IsAscii(wchar32 c) noexcept {
  28. return ::IsAscii(c);
  29. }
  30. template <class It>
  31. struct TIsUnicodeSpaceAdapter {
  32. bool operator()(const It& it) const noexcept {
  33. return IsSpace(*it);
  34. }
  35. };
  36. template <class It>
  37. TIsUnicodeSpaceAdapter<It> IsUnicodeSpaceAdapter(It) {
  38. return {};
  39. }
  40. struct TNoChangesTag {};
  41. template <typename TDerived>
  42. struct TScalarOperationMixin {
  43. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  44. requires requires { TDerived::Execute(TStringRef()); }
  45. {
  46. Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef()));
  47. auto executeResult = TDerived::Execute(args[0].AsStringRef());
  48. return ProcessResult(builder, std::move(executeResult), args);
  49. }
  50. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  51. requires requires { TDerived::Execute(TMaybe<TStringRef>(TStringRef())); }
  52. {
  53. auto executeResult = TDerived::Execute(args[0] ? TMaybe<TStringRef>(args[0].AsStringRef()) : Nothing());
  54. return ProcessResult(builder, std::move(executeResult), args);
  55. }
  56. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  57. requires requires { TDerived::Execute(TStringRef(), TStringRef()); }
  58. {
  59. auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef());
  60. return ProcessResult(builder, std::move(executeResult), args);
  61. }
  62. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  63. requires requires { TDerived::Execute(TStringRef(), TMaybe<ui16>()); }
  64. {
  65. auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1] ? TMaybe<ui16>(args[1].Get<ui16>()) : Nothing());
  66. return ProcessResult(builder, std::move(executeResult), args);
  67. }
  68. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  69. requires requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); }
  70. {
  71. auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2].AsStringRef());
  72. return ProcessResult(builder, std::move(executeResult), args);
  73. }
  74. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  75. requires requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>()); }
  76. {
  77. auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2] ? TMaybe<ui64>(args[2].Get<ui64>()) : Nothing());
  78. return ProcessResult(builder, std::move(executeResult), args);
  79. }
  80. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
  81. requires requires { TDerived::Execute(TStringRef(), TMaybe<ui64>(), TMaybe<ui64>()); }
  82. {
  83. auto executeResult = TDerived::Execute(args[0].AsStringRef(),
  84. args[1] ? TMaybe<ui64>(args[1].Get<ui64>()) : Nothing(),
  85. args[2] ? TMaybe<ui64>(args[2].Get<ui64>()) : Nothing());
  86. return ProcessResult(builder, std::move(executeResult), args);
  87. }
  88. private:
  89. static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TString& newString, const TUnboxedValuePod*) {
  90. return builder->NewString(newString);
  91. }
  92. static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TStringBuf newString, const TUnboxedValuePod*) {
  93. return builder->NewString(newString);
  94. }
  95. template <typename T>
  96. static TUnboxedValue ProcessResult(const IValueBuilder* builder, const std::variant<TNoChangesTag, T>& newValue, const TUnboxedValuePod* initialArg) {
  97. if (std::holds_alternative<T>(newValue)) {
  98. return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg);
  99. } else {
  100. return initialArg[0];
  101. }
  102. }
  103. template <typename T>
  104. static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TMaybe<T>& newValue, const TUnboxedValuePod* initialArg) {
  105. if (newValue.Defined()) {
  106. return ProcessResult(builder, *newValue, initialArg);
  107. } else {
  108. return TUnboxedValuePod();
  109. }
  110. }
  111. template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>>
  112. static TUnboxedValue ProcessResult(const IValueBuilder* builder, T result, const TUnboxedValuePod*) {
  113. Y_UNUSED(builder);
  114. return TUnboxedValuePod(result);
  115. }
  116. };
  117. template <typename TDerived>
  118. struct TBlockOperationMixin {
  119. template <typename TSink>
  120. static void BlockDoExecute(const TBlockItem arg, const TSink& sink)
  121. requires requires { TDerived::Execute(TStringRef()); }
  122. {
  123. Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef()));
  124. auto executeResult = TDerived::Execute(arg.AsStringRef());
  125. TBlockItem boxedValue = ProcessResult(executeResult, arg);
  126. sink(boxedValue);
  127. }
  128. template <typename TSink>
  129. static void BlockDoExecute(const TBlockItem arg, const TSink& sink)
  130. requires requires { TDerived::Execute(TMaybe<TStringRef>(TStringRef())); }
  131. {
  132. auto executeResult = TDerived::Execute(arg ? TMaybe<TStringRef>(arg.AsStringRef()) : Nothing());
  133. TBlockItem boxedValue = ProcessResult(executeResult, arg);
  134. sink(boxedValue);
  135. }
  136. template <typename TSink>
  137. static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink)
  138. requires requires { TDerived::Execute(TStringRef(), TStringRef()); }
  139. {
  140. auto executeResult = TDerived::Execute(arg1.AsStringRef(),
  141. arg2.AsStringRef());
  142. TBlockItem boxedValue = ProcessResult(executeResult, arg1);
  143. sink(boxedValue);
  144. }
  145. template <typename TSink>
  146. static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink)
  147. requires requires { TDerived::Execute(TStringRef(), TMaybe<ui16>()); }
  148. {
  149. auto executeResult = TDerived::Execute(arg1.AsStringRef(), arg2 ? TMaybe<ui16>(arg2.Get<ui16>()) : Nothing());
  150. TBlockItem boxedValue = ProcessResult(executeResult, arg1);
  151. sink(boxedValue);
  152. }
  153. template <typename TSink>
  154. static void BlockDoExecute(const TBlockItem args, const TSink& sink)
  155. requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); })
  156. {
  157. auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(),
  158. args.GetElement(1).AsStringRef(),
  159. args.GetElement(2).AsStringRef());
  160. TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0));
  161. sink(boxedValue);
  162. }
  163. template <typename TSink>
  164. static void BlockDoExecute(const TBlockItem args, const TSink& sink)
  165. requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>(0ULL)); })
  166. {
  167. auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(),
  168. args.GetElement(1).AsStringRef(),
  169. (args.GetElement(2) ? TMaybe<ui64>(args.GetElement(2).Get<ui64>()) : Nothing()));
  170. TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0));
  171. sink(boxedValue);
  172. }
  173. template <typename TSink>
  174. static void BlockDoExecute(const TBlockItem args, const TSink& sink)
  175. requires(requires { TDerived::Execute(TStringRef(), TMaybe<ui64>(0ULL), TMaybe<ui64>(0ULL)); })
  176. {
  177. auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(),
  178. (args.GetElement(1) ? TMaybe<ui64>(args.GetElement(1).Get<ui64>()) : Nothing()),
  179. (args.GetElement(2) ? TMaybe<ui64>(args.GetElement(2).Get<ui64>()) : Nothing()));
  180. TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0));
  181. sink(boxedValue);
  182. }
  183. private:
  184. static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) {
  185. Y_UNUSED(arg);
  186. return TBlockItem(newString);
  187. }
  188. static TBlockItem ProcessResult(const TStringBuf newString, const TBlockItem arg) {
  189. Y_UNUSED(arg);
  190. return TBlockItem(newString);
  191. }
  192. template <typename T>
  193. static TBlockItem ProcessResult(const TMaybe<T>& newValue, const TBlockItem arg) {
  194. if (newValue.Defined()) {
  195. return ProcessResult(*newValue, arg);
  196. } else {
  197. return TBlockItem();
  198. }
  199. }
  200. template <typename T>
  201. static TBlockItem ProcessResult(const std::variant<TNoChangesTag, T>& newValue, const TBlockItem arg) {
  202. if (std::holds_alternative<T>(newValue)) {
  203. return ProcessResult(std::get<T>(newValue), arg);
  204. } else {
  205. return arg;
  206. }
  207. }
  208. template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>>
  209. static TBlockItem ProcessResult(T result, const TBlockItem arg) {
  210. Y_UNUSED(arg);
  211. return TBlockItem(result);
  212. }
  213. };
  214. template <typename TDerived>
  215. struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {};
  216. template <auto mode>
  217. struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> {
  218. static TString Execute(TStringRef arg) {
  219. const TUtf16String& input = UTF8ToWide(arg.Data(), arg.Size());
  220. return WideToUTF8(Normalize<mode>(input));
  221. }
  222. DISABLE_IMPICT_ARGUMENT_CAST;
  223. };
  224. template <bool (*Function)(wchar32)>
  225. struct TCheckAllChars: public TOperationMixin<TCheckAllChars<Function>> {
  226. static bool Execute(TStringRef arg) {
  227. const TStringBuf input(arg);
  228. wchar32 rune;
  229. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
  230. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
  231. while (cur != last) {
  232. ReadUTF8CharAndAdvance(rune, cur, last);
  233. if (!static_cast<bool (*)(wchar32)>(Function)(rune)) {
  234. return false;
  235. }
  236. }
  237. return true;
  238. }
  239. DISABLE_IMPICT_ARGUMENT_CAST;
  240. };
  241. template <bool (*Function)(TUtf16String&, size_t pos, size_t count)>
  242. struct TStringToStringMapper: public TOperationMixin<TStringToStringMapper<Function>> {
  243. static std::variant<TNoChangesTag, TString> Execute(TStringRef arg) {
  244. if (auto wide = UTF8ToWide(arg);
  245. static_cast<bool (*)(TUtf16String&, size_t pos, size_t count)>(Function)(wide, 0, TUtf16String::npos)) {
  246. return WideToUTF8(std::move(wide));
  247. } else {
  248. return TNoChangesTag{};
  249. }
  250. }
  251. DISABLE_IMPICT_ARGUMENT_CAST;
  252. };
  253. struct TLengthGetter: public TOperationMixin<TLengthGetter> {
  254. static ui64 Execute(TStringRef inputRef) {
  255. size_t result;
  256. GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result);
  257. return static_cast<ui64>(result);
  258. }
  259. DISABLE_IMPICT_ARGUMENT_CAST;
  260. };
  261. struct TReverser: public TOperationMixin<TReverser> {
  262. static TString Execute(TStringRef inputRef) {
  263. auto wide = UTF8ToWide(inputRef);
  264. ReverseInPlace(wide);
  265. return WideToUTF8(wide);
  266. }
  267. DISABLE_IMPICT_ARGUMENT_CAST;
  268. };
  269. struct TStripper: public TOperationMixin<TStripper> {
  270. static TString Execute(TStringRef inputRef) {
  271. const TUtf32String input = UTF8ToUTF32<true>(inputRef);
  272. const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
  273. return WideToUTF8(result);
  274. }
  275. DISABLE_IMPICT_ARGUMENT_CAST;
  276. };
  277. struct TAllRemover: public TOperationMixin<TAllRemover> {
  278. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
  279. TUtf32String input = UTF8ToUTF32<true>(inputRef);
  280. const TUtf32String remove = UTF8ToUTF32<true>(removeRef);
  281. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  282. size_t tpos = 0;
  283. for (const wchar32 c : input) {
  284. if (!chars.contains(c)) {
  285. input[tpos++] = c;
  286. }
  287. }
  288. if (tpos != input.size()) {
  289. input.resize(tpos);
  290. return WideToUTF8(input);
  291. }
  292. return TNoChangesTag{};
  293. }
  294. DISABLE_IMPICT_ARGUMENT_CAST;
  295. };
  296. struct TFirstRemover: public TOperationMixin<TFirstRemover> {
  297. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
  298. TUtf32String input = UTF8ToUTF32<true>(inputRef);
  299. const auto remove = UTF8ToUTF32<true>(removeRef);
  300. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  301. for (auto it = input.cbegin(); it != input.cend(); ++it) {
  302. if (chars.contains(*it)) {
  303. input.erase(it);
  304. return WideToUTF8(input);
  305. }
  306. }
  307. return TNoChangesTag{};
  308. }
  309. DISABLE_IMPICT_ARGUMENT_CAST;
  310. };
  311. struct TUnicodeSetMatcher: public TOperationMixin<TUnicodeSetMatcher> {
  312. static bool Execute(TStringRef inputRef, TStringRef customCategoryRef) {
  313. const TStringBuf input(inputRef);
  314. const TUtf16String& customCategory = UTF8ToWide(customCategoryRef);
  315. TUnicodeSet unicodeSet;
  316. try {
  317. unicodeSet.Parse(customCategory);
  318. } catch (...) {
  319. UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str());
  320. }
  321. wchar32 rune;
  322. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
  323. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
  324. while (cur != last) {
  325. ReadUTF8CharAndAdvance(rune, cur, last);
  326. if (!unicodeSet.Has(rune)) {
  327. return false;
  328. }
  329. }
  330. return true;
  331. }
  332. DISABLE_IMPICT_ARGUMENT_CAST;
  333. };
  334. struct TLevensteinDistanceFinder: public TOperationMixin<TLevensteinDistanceFinder> {
  335. static ui64 Execute(TStringRef leftRef, TStringRef rightRef) {
  336. const TStringBuf left(leftRef);
  337. const TStringBuf right(rightRef);
  338. const auto& leftUtf32 = UTF8ToUTF32<true>(left);
  339. const auto& rightUtf32 = UTF8ToUTF32<true>(right);
  340. return NLevenshtein::Distance(leftUtf32, rightUtf32);
  341. }
  342. DISABLE_IMPICT_ARGUMENT_CAST;
  343. };
  344. struct TLastRemoval: public TOperationMixin<TLastRemoval> {
  345. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
  346. TUtf32String input = UTF8ToUTF32<true>(inputRef);
  347. const TUtf32String remove = UTF8ToUTF32<true>(removeRef);
  348. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  349. for (auto it = input.crbegin(); it != input.crend(); ++it) {
  350. if (chars.contains(*it)) {
  351. input.erase(input.crend() - it - 1, 1);
  352. return WideToUTF8(input);
  353. }
  354. }
  355. return TNoChangesTag{};
  356. }
  357. DISABLE_IMPICT_ARGUMENT_CAST;
  358. };
  359. struct TAllReplacer: public TOperationMixin<TAllReplacer> {
  360. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
  361. if (TString result(inputRef); SubstGlobal(result, whatReplace, toReplace)) {
  362. return result;
  363. } else {
  364. return TNoChangesTag{};
  365. }
  366. }
  367. DISABLE_IMPICT_ARGUMENT_CAST;
  368. };
  369. struct TFirstReplacer: public TOperationMixin<TFirstReplacer> {
  370. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
  371. std::string result(inputRef);
  372. const std::string_view what(whatReplace);
  373. if (const auto index = result.find(what); index != std::string::npos) {
  374. result.replace(index, what.size(), std::string_view(toReplace));
  375. return result;
  376. }
  377. return TNoChangesTag{};
  378. }
  379. DISABLE_IMPICT_ARGUMENT_CAST;
  380. };
  381. struct TLastReplacer: public TOperationMixin<TLastReplacer> {
  382. static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
  383. std::string result(inputRef);
  384. const std::string_view what(whatReplace);
  385. if (const auto index = result.rfind(what); index != std::string::npos) {
  386. result.replace(index, what.size(), std::string_view(toReplace));
  387. return result;
  388. }
  389. return TNoChangesTag{};
  390. }
  391. DISABLE_IMPICT_ARGUMENT_CAST;
  392. };
  393. struct TFinder: public TOperationMixin<TFinder> {
  394. static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) {
  395. const std::string_view string(inputRef);
  396. const std::string_view needle(whatFind);
  397. std::string_view::size_type pos = 0U;
  398. if (auto p = whereFind.GetOrElse(0ULL)) {
  399. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  400. const auto width = WideCharSize(*ptr);
  401. pos += width;
  402. ptr += width;
  403. }
  404. }
  405. if (const auto find = string.find(needle, pos); std::string_view::npos != find) {
  406. size_t result;
  407. GetNumberOfUTF8Chars(string.data(), find, result);
  408. return static_cast<ui64>(result);
  409. }
  410. return Nothing();
  411. }
  412. DISABLE_IMPICT_ARGUMENT_CAST;
  413. };
  414. struct TRFinder: public TOperationMixin<TRFinder> {
  415. static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) {
  416. const std::string_view string(inputRef);
  417. const std::string_view needle(whatFind);
  418. std::string_view::size_type pos = std::string_view::npos;
  419. if (auto p = whereFind.GetOrElse(std::string_view::npos); std::string_view::npos != p) {
  420. pos = 0ULL;
  421. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  422. const auto width = WideCharSize(*ptr);
  423. pos += width;
  424. ptr += width;
  425. }
  426. }
  427. if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) {
  428. size_t result;
  429. GetNumberOfUTF8Chars(string.data(), find, result);
  430. return static_cast<ui64>(result);
  431. }
  432. return Nothing();
  433. }
  434. DISABLE_IMPICT_ARGUMENT_CAST;
  435. };
  436. template <bool strict>
  437. struct TToUint64Converter: public TOperationMixin<TToUint64Converter<strict>> {
  438. static TNothing Terminate(const char* message) {
  439. if constexpr (strict) {
  440. return Nothing();
  441. } else {
  442. UdfTerminate(message);
  443. }
  444. };
  445. static TMaybe<ui64> Execute(TStringRef inputRef, TMaybe<ui16> inputBase) {
  446. const TString inputStr(inputRef);
  447. const char* input = inputStr.data();
  448. const int base = inputBase.GetOrElse(0);
  449. char* pos = nullptr;
  450. auto prevErrno = errno;
  451. errno = 0;
  452. Y_DEFER {
  453. errno = prevErrno;
  454. };
  455. unsigned long long res = std::strtoull(input, &pos, base);
  456. if (!res && errno == EINVAL) {
  457. return Terminate("Incorrect base");
  458. }
  459. ui64 ret = static_cast<ui64>(res);
  460. if (!res && pos == input) {
  461. return Terminate("Input string is not a number");
  462. } else if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) {
  463. return Terminate("Converted value falls out of Uint64 range");
  464. } else if (*pos) {
  465. return Terminate("Input string contains junk after the number");
  466. }
  467. return ret;
  468. }
  469. DISABLE_IMPICT_ARGUMENT_CAST;
  470. };
  471. struct TUtf8Checker: public TOperationMixin<TUtf8Checker> {
  472. static bool Execute(TMaybe<TStringRef> inputRef) {
  473. if (!inputRef.Defined()) {
  474. return false;
  475. }
  476. return IsUtf8(*inputRef);
  477. }
  478. DISABLE_IMPICT_ARGUMENT_CAST;
  479. };
  480. struct TSubstringGetter: public TOperationMixin<TSubstringGetter> {
  481. static TStringBuf Execute(TStringRef inputRef Y_LIFETIME_BOUND, TMaybe<ui64> inputFrom, TMaybe<ui64> inputLen) {
  482. const TStringBuf input(inputRef);
  483. size_t from = inputFrom.GetOrElse(0);
  484. size_t len = inputLen.GetOrElse(TStringBuf::npos);
  485. return SubstrUTF8(input, from, len);
  486. }
  487. DISABLE_IMPICT_ARGUMENT_CAST;
  488. };
  489. #define DEFINE_UTF8_OPERATION_STRICT(udfName, Executor, signature, optArgs) \
  490. BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgs) { \
  491. return Executor::DoExecute(valueBuilder, args); \
  492. } \
  493. \
  494. struct T##udfName##KernelExec \
  495. : public TUnaryKernelExec<T##udfName##KernelExec> { \
  496. template <typename TSink> \
  497. static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \
  498. Y_UNUSED(valueBuilder); \
  499. Executor::BlockDoExecute(arg1, sink); \
  500. } \
  501. }; \
  502. \
  503. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  504. #define DEFINE_UTF8_OPERATION_BIN_BASE(macro, udfName, Executor, signature, optArgs) \
  505. macro(T##udfName, signature, optArgs) { \
  506. return Executor::DoExecute(valueBuilder, args); \
  507. } \
  508. \
  509. struct T##udfName##KernelExec \
  510. : public TBinaryKernelExec<T##udfName##KernelExec> { \
  511. template <typename TSink> \
  512. static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { \
  513. Y_UNUSED(valueBuilder); \
  514. Executor::BlockDoExecute(arg1, arg2, sink); \
  515. } \
  516. }; \
  517. \
  518. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  519. #define DEFINE_UTF8_OPERATION_BIN_STRICT(udfName, Executor, signature, optArgs) \
  520. DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs)
  521. #define DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(udfName, Executor, signature, optArgs) \
  522. DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs)
  523. #define DEFINE_UTF8_OPERATION_MANY_STRICT(udfName, Executor, signature, argsCount, optArgsCount) \
  524. BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgsCount) { \
  525. return Executor::DoExecute(valueBuilder, args); \
  526. } \
  527. \
  528. struct T##udfName##KernelExec \
  529. : public TGenericKernelExec<T##udfName##KernelExec, argsCount> { \
  530. template <typename TSink> \
  531. static void Process(const IValueBuilder* valueBuilder, TBlockItem args, const TSink& sink) { \
  532. Y_UNUSED(valueBuilder); \
  533. Executor::BlockDoExecute(args, sink); \
  534. } \
  535. }; \
  536. \
  537. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  538. DEFINE_UTF8_OPERATION_STRICT(IsUtf, TUtf8Checker, bool(TOptional<char*>), /*optArgs=*/1);
  539. DEFINE_UTF8_OPERATION_STRICT(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  540. DEFINE_UTF8_OPERATION_STRICT(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  541. DEFINE_UTF8_OPERATION_STRICT(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  542. DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  543. DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  544. DEFINE_UTF8_OPERATION_STRICT(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  545. DEFINE_UTF8_OPERATION_STRICT(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  546. DEFINE_UTF8_OPERATION_STRICT(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  547. DEFINE_UTF8_OPERATION_STRICT(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  548. DEFINE_UTF8_OPERATION_STRICT(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  549. DEFINE_UTF8_OPERATION_STRICT(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  550. DEFINE_UTF8_OPERATION_STRICT(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  551. DEFINE_UTF8_OPERATION_STRICT(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
  552. DEFINE_UTF8_OPERATION_STRICT(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  553. DEFINE_UTF8_OPERATION_STRICT(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  554. DEFINE_UTF8_OPERATION_STRICT(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  555. DEFINE_UTF8_OPERATION_STRICT(GetLength, TLengthGetter, ui64(TAutoMap<TUtf8>), /*optArgs=*/0);
  556. DEFINE_UTF8_OPERATION_STRICT(Reverse, TReverser, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  557. DEFINE_UTF8_OPERATION_STRICT(Strip, TStripper, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
  558. DEFINE_UTF8_OPERATION_MANY_STRICT(Substring, TSubstringGetter, TUtf8(TAutoMap<TUtf8>, TOptional<ui64>, TOptional<ui64>), /*argsCount=*/3, /*optArgs=*/1);
  559. DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveAll, TAllRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
  560. DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveFirst, TFirstRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
  561. DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(IsUnicodeSet, TUnicodeSetMatcher, bool(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
  562. DEFINE_UTF8_OPERATION_BIN_STRICT(LevensteinDistance, TLevensteinDistanceFinder, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>), /*optArgs=*/0);
  563. DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveLast, TLastRemoval, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
  564. DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceAll, TAllReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
  565. DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceFirst, TFirstReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
  566. DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceLast, TLastReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
  567. DEFINE_UTF8_OPERATION_MANY_STRICT(Find, TFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1);
  568. DEFINE_UTF8_OPERATION_MANY_STRICT(RFind, TRFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1);
  569. DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(ToUint64, TToUint64Converter</*strict=*/false>, ui64(TAutoMap<TUtf8>, TOptional<ui16>), /*optionalArgs=*/1);
  570. DEFINE_UTF8_OPERATION_BIN_STRICT(TryToUint64, TToUint64Converter</*strict=*/true>, TOptional<ui64>(TAutoMap<TUtf8>, TOptional<ui16>), /*optionalArgs=*/1);
  571. using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>;
  572. template <typename TIt>
  573. static void SplitToListImpl(
  574. const IValueBuilder* valueBuilder,
  575. const TUnboxedValue& input,
  576. const std::string_view::const_iterator from,
  577. const TIt& it,
  578. TTmpVector& result) {
  579. for (const auto& elem : it) {
  580. result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim())));
  581. }
  582. }
  583. template <typename TIt>
  584. static void SplitToListImpl(
  585. const IValueBuilder* valueBuilder,
  586. const TUnboxedValue& input,
  587. const TUtf32String::const_iterator start,
  588. const TIt& it,
  589. TTmpVector& result) {
  590. const std::string_view& original = input.AsStringRef();
  591. size_t charPos = 0U, bytePos = 0U;
  592. for (const auto& elem : it) {
  593. for (const size_t next = std::distance(start, elem.TokenStart()); charPos < next; ++charPos)
  594. bytePos += WideCharSize(original[bytePos]);
  595. const auto from = bytePos;
  596. for (const size_t next = charPos + std::distance(elem.TokenStart(), elem.TokenDelim()); charPos < next; ++charPos)
  597. bytePos += WideCharSize(original[bytePos]);
  598. const auto size = bytePos - from;
  599. result.emplace_back(valueBuilder->SubString(input, from, size));
  600. }
  601. }
  602. template <typename TIt, typename TStrIt>
  603. static void SplitToListImpl(
  604. const IValueBuilder* valueBuilder,
  605. const TUnboxedValue& input,
  606. const TStrIt from,
  607. TIt& it,
  608. bool skipEmpty,
  609. TTmpVector& result) {
  610. if (skipEmpty) {
  611. SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result);
  612. } else {
  613. SplitToListImpl(valueBuilder, input, from, it, result);
  614. }
  615. }
  616. constexpr char delimeterStringName[] = "DelimeterString";
  617. constexpr char skipEmptyName[] = "SkipEmpty";
  618. constexpr char limitName[] = "Limit";
  619. using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>;
  620. using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>;
  621. using TLimitArg = TNamedArg<ui64, limitName>;
  622. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<TUtf8>(
  623. TOptional<TUtf8>,
  624. TUtf8,
  625. TDelimeterStringArg,
  626. TSkipEmptyArg,
  627. TLimitArg
  628. ),
  629. 3) {
  630. TTmpVector result;
  631. if (args[0]) {
  632. const bool delimiterString = args[2].GetOrDefault<bool>(true);
  633. const bool skipEmpty = args[3].GetOrDefault<bool>(false);
  634. const auto limit = args[4].GetOrDefault<ui64>(0);
  635. if (delimiterString) {
  636. const std::string_view input(args[0].AsStringRef());
  637. const std::string_view delimeter(args[1].AsStringRef());
  638. if (limit) {
  639. auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1);
  640. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  641. } else {
  642. auto it = StringSplitter(input).SplitByString(delimeter);
  643. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  644. }
  645. } else {
  646. const auto& input = UTF8ToUTF32<true>(args[0].AsStringRef());
  647. const auto& delimeter = UTF8ToUTF32<true>(args[1].AsStringRef());
  648. if (limit) {
  649. auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1);
  650. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  651. } else {
  652. auto it = StringSplitter(input).SplitBySet(delimeter.c_str());
  653. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  654. }
  655. }
  656. }
  657. return valueBuilder->NewList(result.data(), result.size());
  658. }
  659. SIMPLE_UDF(TJoinFromList, TUtf8(TAutoMap<TListType<TOptional<TUtf8>>>, TUtf8)) {
  660. const auto input = args[0].GetListIterator();
  661. const std::string_view delimeter(args[1].AsStringRef());
  662. std::vector<TString> items;
  663. for (TUnboxedValue current; input.Next(current);) {
  664. if (current) {
  665. items.emplace_back(current.AsStringRef());
  666. }
  667. }
  668. return valueBuilder->NewString(JoinSeq(delimeter, items));
  669. }
  670. SIMPLE_UDF(TToCodePointList, TListType<ui32>(TAutoMap<TUtf8>)) {
  671. size_t codePointCount = 0;
  672. const auto& inputRef = args[0].AsStringRef();
  673. if (!GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), codePointCount)) {
  674. // should not happen but still we have to check return code
  675. ythrow yexception() << "Unable to count code points";
  676. }
  677. TUnboxedValue* itemsPtr = nullptr;
  678. auto result = valueBuilder->NewArray(codePointCount, itemsPtr);
  679. const unsigned char* current = reinterpret_cast<const unsigned char*>(inputRef.Data());
  680. const unsigned char* end = current + inputRef.Size();
  681. wchar32 rune = BROKEN_RUNE;
  682. ui32 codePointIndex = 0;
  683. RECODE_RESULT retcode = RECODE_OK;
  684. while (current < end && RECODE_OK == (retcode = ReadUTF8CharAndAdvance(rune, current, end))) {
  685. if (codePointIndex >= codePointCount) {
  686. // sanity check
  687. ythrow yexception() << "Too big code point index " << codePointIndex << ", expecting only " << codePointCount << " code points";
  688. }
  689. itemsPtr[codePointIndex++] = TUnboxedValuePod(static_cast<ui32>(rune));
  690. }
  691. if (retcode != RECODE_OK) {
  692. ythrow yexception() << "Malformed UTF-8 string";
  693. }
  694. return result;
  695. }
  696. SIMPLE_UDF(TFromCodePointList, TUtf8(TAutoMap<TListType<ui32>>)) {
  697. auto input = args[0];
  698. if (auto elems = input.GetElements()) {
  699. const auto elemCount = input.GetListLength();
  700. auto bufferSize = WideToUTF8BufferSize(elemCount);
  701. TTempBuf buffer(bufferSize);
  702. auto bufferPtr = buffer.Data();
  703. auto bufferEnd = buffer.Data() + bufferSize;
  704. for (ui64 i = 0; i != elemCount; ++i) {
  705. const auto& item = elems[i];
  706. const wchar32 rune = item.Get<ui32>();
  707. size_t written = 0;
  708. WideToUTF8(&rune, 1, bufferPtr, written);
  709. Y_ENSURE(written <= 4);
  710. bufferPtr += written;
  711. Y_ENSURE(bufferPtr <= bufferEnd);
  712. }
  713. return valueBuilder->NewString(TStringRef(buffer.Data(), bufferPtr - buffer.Data()));
  714. }
  715. std::vector<char, NUdf::TStdAllocatorForUdf<char>> buffer;
  716. buffer.reserve(TUnboxedValuePod::InternalBufferSize);
  717. const auto& iter = input.GetListIterator();
  718. char runeBuffer[4] = {};
  719. for (NUdf::TUnboxedValue item; iter.Next(item); ) {
  720. const wchar32 rune = item.Get<ui32>();
  721. size_t written = 0;
  722. WideToUTF8(&rune, 1, runeBuffer, written);
  723. Y_ENSURE(written <= 4);
  724. buffer.insert(buffer.end(), runeBuffer, runeBuffer + written);
  725. }
  726. return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size()));
  727. }
  728. #define EXPORTED_UNICODE_BASE_UDF \
  729. TIsUtf, \
  730. TGetLength, \
  731. TSubstring, \
  732. TFind, \
  733. TRFind, \
  734. TSplitToList, \
  735. TJoinFromList, \
  736. TLevensteinDistance, \
  737. TReplaceAll, \
  738. TReplaceFirst, \
  739. TReplaceLast, \
  740. TRemoveAll, \
  741. TRemoveFirst, \
  742. TRemoveLast, \
  743. TToCodePointList, \
  744. TFromCodePointList, \
  745. TReverse, \
  746. TToLower, \
  747. TToUpper, \
  748. TToTitle, \
  749. TToUint64, \
  750. TTryToUint64, \
  751. TStrip, \
  752. TIsUnicodeSet, \
  753. TNormalize, \
  754. TNormalizeNFD, \
  755. TNormalizeNFC, \
  756. TNormalizeNFKD, \
  757. TNormalizeNFKC, \
  758. TIsAscii, \
  759. TIsSpace, \
  760. TIsUpper, \
  761. TIsLower, \
  762. TIsDigit, \
  763. TIsAlpha, \
  764. TIsAlnum, \
  765. TIsHex
  766. }