unicode_base_udf.h 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. #pragma once
  2. #include <yql/essentials/public/udf/udf_allocator.h>
  3. #include <yql/essentials/public/udf/udf_helpers.h>
  4. #include <yql/essentials/utils/utf8.h>
  5. #include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>
  6. #include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>
  7. #include <library/cpp/unicode/normalization/normalization.h>
  8. #include <library/cpp/unicode/set/unicode_set.h>
  9. #include <library/cpp/deprecated/split/split_iterator.h>
  10. #include <util/string/join.h>
  11. #include <util/string/reverse.h>
  12. #include <util/string/split.h>
  13. #include <util/string/subst.h>
  14. #include <util/charset/wide.h>
  15. #include <util/charset/utf8.h>
  16. #include <util/string/strip.h>
  17. #include <util/string/ascii.h>
  18. #include <util/charset/unidata.h>
  19. using namespace NYql;
  20. using namespace NUdf;
  21. using namespace NUnicode;
  22. namespace {
  23. inline constexpr bool IsAscii(wchar32 c) noexcept {
  24. return ::IsAscii(c);
  25. }
  26. template <class It>
  27. struct TIsUnicodeSpaceAdapter {
  28. bool operator()(const It& it) const noexcept {
  29. return IsSpace(*it);
  30. }
  31. };
  32. template <class It>
  33. TIsUnicodeSpaceAdapter<It> IsUnicodeSpaceAdapter(It) {
  34. return {};
  35. }
  36. struct TNoChangesTag {};
  37. template <typename TDerived>
  38. struct TScalarOperationMixin {
  39. static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) {
  40. Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef()));
  41. auto&& executeResult = TDerived::Execute(args[0].AsStringRef());
  42. return ProcessResult(builder, std::move(executeResult), args);
  43. }
  44. private:
  45. static TUnboxedValue ProcessResult(const IValueBuilder* builder, TString&& newString, const TUnboxedValuePod*) {
  46. return builder->NewString(std::move(newString));
  47. }
  48. template <typename T>
  49. static TUnboxedValue ProcessResult(const IValueBuilder* builder, std::variant<TNoChangesTag, T> newValue, const TUnboxedValuePod* initialArg) {
  50. if (std::holds_alternative<T>(newValue)) {
  51. return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg);
  52. } else {
  53. return initialArg[0];
  54. }
  55. }
  56. static TUnboxedValue ProcessResult(const IValueBuilder* builder, bool result, const TUnboxedValuePod*) {
  57. Y_UNUSED(builder);
  58. return TUnboxedValuePod(result);
  59. }
  60. };
  61. template <typename TDerived>
  62. struct TBlockOperationMixin {
  63. template <typename Sync>
  64. static void DoExecute(const TBlockItem arg, const Sync& sync) {
  65. Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef()));
  66. auto&& executeResult = TDerived::Execute(arg.AsStringRef());
  67. TBlockItem boxedValue = ProcessResult(std::move(executeResult), arg);
  68. sync(boxedValue);
  69. }
  70. private:
  71. static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) {
  72. Y_UNUSED(arg);
  73. return TBlockItem(std::move(newString));
  74. }
  75. template <typename T>
  76. static TBlockItem ProcessResult(const std::variant<TNoChangesTag, T>& newValue, const TBlockItem arg) {
  77. if (std::holds_alternative<T>(newValue)) {
  78. return ProcessResult(std::get<T>(newValue), arg);
  79. } else {
  80. return arg;
  81. }
  82. }
  83. static TBlockItem ProcessResult(bool result, const TBlockItem arg) {
  84. Y_UNUSED(arg);
  85. return TBlockItem(result);
  86. }
  87. };
  88. template <typename TDerived>
  89. struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {
  90. using TBlockOperationMixin<TDerived>::DoExecute;
  91. using TScalarOperationMixin<TDerived>::DoExecute;
  92. };
  93. template <auto mode>
  94. struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> {
  95. static TString Execute(TStringRef arg) {
  96. const TUtf16String& input = UTF8ToWide(arg.Data(), arg.Size());
  97. return WideToUTF8(Normalize<mode>(input));
  98. }
  99. };
  100. template <bool (*Function)(wchar32)>
  101. struct TCheckAllChars: public TOperationMixin<TCheckAllChars<Function>> {
  102. static bool Execute(TStringRef arg) {
  103. const TStringBuf input(arg);
  104. wchar32 rune;
  105. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
  106. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
  107. while (cur != last) {
  108. ReadUTF8CharAndAdvance(rune, cur, last);
  109. if (!static_cast<bool (*)(wchar32)>(Function)(rune)) {
  110. return false;
  111. }
  112. }
  113. return true;
  114. }
  115. };
  116. template <bool (*Function)(TUtf16String&, size_t pos, size_t count)>
  117. struct TStringToStringMapper: public TOperationMixin<TStringToStringMapper<Function>> {
  118. static std::variant<TNoChangesTag, TString> Execute(TStringRef arg) {
  119. if (auto wide = UTF8ToWide(arg);
  120. static_cast<bool (*)(TUtf16String&, size_t pos, size_t count)>(Function)(wide, 0, TUtf16String::npos)) {
  121. return WideToUTF8(std::move(wide));
  122. } else {
  123. return TNoChangesTag{};
  124. }
  125. }
  126. };
  127. #define DEFINE_UTF8_OPERATION(udfName, Executor, signature) \
  128. BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, signature) { \
  129. return Executor::DoExecute(valueBuilder, args); \
  130. } \
  131. \
  132. struct T##udfName##KernelExec \
  133. : public TUnaryKernelExec<T##udfName##KernelExec> { \
  134. template <typename TSink> \
  135. static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \
  136. Y_UNUSED(valueBuilder); \
  137. Executor::DoExecute(arg1, sink); \
  138. } \
  139. }; \
  140. \
  141. END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
  142. DEFINE_UTF8_OPERATION(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
  143. DEFINE_UTF8_OPERATION(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>));
  144. DEFINE_UTF8_OPERATION(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
  145. DEFINE_UTF8_OPERATION(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>));
  146. DEFINE_UTF8_OPERATION(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>));
  147. DEFINE_UTF8_OPERATION(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>));
  148. DEFINE_UTF8_OPERATION(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>));
  149. DEFINE_UTF8_OPERATION(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>));
  150. DEFINE_UTF8_OPERATION(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>));
  151. DEFINE_UTF8_OPERATION(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>));
  152. DEFINE_UTF8_OPERATION(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>));
  153. DEFINE_UTF8_OPERATION(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>));
  154. DEFINE_UTF8_OPERATION(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>));
  155. DEFINE_UTF8_OPERATION(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>));
  156. DEFINE_UTF8_OPERATION(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>));
  157. DEFINE_UTF8_OPERATION(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>));
  158. SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) {
  159. Y_UNUSED(valueBuilder);
  160. if (args[0]) {
  161. return TUnboxedValuePod(IsUtf8(args[0].AsStringRef()));
  162. } else {
  163. return TUnboxedValuePod(false);
  164. }
  165. }
  166. SIMPLE_UDF(TGetLength, ui64(TAutoMap<TUtf8>)) {
  167. Y_UNUSED(valueBuilder);
  168. const auto& inputRef = args[0].AsStringRef();
  169. size_t result;
  170. GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result);
  171. return TUnboxedValuePod(static_cast<ui64>(result));
  172. }
  173. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TToUint64, ui64(TAutoMap<TUtf8>, TOptional<ui16>), 1) {
  174. Y_UNUSED(valueBuilder);
  175. const TString inputStr(args[0].AsStringRef());
  176. const char* input = inputStr.data();
  177. const int base = static_cast<int>(args[1].GetOrDefault<ui16>(0));
  178. char* pos = nullptr;
  179. errno = 0;
  180. unsigned long long res = std::strtoull(input, &pos, base);
  181. if (!res && errno == EINVAL) {
  182. UdfTerminate("Incorrect base");
  183. }
  184. ui64 ret = static_cast<ui64>(res);
  185. if (!res && pos == input) {
  186. UdfTerminate("Input string is not a number");
  187. } else if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) {
  188. UdfTerminate("Converted value falls out of Uint64 range");
  189. } else if (*pos) {
  190. UdfTerminate("Input string contains junk after the number");
  191. }
  192. return TUnboxedValuePod(ret);
  193. }
  194. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TTryToUint64, TOptional<ui64>(TAutoMap<TUtf8>, TOptional<ui16>), 1) {
  195. Y_UNUSED(valueBuilder);
  196. const TString inputStr(args[0].AsStringRef());
  197. const char* input = inputStr.data();
  198. const int base = static_cast<int>(args[1].GetOrDefault<ui16>(0));
  199. char* pos = nullptr;
  200. errno = 0;
  201. unsigned long long res = std::strtoull(input, &pos, base);
  202. if (!res && errno == EINVAL) {
  203. return TUnboxedValuePod();
  204. }
  205. ui64 ret = static_cast<ui64>(res);
  206. if (!res && pos == input) {
  207. return TUnboxedValuePod();
  208. }
  209. if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) {
  210. return TUnboxedValuePod();
  211. }
  212. if (*pos) {
  213. return TUnboxedValuePod();
  214. }
  215. return TUnboxedValuePod(ret);
  216. }
  217. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSubstring, TUtf8(TAutoMap<TUtf8>, TOptional<ui64>, TOptional<ui64>), 1) {
  218. const TStringBuf input(args[0].AsStringRef());
  219. size_t from = args[1].GetOrDefault<ui64>(0);
  220. size_t len = !args[2] ? TStringBuf::npos : size_t(args[2].Get<ui64>());
  221. return valueBuilder->NewString(SubstrUTF8(input, from, len));
  222. }
  223. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
  224. Y_UNUSED(valueBuilder);
  225. const std::string_view string(args[0].AsStringRef());
  226. const std::string_view needle(args[1].AsStringRef());
  227. std::string_view::size_type pos = 0U;
  228. if (auto p = args[2].GetOrDefault<ui64>(0ULL)) {
  229. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  230. const auto width = WideCharSize(*ptr);
  231. pos += width;
  232. ptr += width;
  233. }
  234. }
  235. if (const auto find = string.find(needle, pos); std::string_view::npos != find) {
  236. size_t result;
  237. GetNumberOfUTF8Chars(string.data(), find, result);
  238. return TUnboxedValuePod(static_cast<ui64>(result));
  239. }
  240. return TUnboxedValuePod();
  241. }
  242. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TRFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
  243. Y_UNUSED(valueBuilder);
  244. const std::string_view string(args[0].AsStringRef());
  245. const std::string_view needle(args[1].AsStringRef());
  246. std::string_view::size_type pos = std::string_view::npos;
  247. if (auto p = args[2].GetOrDefault<ui64>(std::string_view::npos); std::string_view::npos != p) {
  248. pos = 0ULL;
  249. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  250. const auto width = WideCharSize(*ptr);
  251. pos += width;
  252. ptr += width;
  253. }
  254. }
  255. if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) {
  256. size_t result;
  257. GetNumberOfUTF8Chars(string.data(), find, result);
  258. return TUnboxedValuePod(static_cast<ui64>(result));
  259. }
  260. return TUnboxedValuePod();
  261. }
  262. using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>;
  263. template <typename TIt>
  264. static void SplitToListImpl(
  265. const IValueBuilder* valueBuilder,
  266. const TUnboxedValue& input,
  267. const std::string_view::const_iterator from,
  268. const TIt& it,
  269. TTmpVector& result) {
  270. for (const auto& elem : it) {
  271. result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim())));
  272. }
  273. }
  274. template <typename TIt>
  275. static void SplitToListImpl(
  276. const IValueBuilder* valueBuilder,
  277. const TUnboxedValue& input,
  278. const TUtf32String::const_iterator start,
  279. const TIt& it,
  280. TTmpVector& result) {
  281. const std::string_view& original = input.AsStringRef();
  282. size_t charPos = 0U, bytePos = 0U;
  283. for (const auto& elem : it) {
  284. for (const size_t next = std::distance(start, elem.TokenStart()); charPos < next; ++charPos)
  285. bytePos += WideCharSize(original[bytePos]);
  286. const auto from = bytePos;
  287. for (const size_t next = charPos + std::distance(elem.TokenStart(), elem.TokenDelim()); charPos < next; ++charPos)
  288. bytePos += WideCharSize(original[bytePos]);
  289. const auto size = bytePos - from;
  290. result.emplace_back(valueBuilder->SubString(input, from, size));
  291. }
  292. }
  293. template <typename TIt, typename TStrIt>
  294. static void SplitToListImpl(
  295. const IValueBuilder* valueBuilder,
  296. const TUnboxedValue& input,
  297. const TStrIt from,
  298. TIt& it,
  299. bool skipEmpty,
  300. TTmpVector& result) {
  301. if (skipEmpty) {
  302. SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result);
  303. } else {
  304. SplitToListImpl(valueBuilder, input, from, it, result);
  305. }
  306. }
  307. constexpr char delimeterStringName[] = "DelimeterString";
  308. constexpr char skipEmptyName[] = "SkipEmpty";
  309. constexpr char limitName[] = "Limit";
  310. using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>;
  311. using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>;
  312. using TLimitArg = TNamedArg<ui64, limitName>;
  313. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<TUtf8>(
  314. TOptional<TUtf8>,
  315. TUtf8,
  316. TDelimeterStringArg,
  317. TSkipEmptyArg,
  318. TLimitArg
  319. ),
  320. 3) {
  321. TTmpVector result;
  322. if (args[0]) {
  323. const bool delimiterString = args[2].GetOrDefault<bool>(true);
  324. const bool skipEmpty = args[3].GetOrDefault<bool>(false);
  325. const auto limit = args[4].GetOrDefault<ui64>(0);
  326. if (delimiterString) {
  327. const std::string_view input(args[0].AsStringRef());
  328. const std::string_view delimeter(args[1].AsStringRef());
  329. if (limit) {
  330. auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1);
  331. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  332. } else {
  333. auto it = StringSplitter(input).SplitByString(delimeter);
  334. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  335. }
  336. } else {
  337. const auto& input = UTF8ToUTF32<true>(args[0].AsStringRef());
  338. const auto& delimeter = UTF8ToUTF32<true>(args[1].AsStringRef());
  339. if (limit) {
  340. auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1);
  341. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  342. } else {
  343. auto it = StringSplitter(input).SplitBySet(delimeter.c_str());
  344. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  345. }
  346. }
  347. }
  348. return valueBuilder->NewList(result.data(), result.size());
  349. }
  350. SIMPLE_UDF(TJoinFromList, TUtf8(TAutoMap<TListType<TOptional<TUtf8>>>, TUtf8)) {
  351. const auto input = args[0].GetListIterator();
  352. const std::string_view delimeter(args[1].AsStringRef());
  353. std::vector<TString> items;
  354. for (TUnboxedValue current; input.Next(current);) {
  355. if (current) {
  356. items.emplace_back(current.AsStringRef());
  357. }
  358. }
  359. return valueBuilder->NewString(JoinSeq(delimeter, items));
  360. }
  361. SIMPLE_UDF(TLevensteinDistance, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>)) {
  362. Y_UNUSED(valueBuilder);
  363. const TStringBuf left(args[0].AsStringRef());
  364. const TStringBuf right(args[1].AsStringRef());
  365. const auto& leftUtf32 = UTF8ToUTF32<true>(left);
  366. const auto& rightUtf32 = UTF8ToUTF32<true>(right);
  367. const ui64 result = NLevenshtein::Distance(leftUtf32, rightUtf32);
  368. return TUnboxedValuePod(result);
  369. }
  370. SIMPLE_UDF(TReplaceAll, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  371. if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
  372. return valueBuilder->NewString(result);
  373. else
  374. return args[0];
  375. }
  376. SIMPLE_UDF(TReplaceFirst, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  377. std::string result(args[0].AsStringRef());
  378. const std::string_view what(args[1].AsStringRef());
  379. if (const auto index = result.find(what); index != std::string::npos) {
  380. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  381. return valueBuilder->NewString(result);
  382. }
  383. return args[0];
  384. }
  385. SIMPLE_UDF(TReplaceLast, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  386. std::string result(args[0].AsStringRef());
  387. const std::string_view what(args[1].AsStringRef());
  388. if (const auto index = result.rfind(what); index != std::string::npos) {
  389. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  390. return valueBuilder->NewString(result);
  391. }
  392. return args[0];
  393. }
  394. SIMPLE_UDF(TRemoveAll, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  395. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  396. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  397. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  398. size_t tpos = 0;
  399. for (const wchar32 c : input) {
  400. if (!chars.contains(c)) {
  401. input[tpos++] = c;
  402. }
  403. }
  404. if (tpos != input.size()) {
  405. input.resize(tpos);
  406. return valueBuilder->NewString(WideToUTF8(input));
  407. }
  408. return args[0];
  409. }
  410. SIMPLE_UDF(TRemoveFirst, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  411. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  412. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  413. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  414. for (auto it = input.cbegin(); it != input.cend(); ++it) {
  415. if (chars.contains(*it)) {
  416. input.erase(it);
  417. return valueBuilder->NewString(WideToUTF8(input));
  418. }
  419. }
  420. return args[0];
  421. }
  422. SIMPLE_UDF(TRemoveLast, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  423. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  424. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  425. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  426. for (auto it = input.crbegin(); it != input.crend(); ++it) {
  427. if (chars.contains(*it)) {
  428. input.erase(input.crend() - it - 1, 1);
  429. return valueBuilder->NewString(WideToUTF8(input));
  430. }
  431. }
  432. return args[0];
  433. }
  434. SIMPLE_UDF(TToCodePointList, TListType<ui32>(TAutoMap<TUtf8>)) {
  435. size_t codePointCount = 0;
  436. const auto& inputRef = args[0].AsStringRef();
  437. if (!GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), codePointCount)) {
  438. // should not happen but still we have to check return code
  439. ythrow yexception() << "Unable to count code points";
  440. }
  441. TUnboxedValue* itemsPtr = nullptr;
  442. auto result = valueBuilder->NewArray(codePointCount, itemsPtr);
  443. const unsigned char* current = reinterpret_cast<const unsigned char*>(inputRef.Data());
  444. const unsigned char* end = current + inputRef.Size();
  445. wchar32 rune = BROKEN_RUNE;
  446. ui32 codePointIndex = 0;
  447. RECODE_RESULT retcode = RECODE_OK;
  448. while (current < end && RECODE_OK == (retcode = ReadUTF8CharAndAdvance(rune, current, end))) {
  449. if (codePointIndex >= codePointCount) {
  450. // sanity check
  451. ythrow yexception() << "Too big code point index " << codePointIndex << ", expecting only " << codePointCount << " code points";
  452. }
  453. itemsPtr[codePointIndex++] = TUnboxedValuePod(static_cast<ui32>(rune));
  454. }
  455. if (retcode != RECODE_OK) {
  456. ythrow yexception() << "Malformed UTF-8 string";
  457. }
  458. return result;
  459. }
  460. SIMPLE_UDF(TFromCodePointList, TUtf8(TAutoMap<TListType<ui32>>)) {
  461. auto input = args[0];
  462. if (auto elems = input.GetElements()) {
  463. const auto elemCount = input.GetListLength();
  464. auto bufferSize = WideToUTF8BufferSize(elemCount);
  465. TTempBuf buffer(bufferSize);
  466. auto bufferPtr = buffer.Data();
  467. auto bufferEnd = buffer.Data() + bufferSize;
  468. for (ui64 i = 0; i != elemCount; ++i) {
  469. const auto& item = elems[i];
  470. const wchar32 rune = item.Get<ui32>();
  471. size_t written = 0;
  472. WideToUTF8(&rune, 1, bufferPtr, written);
  473. Y_ENSURE(written <= 4);
  474. bufferPtr += written;
  475. Y_ENSURE(bufferPtr <= bufferEnd);
  476. }
  477. return valueBuilder->NewString(TStringRef(buffer.Data(), bufferPtr - buffer.Data()));
  478. }
  479. std::vector<char, NUdf::TStdAllocatorForUdf<char>> buffer;
  480. buffer.reserve(TUnboxedValuePod::InternalBufferSize);
  481. const auto& iter = input.GetListIterator();
  482. char runeBuffer[4] = {};
  483. for (NUdf::TUnboxedValue item; iter.Next(item); ) {
  484. const wchar32 rune = item.Get<ui32>();
  485. size_t written = 0;
  486. WideToUTF8(&rune, 1, runeBuffer, written);
  487. Y_ENSURE(written <= 4);
  488. buffer.insert(buffer.end(), runeBuffer, runeBuffer + written);
  489. }
  490. return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size()));
  491. }
  492. SIMPLE_UDF(TReverse, TUtf8(TAutoMap<TUtf8>)) {
  493. auto wide = UTF8ToWide(args[0].AsStringRef());
  494. ReverseInPlace(wide);
  495. return valueBuilder->NewString(WideToUTF8(wide));
  496. }
  497. SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) {
  498. const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  499. const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
  500. return valueBuilder->NewString(WideToUTF8(result));
  501. }
  502. SIMPLE_UDF(TIsUnicodeSet, bool(TAutoMap<TUtf8>, TUtf8)) {
  503. Y_UNUSED(valueBuilder);
  504. const TStringBuf input(args[0].AsStringRef());
  505. const TUtf16String& customCategory = UTF8ToWide(args[1].AsStringRef());
  506. TUnicodeSet unicodeSet;
  507. try {
  508. unicodeSet.Parse(customCategory);
  509. } catch (...) {
  510. UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str());
  511. }
  512. bool result = true;
  513. wchar32 rune;
  514. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
  515. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
  516. while (cur != last) {
  517. ReadUTF8CharAndAdvance(rune, cur, last);
  518. if (!unicodeSet.Has(rune)) {
  519. result = false;
  520. break;
  521. }
  522. }
  523. return TUnboxedValuePod(result);
  524. }
  525. #define EXPORTED_UNICODE_BASE_UDF \
  526. TIsUtf, \
  527. TGetLength, \
  528. TSubstring, \
  529. TFind, \
  530. TRFind, \
  531. TSplitToList, \
  532. TJoinFromList, \
  533. TLevensteinDistance, \
  534. TReplaceAll, \
  535. TReplaceFirst, \
  536. TReplaceLast, \
  537. TRemoveAll, \
  538. TRemoveFirst, \
  539. TRemoveLast, \
  540. TToCodePointList, \
  541. TFromCodePointList, \
  542. TReverse, \
  543. TToLower, \
  544. TToUpper, \
  545. TToTitle, \
  546. TToUint64, \
  547. TTryToUint64, \
  548. TStrip, \
  549. TIsUnicodeSet, \
  550. TNormalize, \
  551. TNormalizeNFD, \
  552. TNormalizeNFC, \
  553. TNormalizeNFKD, \
  554. TNormalizeNFKC, \
  555. TIsAscii, \
  556. TIsSpace, \
  557. TIsUpper, \
  558. TIsLower, \
  559. TIsDigit, \
  560. TIsAlpha, \
  561. TIsAlnum, \
  562. TIsHex
  563. }