unicode_base_udf.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. #pragma once
  2. #include <yql/essentials/public/udf/udf_allocator.h>
  3. #include <yql/essentials/public/udf/udf_helpers.h>
  4. #include <yql/essentials/utils/utf8.h>
  5. #include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>
  6. #include <library/cpp/unicode/normalization/normalization.h>
  7. #include <library/cpp/unicode/set/unicode_set.h>
  8. #include <library/cpp/deprecated/split/split_iterator.h>
  9. #include <util/string/join.h>
  10. #include <util/string/reverse.h>
  11. #include <util/string/split.h>
  12. #include <util/string/subst.h>
  13. #include <util/charset/wide.h>
  14. #include <util/charset/utf8.h>
  15. #include <util/string/strip.h>
  16. #include <util/string/ascii.h>
  17. #include <util/charset/unidata.h>
  18. using namespace NYql;
  19. using namespace NUdf;
  20. using namespace NUnicode;
  21. namespace {
  22. template <class It>
  23. struct TIsUnicodeSpaceAdapter {
  24. bool operator()(const It& it) const noexcept {
  25. return IsSpace(*it);
  26. }
  27. };
  28. template <class It>
  29. TIsUnicodeSpaceAdapter<It> IsUnicodeSpaceAdapter(It) {
  30. return {};
  31. }
  32. #define NORMALIZE_UDF_MAP(XX) \
  33. XX(Normalize, NFC) \
  34. XX(NormalizeNFD, NFD) \
  35. XX(NormalizeNFC, NFC) \
  36. XX(NormalizeNFKD, NFKD) \
  37. XX(NormalizeNFKC, NFKC)
  38. #define IS_CATEGORY_UDF_MAP(XX) \
  39. XX(IsAscii, IsAscii) \
  40. XX(IsSpace, IsSpace) \
  41. XX(IsUpper, IsUpper) \
  42. XX(IsLower, IsLower) \
  43. XX(IsDigit, IsDigit) \
  44. XX(IsAlpha, IsAlpha) \
  45. XX(IsAlnum, IsAlnum) \
  46. XX(IsHex, IsHexdigit)
  47. #define NORMALIZE_UDF(name, mode) \
  48. SIMPLE_UDF(T##name, TUtf8(TAutoMap<TUtf8>)) { \
  49. const auto& inputRef = args[0].AsStringRef(); \
  50. const TUtf16String& input = UTF8ToWide(inputRef.Data(), inputRef.Size()); \
  51. const TString& output = WideToUTF8(Normalize<mode>(input)); \
  52. return valueBuilder->NewString(output); \
  53. }
  54. #define IS_CATEGORY_UDF(udfName, function) \
  55. SIMPLE_UDF(T##udfName, bool(TAutoMap<TUtf8>)) { \
  56. Y_UNUSED(valueBuilder); \
  57. const TStringBuf input(args[0].AsStringRef()); \
  58. bool result = true; \
  59. wchar32 rune; \
  60. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); \
  61. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end()); \
  62. while (cur != last) { \
  63. ReadUTF8CharAndAdvance(rune, cur, last); \
  64. if (!function(rune)) { \
  65. result = false; \
  66. break; \
  67. } \
  68. } \
  69. return TUnboxedValuePod(result); \
  70. }
  71. NORMALIZE_UDF_MAP(NORMALIZE_UDF)
  72. IS_CATEGORY_UDF_MAP(IS_CATEGORY_UDF)
  73. SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) {
  74. Y_UNUSED(valueBuilder);
  75. if (args[0]) {
  76. return TUnboxedValuePod(IsUtf8(args[0].AsStringRef()));
  77. } else {
  78. return TUnboxedValuePod(false);
  79. }
  80. }
  81. SIMPLE_UDF(TGetLength, ui64(TAutoMap<TUtf8>)) {
  82. Y_UNUSED(valueBuilder);
  83. const auto& inputRef = args[0].AsStringRef();
  84. size_t result;
  85. GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result);
  86. return TUnboxedValuePod(static_cast<ui64>(result));
  87. }
  88. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TToUint64, ui64(TAutoMap<TUtf8>, TOptional<ui16>), 1) {
  89. Y_UNUSED(valueBuilder);
  90. const TString inputStr(args[0].AsStringRef());
  91. const char* input = inputStr.data();
  92. const int base = static_cast<int>(args[1].GetOrDefault<ui16>(0));
  93. char* pos = nullptr;
  94. errno = 0;
  95. unsigned long long res = std::strtoull(input, &pos, base);
  96. if (!res && errno == EINVAL) {
  97. UdfTerminate("Incorrect base");
  98. }
  99. ui64 ret = static_cast<ui64>(res);
  100. if (!res && pos == input) {
  101. UdfTerminate("Input string is not a number");
  102. } else if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) {
  103. UdfTerminate("Converted value falls out of Uint64 range");
  104. } else if (*pos) {
  105. UdfTerminate("Input string contains junk after the number");
  106. }
  107. return TUnboxedValuePod(ret);
  108. }
  109. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TTryToUint64, TOptional<ui64>(TAutoMap<TUtf8>, TOptional<ui16>), 1) {
  110. Y_UNUSED(valueBuilder);
  111. const TString inputStr(args[0].AsStringRef());
  112. const char* input = inputStr.data();
  113. const int base = static_cast<int>(args[1].GetOrDefault<ui16>(0));
  114. char* pos = nullptr;
  115. errno = 0;
  116. unsigned long long res = std::strtoull(input, &pos, base);
  117. if (!res && errno == EINVAL) {
  118. return TUnboxedValuePod();
  119. }
  120. ui64 ret = static_cast<ui64>(res);
  121. if (!res && pos == input) {
  122. return TUnboxedValuePod();
  123. }
  124. if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) {
  125. return TUnboxedValuePod();
  126. }
  127. if (*pos) {
  128. return TUnboxedValuePod();
  129. }
  130. return TUnboxedValuePod(ret);
  131. }
  132. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSubstring, TUtf8(TAutoMap<TUtf8>, TOptional<ui64>, TOptional<ui64>), 1) {
  133. const TStringBuf input(args[0].AsStringRef());
  134. size_t from = args[1].GetOrDefault<ui64>(0);
  135. size_t len = !args[2] ? TStringBuf::npos : size_t(args[2].Get<ui64>());
  136. return valueBuilder->NewString(SubstrUTF8(input, from, len));
  137. }
  138. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
  139. Y_UNUSED(valueBuilder);
  140. const std::string_view string(args[0].AsStringRef());
  141. const std::string_view needle(args[1].AsStringRef());
  142. std::string_view::size_type pos = 0U;
  143. if (auto p = args[2].GetOrDefault<ui64>(0ULL)) {
  144. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  145. const auto width = WideCharSize(*ptr);
  146. pos += width;
  147. ptr += width;
  148. }
  149. }
  150. if (const auto find = string.find(needle, pos); std::string_view::npos != find) {
  151. size_t result;
  152. GetNumberOfUTF8Chars(string.data(), find, result);
  153. return TUnboxedValuePod(static_cast<ui64>(result));
  154. }
  155. return TUnboxedValuePod();
  156. }
  157. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TRFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
  158. Y_UNUSED(valueBuilder);
  159. const std::string_view string(args[0].AsStringRef());
  160. const std::string_view needle(args[1].AsStringRef());
  161. std::string_view::size_type pos = std::string_view::npos;
  162. if (auto p = args[2].GetOrDefault<ui64>(std::string_view::npos); std::string_view::npos != p) {
  163. pos = 0ULL;
  164. for (auto ptr = string.data(); p && pos < string.size(); --p) {
  165. const auto width = WideCharSize(*ptr);
  166. pos += width;
  167. ptr += width;
  168. }
  169. }
  170. if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) {
  171. size_t result;
  172. GetNumberOfUTF8Chars(string.data(), find, result);
  173. return TUnboxedValuePod(static_cast<ui64>(result));
  174. }
  175. return TUnboxedValuePod();
  176. }
  177. using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>;
  178. template <typename TIt>
  179. static void SplitToListImpl(
  180. const IValueBuilder* valueBuilder,
  181. const TUnboxedValue& input,
  182. const std::string_view::const_iterator from,
  183. const TIt& it,
  184. TTmpVector& result) {
  185. for (const auto& elem : it) {
  186. result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim())));
  187. }
  188. }
  189. template <typename TIt>
  190. static void SplitToListImpl(
  191. const IValueBuilder* valueBuilder,
  192. const TUnboxedValue& input,
  193. const TUtf16String::const_iterator start,
  194. const TIt& it,
  195. TTmpVector& result) {
  196. const std::string_view& original = input.AsStringRef();
  197. size_t charPos = 0U, bytePos = 0U;
  198. for (const auto& elem : it) {
  199. for (const size_t next = std::distance(start, elem.TokenStart()); charPos < next; ++charPos)
  200. bytePos += WideCharSize(original[bytePos]);
  201. const auto from = bytePos;
  202. for (const size_t next = charPos + std::distance(elem.TokenStart(), elem.TokenDelim()); charPos < next; ++charPos)
  203. bytePos += WideCharSize(original[bytePos]);
  204. const auto size = bytePos - from;
  205. result.emplace_back(valueBuilder->SubString(input, from, size));
  206. }
  207. }
  208. template <typename TIt, typename TStrIt>
  209. static void SplitToListImpl(
  210. const IValueBuilder* valueBuilder,
  211. const TUnboxedValue& input,
  212. const TStrIt from,
  213. TIt& it,
  214. bool skipEmpty,
  215. TTmpVector& result) {
  216. if (skipEmpty) {
  217. SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result);
  218. } else {
  219. SplitToListImpl(valueBuilder, input, from, it, result);
  220. }
  221. }
  222. constexpr char delimeterStringName[] = "DelimeterString";
  223. constexpr char skipEmptyName[] = "SkipEmpty";
  224. constexpr char limitName[] = "Limit";
  225. using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>;
  226. using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>;
  227. using TLimitArg = TNamedArg<ui64, limitName>;
  228. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<TUtf8>(
  229. TOptional<TUtf8>,
  230. TUtf8,
  231. TDelimeterStringArg,
  232. TSkipEmptyArg,
  233. TLimitArg
  234. ),
  235. 3) {
  236. TTmpVector result;
  237. if (args[0]) {
  238. const bool delimiterString = args[2].GetOrDefault<bool>(true);
  239. const bool skipEmpty = args[3].GetOrDefault<bool>(false);
  240. const auto limit = args[4].GetOrDefault<ui64>(0);
  241. if (delimiterString) {
  242. const std::string_view input(args[0].AsStringRef());
  243. const std::string_view delimeter(args[1].AsStringRef());
  244. if (limit) {
  245. auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1);
  246. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  247. } else {
  248. auto it = StringSplitter(input).SplitByString(delimeter);
  249. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  250. }
  251. } else {
  252. const auto& input = UTF8ToWide(args[0].AsStringRef());
  253. const auto& delimeter = UTF8ToWide(args[1].AsStringRef());
  254. if (limit) {
  255. auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1);
  256. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  257. } else {
  258. auto it = StringSplitter(input).SplitBySet(delimeter.c_str());
  259. SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
  260. }
  261. }
  262. }
  263. return valueBuilder->NewList(result.data(), result.size());
  264. }
  265. SIMPLE_UDF(TJoinFromList, TUtf8(TAutoMap<TListType<TOptional<TUtf8>>>, TUtf8)) {
  266. const auto input = args[0].GetListIterator();
  267. const std::string_view delimeter(args[1].AsStringRef());
  268. std::vector<TString> items;
  269. for (TUnboxedValue current; input.Next(current);) {
  270. if (current) {
  271. items.emplace_back(current.AsStringRef());
  272. }
  273. }
  274. return valueBuilder->NewString(JoinSeq(delimeter, items));
  275. }
  276. SIMPLE_UDF(TLevensteinDistance, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>)) {
  277. Y_UNUSED(valueBuilder);
  278. const TStringBuf left(args[0].AsStringRef());
  279. const TStringBuf right(args[1].AsStringRef());
  280. const TUtf16String& leftWide = UTF8ToWide(left);
  281. const TUtf16String& rightWide = UTF8ToWide(right);
  282. const ui64 result = NLevenshtein::Distance(leftWide, rightWide);
  283. return TUnboxedValuePod(result);
  284. }
  285. SIMPLE_UDF(TReplaceAll, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  286. if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
  287. return valueBuilder->NewString(result);
  288. else
  289. return args[0];
  290. }
  291. SIMPLE_UDF(TReplaceFirst, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  292. std::string result(args[0].AsStringRef());
  293. const std::string_view what(args[1].AsStringRef());
  294. if (const auto index = result.find(what); index != std::string::npos) {
  295. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  296. return valueBuilder->NewString(result);
  297. }
  298. return args[0];
  299. }
  300. SIMPLE_UDF(TReplaceLast, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
  301. std::string result(args[0].AsStringRef());
  302. const std::string_view what(args[1].AsStringRef());
  303. if (const auto index = result.rfind(what); index != std::string::npos) {
  304. result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
  305. return valueBuilder->NewString(result);
  306. }
  307. return args[0];
  308. }
  309. SIMPLE_UDF(TRemoveAll, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  310. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  311. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  312. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  313. size_t tpos = 0;
  314. for (const wchar32 c : input) {
  315. if (!chars.contains(c)) {
  316. input[tpos++] = c;
  317. }
  318. }
  319. if (tpos != input.size()) {
  320. input.resize(tpos);
  321. return valueBuilder->NewString(WideToUTF8(input));
  322. }
  323. return args[0];
  324. }
  325. SIMPLE_UDF(TRemoveFirst, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  326. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  327. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  328. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  329. for (auto it = input.cbegin(); it != input.cend(); ++it) {
  330. if (chars.contains(*it)) {
  331. input.erase(it);
  332. return valueBuilder->NewString(WideToUTF8(input));
  333. }
  334. }
  335. return args[0];
  336. }
  337. SIMPLE_UDF(TRemoveLast, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
  338. TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  339. const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
  340. const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
  341. for (auto it = input.crbegin(); it != input.crend(); ++it) {
  342. if (chars.contains(*it)) {
  343. input.erase(input.crend() - it - 1, 1);
  344. return valueBuilder->NewString(WideToUTF8(input));
  345. }
  346. }
  347. return args[0];
  348. }
  349. SIMPLE_UDF(TToCodePointList, TListType<ui32>(TAutoMap<TUtf8>)) {
  350. size_t codePointCount = 0;
  351. const auto& inputRef = args[0].AsStringRef();
  352. if (!GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), codePointCount)) {
  353. // should not happen but still we have to check return code
  354. ythrow yexception() << "Unable to count code points";
  355. }
  356. TUnboxedValue* itemsPtr = nullptr;
  357. auto result = valueBuilder->NewArray(codePointCount, itemsPtr);
  358. const unsigned char* current = reinterpret_cast<const unsigned char*>(inputRef.Data());
  359. const unsigned char* end = current + inputRef.Size();
  360. wchar32 rune = BROKEN_RUNE;
  361. ui32 codePointIndex = 0;
  362. RECODE_RESULT retcode = RECODE_OK;
  363. while (current < end && RECODE_OK == (retcode = ReadUTF8CharAndAdvance(rune, current, end))) {
  364. if (codePointIndex >= codePointCount) {
  365. // sanity check
  366. ythrow yexception() << "Too big code point index " << codePointIndex << ", expecting only " << codePointCount << " code points";
  367. }
  368. itemsPtr[codePointIndex++] = TUnboxedValuePod(static_cast<ui32>(rune));
  369. }
  370. if (retcode != RECODE_OK) {
  371. ythrow yexception() << "Malformed UTF-8 string";
  372. }
  373. return result;
  374. }
  375. SIMPLE_UDF(TFromCodePointList, TUtf8(TAutoMap<TListType<ui32>>)) {
  376. auto input = args[0];
  377. if (auto elems = input.GetElements()) {
  378. const auto elemCount = input.GetListLength();
  379. auto bufferSize = WideToUTF8BufferSize(elemCount);
  380. TTempBuf buffer(bufferSize);
  381. auto bufferPtr = buffer.Data();
  382. auto bufferEnd = buffer.Data() + bufferSize;
  383. for (ui64 i = 0; i != elemCount; ++i) {
  384. const auto& item = elems[i];
  385. const wchar32 rune = item.Get<ui32>();
  386. size_t written = 0;
  387. WideToUTF8(&rune, 1, bufferPtr, written);
  388. Y_ENSURE(written <= 4);
  389. bufferPtr += written;
  390. Y_ENSURE(bufferPtr <= bufferEnd);
  391. }
  392. return valueBuilder->NewString(TStringRef(buffer.Data(), bufferPtr - buffer.Data()));
  393. }
  394. std::vector<char, NUdf::TStdAllocatorForUdf<char>> buffer;
  395. buffer.reserve(TUnboxedValuePod::InternalBufferSize);
  396. const auto& iter = input.GetListIterator();
  397. char runeBuffer[4] = {};
  398. for (NUdf::TUnboxedValue item; iter.Next(item); ) {
  399. const wchar32 rune = item.Get<ui32>();
  400. size_t written = 0;
  401. WideToUTF8(&rune, 1, runeBuffer, written);
  402. Y_ENSURE(written <= 4);
  403. buffer.insert(buffer.end(), runeBuffer, runeBuffer + written);
  404. }
  405. return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size()));
  406. }
  407. SIMPLE_UDF(TReverse, TUtf8(TAutoMap<TUtf8>)) {
  408. auto wide = UTF8ToWide(args[0].AsStringRef());
  409. ReverseInPlace(wide);
  410. return valueBuilder->NewString(WideToUTF8(wide));
  411. }
  412. SIMPLE_UDF(TToLower, TUtf8(TAutoMap<TUtf8>)) {
  413. if (auto wide = UTF8ToWide(args->AsStringRef()); ToLower(wide))
  414. return valueBuilder->NewString(WideToUTF8(wide));
  415. else
  416. return *args;
  417. }
  418. SIMPLE_UDF(TToUpper, TUtf8(TAutoMap<TUtf8>)) {
  419. if (auto wide = UTF8ToWide(args->AsStringRef()); ToUpper(wide))
  420. return valueBuilder->NewString(WideToUTF8(wide));
  421. else
  422. return *args;
  423. }
  424. SIMPLE_UDF(TToTitle, TUtf8(TAutoMap<TUtf8>)) {
  425. if (auto wide = UTF8ToWide(args->AsStringRef()); ToTitle(wide))
  426. return valueBuilder->NewString(WideToUTF8(wide));
  427. else
  428. return *args;
  429. }
  430. SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) {
  431. const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
  432. const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
  433. return valueBuilder->NewString(WideToUTF8(result));
  434. }
  435. SIMPLE_UDF(TIsUnicodeSet, bool(TAutoMap<TUtf8>, TUtf8)) {
  436. Y_UNUSED(valueBuilder);
  437. const TStringBuf input(args[0].AsStringRef());
  438. const TUtf16String& customCategory = UTF8ToWide(args[1].AsStringRef());
  439. TUnicodeSet unicodeSet;
  440. try {
  441. unicodeSet.Parse(customCategory);
  442. } catch (...) {
  443. UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str());
  444. }
  445. bool result = true;
  446. wchar32 rune;
  447. const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
  448. const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
  449. while (cur != last) {
  450. ReadUTF8CharAndAdvance(rune, cur, last);
  451. if (!unicodeSet.Has(rune)) {
  452. result = false;
  453. break;
  454. }
  455. }
  456. return TUnboxedValuePod(result);
  457. }
  458. #define REGISTER_NORMALIZE_UDF(name, mode) T##name,
  459. #define REGISTER_IS_CATEGORY_UDF(name, function) T##name,
  460. #define EXPORTED_UNICODE_BASE_UDF \
  461. NORMALIZE_UDF_MAP(REGISTER_NORMALIZE_UDF) \
  462. IS_CATEGORY_UDF_MAP(REGISTER_IS_CATEGORY_UDF) \
  463. TIsUtf, \
  464. TGetLength, \
  465. TSubstring, \
  466. TFind, \
  467. TRFind, \
  468. TSplitToList, \
  469. TJoinFromList, \
  470. TLevensteinDistance, \
  471. TReplaceAll, \
  472. TReplaceFirst, \
  473. TReplaceLast, \
  474. TRemoveAll, \
  475. TRemoveFirst, \
  476. TRemoveLast, \
  477. TToCodePointList, \
  478. TFromCodePointList, \
  479. TReverse, \
  480. TToLower, \
  481. TToUpper, \
  482. TToTitle, \
  483. TToUint64, \
  484. TTryToUint64, \
  485. TStrip, \
  486. TIsUnicodeSet
  487. }