#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace NYql; using namespace NUdf; using namespace NUnicode; namespace { #define DISABLE_IMPICT_ARGUMENT_CAST \ template \ static auto Execute(Args&&... args) = delete; inline constexpr bool IsAscii(wchar32 c) noexcept { return ::IsAscii(c); } template struct TIsUnicodeSpaceAdapter { bool operator()(const It& it) const noexcept { return IsSpace(*it); } }; template TIsUnicodeSpaceAdapter IsUnicodeSpaceAdapter(It) { return {}; } struct TNoChangesTag {}; template struct TScalarOperationMixin { static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef()); } { Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef())); auto executeResult = TDerived::Execute(args[0].AsStringRef()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TMaybe(TStringRef())); } { auto executeResult = TDerived::Execute(args[0] ? TMaybe(args[0].AsStringRef()) : Nothing()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef(), TStringRef()); } { auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef(), TMaybe()); } { auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1] ? TMaybe(args[1].Get()) : Nothing()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); } { auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2].AsStringRef()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe()); } { auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2] ? TMaybe(args[2].Get()) : Nothing()); return ProcessResult(builder, std::move(executeResult), args); } static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) requires requires { TDerived::Execute(TStringRef(), TMaybe(), TMaybe()); } { auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1] ? TMaybe(args[1].Get()) : Nothing(), args[2] ? TMaybe(args[2].Get()) : Nothing()); return ProcessResult(builder, std::move(executeResult), args); } private: static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TString& newString, const TUnboxedValuePod*) { return builder->NewString(newString); } static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TStringBuf newString, const TUnboxedValuePod*) { return builder->NewString(newString); } template static TUnboxedValue ProcessResult(const IValueBuilder* builder, const std::variant& newValue, const TUnboxedValuePod* initialArg) { if (std::holds_alternative(newValue)) { return ProcessResult(builder, std::move(std::get(newValue)), initialArg); } else { return initialArg[0]; } } template static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TMaybe& newValue, const TUnboxedValuePod* initialArg) { if (newValue.Defined()) { return ProcessResult(builder, *newValue, initialArg); } else { return TUnboxedValuePod(); } } template ::Result>> static TUnboxedValue ProcessResult(const IValueBuilder* builder, T result, const TUnboxedValuePod*) { Y_UNUSED(builder); return TUnboxedValuePod(result); } }; template struct TBlockOperationMixin { template static void BlockDoExecute(const TBlockItem arg, const TSink& sink) requires requires { TDerived::Execute(TStringRef()); } { Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef())); auto executeResult = TDerived::Execute(arg.AsStringRef()); TBlockItem boxedValue = ProcessResult(executeResult, arg); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem arg, const TSink& sink) requires requires { TDerived::Execute(TMaybe(TStringRef())); } { auto executeResult = TDerived::Execute(arg ? TMaybe(arg.AsStringRef()) : Nothing()); TBlockItem boxedValue = ProcessResult(executeResult, arg); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink) requires requires { TDerived::Execute(TStringRef(), TStringRef()); } { auto executeResult = TDerived::Execute(arg1.AsStringRef(), arg2.AsStringRef()); TBlockItem boxedValue = ProcessResult(executeResult, arg1); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink) requires requires { TDerived::Execute(TStringRef(), TMaybe()); } { auto executeResult = TDerived::Execute(arg1.AsStringRef(), arg2 ? TMaybe(arg2.Get()) : Nothing()); TBlockItem boxedValue = ProcessResult(executeResult, arg1); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem args, const TSink& sink) requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); }) { auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(), args.GetElement(1).AsStringRef(), args.GetElement(2).AsStringRef()); TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0)); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem args, const TSink& sink) requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe(0ULL)); }) { auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(), args.GetElement(1).AsStringRef(), (args.GetElement(2) ? TMaybe(args.GetElement(2).Get()) : Nothing())); TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0)); sink(boxedValue); } template static void BlockDoExecute(const TBlockItem args, const TSink& sink) requires(requires { TDerived::Execute(TStringRef(), TMaybe(0ULL), TMaybe(0ULL)); }) { auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(), (args.GetElement(1) ? TMaybe(args.GetElement(1).Get()) : Nothing()), (args.GetElement(2) ? TMaybe(args.GetElement(2).Get()) : Nothing())); TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0)); sink(boxedValue); } private: static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) { Y_UNUSED(arg); return TBlockItem(newString); } static TBlockItem ProcessResult(const TStringBuf newString, const TBlockItem arg) { Y_UNUSED(arg); return TBlockItem(newString); } template static TBlockItem ProcessResult(const TMaybe& newValue, const TBlockItem arg) { if (newValue.Defined()) { return ProcessResult(*newValue, arg); } else { return TBlockItem(); } } template static TBlockItem ProcessResult(const std::variant& newValue, const TBlockItem arg) { if (std::holds_alternative(newValue)) { return ProcessResult(std::get(newValue), arg); } else { return arg; } } template ::Result>> static TBlockItem ProcessResult(T result, const TBlockItem arg) { Y_UNUSED(arg); return TBlockItem(result); } }; template struct TOperationMixin: public TBlockOperationMixin, public TScalarOperationMixin {}; template struct TNormalizeUTF8: public TOperationMixin> { static TString Execute(TStringRef arg) { const TUtf16String& input = UTF8ToWide(arg.Data(), arg.Size()); return WideToUTF8(Normalize(input)); } DISABLE_IMPICT_ARGUMENT_CAST; }; template struct TCheckAllChars: public TOperationMixin> { static bool Execute(TStringRef arg) { const TStringBuf input(arg); wchar32 rune; const unsigned char* cur = reinterpret_cast(input.begin()); const unsigned char* last = reinterpret_cast(input.end()); while (cur != last) { ReadUTF8CharAndAdvance(rune, cur, last); if (!static_cast(Function)(rune)) { return false; } } return true; } DISABLE_IMPICT_ARGUMENT_CAST; }; template struct TStringToStringMapper: public TOperationMixin> { static std::variant Execute(TStringRef arg) { if (auto wide = UTF8ToWide(arg); static_cast(Function)(wide, 0, TUtf16String::npos)) { return WideToUTF8(std::move(wide)); } else { return TNoChangesTag{}; } } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TLengthGetter: public TOperationMixin { static ui64 Execute(TStringRef inputRef) { size_t result; GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result); return static_cast(result); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TReverser: public TOperationMixin { static TString Execute(TStringRef inputRef) { auto wide = UTF8ToWide(inputRef); ReverseInPlace(wide); return WideToUTF8(wide); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TStripper: public TOperationMixin { static TString Execute(TStringRef inputRef) { const TUtf32String input = UTF8ToUTF32(inputRef); const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin())); return WideToUTF8(result); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TAllRemover: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef removeRef) { TUtf32String input = UTF8ToUTF32(inputRef); const TUtf32String remove = UTF8ToUTF32(removeRef); const std::unordered_set chars(remove.cbegin(), remove.cend()); size_t tpos = 0; for (const wchar32 c : input) { if (!chars.contains(c)) { input[tpos++] = c; } } if (tpos != input.size()) { input.resize(tpos); return WideToUTF8(input); } return TNoChangesTag{}; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TFirstRemover: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef removeRef) { TUtf32String input = UTF8ToUTF32(inputRef); const auto remove = UTF8ToUTF32(removeRef); const std::unordered_set chars(remove.cbegin(), remove.cend()); for (auto it = input.cbegin(); it != input.cend(); ++it) { if (chars.contains(*it)) { input.erase(it); return WideToUTF8(input); } } return TNoChangesTag{}; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TUnicodeSetMatcher: public TOperationMixin { static bool Execute(TStringRef inputRef, TStringRef customCategoryRef) { const TStringBuf input(inputRef); const TUtf16String& customCategory = UTF8ToWide(customCategoryRef); TUnicodeSet unicodeSet; try { unicodeSet.Parse(customCategory); } catch (...) { UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str()); } wchar32 rune; const unsigned char* cur = reinterpret_cast(input.begin()); const unsigned char* last = reinterpret_cast(input.end()); while (cur != last) { ReadUTF8CharAndAdvance(rune, cur, last); if (!unicodeSet.Has(rune)) { return false; } } return true; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TLevensteinDistanceFinder: public TOperationMixin { static ui64 Execute(TStringRef leftRef, TStringRef rightRef) { const TStringBuf left(leftRef); const TStringBuf right(rightRef); const auto& leftUtf32 = UTF8ToUTF32(left); const auto& rightUtf32 = UTF8ToUTF32(right); return NLevenshtein::Distance(leftUtf32, rightUtf32); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TLastRemoval: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef removeRef) { TUtf32String input = UTF8ToUTF32(inputRef); const TUtf32String remove = UTF8ToUTF32(removeRef); const std::unordered_set chars(remove.cbegin(), remove.cend()); for (auto it = input.crbegin(); it != input.crend(); ++it) { if (chars.contains(*it)) { input.erase(input.crend() - it - 1, 1); return WideToUTF8(input); } } return TNoChangesTag{}; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TAllReplacer: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { if (TString result(inputRef); SubstGlobal(result, whatReplace, toReplace)) { return result; } else { return TNoChangesTag{}; } } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TFirstReplacer: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { std::string result(inputRef); const std::string_view what(whatReplace); if (const auto index = result.find(what); index != std::string::npos) { result.replace(index, what.size(), std::string_view(toReplace)); return result; } return TNoChangesTag{}; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TLastReplacer: public TOperationMixin { static std::variant Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { std::string result(inputRef); const std::string_view what(whatReplace); if (const auto index = result.rfind(what); index != std::string::npos) { result.replace(index, what.size(), std::string_view(toReplace)); return result; } return TNoChangesTag{}; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TFinder: public TOperationMixin { static TMaybe Execute(TStringRef inputRef, TStringRef whatFind, TMaybe whereFind) { const std::string_view string(inputRef); const std::string_view needle(whatFind); std::string_view::size_type pos = 0U; if (auto p = whereFind.GetOrElse(0ULL)) { for (auto ptr = string.data(); p && pos < string.size(); --p) { const auto width = WideCharSize(*ptr); pos += width; ptr += width; } } if (const auto find = string.find(needle, pos); std::string_view::npos != find) { size_t result; GetNumberOfUTF8Chars(string.data(), find, result); return static_cast(result); } return Nothing(); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TRFinder: public TOperationMixin { static TMaybe Execute(TStringRef inputRef, TStringRef whatFind, TMaybe whereFind) { const std::string_view string(inputRef); const std::string_view needle(whatFind); std::string_view::size_type pos = std::string_view::npos; if (auto p = whereFind.GetOrElse(std::string_view::npos); std::string_view::npos != p) { pos = 0ULL; for (auto ptr = string.data(); p && pos < string.size(); --p) { const auto width = WideCharSize(*ptr); pos += width; ptr += width; } } if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) { size_t result; GetNumberOfUTF8Chars(string.data(), find, result); return static_cast(result); } return Nothing(); } DISABLE_IMPICT_ARGUMENT_CAST; }; template struct TToUint64Converter: public TOperationMixin> { static TNothing Terminate(const char* message) { if constexpr (strict) { return Nothing(); } else { UdfTerminate(message); } }; static TMaybe Execute(TStringRef inputRef, TMaybe inputBase) { const TString inputStr(inputRef); const char* input = inputStr.data(); const int base = inputBase.GetOrElse(0); char* pos = nullptr; auto prevErrno = errno; errno = 0; Y_DEFER { errno = prevErrno; }; unsigned long long res = std::strtoull(input, &pos, base); if (!res && errno == EINVAL) { return Terminate("Incorrect base"); } ui64 ret = static_cast(res); if (!res && pos == input) { return Terminate("Input string is not a number"); } else if ((res == ULLONG_MAX && errno == ERANGE) || ret != res) { return Terminate("Converted value falls out of Uint64 range"); } else if (*pos) { return Terminate("Input string contains junk after the number"); } return ret; } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TUtf8Checker: public TOperationMixin { static bool Execute(TMaybe inputRef) { if (!inputRef.Defined()) { return false; } return IsUtf8(*inputRef); } DISABLE_IMPICT_ARGUMENT_CAST; }; struct TSubstringGetter: public TOperationMixin { static TStringBuf Execute(TStringRef inputRef Y_LIFETIME_BOUND, TMaybe inputFrom, TMaybe inputLen) { const TStringBuf input(inputRef); size_t from = inputFrom.GetOrElse(0); size_t len = inputLen.GetOrElse(TStringBuf::npos); return SubstrUTF8(input, from, len); } DISABLE_IMPICT_ARGUMENT_CAST; }; #define DEFINE_UTF8_OPERATION_STRICT(udfName, Executor, signature, optArgs) \ BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgs) { \ return Executor::DoExecute(valueBuilder, args); \ } \ \ struct T##udfName##KernelExec \ : public TUnaryKernelExec { \ template \ static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \ Y_UNUSED(valueBuilder); \ Executor::BlockDoExecute(arg1, sink); \ } \ }; \ \ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) #define DEFINE_UTF8_OPERATION_BIN_BASE(macro, udfName, Executor, signature, optArgs) \ macro(T##udfName, signature, optArgs) { \ return Executor::DoExecute(valueBuilder, args); \ } \ \ struct T##udfName##KernelExec \ : public TBinaryKernelExec { \ template \ static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { \ Y_UNUSED(valueBuilder); \ Executor::BlockDoExecute(arg1, arg2, sink); \ } \ }; \ \ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) #define DEFINE_UTF8_OPERATION_BIN_STRICT(udfName, Executor, signature, optArgs) \ DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs) #define DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(udfName, Executor, signature, optArgs) \ DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs) #define DEFINE_UTF8_OPERATION_MANY_STRICT(udfName, Executor, signature, argsCount, optArgsCount) \ BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgsCount) { \ return Executor::DoExecute(valueBuilder, args); \ } \ \ struct T##udfName##KernelExec \ : public TGenericKernelExec { \ template \ static void Process(const IValueBuilder* valueBuilder, TBlockItem args, const TSink& sink) { \ Y_UNUSED(valueBuilder); \ Executor::BlockDoExecute(args, sink); \ } \ }; \ \ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) DEFINE_UTF8_OPERATION_STRICT(IsUtf, TUtf8Checker, bool(TOptional), /*optArgs=*/1); DEFINE_UTF8_OPERATION_STRICT(Normalize, TNormalizeUTF8, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(NormalizeNFD, TNormalizeUTF8, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(NormalizeNFC, TNormalizeUTF8, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKD, TNormalizeUTF8, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKC, TNormalizeUTF8, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsAscii, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsSpace, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsUpper, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsLower, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsDigit, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsAlpha, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsAlnum, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(IsHex, TCheckAllChars, bool(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(ToTitle, TStringToStringMapper, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(ToUpper, TStringToStringMapper, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(ToLower, TStringToStringMapper, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(GetLength, TLengthGetter, ui64(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(Reverse, TReverser, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_STRICT(Strip, TStripper, TUtf8(TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_MANY_STRICT(Substring, TSubstringGetter, TUtf8(TAutoMap, TOptional, TOptional), /*argsCount=*/3, /*optArgs=*/1); DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveAll, TAllRemover, TUtf8(TAutoMap, TUtf8), /*optArgs=*/0); DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveFirst, TFirstRemover, TUtf8(TAutoMap, TUtf8), /*optArgs=*/0); DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(IsUnicodeSet, TUnicodeSetMatcher, bool(TAutoMap, TUtf8), /*optArgs=*/0); DEFINE_UTF8_OPERATION_BIN_STRICT(LevensteinDistance, TLevensteinDistanceFinder, ui64(TAutoMap, TAutoMap), /*optArgs=*/0); DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveLast, TLastRemoval, TUtf8(TAutoMap, TUtf8), /*optArgs=*/0); DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceAll, TAllReplacer, TUtf8(TAutoMap, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceFirst, TFirstReplacer, TUtf8(TAutoMap, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceLast, TLastReplacer, TUtf8(TAutoMap, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); DEFINE_UTF8_OPERATION_MANY_STRICT(Find, TFinder, TOptional(TAutoMap, TUtf8, TOptional), /*argsCount=*/3, /*optionalArgs=*/1); DEFINE_UTF8_OPERATION_MANY_STRICT(RFind, TRFinder, TOptional(TAutoMap, TUtf8, TOptional), /*argsCount=*/3, /*optionalArgs=*/1); DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(ToUint64, TToUint64Converter, ui64(TAutoMap, TOptional), /*optionalArgs=*/1); DEFINE_UTF8_OPERATION_BIN_STRICT(TryToUint64, TToUint64Converter, TOptional(TAutoMap, TOptional), /*optionalArgs=*/1); using TTmpVector = TSmallVec; template static void SplitToListImpl( const IValueBuilder* valueBuilder, const TUnboxedValue& input, const std::string_view::const_iterator from, const TIt& it, TTmpVector& result) { for (const auto& elem : it) { result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim()))); } } template static void SplitToListImpl( const IValueBuilder* valueBuilder, const TUnboxedValue& input, const TUtf32String::const_iterator start, const TIt& it, TTmpVector& result) { const std::string_view& original = input.AsStringRef(); size_t charPos = 0U, bytePos = 0U; for (const auto& elem : it) { for (const size_t next = std::distance(start, elem.TokenStart()); charPos < next; ++charPos) bytePos += WideCharSize(original[bytePos]); const auto from = bytePos; for (const size_t next = charPos + std::distance(elem.TokenStart(), elem.TokenDelim()); charPos < next; ++charPos) bytePos += WideCharSize(original[bytePos]); const auto size = bytePos - from; result.emplace_back(valueBuilder->SubString(input, from, size)); } } template static void SplitToListImpl( const IValueBuilder* valueBuilder, const TUnboxedValue& input, const TStrIt from, TIt& it, bool skipEmpty, TTmpVector& result) { if (skipEmpty) { SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result); } else { SplitToListImpl(valueBuilder, input, from, it, result); } } constexpr char delimeterStringName[] = "DelimeterString"; constexpr char skipEmptyName[] = "SkipEmpty"; constexpr char limitName[] = "Limit"; using TDelimeterStringArg = TNamedArg; using TSkipEmptyArg = TNamedArg; using TLimitArg = TNamedArg; SIMPLE_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType( TOptional, TUtf8, TDelimeterStringArg, TSkipEmptyArg, TLimitArg ), 3) { TTmpVector result; if (args[0]) { const bool delimiterString = args[2].GetOrDefault(true); const bool skipEmpty = args[3].GetOrDefault(false); const auto limit = args[4].GetOrDefault(0); if (delimiterString) { const std::string_view input(args[0].AsStringRef()); const std::string_view delimeter(args[1].AsStringRef()); if (limit) { auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1); SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } else { auto it = StringSplitter(input).SplitByString(delimeter); SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } } else { const auto& input = UTF8ToUTF32(args[0].AsStringRef()); const auto& delimeter = UTF8ToUTF32(args[1].AsStringRef()); if (limit) { auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1); SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } else { auto it = StringSplitter(input).SplitBySet(delimeter.c_str()); SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } } } return valueBuilder->NewList(result.data(), result.size()); } SIMPLE_UDF(TJoinFromList, TUtf8(TAutoMap>>, TUtf8)) { const auto input = args[0].GetListIterator(); const std::string_view delimeter(args[1].AsStringRef()); std::vector items; for (TUnboxedValue current; input.Next(current);) { if (current) { items.emplace_back(current.AsStringRef()); } } return valueBuilder->NewString(JoinSeq(delimeter, items)); } SIMPLE_UDF(TToCodePointList, TListType(TAutoMap)) { size_t codePointCount = 0; const auto& inputRef = args[0].AsStringRef(); if (!GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), codePointCount)) { // should not happen but still we have to check return code ythrow yexception() << "Unable to count code points"; } TUnboxedValue* itemsPtr = nullptr; auto result = valueBuilder->NewArray(codePointCount, itemsPtr); const unsigned char* current = reinterpret_cast(inputRef.Data()); const unsigned char* end = current + inputRef.Size(); wchar32 rune = BROKEN_RUNE; ui32 codePointIndex = 0; RECODE_RESULT retcode = RECODE_OK; while (current < end && RECODE_OK == (retcode = ReadUTF8CharAndAdvance(rune, current, end))) { if (codePointIndex >= codePointCount) { // sanity check ythrow yexception() << "Too big code point index " << codePointIndex << ", expecting only " << codePointCount << " code points"; } itemsPtr[codePointIndex++] = TUnboxedValuePod(static_cast(rune)); } if (retcode != RECODE_OK) { ythrow yexception() << "Malformed UTF-8 string"; } return result; } SIMPLE_UDF(TFromCodePointList, TUtf8(TAutoMap>)) { auto input = args[0]; if (auto elems = input.GetElements()) { const auto elemCount = input.GetListLength(); auto bufferSize = WideToUTF8BufferSize(elemCount); TTempBuf buffer(bufferSize); auto bufferPtr = buffer.Data(); auto bufferEnd = buffer.Data() + bufferSize; for (ui64 i = 0; i != elemCount; ++i) { const auto& item = elems[i]; const wchar32 rune = item.Get(); size_t written = 0; WideToUTF8(&rune, 1, bufferPtr, written); Y_ENSURE(written <= 4); bufferPtr += written; Y_ENSURE(bufferPtr <= bufferEnd); } return valueBuilder->NewString(TStringRef(buffer.Data(), bufferPtr - buffer.Data())); } std::vector> buffer; buffer.reserve(TUnboxedValuePod::InternalBufferSize); const auto& iter = input.GetListIterator(); char runeBuffer[4] = {}; for (NUdf::TUnboxedValue item; iter.Next(item); ) { const wchar32 rune = item.Get(); size_t written = 0; WideToUTF8(&rune, 1, runeBuffer, written); Y_ENSURE(written <= 4); buffer.insert(buffer.end(), runeBuffer, runeBuffer + written); } return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size())); } #define EXPORTED_UNICODE_BASE_UDF \ TIsUtf, \ TGetLength, \ TSubstring, \ TFind, \ TRFind, \ TSplitToList, \ TJoinFromList, \ TLevensteinDistance, \ TReplaceAll, \ TReplaceFirst, \ TReplaceLast, \ TRemoveAll, \ TRemoveFirst, \ TRemoveLast, \ TToCodePointList, \ TFromCodePointList, \ TReverse, \ TToLower, \ TToUpper, \ TToTitle, \ TToUint64, \ TTryToUint64, \ TStrip, \ TIsUnicodeSet, \ TNormalize, \ TNormalizeNFD, \ TNormalizeNFC, \ TNormalizeNFKD, \ TNormalizeNFKC, \ TIsAscii, \ TIsSpace, \ TIsUpper, \ TIsLower, \ TIsDigit, \ TIsAlpha, \ TIsAlnum, \ TIsHex }