re2_udf.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. #include <yql/essentials/public/udf/udf_helpers.h>
  2. #include <yql/essentials/public/udf/udf_value_builder.h>
  3. #include <contrib/libs/re2/re2/re2.h>
  4. #include <util/charset/utf8.h>
  5. #include <util/string/cast.h>
  6. using namespace re2;
  7. using namespace NKikimr;
  8. using namespace NUdf;
  9. namespace {
  10. template <typename T>
  11. T Id(T x) {
  12. return x;
  13. }
  14. re2::RE2::Options::Encoding EncodingFromBool(bool x) {
  15. return x ? re2::RE2::Options::Encoding::EncodingUTF8 : re2::RE2::Options::Encoding::EncodingLatin1;
  16. }
  17. #define OPTIONS_MAP(xx) \
  18. xx(Utf8, 0, bool, true, set_encoding, EncodingFromBool) \
  19. xx(PosixSyntax, 1, bool, false, set_posix_syntax, Id) \
  20. xx(LongestMatch, 2, bool, false, set_longest_match, Id) \
  21. xx(LogErrors, 3, bool, true, set_log_errors, Id) \
  22. xx(MaxMem, 4, ui64, 8 << 20, set_max_mem, Id) \
  23. xx(Literal, 5, bool, false, set_literal, Id) \
  24. xx(NeverNl, 6, bool, false, set_never_nl, Id) \
  25. xx(DotNl, 7, bool, false, set_dot_nl, Id) \
  26. xx(NeverCapture, 8, bool, false, set_never_capture, Id) \
  27. xx(CaseSensitive, 9, bool, true, set_case_sensitive, Id) \
  28. xx(PerlClasses, 10, bool, false, set_perl_classes, Id) \
  29. xx(WordBoundary, 11, bool, false, set_word_boundary, Id) \
  30. xx(OneLine, 12, bool, false, set_one_line, Id)
  31. enum EOptionsField : ui32 {
  32. OPTIONS_MAP(ENUM_VALUE_GEN)
  33. Count
  34. };
  35. struct TOptionsSchema {
  36. TType* StructType;
  37. ui32 Indices[EOptionsField::Count];
  38. };
  39. struct TRegexpGroups {
  40. TVector<TString> Names;
  41. TVector<ui32> Indexes;
  42. };
  43. class TRe2Udf: public TBoxedValue {
  44. public:
  45. enum EMode {
  46. MATCH,
  47. GREP,
  48. CAPTURE,
  49. REPLACE,
  50. COUNT,
  51. FIND_AND_CONSUME,
  52. };
  53. template <bool posix>
  54. class TFactory: public TBoxedValue {
  55. public:
  56. TFactory(
  57. EMode mode,
  58. const TOptionsSchema& optionsSchema,
  59. TSourcePosition pos,
  60. const TRegexpGroups& regexpGroups = TRegexpGroups())
  61. : Mode(mode)
  62. , OptionsSchema(optionsSchema)
  63. , Pos_(pos)
  64. , RegexpGroups(regexpGroups)
  65. {
  66. }
  67. private:
  68. TUnboxedValue Run(
  69. const IValueBuilder* valueBuilder,
  70. const TUnboxedValuePod* args) const override {
  71. return TUnboxedValuePod(
  72. new TRe2Udf(
  73. valueBuilder,
  74. args[0],
  75. RegexpGroups,
  76. Mode,
  77. posix,
  78. OptionsSchema,
  79. Pos_));
  80. }
  81. EMode Mode;
  82. const TOptionsSchema OptionsSchema;
  83. TSourcePosition Pos_;
  84. const TRegexpGroups RegexpGroups;
  85. };
  86. static const TStringRef& Name(EMode mode) {
  87. static auto match = TStringRef::Of("Match");
  88. static auto grep = TStringRef::Of("Grep");
  89. static auto capture = TStringRef::Of("Capture");
  90. static auto replace = TStringRef::Of("Replace");
  91. static auto count = TStringRef::Of("Count");
  92. static auto findAndconsume = TStringRef::Of("FindAndConsume");
  93. switch (mode) {
  94. case EMode::MATCH:
  95. return match;
  96. case EMode::GREP:
  97. return grep;
  98. case EMode::CAPTURE:
  99. return capture;
  100. case EMode::REPLACE:
  101. return replace;
  102. case EMode::COUNT:
  103. return count;
  104. case EMode::FIND_AND_CONSUME:
  105. return findAndconsume;
  106. }
  107. Y_ABORT("Unexpected mode");
  108. }
  109. TRe2Udf(
  110. const IValueBuilder*,
  111. const TUnboxedValuePod& runConfig,
  112. const TRegexpGroups regexpGroups,
  113. EMode mode,
  114. bool posix,
  115. const TOptionsSchema& optionsSchema,
  116. TSourcePosition pos)
  117. : RegexpGroups(regexpGroups)
  118. , Mode(mode)
  119. , Captured()
  120. , OptionsSchema(optionsSchema)
  121. , Pos_(pos)
  122. {
  123. try {
  124. auto patternValue = runConfig.GetElement(0);
  125. auto optionsValue = runConfig.GetElement(1);
  126. const std::string_view pattern(patternValue.AsStringRef());
  127. RE2::Options options;
  128. options.set_posix_syntax(posix);
  129. bool needUtf8 = (UTF8Detect(pattern) == UTF8);
  130. options.set_encoding(
  131. needUtf8
  132. ? RE2::Options::Encoding::EncodingUTF8
  133. : RE2::Options::Encoding::EncodingLatin1
  134. );
  135. if (optionsValue) {
  136. #define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(OptionsSchema.Indices[index]).Get<type>()));
  137. OPTIONS_MAP(FIELD_HANDLE)
  138. #undef FIELD_HANDLE
  139. }
  140. Regexp = std::make_unique<RE2>(StringPiece(pattern.data(), pattern.size()), options);
  141. if (mode == EMode::CAPTURE) {
  142. Captured = std::make_unique<StringPiece[]>(Regexp->NumberOfCapturingGroups() + 1);
  143. }
  144. } catch (const std::exception& e) {
  145. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  146. }
  147. }
  148. private:
  149. TUnboxedValue Run(
  150. const IValueBuilder* valueBuilder,
  151. const TUnboxedValuePod* args) const final try {
  152. RE2::Anchor anchor = RE2::UNANCHORED;
  153. if (args[0]) {
  154. const std::string_view input(args[0].AsStringRef());
  155. const StringPiece piece(input.data(), input.size());
  156. switch (Mode) {
  157. case MATCH:
  158. anchor = RE2::ANCHOR_BOTH;
  159. [[fallthrough]];
  160. case GREP:
  161. return TUnboxedValuePod(Regexp->Match(piece, 0, input.size(), anchor, nullptr, 0));
  162. case CAPTURE: {
  163. const int count = Regexp->NumberOfCapturingGroups() + 1;
  164. TUnboxedValue* items = nullptr;
  165. const auto result = valueBuilder->NewArray(RegexpGroups.Names.size(), items);
  166. if (Regexp->Match(piece, 0, input.size(), anchor, Captured.get(), count)) {
  167. for (int i = 0; i < count; ++i) {
  168. if (!Captured[i].empty()) {
  169. items[RegexpGroups.Indexes[i]] = valueBuilder->SubString(args[0], std::distance(piece.begin(), Captured[i].begin()), Captured[i].size());
  170. }
  171. }
  172. } else {
  173. return BuildEmptyStruct(valueBuilder);
  174. }
  175. return result;
  176. }
  177. case REPLACE: {
  178. const std::string_view rewriteRef(args[1].AsStringRef());
  179. const StringPiece rewrite(rewriteRef.data(), rewriteRef.size());
  180. TString rewriteError;
  181. if (!Regexp->CheckRewriteString(rewrite, &rewriteError)) {
  182. UdfTerminate((TStringBuilder() << Pos_ << " [rewrite error] " << rewriteError).data());
  183. }
  184. std::string result(input);
  185. RE2::GlobalReplace(&result, *Regexp, rewrite);
  186. return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result);
  187. }
  188. case COUNT: {
  189. std::string inputHolder(input);
  190. const ui32 result = RE2::GlobalReplace(&inputHolder, *Regexp, "");
  191. return TUnboxedValuePod(result);
  192. }
  193. case FIND_AND_CONSUME: {
  194. StringPiece text(piece);
  195. std::vector<TUnboxedValue> matches;
  196. for (StringPiece w; text.begin() < text.end() && RE2::FindAndConsume(&text, *Regexp, &w);) {
  197. if (w.size() == 0 && !text.empty()) {
  198. text.remove_prefix(1);
  199. }
  200. matches.emplace_back(valueBuilder->SubString(args[0], std::distance(piece.begin(), w.begin()), w.size()));
  201. }
  202. return valueBuilder->NewList(matches.data(), matches.size());
  203. }
  204. }
  205. Y_ABORT("Unexpected mode");
  206. } else {
  207. switch (Mode) {
  208. case MATCH:
  209. case GREP:
  210. return TUnboxedValuePod(false);
  211. case CAPTURE:
  212. return BuildEmptyStruct(valueBuilder);
  213. case REPLACE:
  214. return TUnboxedValuePod();
  215. case COUNT:
  216. return TUnboxedValuePod::Zero();
  217. case FIND_AND_CONSUME:
  218. return valueBuilder->NewEmptyList();
  219. }
  220. Y_ABORT("Unexpected mode");
  221. }
  222. } catch (const std::exception& e) {
  223. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  224. }
  225. std::unique_ptr<RE2> Regexp;
  226. const TRegexpGroups RegexpGroups;
  227. EMode Mode;
  228. std::unique_ptr<StringPiece[]> Captured;
  229. const TOptionsSchema OptionsSchema;
  230. TSourcePosition Pos_;
  231. TUnboxedValue BuildEmptyStruct(const IValueBuilder* valueBuilder) const {
  232. TUnboxedValue* items = nullptr;
  233. return valueBuilder->NewArray(RegexpGroups.Names.size(), items);
  234. }
  235. };
  236. SIMPLE_STRICT_UDF(TEscape, char*(char*)) {
  237. const std::string_view input(args[0].AsStringRef());
  238. const auto& result = RE2::QuoteMeta(StringPiece(input.data(), input.size()));
  239. return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result);
  240. }
  241. TOptionsSchema MakeOptionsSchema(::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder) {
  242. TOptionsSchema ret;
  243. auto structBuilder = builder.Struct(EOptionsField::Count);
  244. #define FIELD_HANDLE(name, index, type, ...) structBuilder->AddField<type>(TStringRef::Of(#name), &ret.Indices[index]);
  245. OPTIONS_MAP(FIELD_HANDLE)
  246. #undef FIELD_HANDLE
  247. ret.StructType = structBuilder->Build();
  248. return ret;
  249. }
  250. class TOptions: public TBoxedValue {
  251. private:
  252. const TOptionsSchema Schema_;
  253. public:
  254. TOptions(const TOptionsSchema& schema)
  255. : Schema_(schema)
  256. {
  257. }
  258. TUnboxedValue Run(
  259. const IValueBuilder* valueBuilder,
  260. const TUnboxedValuePod* args) const override {
  261. TUnboxedValue* items = nullptr;
  262. const auto result = valueBuilder->NewArray(EOptionsField::Count, items);
  263. #define FIELD_HANDLE(name, index, type, defVal, ...) \
  264. { \
  265. auto structIndex = Schema_.Indices[index]; \
  266. if (!args[index]) { \
  267. items[structIndex] = TUnboxedValuePod(static_cast<type>(defVal)); \
  268. } else { \
  269. items[structIndex] = args[index].GetOptionalValue(); \
  270. } \
  271. }
  272. OPTIONS_MAP(FIELD_HANDLE)
  273. #undef FIELD_HANDLE
  274. return result;
  275. }
  276. static const ::NKikimr::NUdf::TStringRef& Name() {
  277. static auto name = ::NKikimr::NUdf::TStringRef::Of("Options");
  278. return name;
  279. }
  280. static bool DeclareSignature(
  281. const ::NKikimr::NUdf::TStringRef& name,
  282. ::NKikimr::NUdf::TType* userType,
  283. ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder,
  284. bool typesOnly) {
  285. Y_UNUSED(userType);
  286. if (Name() == name) {
  287. builder.IsStrict();
  288. auto argsBuilder = builder.Args();
  289. #define FIELD_HANDLE(name, index, type, ...) argsBuilder->Add<TOptional<type>>().Name(TStringRef::Of(#name));
  290. OPTIONS_MAP(FIELD_HANDLE)
  291. #undef FIELD_HANDLE
  292. auto optionsSchema = MakeOptionsSchema(builder);
  293. builder.Returns(optionsSchema.StructType);
  294. builder.OptionalArgs(EOptionsField::Count);
  295. if (!typesOnly) {
  296. builder.Implementation(new TOptions(optionsSchema));
  297. }
  298. return true;
  299. } else {
  300. return false;
  301. }
  302. }
  303. };
  304. SIMPLE_UDF_WITH_OPTIONAL_ARGS(TPatternFromLike, char*(char*, TOptional<char*>), 1) {
  305. const std::string_view input(args[0].AsStringRef());
  306. const bool hasEscape = bool(args[1]);
  307. char escape = 0;
  308. if (hasEscape) {
  309. const std::string_view escapeRef(args[1].AsStringRef());
  310. if (escapeRef.size() != 1U) {
  311. UdfTerminate((TStringBuilder() << GetPos() << " Escape should be single character").data());
  312. }
  313. escape = escapeRef.front();
  314. }
  315. const TString escaped(RE2::QuoteMeta(StringPiece(input.data(), input.size())));
  316. TStringBuilder result;
  317. result << "(?s)";
  318. bool slash = false;
  319. bool escapeOn = false;
  320. for (const char& c : escaped) {
  321. switch (c) {
  322. case '\\':
  323. if (slash) {
  324. result << "\\\\";
  325. }
  326. slash = !slash;
  327. break;
  328. case '%':
  329. if (escapeOn) {
  330. result << "\\%";
  331. escapeOn = false;
  332. } else {
  333. result << ".*";
  334. }
  335. slash = false;
  336. break;
  337. case '_':
  338. if (escapeOn) {
  339. result << "\\_";
  340. escapeOn = false;
  341. } else {
  342. result << '.';
  343. }
  344. slash = false;
  345. break;
  346. default:
  347. if (hasEscape && c == escape) {
  348. if (escapeOn) {
  349. result << RE2::QuoteMeta(StringPiece(&c, 1));
  350. }
  351. escapeOn = !escapeOn;
  352. } else {
  353. if (slash)
  354. result << '\\';
  355. result << c;
  356. escapeOn = false;
  357. }
  358. slash = false;
  359. break;
  360. }
  361. }
  362. return valueBuilder->NewString(result);
  363. }
  364. TType* MakeRunConfigType(IFunctionTypeInfoBuilder& builder, TType* optOptionsStructType) {
  365. return builder.Tuple()->Add<char*>().Add(optOptionsStructType).Build();
  366. }
  367. template <bool posix>
  368. class TRe2Module: public IUdfModule {
  369. public:
  370. TStringRef Name() const {
  371. return posix ? TStringRef::Of("Re2posix") : TStringRef::Of("Re2");
  372. }
  373. void CleanupOnTerminate() const final {
  374. }
  375. void GetAllFunctions(IFunctionsSink& sink) const final {
  376. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::MATCH));
  377. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::GREP));
  378. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::CAPTURE))->SetTypeAwareness();
  379. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::REPLACE));
  380. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::COUNT));
  381. sink.Add(TRe2Udf::Name(TRe2Udf::EMode::FIND_AND_CONSUME));
  382. sink.Add(TEscape::Name());
  383. sink.Add(TPatternFromLike::Name());
  384. sink.Add(TOptions::Name());
  385. }
  386. void BuildFunctionTypeInfo(
  387. const TStringRef& name,
  388. TType* userType,
  389. const TStringRef& typeConfig,
  390. ui32 flags,
  391. IFunctionTypeInfoBuilder& builder) const final try {
  392. Y_UNUSED(userType);
  393. TOptionsSchema optionsSchema = MakeOptionsSchema(builder);
  394. auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build();
  395. bool typesOnly = (flags & TFlags::TypesOnly);
  396. bool isMatch = (TRe2Udf::Name(TRe2Udf::EMode::MATCH) == name);
  397. bool isGrep = (TRe2Udf::Name(TRe2Udf::EMode::GREP) == name);
  398. bool isCapture = (TRe2Udf::Name(TRe2Udf::EMode::CAPTURE) == name);
  399. bool isReplace = (TRe2Udf::Name(TRe2Udf::EMode::REPLACE) == name);
  400. bool isCount = (TRe2Udf::Name(TRe2Udf::EMode::COUNT) == name);
  401. bool isFindAndConsume = (TRe2Udf::Name(TRe2Udf::FIND_AND_CONSUME) == name);
  402. if (isMatch || isGrep) {
  403. builder.SimpleSignature<bool(TOptional<char*>)>()
  404. .RunConfig(MakeRunConfigType(builder, optOptionsStructType));
  405. if (!typesOnly) {
  406. const auto mode = isMatch ? TRe2Udf::EMode::MATCH : TRe2Udf::EMode::GREP;
  407. builder.Implementation(new TRe2Udf::TFactory<posix>(mode, optionsSchema, builder.GetSourcePosition()));
  408. }
  409. } else if (isCapture) {
  410. TRegexpGroups groups;
  411. auto optionalStringType = builder.Optional()->Item<char*>().Build();
  412. auto structBuilder = builder.Struct();
  413. RE2 regexp(StringPiece(typeConfig.Data(), typeConfig.Size()));
  414. const auto& groupNames = regexp.CapturingGroupNames();
  415. int groupCount = regexp.NumberOfCapturingGroups();
  416. if (groupCount >= 0) {
  417. std::unordered_set<std::string_view> groupNamesSet;
  418. int unnamedCount = 0;
  419. ++groupCount;
  420. groups.Indexes.resize(groupCount);
  421. groups.Names.resize(groupCount);
  422. for (int i = 0; i < groupCount; ++i) {
  423. TString fieldName;
  424. auto it = groupNames.find(i);
  425. if (it != groupNames.end()) {
  426. if (!groupNamesSet.insert(it->second).second) {
  427. builder.SetError(
  428. TStringBuilder() << "Regexp contains duplicate capturing group name: " << it->second);
  429. return;
  430. }
  431. fieldName = it->second;
  432. } else {
  433. fieldName = "_" + ToString(unnamedCount);
  434. ++unnamedCount;
  435. }
  436. groups.Names[i] = fieldName;
  437. structBuilder->AddField(fieldName, optionalStringType, &groups.Indexes[i]);
  438. }
  439. builder.Args(1)->Add(optionalStringType).Done().Returns(structBuilder->Build()).RunConfig(MakeRunConfigType(builder, optOptionsStructType));
  440. if (!typesOnly) {
  441. builder.Implementation(
  442. new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::CAPTURE, optionsSchema, builder.GetSourcePosition(), groups));
  443. }
  444. } else {
  445. if (regexp.ok()) {
  446. builder.SetError("Regexp contains no capturing groups");
  447. } else {
  448. builder.SetError(regexp.error());
  449. }
  450. }
  451. } else if (isReplace) {
  452. builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>()
  453. .RunConfig(MakeRunConfigType(builder, optOptionsStructType));
  454. if (!typesOnly) {
  455. builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::REPLACE, optionsSchema, builder.GetSourcePosition()));
  456. }
  457. } else if (isCount) {
  458. builder.SimpleSignature<ui32(TOptional<char*>)>()
  459. .RunConfig(MakeRunConfigType(builder, optOptionsStructType));
  460. if (!typesOnly) {
  461. builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::COUNT, optionsSchema, builder.GetSourcePosition()));
  462. }
  463. } else if (isFindAndConsume) {
  464. builder.SimpleSignature<TListType<char*>(TOptional<char*>)>()
  465. .RunConfig(MakeRunConfigType(builder, optOptionsStructType));
  466. if (!typesOnly) {
  467. builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::FIND_AND_CONSUME, optionsSchema, builder.GetSourcePosition()));
  468. }
  469. } else if (!(
  470. TEscape::DeclareSignature(name, userType, builder, typesOnly) ||
  471. TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) ||
  472. TOptions::DeclareSignature(name, userType, builder, typesOnly))) {
  473. builder.SetError(
  474. TStringBuilder() << "Unknown function name: " << TString(name));
  475. }
  476. } catch (const std::exception& e) {
  477. builder.SetError(CurrentExceptionMessage());
  478. }
  479. };
  480. }
  481. REGISTER_MODULES(
  482. TRe2Module<false>,
  483. TRe2Module<true>)