pire_udf.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. #include <yql/essentials/public/udf/udf_value.h>
  2. #include <yql/essentials/public/udf/udf_type_builder.h>
  3. #include <yql/essentials/public/udf/udf_registrator.h>
  4. #include <yql/essentials/public/udf/udf_value_builder.h>
  5. #include <yql/essentials/public/udf/udf_terminator.h>
  6. #include <library/cpp/regex/pire/regexp.h>
  7. #include <library/cpp/regex/pire/pcre2pire.h>
  8. #include <util/string/builder.h>
  9. using namespace NRegExp;
  10. using namespace NKikimr;
  11. using namespace NUdf;
  12. namespace {
  13. class TPireUdfBase: public TBoxedValue {
  14. protected:
  15. TPireUdfBase(TSourcePosition pos)
  16. : Pos_(pos)
  17. {}
  18. void SetCommonOptions(std::string_view& regex, TFsm::TOptions& options) {
  19. if (regex.size() >= 4U && regex.substr(0U, 4U) == "(?i)") {
  20. options.SetCaseInsensitive(true);
  21. regex.remove_prefix(4U);
  22. }
  23. if (UTF8Detect(regex) == UTF8) {
  24. options.SetCharset(CODES_UTF8);
  25. }
  26. }
  27. TSourcePosition Pos_;
  28. };
  29. class TPireMatch: public TPireUdfBase {
  30. public:
  31. class TFactory: public TPireUdfBase {
  32. public:
  33. TFactory(
  34. bool surroundMode,
  35. bool multiMode,
  36. TSourcePosition pos,
  37. size_t regexpsCount = 0)
  38. : TPireUdfBase(pos)
  39. , SurroundMode(surroundMode)
  40. , MultiMode(multiMode)
  41. , RegexpsCount(regexpsCount)
  42. {
  43. }
  44. private:
  45. TUnboxedValue Run(
  46. const IValueBuilder* valueBuilder,
  47. const TUnboxedValuePod* args) const final {
  48. return TUnboxedValuePod(
  49. new TPireMatch(
  50. valueBuilder,
  51. args[0],
  52. SurroundMode,
  53. MultiMode,
  54. Pos_,
  55. RegexpsCount));
  56. }
  57. bool SurroundMode;
  58. bool MultiMode;
  59. size_t RegexpsCount;
  60. };
  61. static const TStringRef& Name(bool surroundMode, bool multiMode) {
  62. static auto match = TStringRef::Of("Match");
  63. static auto grep = TStringRef::Of("Grep");
  64. static auto multiMatch = TStringRef::Of("MultiMatch");
  65. static auto multiGrep = TStringRef::Of("MultiGrep");
  66. if (surroundMode) {
  67. return multiMode ? multiGrep : grep;
  68. } else {
  69. return multiMode ? multiMatch : match;
  70. }
  71. }
  72. TPireMatch(
  73. const IValueBuilder* valueBuilder,
  74. const TUnboxedValuePod& runConfig,
  75. bool surroundMode,
  76. bool multiMode,
  77. TSourcePosition pos,
  78. size_t regexpsCount)
  79. : TPireUdfBase(pos)
  80. , MultiMode(multiMode)
  81. , RegexpsCount(regexpsCount)
  82. , SurroundMode(surroundMode)
  83. {
  84. Y_UNUSED(valueBuilder);
  85. try {
  86. std::string_view regex(runConfig.AsStringRef());
  87. TFsm::TOptions options;
  88. options.SetSurround(surroundMode);
  89. SetCommonOptions(regex, options);
  90. if (multiMode) {
  91. std::vector<std::string_view> parts;
  92. StringSplitter(regex).Split('\n').AddTo(&parts);
  93. for (const auto& part : parts) {
  94. if (!part.empty()) {
  95. if (Fsm_) try {
  96. *Fsm_ = *Fsm_ | TFsm(TString(part), options);
  97. } catch (const yexception&) {
  98. UdfTerminate((TStringBuilder() << Pos_ << " Failed to glue up regexes, probably the finite state machine appeared to be too large").data());
  99. } else {
  100. Fsm_.Reset(new TFsm(TString(part), options));
  101. }
  102. }
  103. }
  104. } else {
  105. Fsm_.Reset(new TFsm(TString(regex), options));
  106. }
  107. } catch (const std::exception& e) {
  108. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  109. }
  110. }
  111. private:
  112. TUnboxedValue Run(
  113. const IValueBuilder* valueBuilder,
  114. const TUnboxedValuePod* args) const final try {
  115. TUnboxedValue* items = nullptr;
  116. TUnboxedValue tuple;
  117. size_t i = 0;
  118. if (MultiMode) {
  119. tuple = valueBuilder->NewArray(RegexpsCount, items);
  120. for (i = 0; i < RegexpsCount; ++i) {
  121. items[i] = TUnboxedValuePod(false);
  122. }
  123. }
  124. if (args[0]) {
  125. const auto input = args[0].AsStringRef();
  126. TMatcher matcher(*Fsm_);
  127. const bool isMatch = matcher.Match(input.Data(), input.Size(), SurroundMode, SurroundMode).Final();
  128. if (MultiMode) {
  129. if (isMatch) {
  130. const auto& matchedRegexps = matcher.MatchedRegexps();
  131. size_t matchesCount = matchedRegexps.second - matchedRegexps.first;
  132. for (i = 0; i < matchesCount; ++i) {
  133. items[matchedRegexps.first[i]] = TUnboxedValuePod(true);
  134. }
  135. }
  136. return tuple;
  137. } else {
  138. return TUnboxedValuePod(isMatch);
  139. }
  140. } else {
  141. return MultiMode ? tuple : TUnboxedValue(TUnboxedValuePod(false));
  142. }
  143. } catch (const std::exception& e) {
  144. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  145. }
  146. private:
  147. TUniquePtr<TFsm> Fsm_;
  148. bool MultiMode;
  149. size_t RegexpsCount;
  150. bool SurroundMode;
  151. };
  152. class TPireCapture: public TPireUdfBase {
  153. public:
  154. class TFactory: public TPireUdfBase {
  155. public:
  156. TFactory(TSourcePosition pos)
  157. : TPireUdfBase(pos)
  158. {}
  159. private:
  160. TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
  161. return TUnboxedValuePod(new TPireCapture(args[0], Pos_));
  162. } catch (const std::exception& e) {
  163. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  164. }
  165. };
  166. static const TStringRef& Name() {
  167. static auto name = TStringRef::Of("Capture");
  168. return name;
  169. }
  170. TPireCapture(const TUnboxedValuePod& runConfig, TSourcePosition pos)
  171. : TPireUdfBase(pos)
  172. {
  173. std::string_view regex(runConfig.AsStringRef());
  174. TFsm::TOptions options;
  175. SetCommonOptions(regex, options);
  176. Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
  177. }
  178. private:
  179. TUnboxedValue Run(
  180. const IValueBuilder* valueBuilder,
  181. const TUnboxedValuePod* args) const final try {
  182. if (args[0]) {
  183. const std::string_view input = args[0].AsStringRef();
  184. TSlowSearcher searcher(*Fsm_);
  185. searcher.Search(input.data(), input.size());
  186. if (searcher.Captured()) {
  187. const auto& captured = searcher.GetCaptured();
  188. return valueBuilder->SubString(args[0], std::distance(input.begin(), captured.begin()), captured.length());
  189. }
  190. }
  191. return TUnboxedValue();
  192. } catch (const std::exception& e) {
  193. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  194. }
  195. TUniquePtr<TSlowCapturingFsm> Fsm_;
  196. };
  197. class TPireReplace: public TPireUdfBase {
  198. public:
  199. class TFactory: public TPireUdfBase {
  200. public:
  201. TFactory(TSourcePosition pos)
  202. : TPireUdfBase(pos)
  203. {}
  204. private:
  205. TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
  206. return TUnboxedValuePod(new TPireReplace(args[0], Pos_));
  207. } catch (const std::exception& e) {
  208. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  209. }
  210. };
  211. static const TStringRef& Name() {
  212. static auto name = TStringRef::Of("Replace");
  213. return name;
  214. }
  215. TPireReplace(const TUnboxedValuePod& runConfig, TSourcePosition pos)
  216. : TPireUdfBase(pos)
  217. {
  218. std::string_view regex(runConfig.AsStringRef());
  219. TFsm::TOptions options;
  220. SetCommonOptions(regex, options);
  221. Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
  222. }
  223. private:
  224. TUnboxedValue Run(
  225. const IValueBuilder* valueBuilder,
  226. const TUnboxedValuePod* args) const final try {
  227. if (args[0]) {
  228. const std::string_view input(args[0].AsStringRef());
  229. TSlowSearcher s(*Fsm_);
  230. s.Search(input.data(), input.size());
  231. if (s.Captured()) {
  232. const auto& captured = s.GetCaptured();
  233. const TString replacement(args[1].AsStringRef());
  234. TString replaced(args[0].AsStringRef());
  235. replaced.replace(std::distance(input.begin(), captured.begin()), captured.length(), replacement);
  236. return valueBuilder->NewString(replaced);
  237. } else {
  238. return TUnboxedValue(args[0]);
  239. }
  240. } else {
  241. return TUnboxedValue();
  242. }
  243. } catch (const std::exception& e) {
  244. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  245. }
  246. TUniquePtr<TSlowCapturingFsm> Fsm_;
  247. };
  248. class TPireModule: public IUdfModule {
  249. public:
  250. TStringRef Name() const {
  251. return TStringRef::Of("Pire");
  252. }
  253. void CleanupOnTerminate() const final {
  254. }
  255. void GetAllFunctions(IFunctionsSink& sink) const final {
  256. sink.Add(TPireMatch::Name(true, true))->SetTypeAwareness();
  257. sink.Add(TPireMatch::Name(false, true))->SetTypeAwareness();
  258. sink.Add(TPireMatch::Name(true, false));
  259. sink.Add(TPireMatch::Name(false, false));
  260. sink.Add(TPireCapture::Name());
  261. sink.Add(TPireReplace::Name());
  262. }
  263. void BuildFunctionTypeInfo(
  264. const TStringRef& name,
  265. TType*,
  266. const TStringRef& typeConfig,
  267. ui32 flags,
  268. IFunctionTypeInfoBuilder& builder) const final try {
  269. const bool typesOnly = (flags & TFlags::TypesOnly);
  270. const bool isMatch = (TPireMatch::Name(false, false) == name);
  271. const bool isGrep = (TPireMatch::Name(true, false) == name);
  272. const bool isMultiMatch = (TPireMatch::Name(false, true) == name);
  273. const bool isMultiGrep = (TPireMatch::Name(true, true) == name);
  274. if (isMatch || isGrep) {
  275. builder.SimpleSignature<bool(TOptional<char*>)>()
  276. .RunConfig<const char*>();
  277. if (!typesOnly) {
  278. builder.Implementation(new TPireMatch::TFactory(isGrep, false, builder.GetSourcePosition()));
  279. }
  280. } else if (isMultiMatch || isMultiGrep) {
  281. const auto boolType = builder.SimpleType<bool>();
  282. const auto optionalStringType = builder.Optional()->Item<char*>().Build();
  283. const std::string_view regexp(typeConfig);
  284. const size_t regexpCount = std::count(regexp.begin(), regexp.end(), '\n') + 1;
  285. const auto tuple = builder.Tuple();
  286. for (size_t i = 0; i < regexpCount; ++i) {
  287. tuple->Add(boolType);
  288. }
  289. const auto tupleType = tuple->Build();
  290. builder.Args(1)->Add(optionalStringType).Done().Returns(tupleType).RunConfig<char*>();
  291. if (!typesOnly) {
  292. builder.Implementation(new TPireMatch::TFactory(isMultiGrep, true, builder.GetSourcePosition(), regexpCount));
  293. }
  294. } else if (TPireCapture::Name() == name) {
  295. builder.SimpleSignature<TOptional<char*>(TOptional<char*>)>()
  296. .RunConfig<char*>();
  297. if (!typesOnly) {
  298. builder.Implementation(new TPireCapture::TFactory(builder.GetSourcePosition()));
  299. }
  300. } else if (TPireReplace::Name() == name) {
  301. builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>()
  302. .RunConfig<char*>();
  303. if (!typesOnly) {
  304. builder.Implementation(new TPireReplace::TFactory(builder.GetSourcePosition()));
  305. }
  306. }
  307. } catch (const std::exception& e) {
  308. builder.SetError(CurrentExceptionMessage());
  309. }
  310. };
  311. }
  312. REGISTER_MODULES(TPireModule)