hyperscan_udf.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. #include <yql/essentials/public/udf/udf_value.h>
  2. #include <yql/essentials/public/udf/udf_type_builder.h>
  3. #include <yql/essentials/public/udf/udf_registrator.h>
  4. #include <yql/essentials/public/udf/udf_value_builder.h>
  5. #include <yql/essentials/public/udf/udf_terminator.h>
  6. #include <library/cpp/regex/hyperscan/hyperscan.h>
  7. #include <library/cpp/regex/pcre/regexp.h>
  8. #include <util/charset/utf8.h>
  9. #include <util/string/split.h>
  10. #include <util/string/builder.h>
  11. #include <util/system/cpu_id.h>
  12. using namespace NHyperscan;
  13. using namespace NKikimr;
  14. using namespace NUdf;
  15. namespace {
  16. using TOptions = ui32;
  17. class THyperscanUdfBase: public TBoxedValue {
  18. protected:
  19. constexpr static const char* IGNORE_CASE_PREFIX = "(?i)";
  20. static void SetCommonOptions(TString& regex, TOptions& options) {
  21. options |= HS_FLAG_ALLOWEMPTY;
  22. if (regex.StartsWith(IGNORE_CASE_PREFIX)) {
  23. options |= HS_FLAG_CASELESS;
  24. regex = regex.substr(4);
  25. }
  26. if (UTF8Detect(regex) == UTF8) {
  27. options |= HS_FLAG_UTF8;
  28. }
  29. if (NX86::HaveAVX2()) {
  30. options |= HS_CPU_FEATURES_AVX2;
  31. }
  32. }
  33. };
  34. class THyperscanMatch: public THyperscanUdfBase {
  35. public:
  36. enum class EMode {
  37. NORMAL,
  38. BACKTRACKING,
  39. MULTI
  40. };
  41. class TFactory: public THyperscanUdfBase {
  42. public:
  43. TFactory(
  44. TSourcePosition pos,
  45. bool surroundMode,
  46. THyperscanMatch::EMode mode,
  47. size_t regexpsCount = 0)
  48. : Pos_(pos)
  49. , SurroundMode(surroundMode)
  50. , Mode(mode)
  51. , RegexpsCount(regexpsCount)
  52. {
  53. }
  54. private:
  55. TUnboxedValue Run(
  56. const IValueBuilder* valueBuilder,
  57. const TUnboxedValuePod* args) const override {
  58. return TUnboxedValuePod(
  59. new THyperscanMatch(
  60. valueBuilder,
  61. args[0],
  62. SurroundMode,
  63. Mode,
  64. Pos_,
  65. RegexpsCount));
  66. }
  67. TSourcePosition Pos_;
  68. bool SurroundMode;
  69. THyperscanMatch::EMode Mode;
  70. size_t RegexpsCount;
  71. };
  72. static const TStringRef& Name(bool isGrep, THyperscanMatch::EMode mode) {
  73. static auto match = TStringRef::Of("Match");
  74. static auto grep = TStringRef::Of("Grep");
  75. static auto backtrackingMatch = TStringRef::Of("BacktrackingMatch");
  76. static auto backtrackingGrep = TStringRef::Of("BacktrackingGrep");
  77. static auto multiMatch = TStringRef::Of("MultiMatch");
  78. static auto multiGrep = TStringRef::Of("MultiGrep");
  79. if (isGrep) {
  80. switch (mode) {
  81. case THyperscanMatch::EMode::NORMAL:
  82. return grep;
  83. case THyperscanMatch::EMode::BACKTRACKING:
  84. return backtrackingGrep;
  85. case THyperscanMatch::EMode::MULTI:
  86. return multiGrep;
  87. }
  88. } else {
  89. switch (mode) {
  90. case THyperscanMatch::EMode::NORMAL:
  91. return match;
  92. case THyperscanMatch::EMode::BACKTRACKING:
  93. return backtrackingMatch;
  94. case THyperscanMatch::EMode::MULTI:
  95. return multiMatch;
  96. }
  97. }
  98. Y_ABORT("Unexpected");
  99. }
  100. THyperscanMatch(
  101. const IValueBuilder*,
  102. const TUnboxedValuePod& runConfig,
  103. bool surroundMode,
  104. THyperscanMatch::EMode mode,
  105. TSourcePosition pos,
  106. size_t regexpsCount)
  107. : Regex_(runConfig.AsStringRef())
  108. , Mode(mode)
  109. , Pos_(pos)
  110. , RegexpsCount(regexpsCount)
  111. {
  112. try {
  113. TOptions options = 0;
  114. int pcreOptions = REG_EXTENDED;
  115. if (Mode == THyperscanMatch::EMode::BACKTRACKING && Regex_.StartsWith(IGNORE_CASE_PREFIX)) {
  116. pcreOptions |= REG_ICASE;
  117. }
  118. auto regex = Regex_;
  119. SetCommonOptions(regex, options);
  120. switch (mode) {
  121. case THyperscanMatch::EMode::NORMAL: {
  122. if (!surroundMode) {
  123. regex = TStringBuilder() << '^' << regex << '$';
  124. }
  125. Database_ = Compile(regex, options);
  126. break;
  127. }
  128. case THyperscanMatch::EMode::BACKTRACKING: {
  129. if (!surroundMode) {
  130. regex = TStringBuilder() << '^' << regex << '$';
  131. }
  132. try {
  133. Database_ = Compile(regex, options);
  134. Mode = THyperscanMatch::EMode::NORMAL;
  135. } catch (const TCompileException&) {
  136. options |= HS_FLAG_PREFILTER;
  137. Database_ = Compile(regex, options);
  138. Fallback_ = TRegExMatch(regex, pcreOptions);
  139. }
  140. break;
  141. }
  142. case THyperscanMatch::EMode::MULTI: {
  143. std::vector<TString> regexes;
  144. TVector<const char*> cregexes;
  145. TVector<TOptions> flags;
  146. TVector<TOptions> ids;
  147. const auto func = [&regexes, &flags, surroundMode](const std::string_view& token) {
  148. TString regex(token);
  149. TOptions opt = 0;
  150. SetCommonOptions(regex, opt);
  151. if (!surroundMode) {
  152. regex = TStringBuilder() << '^' << regex << '$';
  153. }
  154. regexes.emplace_back(std::move(regex));
  155. flags.emplace_back(opt);
  156. };
  157. StringSplitter(Regex_).Split('\n').Consume(func);
  158. std::transform(regexes.cbegin(), regexes.cend(), std::back_inserter(cregexes), std::bind(&TString::c_str, std::placeholders::_1));
  159. ids.resize(regexes.size());
  160. std::iota(ids.begin(), ids.end(), 0);
  161. Database_ = CompileMulti(cregexes, flags, ids);
  162. break;
  163. }
  164. }
  165. Scratch_ = MakeScratch(Database_);
  166. } catch (const std::exception& e) {
  167. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  168. }
  169. }
  170. private:
  171. TUnboxedValue Run(
  172. const IValueBuilder* valueBuilder,
  173. const TUnboxedValuePod* args) const final try {
  174. TUnboxedValue* items = nullptr;
  175. TUnboxedValue tuple;
  176. size_t i = 0;
  177. if (Mode == THyperscanMatch::EMode::MULTI) {
  178. tuple = valueBuilder->NewArray(RegexpsCount, items);
  179. for (i = 0; i < RegexpsCount; ++i) {
  180. items[i] = TUnboxedValuePod(false);
  181. }
  182. }
  183. if (args[0]) {
  184. // XXX: StringRef data might not be a NTBS, though the function
  185. // <TRegExMatch::Match> expects ASCIIZ string. Explicitly copy
  186. // the given argument string and append the NUL terminator to it.
  187. const TString input(args[0].AsStringRef());
  188. if (Y_UNLIKELY(Mode == THyperscanMatch::EMode::MULTI)) {
  189. auto callback = [items] (TOptions id, ui64 /* from */, ui64 /* to */) {
  190. items[id] = TUnboxedValuePod(true);
  191. };
  192. Scan(Database_, Scratch_, input, callback);
  193. return tuple;
  194. } else {
  195. bool matches = Matches(Database_, Scratch_, input);
  196. if (matches && Mode == THyperscanMatch::EMode::BACKTRACKING) {
  197. matches = Fallback_.Match(input.data());
  198. }
  199. return TUnboxedValuePod(matches);
  200. }
  201. } else {
  202. return Mode == THyperscanMatch::EMode::MULTI ? tuple : TUnboxedValue(TUnboxedValuePod(false));
  203. }
  204. } catch (const std::exception& e) {
  205. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  206. }
  207. private:
  208. const TString Regex_;
  209. THyperscanMatch::EMode Mode;
  210. const TSourcePosition Pos_;
  211. const size_t RegexpsCount;
  212. TDatabase Database_;
  213. TScratch Scratch_;
  214. TRegExMatch Fallback_;
  215. };
  216. class THyperscanCapture: public THyperscanUdfBase {
  217. public:
  218. class TFactory: public THyperscanUdfBase {
  219. public:
  220. TFactory(TSourcePosition pos)
  221. : Pos_(pos)
  222. {}
  223. private:
  224. TUnboxedValue Run(const IValueBuilder*,
  225. const TUnboxedValuePod* args) const final try {
  226. return TUnboxedValuePod(new THyperscanCapture(args[0], Pos_));
  227. } catch (const std::exception& e) {
  228. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  229. }
  230. private:
  231. TSourcePosition Pos_;
  232. };
  233. static const TStringRef& Name() {
  234. static auto name = TStringRef::Of("Capture");
  235. return name;
  236. }
  237. THyperscanCapture(const TUnboxedValuePod& runConfig, TSourcePosition pos)
  238. : Pos_(pos)
  239. {
  240. Regex_ = runConfig.AsStringRef();
  241. TOptions options = HS_FLAG_SOM_LEFTMOST;
  242. SetCommonOptions(Regex_, options);
  243. Database_ = Compile(Regex_, options);
  244. Scratch_ = MakeScratch(Database_);
  245. }
  246. private:
  247. TUnboxedValue Run(
  248. const IValueBuilder* valueBuilder,
  249. const TUnboxedValuePod* args) const final try {
  250. if (const auto arg = args[0]) {
  251. TUnboxedValue result;
  252. auto callback = [valueBuilder, arg, &result] (TOptions id, ui64 from, ui64 to) {
  253. Y_UNUSED(id);
  254. if (!result) {
  255. result = valueBuilder->SubString(arg, from, to);
  256. }
  257. };
  258. Scan(Database_, Scratch_, arg.AsStringRef(), callback);
  259. return result;
  260. }
  261. return TUnboxedValue();
  262. } catch (const std::exception& e) {
  263. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  264. }
  265. TSourcePosition Pos_;
  266. TString Regex_;
  267. TDatabase Database_;
  268. TScratch Scratch_;
  269. };
  270. class THyperscanReplace: public THyperscanUdfBase {
  271. public:
  272. class TFactory: public THyperscanUdfBase {
  273. public:
  274. TFactory(TSourcePosition pos)
  275. : Pos_(pos)
  276. {}
  277. private:
  278. TUnboxedValue Run(const IValueBuilder*,
  279. const TUnboxedValuePod* args) const final try {
  280. return TUnboxedValuePod(new THyperscanReplace(args[0], Pos_));
  281. } catch (const std::exception& e) {
  282. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  283. }
  284. private:
  285. TSourcePosition Pos_;
  286. };
  287. static const TStringRef& Name() {
  288. static auto name = TStringRef::Of("Replace");
  289. return name;
  290. }
  291. THyperscanReplace(const TUnboxedValuePod& runConfig, TSourcePosition pos)
  292. : Pos_(pos)
  293. {
  294. Regex_ = runConfig.AsStringRef();
  295. TOptions options = HS_FLAG_SOM_LEFTMOST;
  296. SetCommonOptions(Regex_, options);
  297. Database_ = Compile(Regex_, options);
  298. Scratch_ = MakeScratch(Database_);
  299. }
  300. private:
  301. TUnboxedValue Run(
  302. const IValueBuilder* valueBuilder,
  303. const TUnboxedValuePod* args) const final try {
  304. if (args[0]) {
  305. const std::string_view input(args[0].AsStringRef());
  306. const std::string_view replacement(args[1].AsStringRef());
  307. ui64 index = 0;
  308. TStringBuilder result;
  309. auto callback = [input, replacement, &index, &result] (TOptions id, ui64 from, ui64 to) {
  310. Y_UNUSED(id);
  311. if (index != from) {
  312. result << input.substr(index, from - index);
  313. }
  314. result << replacement;
  315. index = to;
  316. };
  317. Scan(Database_, Scratch_, input, callback);
  318. if (!index) {
  319. return args[0];
  320. }
  321. result << input.substr(index);
  322. return valueBuilder->NewString(result);
  323. }
  324. return TUnboxedValue();
  325. } catch (const std::exception& e) {
  326. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  327. }
  328. TSourcePosition Pos_;
  329. TString Regex_;
  330. TDatabase Database_;
  331. TScratch Scratch_;
  332. };
  333. class THyperscanModule: public IUdfModule {
  334. public:
  335. TStringRef Name() const {
  336. return TStringRef::Of("Hyperscan");
  337. }
  338. void CleanupOnTerminate() const final {
  339. }
  340. void GetAllFunctions(IFunctionsSink& sink) const final {
  341. sink.Add(THyperscanMatch::Name(true, THyperscanMatch::EMode::NORMAL));
  342. sink.Add(THyperscanMatch::Name(false, THyperscanMatch::EMode::NORMAL));
  343. sink.Add(THyperscanMatch::Name(true, THyperscanMatch::EMode::BACKTRACKING));
  344. sink.Add(THyperscanMatch::Name(false, THyperscanMatch::EMode::BACKTRACKING));
  345. sink.Add(THyperscanMatch::Name(true, THyperscanMatch::EMode::MULTI))->SetTypeAwareness();
  346. sink.Add(THyperscanMatch::Name(false, THyperscanMatch::EMode::MULTI))->SetTypeAwareness();
  347. sink.Add(THyperscanCapture::Name());
  348. sink.Add(THyperscanReplace::Name());
  349. }
  350. void BuildFunctionTypeInfo(
  351. const TStringRef& name,
  352. TType* userType,
  353. const TStringRef& typeConfig,
  354. ui32 flags,
  355. IFunctionTypeInfoBuilder& builder) const final {
  356. try {
  357. Y_UNUSED(userType);
  358. bool typesOnly = (flags & TFlags::TypesOnly);
  359. bool isMatch = (THyperscanMatch::Name(false, THyperscanMatch::EMode::NORMAL) == name);
  360. bool isGrep = (THyperscanMatch::Name(true, THyperscanMatch::EMode::NORMAL) == name);
  361. bool isBacktrackingMatch = (THyperscanMatch::Name(false, THyperscanMatch::EMode::BACKTRACKING) == name);
  362. bool isBacktrackingGrep = (THyperscanMatch::Name(true, THyperscanMatch::EMode::BACKTRACKING) == name);
  363. bool isMultiMatch = (THyperscanMatch::Name(false, THyperscanMatch::EMode::MULTI) == name);
  364. bool isMultiGrep = (THyperscanMatch::Name(true, THyperscanMatch::EMode::MULTI) == name);
  365. if (isMatch || isGrep) {
  366. builder.SimpleSignature<bool(TOptional<char*>)>()
  367. .RunConfig<const char*>();
  368. if (!typesOnly) {
  369. builder.Implementation(new THyperscanMatch::TFactory(builder.GetSourcePosition(), isGrep, THyperscanMatch::EMode::NORMAL));
  370. }
  371. } else if (isBacktrackingMatch || isBacktrackingGrep) {
  372. builder.SimpleSignature<bool(TOptional<char*>)>()
  373. .RunConfig<const char*>();
  374. if (!typesOnly) {
  375. builder.Implementation(new THyperscanMatch::TFactory(builder.GetSourcePosition(), isBacktrackingGrep, THyperscanMatch::EMode::BACKTRACKING));
  376. }
  377. } else if (isMultiMatch || isMultiGrep) {
  378. auto boolType = builder.SimpleType<bool>();
  379. auto optionalStringType = builder.Optional()->Item<char*>().Build();
  380. const std::string_view regexp(typeConfig);
  381. size_t regexpCount = std::count(regexp.begin(), regexp.end(), '\n') + 1;
  382. auto tuple = builder.Tuple();
  383. for (size_t i = 0; i < regexpCount; ++i) {
  384. tuple->Add(boolType);
  385. }
  386. auto tupleType = tuple->Build();
  387. builder.Args(1)->Add(optionalStringType).Done().Returns(tupleType).RunConfig<char*>();
  388. if (!typesOnly) {
  389. builder.Implementation(new THyperscanMatch::TFactory(builder.GetSourcePosition(), isMultiGrep, THyperscanMatch::EMode::MULTI, regexpCount));
  390. }
  391. } else if (THyperscanCapture::Name() == name) {
  392. builder.SimpleSignature<TOptional<char*>(TOptional<char*>)>()
  393. .RunConfig<char*>();
  394. if (!typesOnly) {
  395. builder.Implementation(new THyperscanCapture::TFactory(builder.GetSourcePosition()));
  396. }
  397. } else if (THyperscanReplace::Name() == name) {
  398. builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>()
  399. .RunConfig<char*>();
  400. if (!typesOnly) {
  401. builder.Implementation(new THyperscanReplace::TFactory(builder.GetSourcePosition()));
  402. }
  403. }
  404. } catch (const std::exception& e) {
  405. builder.SetError(CurrentExceptionMessage());
  406. }
  407. }
  408. };
  409. class TPcreModule : public THyperscanModule {
  410. public:
  411. TStringRef Name() const {
  412. return TStringRef::Of("Pcre");
  413. }
  414. };
  415. }
  416. REGISTER_MODULES(THyperscanModule, TPcreModule)