hyperscan.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. #include "hyperscan.h"
  2. #include <contrib/libs/hyperscan/runtime_core2/hs_common.h>
  3. #include <contrib/libs/hyperscan/runtime_core2/hs_runtime.h>
  4. #include <contrib/libs/hyperscan/runtime_corei7/hs_common.h>
  5. #include <contrib/libs/hyperscan/runtime_corei7/hs_runtime.h>
  6. #include <contrib/libs/hyperscan/runtime_avx2/hs_common.h>
  7. #include <contrib/libs/hyperscan/runtime_avx2/hs_runtime.h>
  8. #include <util/generic/singleton.h>
  9. #include <util/system/sanitizers.h>
  10. namespace NHyperscan {
  11. using TSerializedDatabase = THolder<char, TDeleter<decltype(&free), &free>>;
  12. using TCompileError = THolder<hs_compile_error_t, TDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
  13. namespace NPrivate {
  14. ERuntime DetectCurrentRuntime() {
  15. // NOTE: We explicitly disable AVX512 runtime, there are bugs with
  16. // trivial string matching. See SPI-122953 & SPI-117618.
  17. if (NX86::HaveAVX() && NX86::HaveAVX2()) {
  18. return ERuntime::AVX2;
  19. } else if (NX86::HaveSSE42() && NX86::HavePOPCNT()) {
  20. return ERuntime::Corei7;
  21. } else {
  22. return ERuntime::Core2;
  23. }
  24. }
  25. TCPUFeatures RuntimeCpuFeatures(ERuntime runtime) {
  26. switch (runtime) {
  27. default:
  28. Y_ASSERT(false);
  29. [[fallthrough]];
  30. case ERuntime::Core2:
  31. case ERuntime::Corei7:
  32. return 0;
  33. case ERuntime::AVX2:
  34. return CPU_FEATURES_AVX2;
  35. }
  36. }
  37. hs_platform_info_t MakePlatformInfo(TCPUFeatures cpuFeatures) {
  38. hs_platform_info_t platformInfo{HS_TUNE_FAMILY_GENERIC, cpuFeatures, 0, 0};
  39. return platformInfo;
  40. }
  41. hs_platform_info_t MakeCurrentPlatformInfo() {
  42. return MakePlatformInfo(RuntimeCpuFeatures(DetectCurrentRuntime()));
  43. }
  44. TImpl::TImpl(ERuntime runtime) {
  45. switch (runtime) {
  46. default:
  47. Y_ASSERT(false);
  48. [[fallthrough]];
  49. case ERuntime::Core2:
  50. AllocScratch = core2_hs_alloc_scratch;
  51. Scan = core2_hs_scan;
  52. SerializeDatabase = core2_hs_serialize_database;
  53. DeserializeDatabase = core2_hs_deserialize_database;
  54. break;
  55. case ERuntime::Corei7:
  56. AllocScratch = corei7_hs_alloc_scratch;
  57. Scan = corei7_hs_scan;
  58. SerializeDatabase = corei7_hs_serialize_database;
  59. DeserializeDatabase = corei7_hs_deserialize_database;
  60. break;
  61. case ERuntime::AVX2:
  62. AllocScratch = avx2_hs_alloc_scratch;
  63. Scan = avx2_hs_scan;
  64. SerializeDatabase = avx2_hs_serialize_database;
  65. DeserializeDatabase = avx2_hs_deserialize_database;
  66. break;
  67. }
  68. }
  69. TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform) {
  70. hs_database_t* rawDb = nullptr;
  71. hs_compile_error_t* rawCompileErr = nullptr;
  72. hs_error_t status = hs_compile(
  73. regex.begin(),
  74. flags,
  75. HS_MODE_BLOCK,
  76. platform,
  77. &rawDb,
  78. &rawCompileErr);
  79. TDatabase db(rawDb);
  80. NHyperscan::TCompileError compileError(rawCompileErr);
  81. if (status != HS_SUCCESS) {
  82. ythrow TCompileException()
  83. << "Failed to compile regex: " << regex << ". "
  84. << "Error message (hyperscan): " << compileError->message;
  85. }
  86. return db;
  87. }
  88. TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform) {
  89. hs_database_t* rawDb = nullptr;
  90. hs_compile_error_t* rawCompileErr = nullptr;
  91. hs_error_t status = hs_compile_lit(
  92. literal.data(),
  93. flags,
  94. literal.size(),
  95. HS_MODE_BLOCK,
  96. platform,
  97. &rawDb,
  98. &rawCompileErr);
  99. TDatabase db(rawDb);
  100. NHyperscan::TCompileError compileError(rawCompileErr);
  101. if (status != HS_SUCCESS) {
  102. ythrow TCompileException()
  103. << "Failed to compile literal: " << literal << ". "
  104. << "Error message (hyperscan): " << compileError->message;
  105. }
  106. return db;
  107. }
  108. TDatabase CompileMulti(
  109. const TVector<const char*>& regexs,
  110. const TVector<unsigned int>& flags,
  111. const TVector<unsigned int>& ids,
  112. hs_platform_info_t* platform,
  113. const TVector<const hs_expr_ext_t*>* extendedParameters) {
  114. unsigned int count = regexs.size();
  115. if (flags.size() != count) {
  116. ythrow yexception()
  117. << "Mismatch of sizes vectors passed to CompileMulti. "
  118. << "size(regexs) = " << regexs.size() << ". "
  119. << "size(flags) = " << flags.size() << ".";
  120. }
  121. if (ids.size() != count) {
  122. ythrow yexception()
  123. << "Mismatch of sizes vectors passed to CompileMulti. "
  124. << "size(regexs) = " << regexs.size() << ". "
  125. << "size(ids) = " << ids.size() << ".";
  126. }
  127. if (extendedParameters && extendedParameters->size() != count) {
  128. ythrow yexception()
  129. << "Mismatch of sizes vectors passed to CompileMulti. "
  130. << "size(regexs) = " << regexs.size() << ". "
  131. << "size(extendedParameters) = " << extendedParameters->size() << ".";
  132. }
  133. hs_database_t* rawDb = nullptr;
  134. hs_compile_error_t* rawCompileErr = nullptr;
  135. hs_error_t status = hs_compile_ext_multi(
  136. regexs.data(),
  137. flags.data(),
  138. ids.data(),
  139. extendedParameters ? extendedParameters->data() : nullptr,
  140. count,
  141. HS_MODE_BLOCK,
  142. platform,
  143. &rawDb,
  144. &rawCompileErr);
  145. TDatabase db(rawDb);
  146. NHyperscan::TCompileError compileError(rawCompileErr);
  147. if (status != HS_SUCCESS) {
  148. if (compileError->expression >= 0) {
  149. const char* regex = regexs[compileError->expression];
  150. ythrow TCompileException()
  151. << "Failed to compile regex: " << regex << ". "
  152. << "Error message (hyperscan): " << compileError->message;
  153. } else {
  154. ythrow TCompileException()
  155. << "Failed to compile multiple regexs. "
  156. << "Error message (hyperscan): " << compileError->message;
  157. }
  158. }
  159. return db;
  160. }
  161. TDatabase CompileMultiLiteral(
  162. const TVector<const char*>& literals,
  163. const TVector<unsigned int>& flags,
  164. const TVector<unsigned int>& ids,
  165. const TVector<size_t>& lens,
  166. hs_platform_info_t* platform)
  167. {
  168. unsigned int count = literals.size();
  169. if (flags.size() != count) {
  170. ythrow yexception()
  171. << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
  172. << "size(literals) = " << literals.size() << ". "
  173. << "size(flags) = " << flags.size() << ".";
  174. }
  175. if (ids.size() != count) {
  176. ythrow yexception()
  177. << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
  178. << "size(literals) = " << literals.size() << ". "
  179. << "size(ids) = " << ids.size() << ".";
  180. }
  181. if (lens.size() != count) {
  182. ythrow yexception()
  183. << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
  184. << "size(literals) = " << literals.size() << ". "
  185. << "size(lens) = " << lens.size() << ".";
  186. }
  187. hs_database_t* rawDb = nullptr;
  188. hs_compile_error_t* rawCompileErr = nullptr;
  189. hs_error_t status = hs_compile_lit_multi(
  190. literals.data(),
  191. flags.data(),
  192. ids.data(),
  193. lens.data(),
  194. count,
  195. HS_MODE_BLOCK,
  196. platform,
  197. &rawDb,
  198. &rawCompileErr);
  199. TDatabase db(rawDb);
  200. NHyperscan::TCompileError compileError(rawCompileErr);
  201. if (status != HS_SUCCESS) {
  202. if (compileError->expression >= 0) {
  203. const char* literal = literals[compileError->expression];
  204. ythrow TCompileException()
  205. << "Failed to compile literal: " << literal << ". "
  206. << "Error message (hyperscan): " << compileError->message;
  207. } else {
  208. ythrow TCompileException()
  209. << "Failed to compile multiple literals. "
  210. << "Error message (hyperscan): " << compileError->message;
  211. }
  212. }
  213. return db;
  214. }
  215. bool Matches(
  216. const TDatabase& db,
  217. const TScratch& scratch,
  218. const TStringBuf& text,
  219. const TImpl& impl) {
  220. bool result = false;
  221. auto callback = [&](unsigned int /* id */, unsigned long long /* from */, unsigned long long /* to */) {
  222. result = true;
  223. return 1; // stop scan
  224. };
  225. Scan(
  226. db,
  227. scratch,
  228. text,
  229. callback,
  230. impl);
  231. return result;
  232. }
  233. } // namespace NPrivate
  234. TDatabase Compile(const TStringBuf& regex, unsigned int flags) {
  235. auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
  236. return NPrivate::Compile(regex, flags, &platformInfo);
  237. }
  238. TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures) {
  239. auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
  240. return NPrivate::Compile(regex, flags, &platformInfo);
  241. }
  242. TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags) {
  243. auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
  244. return NPrivate::CompileLiteral(literal, flags, &platformInfo);
  245. }
  246. TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures) {
  247. auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
  248. return NPrivate::CompileLiteral(literal, flags, &platformInfo);
  249. }
  250. TDatabase CompileMulti(
  251. const TVector<const char*>& regexs,
  252. const TVector<unsigned int>& flags,
  253. const TVector<unsigned int>& ids,
  254. const TVector<const hs_expr_ext_t*>* extendedParameters)
  255. {
  256. auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
  257. return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
  258. }
  259. TDatabase CompileMulti(
  260. const TVector<const char*>& regexs,
  261. const TVector<unsigned int>& flags,
  262. const TVector<unsigned int>& ids,
  263. TCPUFeatures cpuFeatures,
  264. const TVector<const hs_expr_ext_t*>* extendedParameters)
  265. {
  266. auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
  267. return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
  268. }
  269. TDatabase CompileMultiLiteral(
  270. const TVector<const char*>& literals,
  271. const TVector<unsigned int>& flags,
  272. const TVector<unsigned int>& ids,
  273. const TVector<size_t>& lens)
  274. {
  275. auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
  276. return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
  277. }
  278. TDatabase CompileMultiLiteral(
  279. const TVector<const char*>& literals,
  280. const TVector<unsigned int>& flags,
  281. const TVector<unsigned int>& ids,
  282. const TVector<size_t>& lens,
  283. TCPUFeatures cpuFeatures)
  284. {
  285. auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
  286. return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
  287. }
  288. TScratch MakeScratch(const TDatabase& db) {
  289. hs_scratch_t* rawScratch = nullptr;
  290. hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
  291. NHyperscan::TScratch scratch(rawScratch);
  292. if (status != HS_SUCCESS) {
  293. ythrow yexception() << "Failed to make scratch for hyperscan database";
  294. }
  295. return scratch;
  296. }
  297. void GrowScratch(TScratch& scratch, const TDatabase& db) {
  298. hs_scratch_t* rawScratch = scratch.Get();
  299. hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
  300. if (rawScratch != scratch.Get()) {
  301. Y_UNUSED(scratch.Release()); // freed by hs_alloc_scratch
  302. scratch.Reset(rawScratch);
  303. }
  304. if (status != HS_SUCCESS) {
  305. ythrow yexception() << "Failed to make grow scratch for hyperscan database";
  306. }
  307. }
  308. TScratch CloneScratch(const TScratch& scratch) {
  309. hs_scratch_t* rawScratch = nullptr;
  310. hs_error_t status = hs_clone_scratch(scratch.Get(), &rawScratch);
  311. TScratch scratchCopy(rawScratch);
  312. if (status != HS_SUCCESS) {
  313. ythrow yexception() << "Failed to clone scratch for hyperscan database";
  314. }
  315. return scratchCopy;
  316. }
  317. bool Matches(
  318. const TDatabase& db,
  319. const TScratch& scratch,
  320. const TStringBuf& text)
  321. {
  322. return NPrivate::Matches(db, scratch, text, *Singleton<NPrivate::TImpl>());
  323. }
  324. TString Serialize(const TDatabase& db) {
  325. char* databaseBytes = nullptr;
  326. size_t databaseLength;
  327. hs_error_t status = Singleton<NPrivate::TImpl>()->SerializeDatabase(
  328. db.Get(),
  329. &databaseBytes,
  330. &databaseLength);
  331. TSerializedDatabase serialization(databaseBytes);
  332. if (status != HS_SUCCESS) {
  333. ythrow yexception() << "Failed to serialize hyperscan database";
  334. }
  335. return TString(serialization.Get(), databaseLength);
  336. }
  337. TDatabase Deserialize(const TStringBuf& serialization) {
  338. hs_database_t* rawDb = nullptr;
  339. hs_error_t status = Singleton<NPrivate::TImpl>()->DeserializeDatabase(
  340. serialization.begin(),
  341. serialization.size(),
  342. &rawDb);
  343. TDatabase db(rawDb);
  344. if (status != HS_SUCCESS) {
  345. if (status == HS_DB_PLATFORM_ERROR) {
  346. ythrow yexception() << "Serialized Hyperscan database is incompatible with current CPU";
  347. } else if (status == HS_DB_VERSION_ERROR) {
  348. ythrow yexception() << "Need recreate Hyperscan database with new version Hyperscan";
  349. } else {
  350. ythrow yexception() << "Failed to deserialize hyperscan database (status = " << status << ")";
  351. }
  352. }
  353. return db;
  354. }
  355. }