hyperscan_ut.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. #include <library/cpp/regex/hyperscan/hyperscan.h>
  2. #include <library/cpp/testing/unittest/registar.h>
  3. #include <util/generic/set.h>
  4. #include <array>
  5. #include <algorithm>
  6. Y_UNIT_TEST_SUITE(HyperscanWrappers) {
  7. using namespace NHyperscan;
  8. using namespace NHyperscan::NPrivate;
  9. Y_UNIT_TEST(CompileAndScan) {
  10. TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  11. TScratch scratch = MakeScratch(db);
  12. unsigned int foundId = 42;
  13. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  14. foundId = id;
  15. };
  16. NHyperscan::Scan(
  17. db,
  18. scratch,
  19. "abc",
  20. callback);
  21. UNIT_ASSERT_EQUAL(foundId, 0);
  22. }
  23. Y_UNIT_TEST(CompileLiteralAndScan) {
  24. TDatabase db = CompileLiteral("a.c?[)", HS_FLAG_SINGLEMATCH);
  25. TScratch scratch = MakeScratch(db);
  26. unsigned int foundId = 42;
  27. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  28. foundId = id;
  29. };
  30. NHyperscan::Scan(
  31. db,
  32. scratch,
  33. "a.c?[)",
  34. callback);
  35. UNIT_ASSERT_EQUAL(foundId, 0);
  36. }
  37. Y_UNIT_TEST(Matches) {
  38. NHyperscan::TDatabase db = NHyperscan::Compile(
  39. "a.c",
  40. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  41. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  42. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
  43. UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
  44. }
  45. Y_UNIT_TEST(Multi) {
  46. NHyperscan::TDatabase db = NHyperscan::CompileMulti(
  47. {
  48. "foo",
  49. "bar",
  50. },
  51. {
  52. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
  53. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
  54. },
  55. {
  56. 42,
  57. 241,
  58. });
  59. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  60. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
  61. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
  62. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
  63. UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
  64. TSet<unsigned int> foundIds;
  65. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  66. foundIds.insert(id);
  67. };
  68. NHyperscan::Scan(
  69. db,
  70. scratch,
  71. "fooBaR",
  72. callback);
  73. UNIT_ASSERT_EQUAL(foundIds.size(), 2);
  74. UNIT_ASSERT(foundIds.contains(42));
  75. UNIT_ASSERT(foundIds.contains(241));
  76. }
  77. Y_UNIT_TEST(MultiLiteral) {
  78. static const TVector<TString> LITERALS = {
  79. "foo.",
  80. "bar.",
  81. };
  82. NHyperscan::TDatabase db = NHyperscan::CompileMultiLiteral(
  83. {
  84. LITERALS[0].c_str(),
  85. LITERALS[1].c_str(),
  86. },
  87. {
  88. HS_FLAG_SINGLEMATCH,
  89. HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
  90. },
  91. {
  92. 42,
  93. 241,
  94. },
  95. {
  96. LITERALS[0].size(),
  97. LITERALS[1].size(),
  98. });
  99. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  100. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo."));
  101. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar."));
  102. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR."));
  103. UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO."));
  104. TSet<unsigned int> foundIds;
  105. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  106. foundIds.insert(id);
  107. };
  108. NHyperscan::Scan(
  109. db,
  110. scratch,
  111. "foo.BaR.",
  112. callback);
  113. UNIT_ASSERT_EQUAL(foundIds.size(), 2);
  114. UNIT_ASSERT(foundIds.contains(42));
  115. UNIT_ASSERT(foundIds.contains(241));
  116. }
  117. // https://ml.yandex-team.ru/thread/2370000002965712422/
  118. Y_UNIT_TEST(MultiRegression) {
  119. NHyperscan::CompileMulti(
  120. {
  121. "aa.bb/cc.dd",
  122. },
  123. {
  124. HS_FLAG_UTF8,
  125. },
  126. {
  127. 0,
  128. });
  129. }
  130. Y_UNIT_TEST(Serialize) {
  131. NHyperscan::TDatabase db = NHyperscan::Compile(
  132. "foo",
  133. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  134. TString serialization = Serialize(db);
  135. db.Reset();
  136. TDatabase db2 = Deserialize(serialization);
  137. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
  138. UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
  139. UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
  140. }
  141. Y_UNIT_TEST(GrowScratch) {
  142. NHyperscan::TDatabase db1 = NHyperscan::Compile(
  143. "foo",
  144. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  145. NHyperscan::TDatabase db2 = NHyperscan::Compile(
  146. "longer\\w\\w\\wpattern",
  147. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
  148. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
  149. NHyperscan::GrowScratch(scratch, db2);
  150. UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
  151. UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
  152. }
  153. Y_UNIT_TEST(CloneScratch) {
  154. NHyperscan::TDatabase db = NHyperscan::Compile(
  155. "foo",
  156. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  157. NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
  158. NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
  159. scratch1.Reset();
  160. UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
  161. }
  162. class TSimpleSingleRegex {
  163. public:
  164. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  165. return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
  166. }
  167. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  168. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  169. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
  170. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
  171. }
  172. };
  173. // This regex uses AVX2 instructions on long (>70) texts.
  174. // It crushes when compiled for machine with AVX2 and run on machine without it.
  175. class TAvx2SingleRegex {
  176. public:
  177. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  178. auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
  179. "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
  180. unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
  181. return NHyperscan::Compile(regex, flags, cpuFeatures);
  182. }
  183. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  184. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  185. UNIT_ASSERT(NHyperscan::NPrivate::Matches(
  186. db,
  187. scratch,
  188. "_________________________________________________________________"
  189. "фу.bar"
  190. "_________________________________________________________________",
  191. impl));
  192. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
  193. db,
  194. scratch,
  195. "_________________________________________________________________"
  196. "фу"
  197. "_________________________________________________________________",
  198. impl));
  199. }
  200. };
  201. class TSimpleMultiRegex {
  202. public:
  203. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  204. return NHyperscan::CompileMulti(
  205. {
  206. "foo",
  207. "bar",
  208. },
  209. {
  210. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
  211. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
  212. },
  213. {
  214. 42,
  215. 241,
  216. },
  217. cpuFeatures);
  218. }
  219. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  220. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  221. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
  222. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
  223. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
  224. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
  225. TSet<unsigned int> foundIds;
  226. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  227. foundIds.insert(id);
  228. };
  229. NHyperscan::NPrivate::Scan(
  230. db,
  231. scratch,
  232. "fooBaR",
  233. callback,
  234. impl);
  235. UNIT_ASSERT_EQUAL(foundIds.size(), 2);
  236. UNIT_ASSERT(foundIds.contains(42));
  237. UNIT_ASSERT(foundIds.contains(241));
  238. }
  239. };
  240. template <class Regex>
  241. void TestCrossPlatformCompile() {
  242. const std::array<ERuntime, 4> runtimes = {
  243. ERuntime::Core2,
  244. ERuntime::Corei7,
  245. ERuntime::AVX2,
  246. };
  247. // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
  248. auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
  249. Y_ASSERT(currentRuntimeIter != runtimes.cend());
  250. for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
  251. auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
  252. Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
  253. }
  254. }
  255. Y_UNIT_TEST(CrossPlatformCompile) {
  256. TestCrossPlatformCompile<TSimpleSingleRegex>();
  257. TestCrossPlatformCompile<TAvx2SingleRegex>();
  258. TestCrossPlatformCompile<TSimpleMultiRegex>();
  259. }
  260. }