hyperscan_ut.cpp 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. #include <library/cpp/regex/hyperscan/hyperscan.h>
  2. #include <library/cpp/testing/unittest/registar.h>
  3. #include <util/generic/set.h>
  4. #include <array>
  5. #include <algorithm>
  6. Y_UNIT_TEST_SUITE(HyperscanWrappers) {
  7. using namespace NHyperscan;
  8. using namespace NHyperscan::NPrivate;
  9. Y_UNIT_TEST(CompileAndScan) {
  10. TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  11. TScratch scratch = MakeScratch(db);
  12. unsigned int foundId = 42;
  13. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  14. foundId = id;
  15. };
  16. NHyperscan::Scan(
  17. db,
  18. scratch,
  19. "abc",
  20. callback);
  21. UNIT_ASSERT_EQUAL(foundId, 0);
  22. }
  23. Y_UNIT_TEST(Matches) {
  24. NHyperscan::TDatabase db = NHyperscan::Compile(
  25. "a.c",
  26. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  27. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  28. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
  29. UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
  30. }
  31. Y_UNIT_TEST(Multi) {
  32. NHyperscan::TDatabase db = NHyperscan::CompileMulti(
  33. {
  34. "foo",
  35. "bar",
  36. },
  37. {
  38. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
  39. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
  40. },
  41. {
  42. 42,
  43. 241,
  44. });
  45. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  46. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
  47. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
  48. UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
  49. UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
  50. TSet<unsigned int> foundIds;
  51. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  52. foundIds.insert(id);
  53. };
  54. NHyperscan::Scan(
  55. db,
  56. scratch,
  57. "fooBaR",
  58. callback);
  59. UNIT_ASSERT_EQUAL(foundIds.size(), 2);
  60. UNIT_ASSERT(foundIds.contains(42));
  61. UNIT_ASSERT(foundIds.contains(241));
  62. }
  63. // https://ml.yandex-team.ru/thread/2370000002965712422/
  64. Y_UNIT_TEST(MultiRegression) {
  65. NHyperscan::CompileMulti(
  66. {
  67. "aa.bb/cc.dd",
  68. },
  69. {
  70. HS_FLAG_UTF8,
  71. },
  72. {
  73. 0,
  74. });
  75. }
  76. Y_UNIT_TEST(Serialize) {
  77. NHyperscan::TDatabase db = NHyperscan::Compile(
  78. "foo",
  79. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  80. TString serialization = Serialize(db);
  81. db.Reset();
  82. TDatabase db2 = Deserialize(serialization);
  83. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
  84. UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
  85. UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
  86. }
  87. Y_UNIT_TEST(GrowScratch) {
  88. NHyperscan::TDatabase db1 = NHyperscan::Compile(
  89. "foo",
  90. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  91. NHyperscan::TDatabase db2 = NHyperscan::Compile(
  92. "longer\\w\\w\\wpattern",
  93. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
  94. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
  95. NHyperscan::GrowScratch(scratch, db2);
  96. UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
  97. UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
  98. }
  99. Y_UNIT_TEST(CloneScratch) {
  100. NHyperscan::TDatabase db = NHyperscan::Compile(
  101. "foo",
  102. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
  103. NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
  104. NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
  105. scratch1.Reset();
  106. UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
  107. }
  108. class TSimpleSingleRegex {
  109. public:
  110. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  111. return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
  112. }
  113. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  114. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  115. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
  116. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
  117. }
  118. };
  119. // This regex uses AVX2 instructions on long (>70) texts.
  120. // It crushes when compiled for machine with AVX2 and run on machine without it.
  121. class TAvx2SingleRegex {
  122. public:
  123. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  124. auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
  125. "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
  126. unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
  127. return NHyperscan::Compile(regex, flags, cpuFeatures);
  128. }
  129. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  130. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  131. UNIT_ASSERT(NHyperscan::NPrivate::Matches(
  132. db,
  133. scratch,
  134. "_________________________________________________________________"
  135. "фу.bar"
  136. "_________________________________________________________________",
  137. impl));
  138. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
  139. db,
  140. scratch,
  141. "_________________________________________________________________"
  142. "фу"
  143. "_________________________________________________________________",
  144. impl));
  145. }
  146. };
  147. class TSimpleMultiRegex {
  148. public:
  149. static TDatabase Compile(TCPUFeatures cpuFeatures) {
  150. return NHyperscan::CompileMulti(
  151. {
  152. "foo",
  153. "bar",
  154. },
  155. {
  156. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
  157. HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
  158. },
  159. {
  160. 42,
  161. 241,
  162. },
  163. cpuFeatures);
  164. }
  165. static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
  166. NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
  167. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
  168. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
  169. UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
  170. UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
  171. TSet<unsigned int> foundIds;
  172. auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
  173. foundIds.insert(id);
  174. };
  175. NHyperscan::NPrivate::Scan(
  176. db,
  177. scratch,
  178. "fooBaR",
  179. callback,
  180. impl);
  181. UNIT_ASSERT_EQUAL(foundIds.size(), 2);
  182. UNIT_ASSERT(foundIds.contains(42));
  183. UNIT_ASSERT(foundIds.contains(241));
  184. }
  185. };
  186. template <class Regex>
  187. void TestCrossPlatformCompile() {
  188. const std::array<ERuntime, 4> runtimes = {
  189. ERuntime::Core2,
  190. ERuntime::Corei7,
  191. ERuntime::AVX2,
  192. ERuntime::AVX512
  193. };
  194. // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
  195. auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
  196. Y_ASSERT(currentRuntimeIter != runtimes.cend());
  197. for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
  198. auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
  199. Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
  200. }
  201. }
  202. Y_UNIT_TEST(CrossPlatformCompile) {
  203. TestCrossPlatformCompile<TSimpleSingleRegex>();
  204. TestCrossPlatformCompile<TAvx2SingleRegex>();
  205. TestCrossPlatformCompile<TSimpleMultiRegex>();
  206. }
  207. }