123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- #include <library/cpp/regex/hyperscan/hyperscan.h>
- #include <library/cpp/testing/unittest/registar.h>
- #include <util/generic/set.h>
- #include <array>
- #include <algorithm>
- Y_UNIT_TEST_SUITE(HyperscanWrappers) {
- using namespace NHyperscan;
- using namespace NHyperscan::NPrivate;
- Y_UNIT_TEST(CompileAndScan) {
- TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
- TScratch scratch = MakeScratch(db);
- unsigned int foundId = 42;
- auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
- foundId = id;
- };
- NHyperscan::Scan(
- db,
- scratch,
- "abc",
- callback);
- UNIT_ASSERT_EQUAL(foundId, 0);
- }
- Y_UNIT_TEST(Matches) {
- NHyperscan::TDatabase db = NHyperscan::Compile(
- "a.c",
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
- UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
- UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
- }
- Y_UNIT_TEST(Multi) {
- NHyperscan::TDatabase db = NHyperscan::CompileMulti(
- {
- "foo",
- "bar",
- },
- {
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
- },
- {
- 42,
- 241,
- });
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
- UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
- UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
- UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
- UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
- TSet<unsigned int> foundIds;
- auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
- foundIds.insert(id);
- };
- NHyperscan::Scan(
- db,
- scratch,
- "fooBaR",
- callback);
- UNIT_ASSERT_EQUAL(foundIds.size(), 2);
- UNIT_ASSERT(foundIds.contains(42));
- UNIT_ASSERT(foundIds.contains(241));
- }
- // https://ml.yandex-team.ru/thread/2370000002965712422/
- Y_UNIT_TEST(MultiRegression) {
- NHyperscan::CompileMulti(
- {
- "aa.bb/cc.dd",
- },
- {
- HS_FLAG_UTF8,
- },
- {
- 0,
- });
- }
- Y_UNIT_TEST(Serialize) {
- NHyperscan::TDatabase db = NHyperscan::Compile(
- "foo",
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
- TString serialization = Serialize(db);
- db.Reset();
- TDatabase db2 = Deserialize(serialization);
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
- UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
- UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
- }
- Y_UNIT_TEST(GrowScratch) {
- NHyperscan::TDatabase db1 = NHyperscan::Compile(
- "foo",
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
- NHyperscan::TDatabase db2 = NHyperscan::Compile(
- "longer\\w\\w\\wpattern",
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
- NHyperscan::GrowScratch(scratch, db2);
- UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
- UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
- }
- Y_UNIT_TEST(CloneScratch) {
- NHyperscan::TDatabase db = NHyperscan::Compile(
- "foo",
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
- NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
- NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
- scratch1.Reset();
- UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
- }
- class TSimpleSingleRegex {
- public:
- static TDatabase Compile(TCPUFeatures cpuFeatures) {
- return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
- }
- static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
- UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
- UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
- }
- };
- // This regex uses AVX2 instructions on long (>70) texts.
- // It crushes when compiled for machine with AVX2 and run on machine without it.
- class TAvx2SingleRegex {
- public:
- static TDatabase Compile(TCPUFeatures cpuFeatures) {
- auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
- "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
- unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
- return NHyperscan::Compile(regex, flags, cpuFeatures);
- }
- static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
- UNIT_ASSERT(NHyperscan::NPrivate::Matches(
- db,
- scratch,
- "_________________________________________________________________"
- "фу.bar"
- "_________________________________________________________________",
- impl));
- UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
- db,
- scratch,
- "_________________________________________________________________"
- "фу"
- "_________________________________________________________________",
- impl));
- }
- };
- class TSimpleMultiRegex {
- public:
- static TDatabase Compile(TCPUFeatures cpuFeatures) {
- return NHyperscan::CompileMulti(
- {
- "foo",
- "bar",
- },
- {
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
- HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
- },
- {
- 42,
- 241,
- },
- cpuFeatures);
- }
- static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
- NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
- UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
- UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
- UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
- UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
- TSet<unsigned int> foundIds;
- auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
- foundIds.insert(id);
- };
- NHyperscan::NPrivate::Scan(
- db,
- scratch,
- "fooBaR",
- callback,
- impl);
- UNIT_ASSERT_EQUAL(foundIds.size(), 2);
- UNIT_ASSERT(foundIds.contains(42));
- UNIT_ASSERT(foundIds.contains(241));
- }
- };
- template <class Regex>
- void TestCrossPlatformCompile() {
- const std::array<ERuntime, 4> runtimes = {
- ERuntime::Core2,
- ERuntime::Corei7,
- ERuntime::AVX2,
- ERuntime::AVX512
- };
- // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
- auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
- Y_ASSERT(currentRuntimeIter != runtimes.cend());
- for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
- auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
- Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
- }
- }
- Y_UNIT_TEST(CrossPlatformCompile) {
- TestCrossPlatformCompile<TSimpleSingleRegex>();
- TestCrossPlatformCompile<TAvx2SingleRegex>();
- TestCrossPlatformCompile<TSimpleMultiRegex>();
- }
- }
|