//===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // TrigramIndex implements a heuristic for SpecialCaseList that allows to // filter out ~99% incoming queries when all regular expressions in the // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more // complicated, the check is defeated and it will always pass the queries to a // full regex. // //===----------------------------------------------------------------------===// #include "llvm/Support/TrigramIndex.h" #include using namespace llvm; static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; static bool isAdvancedMetachar(unsigned Char) { return strchr(RegexAdvancedMetachars, Char) != nullptr; } void TrigramIndex::insert(const std::string &Regex) { if (Defeated) return; std::set Was; unsigned Cnt = 0; unsigned Tri = 0; unsigned Len = 0; bool Escaped = false; for (unsigned Char : Regex) { if (!Escaped) { // Regular expressions allow escaping symbols by preceding it with '\'. if (Char == '\\') { Escaped = true; continue; } if (isAdvancedMetachar(Char)) { // This is a more complicated regex than we can handle here. Defeated = true; return; } if (Char == '.' || Char == '*') { Tri = 0; Len = 0; continue; } } if (Escaped && Char >= '1' && Char <= '9') { Defeated = true; return; } // We have already handled escaping and can reset the flag. Escaped = false; Tri = ((Tri << 8) + Char) & 0xFFFFFF; Len++; if (Len < 3) continue; // We don't want the index to grow too much for the popular trigrams, // as they are weak signals. It's ok to still require them for the // rules we have already processed. It's just a small additional // computational cost. if (Index[Tri].size() >= 4) continue; Cnt++; if (!Was.count(Tri)) { // Adding the current rule to the index. Index[Tri].push_back(Counts.size()); Was.insert(Tri); } } if (!Cnt) { // This rule does not have remarkable trigrams to rely on. // We have to always call the full regex chain. Defeated = true; return; } Counts.push_back(Cnt); } bool TrigramIndex::isDefinitelyOut(StringRef Query) const { if (Defeated) return false; std::vector CurCounts(Counts.size()); unsigned Tri = 0; for (size_t I = 0; I < Query.size(); I++) { Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; if (I < 2) continue; const auto &II = Index.find(Tri); if (II == Index.end()) continue; for (size_t J : II->second) { CurCounts[J]++; // If we have reached a desired limit, we have to look at the query // more closely by running a full regex. if (CurCounts[J] >= Counts[J]) return false; } } return true; }