123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- #pragma once
- #ifdef __GNUC__
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wunused-parameter"
- #endif
- //===-- TrigramIndex.h - a heuristic for SpecialCaseList --------*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //===----------------------------------------------------------------------===//
- //
- // TrigramIndex implements a heuristic for SpecialCaseList that allows to
- // filter out ~99% incoming queries when all regular expressions in the
- // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
- // complicated, the check is defeated and it will always pass the queries to a
- // full regex.
- //
- // The basic idea is that in order for a wildcard to match a query, the query
- // needs to have all trigrams which occur in the wildcard. We create a trigram
- // index (trigram -> list of rules with it) and then count trigrams in the query
- // for each rule. If the count for one of the rules reaches the expected value,
- // the check passes the query to a regex. If none of the rules got enough
- // trigrams, the check tells that the query is definitely not matched by any
- // of the rules, and no regex matching is needed.
- // A similar idea was used in Google Code Search as described in the blog post:
- // https://swtch.com/~rsc/regexp/regexp4.html
- //
- //===----------------------------------------------------------------------===//
- #ifndef LLVM_SUPPORT_TRIGRAMINDEX_H
- #define LLVM_SUPPORT_TRIGRAMINDEX_H
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringRef.h"
- #include <string>
- #include <unordered_map>
- #include <vector>
- namespace llvm {
- class TrigramIndex {
- public:
- /// Inserts a new Regex into the index.
- void insert(const std::string &Regex);
- /// Returns true, if special case list definitely does not have a line
- /// that matches the query. Returns false, if it's not sure.
- bool isDefinitelyOut(StringRef Query) const;
- /// Returned true, iff the heuristic is defeated and not useful.
- /// In this case isDefinitelyOut always returns false.
- bool isDefeated() { return Defeated; }
- private:
- // If true, the rules are too complicated for the check to work, and full
- // regex matching is needed for every rule.
- bool Defeated = false;
- // The minimum number of trigrams which should match for a rule to have a
- // chance to match the query. The number of elements equals the number of
- // regex rules in the SpecialCaseList.
- std::vector<unsigned> Counts;
- // Index holds a list of rules indices for each trigram. The same indices
- // are used in Counts to store per-rule limits.
- // If a trigram is too common (>4 rules with it), we stop tracking it,
- // which increases the probability for a need to match using regex, but
- // decreases the costs in the regular case.
- std::unordered_map<unsigned, SmallVector<size_t, 4>> Index{256};
- };
- } // namespace llvm
- #endif // LLVM_SUPPORT_TRIGRAMINDEX_H
- #ifdef __GNUC__
- #pragma GCC diagnostic pop
- #endif
|