123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This class can produce a generic deterministic finite state automaton (DFA),
- // given a set of possible states and transitions.
- //
- // The input transitions can be nondeterministic - this class will produce the
- // deterministic equivalent state machine.
- //
- // The generated code can run the DFA and produce an accepted / not accepted
- // state and also produce, given a sequence of transitions that results in an
- // accepted state, the sequence of intermediate states. This is useful if the
- // initial automaton was nondeterministic - it allows mapping back from the DFA
- // to the NFA.
- //
- //===----------------------------------------------------------------------===//
- #include "DFAEmitter.h"
- #include "CodeGenTarget.h"
- #include "SequenceToOffsetTable.h"
- #include "TableGenBackends.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringExtras.h"
- #include "llvm/ADT/UniqueVector.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Support/raw_ostream.h"
- #include "llvm/TableGen/Record.h"
- #include "llvm/TableGen/TableGenBackend.h"
- #include <cassert>
- #include <cstdint>
- #include <map>
- #include <set>
- #include <string>
- #include <vector>
- #define DEBUG_TYPE "dfa-emitter"
- using namespace llvm;
- //===----------------------------------------------------------------------===//
- // DfaEmitter implementation. This is independent of the GenAutomaton backend.
- //===----------------------------------------------------------------------===//
- void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
- Actions.insert(A);
- NfaStates.insert(From);
- NfaStates.insert(To);
- NfaTransitions[{From, A}].push_back(To);
- ++NumNfaTransitions;
- }
- void DfaEmitter::visitDfaState(const DfaState &DS) {
- // For every possible action...
- auto FromId = DfaStates.idFor(DS);
- for (action_type A : Actions) {
- DfaState NewStates;
- DfaTransitionInfo TI;
- // For every represented state, word pair in the original NFA...
- for (state_type FromState : DS) {
- // If this action is possible from this state add the transitioned-to
- // states to NewStates.
- auto I = NfaTransitions.find({FromState, A});
- if (I == NfaTransitions.end())
- continue;
- for (state_type &ToState : I->second) {
- NewStates.push_back(ToState);
- TI.emplace_back(FromState, ToState);
- }
- }
- if (NewStates.empty())
- continue;
- // Sort and unique.
- sort(NewStates);
- NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
- NewStates.end());
- sort(TI);
- TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
- unsigned ToId = DfaStates.insert(NewStates);
- DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
- }
- }
- void DfaEmitter::constructDfa() {
- DfaState Initial(1, /*NFA initial state=*/0);
- DfaStates.insert(Initial);
- // Note that UniqueVector starts indices at 1, not zero.
- unsigned DfaStateId = 1;
- while (DfaStateId <= DfaStates.size()) {
- DfaState S = DfaStates[DfaStateId];
- visitDfaState(S);
- DfaStateId++;
- }
- }
- void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
- constructDfa();
- OS << "// Input NFA has " << NfaStates.size() << " states with "
- << NumNfaTransitions << " transitions.\n";
- OS << "// Generated DFA has " << DfaStates.size() << " states with "
- << DfaTransitions.size() << " transitions.\n\n";
- // Implementation note: We don't bake a simple std::pair<> here as it requires
- // significantly more effort to parse. A simple test with a large array of
- // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
- // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
- // define the pair type.
- //
- // FIXME: It may make sense to emit these as ULEB sequences instead of
- // pairs of uint64_t.
- OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
- OS << "// transition implies a set of NFA transitions. These are referred\n";
- OS << "// to by index in " << Name << "Transitions[].\n";
- SequenceToOffsetTable<DfaTransitionInfo> Table;
- std::map<DfaTransitionInfo, unsigned> EmittedIndices;
- for (auto &T : DfaTransitions)
- Table.add(T.second.second);
- Table.layout();
- OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
- << "TransitionInfo = {{\n";
- Table.emit(
- OS,
- [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
- OS << "{" << P.first << ", " << P.second << "}";
- },
- "{0ULL, 0ULL}");
- OS << "}};\n\n";
- OS << "// A transition in the generated " << Name << " DFA.\n";
- OS << "struct " << Name << "Transition {\n";
- OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n";
- OS << " ";
- printActionType(OS);
- OS << " Action; // The input symbol that causes this transition.\n";
- OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n";
- OS << " unsigned InfoIdx; // Start index into " << Name
- << "TransitionInfo.\n";
- OS << "};\n\n";
- OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
- OS << "// The initial state is 1, not zero.\n";
- OS << "const std::array<" << Name << "Transition, "
- << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
- for (auto &KV : DfaTransitions) {
- dfa_state_type From = KV.first.first;
- dfa_state_type To = KV.second.first;
- action_type A = KV.first.second;
- unsigned InfoIdx = Table.get(KV.second.second);
- OS << " {" << From << ", ";
- printActionValue(A, OS);
- OS << ", " << To << ", " << InfoIdx << "},\n";
- }
- OS << "\n}};\n\n";
- }
- void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
- void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
- //===----------------------------------------------------------------------===//
- // AutomatonEmitter implementation
- //===----------------------------------------------------------------------===//
- namespace {
- // FIXME: This entire discriminated union could be removed with c++17:
- // using Action = std::variant<Record *, unsigned, std::string>;
- struct Action {
- Record *R = nullptr;
- unsigned I = 0;
- std::string S;
- Action() = default;
- Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
- void print(raw_ostream &OS) const {
- if (R)
- OS << R->getName();
- else if (!S.empty())
- OS << '"' << S << '"';
- else
- OS << I;
- }
- bool operator<(const Action &Other) const {
- return std::make_tuple(R, I, S) <
- std::make_tuple(Other.R, Other.I, Other.S);
- }
- };
- using ActionTuple = std::vector<Action>;
- class Automaton;
- class Transition {
- uint64_t NewState;
- // The tuple of actions that causes this transition.
- ActionTuple Actions;
- // The types of the actions; this is the same across all transitions.
- SmallVector<std::string, 4> Types;
- public:
- Transition(Record *R, Automaton *Parent);
- const ActionTuple &getActions() { return Actions; }
- SmallVector<std::string, 4> getTypes() { return Types; }
- bool canTransitionFrom(uint64_t State);
- uint64_t transitionFrom(uint64_t State);
- };
- class Automaton {
- RecordKeeper &Records;
- Record *R;
- std::vector<Transition> Transitions;
- /// All possible action tuples, uniqued.
- UniqueVector<ActionTuple> Actions;
- /// The fields within each Transition object to find the action symbols.
- std::vector<StringRef> ActionSymbolFields;
- public:
- Automaton(RecordKeeper &Records, Record *R);
- void emit(raw_ostream &OS);
- ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
- /// If the type of action A has been overridden (there exists a field
- /// "TypeOf_A") return that, otherwise return the empty string.
- StringRef getActionSymbolType(StringRef A);
- };
- class AutomatonEmitter {
- RecordKeeper &Records;
- public:
- AutomatonEmitter(RecordKeeper &R) : Records(R) {}
- void run(raw_ostream &OS);
- };
- /// A DfaEmitter implementation that can print our variant action type.
- class CustomDfaEmitter : public DfaEmitter {
- const UniqueVector<ActionTuple> &Actions;
- std::string TypeName;
- public:
- CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
- : Actions(Actions), TypeName(TypeName) {}
- void printActionType(raw_ostream &OS) override;
- void printActionValue(action_type A, raw_ostream &OS) override;
- };
- } // namespace
- void AutomatonEmitter::run(raw_ostream &OS) {
- for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
- Automaton A(Records, R);
- OS << "#ifdef GET_" << R->getName() << "_DECL\n";
- A.emit(OS);
- OS << "#endif // GET_" << R->getName() << "_DECL\n";
- }
- }
- Automaton::Automaton(RecordKeeper &Records, Record *R)
- : Records(Records), R(R) {
- LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
- ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
- }
- void Automaton::emit(raw_ostream &OS) {
- StringRef TransitionClass = R->getValueAsString("TransitionClass");
- for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
- assert(T->isSubClassOf("Transition"));
- Transitions.emplace_back(T, this);
- Actions.insert(Transitions.back().getActions());
- }
- LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size()
- << "\n");
- LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size()
- << " potential transitions.\n");
- StringRef Name = R->getName();
- CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
- // Starting from the initial state, build up a list of possible states and
- // transitions.
- std::deque<uint64_t> Worklist(1, 0);
- std::set<uint64_t> SeenStates;
- unsigned NumTransitions = 0;
- SeenStates.insert(Worklist.front());
- while (!Worklist.empty()) {
- uint64_t State = Worklist.front();
- Worklist.pop_front();
- for (Transition &T : Transitions) {
- if (!T.canTransitionFrom(State))
- continue;
- uint64_t NewState = T.transitionFrom(State);
- if (SeenStates.emplace(NewState).second)
- Worklist.emplace_back(NewState);
- ++NumTransitions;
- Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
- }
- }
- LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size()
- << " states with " << NumTransitions << " transitions.\n");
- const auto &ActionTypes = Transitions.back().getTypes();
- OS << "// The type of an action in the " << Name << " automaton.\n";
- if (ActionTypes.size() == 1) {
- OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
- } else {
- OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
- << ">;\n";
- }
- OS << "\n";
- Emitter.emit(Name, OS);
- }
- StringRef Automaton::getActionSymbolType(StringRef A) {
- Twine Ty = "TypeOf_" + A;
- if (!R->getValue(Ty.str()))
- return "";
- return R->getValueAsString(Ty.str());
- }
- Transition::Transition(Record *R, Automaton *Parent) {
- BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
- NewState = 0;
- assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
- "State cannot be represented in 64 bits!");
- for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
- if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
- if (Bit->getValue())
- NewState |= 1ULL << I;
- }
- }
- for (StringRef A : Parent->getActionSymbolFields()) {
- RecordVal *SymbolV = R->getValue(A);
- if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
- Actions.emplace_back(R->getValueAsDef(A), 0, "");
- Types.emplace_back(Ty->getAsString());
- } else if (isa<IntRecTy>(SymbolV->getType())) {
- Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
- Types.emplace_back("unsigned");
- } else if (isa<StringRecTy>(SymbolV->getType())) {
- Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
- Types.emplace_back("std::string");
- } else {
- report_fatal_error("Unhandled symbol type!");
- }
- StringRef TypeOverride = Parent->getActionSymbolType(A);
- if (!TypeOverride.empty())
- Types.back() = std::string(TypeOverride);
- }
- }
- bool Transition::canTransitionFrom(uint64_t State) {
- if ((State & NewState) == 0)
- // The bits we want to set are not set;
- return true;
- return false;
- }
- uint64_t Transition::transitionFrom(uint64_t State) {
- return State | NewState;
- }
- void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
- void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
- const ActionTuple &AT = Actions[A];
- if (AT.size() > 1)
- OS << "std::make_tuple(";
- ListSeparator LS;
- for (const auto &SingleAction : AT) {
- OS << LS;
- SingleAction.print(OS);
- }
- if (AT.size() > 1)
- OS << ")";
- }
- namespace llvm {
- void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
- AutomatonEmitter(RK).run(OS);
- }
- } // namespace llvm
|