DFAEmitter.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This class can produce a generic deterministic finite state automaton (DFA),
  10. // given a set of possible states and transitions.
  11. //
  12. // The input transitions can be nondeterministic - this class will produce the
  13. // deterministic equivalent state machine.
  14. //
  15. // The generated code can run the DFA and produce an accepted / not accepted
  16. // state and also produce, given a sequence of transitions that results in an
  17. // accepted state, the sequence of intermediate states. This is useful if the
  18. // initial automaton was nondeterministic - it allows mapping back from the DFA
  19. // to the NFA.
  20. //
  21. //===----------------------------------------------------------------------===//
  22. #include "DFAEmitter.h"
  23. #include "CodeGenTarget.h"
  24. #include "SequenceToOffsetTable.h"
  25. #include "TableGenBackends.h"
  26. #include "llvm/ADT/SmallVector.h"
  27. #include "llvm/ADT/StringExtras.h"
  28. #include "llvm/ADT/UniqueVector.h"
  29. #include "llvm/Support/Debug.h"
  30. #include "llvm/Support/raw_ostream.h"
  31. #include "llvm/TableGen/Record.h"
  32. #include "llvm/TableGen/TableGenBackend.h"
  33. #include <cassert>
  34. #include <cstdint>
  35. #include <map>
  36. #include <set>
  37. #include <string>
  38. #include <vector>
  39. #define DEBUG_TYPE "dfa-emitter"
  40. using namespace llvm;
  41. //===----------------------------------------------------------------------===//
  42. // DfaEmitter implementation. This is independent of the GenAutomaton backend.
  43. //===----------------------------------------------------------------------===//
  44. void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
  45. Actions.insert(A);
  46. NfaStates.insert(From);
  47. NfaStates.insert(To);
  48. NfaTransitions[{From, A}].push_back(To);
  49. ++NumNfaTransitions;
  50. }
  51. void DfaEmitter::visitDfaState(const DfaState &DS) {
  52. // For every possible action...
  53. auto FromId = DfaStates.idFor(DS);
  54. for (action_type A : Actions) {
  55. DfaState NewStates;
  56. DfaTransitionInfo TI;
  57. // For every represented state, word pair in the original NFA...
  58. for (state_type FromState : DS) {
  59. // If this action is possible from this state add the transitioned-to
  60. // states to NewStates.
  61. auto I = NfaTransitions.find({FromState, A});
  62. if (I == NfaTransitions.end())
  63. continue;
  64. for (state_type &ToState : I->second) {
  65. NewStates.push_back(ToState);
  66. TI.emplace_back(FromState, ToState);
  67. }
  68. }
  69. if (NewStates.empty())
  70. continue;
  71. // Sort and unique.
  72. sort(NewStates);
  73. NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
  74. NewStates.end());
  75. sort(TI);
  76. TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
  77. unsigned ToId = DfaStates.insert(NewStates);
  78. DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
  79. }
  80. }
  81. void DfaEmitter::constructDfa() {
  82. DfaState Initial(1, /*NFA initial state=*/0);
  83. DfaStates.insert(Initial);
  84. // Note that UniqueVector starts indices at 1, not zero.
  85. unsigned DfaStateId = 1;
  86. while (DfaStateId <= DfaStates.size()) {
  87. DfaState S = DfaStates[DfaStateId];
  88. visitDfaState(S);
  89. DfaStateId++;
  90. }
  91. }
  92. void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
  93. constructDfa();
  94. OS << "// Input NFA has " << NfaStates.size() << " states with "
  95. << NumNfaTransitions << " transitions.\n";
  96. OS << "// Generated DFA has " << DfaStates.size() << " states with "
  97. << DfaTransitions.size() << " transitions.\n\n";
  98. // Implementation note: We don't bake a simple std::pair<> here as it requires
  99. // significantly more effort to parse. A simple test with a large array of
  100. // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
  101. // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
  102. // define the pair type.
  103. //
  104. // FIXME: It may make sense to emit these as ULEB sequences instead of
  105. // pairs of uint64_t.
  106. OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
  107. OS << "// transition implies a set of NFA transitions. These are referred\n";
  108. OS << "// to by index in " << Name << "Transitions[].\n";
  109. SequenceToOffsetTable<DfaTransitionInfo> Table;
  110. std::map<DfaTransitionInfo, unsigned> EmittedIndices;
  111. for (auto &T : DfaTransitions)
  112. Table.add(T.second.second);
  113. Table.layout();
  114. OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
  115. << "TransitionInfo = {{\n";
  116. Table.emit(
  117. OS,
  118. [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
  119. OS << "{" << P.first << ", " << P.second << "}";
  120. },
  121. "{0ULL, 0ULL}");
  122. OS << "}};\n\n";
  123. OS << "// A transition in the generated " << Name << " DFA.\n";
  124. OS << "struct " << Name << "Transition {\n";
  125. OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n";
  126. OS << " ";
  127. printActionType(OS);
  128. OS << " Action; // The input symbol that causes this transition.\n";
  129. OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n";
  130. OS << " unsigned InfoIdx; // Start index into " << Name
  131. << "TransitionInfo.\n";
  132. OS << "};\n\n";
  133. OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
  134. OS << "// The initial state is 1, not zero.\n";
  135. OS << "const std::array<" << Name << "Transition, "
  136. << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
  137. for (auto &KV : DfaTransitions) {
  138. dfa_state_type From = KV.first.first;
  139. dfa_state_type To = KV.second.first;
  140. action_type A = KV.first.second;
  141. unsigned InfoIdx = Table.get(KV.second.second);
  142. OS << " {" << From << ", ";
  143. printActionValue(A, OS);
  144. OS << ", " << To << ", " << InfoIdx << "},\n";
  145. }
  146. OS << "\n}};\n\n";
  147. }
  148. void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
  149. void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
  150. //===----------------------------------------------------------------------===//
  151. // AutomatonEmitter implementation
  152. //===----------------------------------------------------------------------===//
  153. namespace {
  154. // FIXME: This entire discriminated union could be removed with c++17:
  155. // using Action = std::variant<Record *, unsigned, std::string>;
  156. struct Action {
  157. Record *R = nullptr;
  158. unsigned I = 0;
  159. std::string S;
  160. Action() = default;
  161. Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
  162. void print(raw_ostream &OS) const {
  163. if (R)
  164. OS << R->getName();
  165. else if (!S.empty())
  166. OS << '"' << S << '"';
  167. else
  168. OS << I;
  169. }
  170. bool operator<(const Action &Other) const {
  171. return std::make_tuple(R, I, S) <
  172. std::make_tuple(Other.R, Other.I, Other.S);
  173. }
  174. };
  175. using ActionTuple = std::vector<Action>;
  176. class Automaton;
  177. class Transition {
  178. uint64_t NewState;
  179. // The tuple of actions that causes this transition.
  180. ActionTuple Actions;
  181. // The types of the actions; this is the same across all transitions.
  182. SmallVector<std::string, 4> Types;
  183. public:
  184. Transition(Record *R, Automaton *Parent);
  185. const ActionTuple &getActions() { return Actions; }
  186. SmallVector<std::string, 4> getTypes() { return Types; }
  187. bool canTransitionFrom(uint64_t State);
  188. uint64_t transitionFrom(uint64_t State);
  189. };
  190. class Automaton {
  191. RecordKeeper &Records;
  192. Record *R;
  193. std::vector<Transition> Transitions;
  194. /// All possible action tuples, uniqued.
  195. UniqueVector<ActionTuple> Actions;
  196. /// The fields within each Transition object to find the action symbols.
  197. std::vector<StringRef> ActionSymbolFields;
  198. public:
  199. Automaton(RecordKeeper &Records, Record *R);
  200. void emit(raw_ostream &OS);
  201. ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
  202. /// If the type of action A has been overridden (there exists a field
  203. /// "TypeOf_A") return that, otherwise return the empty string.
  204. StringRef getActionSymbolType(StringRef A);
  205. };
  206. class AutomatonEmitter {
  207. RecordKeeper &Records;
  208. public:
  209. AutomatonEmitter(RecordKeeper &R) : Records(R) {}
  210. void run(raw_ostream &OS);
  211. };
  212. /// A DfaEmitter implementation that can print our variant action type.
  213. class CustomDfaEmitter : public DfaEmitter {
  214. const UniqueVector<ActionTuple> &Actions;
  215. std::string TypeName;
  216. public:
  217. CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
  218. : Actions(Actions), TypeName(TypeName) {}
  219. void printActionType(raw_ostream &OS) override;
  220. void printActionValue(action_type A, raw_ostream &OS) override;
  221. };
  222. } // namespace
  223. void AutomatonEmitter::run(raw_ostream &OS) {
  224. for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
  225. Automaton A(Records, R);
  226. OS << "#ifdef GET_" << R->getName() << "_DECL\n";
  227. A.emit(OS);
  228. OS << "#endif // GET_" << R->getName() << "_DECL\n";
  229. }
  230. }
  231. Automaton::Automaton(RecordKeeper &Records, Record *R)
  232. : Records(Records), R(R) {
  233. LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
  234. ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
  235. }
  236. void Automaton::emit(raw_ostream &OS) {
  237. StringRef TransitionClass = R->getValueAsString("TransitionClass");
  238. for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
  239. assert(T->isSubClassOf("Transition"));
  240. Transitions.emplace_back(T, this);
  241. Actions.insert(Transitions.back().getActions());
  242. }
  243. LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size()
  244. << "\n");
  245. LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size()
  246. << " potential transitions.\n");
  247. StringRef Name = R->getName();
  248. CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
  249. // Starting from the initial state, build up a list of possible states and
  250. // transitions.
  251. std::deque<uint64_t> Worklist(1, 0);
  252. std::set<uint64_t> SeenStates;
  253. unsigned NumTransitions = 0;
  254. SeenStates.insert(Worklist.front());
  255. while (!Worklist.empty()) {
  256. uint64_t State = Worklist.front();
  257. Worklist.pop_front();
  258. for (Transition &T : Transitions) {
  259. if (!T.canTransitionFrom(State))
  260. continue;
  261. uint64_t NewState = T.transitionFrom(State);
  262. if (SeenStates.emplace(NewState).second)
  263. Worklist.emplace_back(NewState);
  264. ++NumTransitions;
  265. Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
  266. }
  267. }
  268. LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size()
  269. << " states with " << NumTransitions << " transitions.\n");
  270. const auto &ActionTypes = Transitions.back().getTypes();
  271. OS << "// The type of an action in the " << Name << " automaton.\n";
  272. if (ActionTypes.size() == 1) {
  273. OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
  274. } else {
  275. OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
  276. << ">;\n";
  277. }
  278. OS << "\n";
  279. Emitter.emit(Name, OS);
  280. }
  281. StringRef Automaton::getActionSymbolType(StringRef A) {
  282. Twine Ty = "TypeOf_" + A;
  283. if (!R->getValue(Ty.str()))
  284. return "";
  285. return R->getValueAsString(Ty.str());
  286. }
  287. Transition::Transition(Record *R, Automaton *Parent) {
  288. BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
  289. NewState = 0;
  290. assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
  291. "State cannot be represented in 64 bits!");
  292. for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
  293. if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
  294. if (Bit->getValue())
  295. NewState |= 1ULL << I;
  296. }
  297. }
  298. for (StringRef A : Parent->getActionSymbolFields()) {
  299. RecordVal *SymbolV = R->getValue(A);
  300. if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
  301. Actions.emplace_back(R->getValueAsDef(A), 0, "");
  302. Types.emplace_back(Ty->getAsString());
  303. } else if (isa<IntRecTy>(SymbolV->getType())) {
  304. Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
  305. Types.emplace_back("unsigned");
  306. } else if (isa<StringRecTy>(SymbolV->getType())) {
  307. Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
  308. Types.emplace_back("std::string");
  309. } else {
  310. report_fatal_error("Unhandled symbol type!");
  311. }
  312. StringRef TypeOverride = Parent->getActionSymbolType(A);
  313. if (!TypeOverride.empty())
  314. Types.back() = std::string(TypeOverride);
  315. }
  316. }
  317. bool Transition::canTransitionFrom(uint64_t State) {
  318. if ((State & NewState) == 0)
  319. // The bits we want to set are not set;
  320. return true;
  321. return false;
  322. }
  323. uint64_t Transition::transitionFrom(uint64_t State) {
  324. return State | NewState;
  325. }
  326. void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
  327. void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
  328. const ActionTuple &AT = Actions[A];
  329. if (AT.size() > 1)
  330. OS << "std::make_tuple(";
  331. ListSeparator LS;
  332. for (const auto &SingleAction : AT) {
  333. OS << LS;
  334. SingleAction.print(OS);
  335. }
  336. if (AT.size() > 1)
  337. OS << ")";
  338. }
  339. namespace llvm {
  340. void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
  341. AutomatonEmitter(RK).run(OS);
  342. }
  343. } // namespace llvm