X86SpeculativeLoadHardening.cpp 93 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277
  1. //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. ///
  10. /// Provide a pass which mitigates speculative execution attacks which operate
  11. /// by speculating incorrectly past some predicate (a type check, bounds check,
  12. /// or other condition) to reach a load with invalid inputs and leak the data
  13. /// accessed by that load using a side channel out of the speculative domain.
  14. ///
  15. /// For details on the attacks, see the first variant in both the Project Zero
  16. /// writeup and the Spectre paper:
  17. /// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
  18. /// https://spectreattack.com/spectre.pdf
  19. ///
  20. //===----------------------------------------------------------------------===//
  21. #include "X86.h"
  22. #include "X86InstrBuilder.h"
  23. #include "X86InstrInfo.h"
  24. #include "X86Subtarget.h"
  25. #include "llvm/ADT/ArrayRef.h"
  26. #include "llvm/ADT/DenseMap.h"
  27. #include "llvm/ADT/Optional.h"
  28. #include "llvm/ADT/STLExtras.h"
  29. #include "llvm/ADT/ScopeExit.h"
  30. #include "llvm/ADT/SmallPtrSet.h"
  31. #include "llvm/ADT/SmallSet.h"
  32. #include "llvm/ADT/SmallVector.h"
  33. #include "llvm/ADT/SparseBitVector.h"
  34. #include "llvm/ADT/Statistic.h"
  35. #include "llvm/CodeGen/MachineBasicBlock.h"
  36. #include "llvm/CodeGen/MachineConstantPool.h"
  37. #include "llvm/CodeGen/MachineFunction.h"
  38. #include "llvm/CodeGen/MachineFunctionPass.h"
  39. #include "llvm/CodeGen/MachineInstr.h"
  40. #include "llvm/CodeGen/MachineInstrBuilder.h"
  41. #include "llvm/CodeGen/MachineModuleInfo.h"
  42. #include "llvm/CodeGen/MachineOperand.h"
  43. #include "llvm/CodeGen/MachineRegisterInfo.h"
  44. #include "llvm/CodeGen/MachineSSAUpdater.h"
  45. #include "llvm/CodeGen/TargetInstrInfo.h"
  46. #include "llvm/CodeGen/TargetRegisterInfo.h"
  47. #include "llvm/CodeGen/TargetSchedule.h"
  48. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  49. #include "llvm/IR/DebugLoc.h"
  50. #include "llvm/MC/MCSchedule.h"
  51. #include "llvm/Pass.h"
  52. #include "llvm/Support/CommandLine.h"
  53. #include "llvm/Support/Debug.h"
  54. #include "llvm/Support/raw_ostream.h"
  55. #include "llvm/Target/TargetMachine.h"
  56. #include <algorithm>
  57. #include <cassert>
  58. #include <iterator>
  59. #include <utility>
  60. using namespace llvm;
  61. #define PASS_KEY "x86-slh"
  62. #define DEBUG_TYPE PASS_KEY
  63. STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
  64. STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
  65. STATISTIC(NumAddrRegsHardened,
  66. "Number of address mode used registers hardaned");
  67. STATISTIC(NumPostLoadRegsHardened,
  68. "Number of post-load register values hardened");
  69. STATISTIC(NumCallsOrJumpsHardened,
  70. "Number of calls or jumps requiring extra hardening");
  71. STATISTIC(NumInstsInserted, "Number of instructions inserted");
  72. STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
  73. static cl::opt<bool> EnableSpeculativeLoadHardening(
  74. "x86-speculative-load-hardening",
  75. cl::desc("Force enable speculative load hardening"), cl::init(false),
  76. cl::Hidden);
  77. static cl::opt<bool> HardenEdgesWithLFENCE(
  78. PASS_KEY "-lfence",
  79. cl::desc(
  80. "Use LFENCE along each conditional edge to harden against speculative "
  81. "loads rather than conditional movs and poisoned pointers."),
  82. cl::init(false), cl::Hidden);
  83. static cl::opt<bool> EnablePostLoadHardening(
  84. PASS_KEY "-post-load",
  85. cl::desc("Harden the value loaded *after* it is loaded by "
  86. "flushing the loaded bits to 1. This is hard to do "
  87. "in general but can be done easily for GPRs."),
  88. cl::init(true), cl::Hidden);
  89. static cl::opt<bool> FenceCallAndRet(
  90. PASS_KEY "-fence-call-and-ret",
  91. cl::desc("Use a full speculation fence to harden both call and ret edges "
  92. "rather than a lighter weight mitigation."),
  93. cl::init(false), cl::Hidden);
  94. static cl::opt<bool> HardenInterprocedurally(
  95. PASS_KEY "-ip",
  96. cl::desc("Harden interprocedurally by passing our state in and out of "
  97. "functions in the high bits of the stack pointer."),
  98. cl::init(true), cl::Hidden);
  99. static cl::opt<bool>
  100. HardenLoads(PASS_KEY "-loads",
  101. cl::desc("Sanitize loads from memory. When disable, no "
  102. "significant security is provided."),
  103. cl::init(true), cl::Hidden);
  104. static cl::opt<bool> HardenIndirectCallsAndJumps(
  105. PASS_KEY "-indirect",
  106. cl::desc("Harden indirect calls and jumps against using speculatively "
  107. "stored attacker controlled addresses. This is designed to "
  108. "mitigate Spectre v1.2 style attacks."),
  109. cl::init(true), cl::Hidden);
  110. namespace {
  111. class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
  112. public:
  113. X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { }
  114. StringRef getPassName() const override {
  115. return "X86 speculative load hardening";
  116. }
  117. bool runOnMachineFunction(MachineFunction &MF) override;
  118. void getAnalysisUsage(AnalysisUsage &AU) const override;
  119. /// Pass identification, replacement for typeid.
  120. static char ID;
  121. private:
  122. /// The information about a block's conditional terminators needed to trace
  123. /// our predicate state through the exiting edges.
  124. struct BlockCondInfo {
  125. MachineBasicBlock *MBB;
  126. // We mostly have one conditional branch, and in extremely rare cases have
  127. // two. Three and more are so rare as to be unimportant for compile time.
  128. SmallVector<MachineInstr *, 2> CondBrs;
  129. MachineInstr *UncondBr;
  130. };
  131. /// Manages the predicate state traced through the program.
  132. struct PredState {
  133. unsigned InitialReg = 0;
  134. unsigned PoisonReg = 0;
  135. const TargetRegisterClass *RC;
  136. MachineSSAUpdater SSA;
  137. PredState(MachineFunction &MF, const TargetRegisterClass *RC)
  138. : RC(RC), SSA(MF) {}
  139. };
  140. const X86Subtarget *Subtarget = nullptr;
  141. MachineRegisterInfo *MRI = nullptr;
  142. const X86InstrInfo *TII = nullptr;
  143. const TargetRegisterInfo *TRI = nullptr;
  144. Optional<PredState> PS;
  145. void hardenEdgesWithLFENCE(MachineFunction &MF);
  146. SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
  147. SmallVector<MachineInstr *, 16>
  148. tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
  149. void unfoldCallAndJumpLoads(MachineFunction &MF);
  150. SmallVector<MachineInstr *, 16>
  151. tracePredStateThroughIndirectBranches(MachineFunction &MF);
  152. void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
  153. unsigned saveEFLAGS(MachineBasicBlock &MBB,
  154. MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
  155. void restoreEFLAGS(MachineBasicBlock &MBB,
  156. MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
  157. Register Reg);
  158. void mergePredStateIntoSP(MachineBasicBlock &MBB,
  159. MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
  160. unsigned PredStateReg);
  161. unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
  162. MachineBasicBlock::iterator InsertPt,
  163. DebugLoc Loc);
  164. void
  165. hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
  166. MachineOperand &IndexMO,
  167. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
  168. MachineInstr *
  169. sinkPostLoadHardenedInst(MachineInstr &MI,
  170. SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
  171. bool canHardenRegister(Register Reg);
  172. unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
  173. MachineBasicBlock::iterator InsertPt,
  174. DebugLoc Loc);
  175. unsigned hardenPostLoad(MachineInstr &MI);
  176. void hardenReturnInstr(MachineInstr &MI);
  177. void tracePredStateThroughCall(MachineInstr &MI);
  178. void hardenIndirectCallOrJumpInstr(
  179. MachineInstr &MI,
  180. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
  181. };
  182. } // end anonymous namespace
  183. char X86SpeculativeLoadHardeningPass::ID = 0;
  184. void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
  185. AnalysisUsage &AU) const {
  186. MachineFunctionPass::getAnalysisUsage(AU);
  187. }
  188. static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
  189. MachineBasicBlock &Succ, int SuccCount,
  190. MachineInstr *Br, MachineInstr *&UncondBr,
  191. const X86InstrInfo &TII) {
  192. assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
  193. MachineFunction &MF = *MBB.getParent();
  194. MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
  195. // We have to insert the new block immediately after the current one as we
  196. // don't know what layout-successor relationships the successor has and we
  197. // may not be able to (and generally don't want to) try to fix those up.
  198. MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
  199. // Update the branch instruction if necessary.
  200. if (Br) {
  201. assert(Br->getOperand(0).getMBB() == &Succ &&
  202. "Didn't start with the right target!");
  203. Br->getOperand(0).setMBB(&NewMBB);
  204. // If this successor was reached through a branch rather than fallthrough,
  205. // we might have *broken* fallthrough and so need to inject a new
  206. // unconditional branch.
  207. if (!UncondBr) {
  208. MachineBasicBlock &OldLayoutSucc =
  209. *std::next(MachineFunction::iterator(&NewMBB));
  210. assert(MBB.isSuccessor(&OldLayoutSucc) &&
  211. "Without an unconditional branch, the old layout successor should "
  212. "be an actual successor!");
  213. auto BrBuilder =
  214. BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
  215. // Update the unconditional branch now that we've added one.
  216. UncondBr = &*BrBuilder;
  217. }
  218. // Insert unconditional "jump Succ" instruction in the new block if
  219. // necessary.
  220. if (!NewMBB.isLayoutSuccessor(&Succ)) {
  221. SmallVector<MachineOperand, 4> Cond;
  222. TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
  223. }
  224. } else {
  225. assert(!UncondBr &&
  226. "Cannot have a branchless successor and an unconditional branch!");
  227. assert(NewMBB.isLayoutSuccessor(&Succ) &&
  228. "A non-branch successor must have been a layout successor before "
  229. "and now is a layout successor of the new block.");
  230. }
  231. // If this is the only edge to the successor, we can just replace it in the
  232. // CFG. Otherwise we need to add a new entry in the CFG for the new
  233. // successor.
  234. if (SuccCount == 1) {
  235. MBB.replaceSuccessor(&Succ, &NewMBB);
  236. } else {
  237. MBB.splitSuccessor(&Succ, &NewMBB);
  238. }
  239. // Hook up the edge from the new basic block to the old successor in the CFG.
  240. NewMBB.addSuccessor(&Succ);
  241. // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
  242. for (MachineInstr &MI : Succ) {
  243. if (!MI.isPHI())
  244. break;
  245. for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
  246. OpIdx += 2) {
  247. MachineOperand &OpV = MI.getOperand(OpIdx);
  248. MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
  249. assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
  250. if (OpMBB.getMBB() != &MBB)
  251. continue;
  252. // If this is the last edge to the succesor, just replace MBB in the PHI
  253. if (SuccCount == 1) {
  254. OpMBB.setMBB(&NewMBB);
  255. break;
  256. }
  257. // Otherwise, append a new pair of operands for the new incoming edge.
  258. MI.addOperand(MF, OpV);
  259. MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
  260. break;
  261. }
  262. }
  263. // Inherit live-ins from the successor
  264. for (auto &LI : Succ.liveins())
  265. NewMBB.addLiveIn(LI);
  266. LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
  267. << Succ.getName() << "'.\n");
  268. return NewMBB;
  269. }
  270. /// Removing duplicate PHI operands to leave the PHI in a canonical and
  271. /// predictable form.
  272. ///
  273. /// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
  274. /// isn't what you might expect. We may have multiple entries in PHI nodes for
  275. /// a single predecessor. This makes CFG-updating extremely complex, so here we
  276. /// simplify all PHI nodes to a model even simpler than the IR's model: exactly
  277. /// one entry per predecessor, regardless of how many edges there are.
  278. static void canonicalizePHIOperands(MachineFunction &MF) {
  279. SmallPtrSet<MachineBasicBlock *, 4> Preds;
  280. SmallVector<int, 4> DupIndices;
  281. for (auto &MBB : MF)
  282. for (auto &MI : MBB) {
  283. if (!MI.isPHI())
  284. break;
  285. // First we scan the operands of the PHI looking for duplicate entries
  286. // a particular predecessor. We retain the operand index of each duplicate
  287. // entry found.
  288. for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
  289. OpIdx += 2)
  290. if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
  291. DupIndices.push_back(OpIdx);
  292. // Now walk the duplicate indices, removing both the block and value. Note
  293. // that these are stored as a vector making this element-wise removal
  294. // :w
  295. // potentially quadratic.
  296. //
  297. // FIXME: It is really frustrating that we have to use a quadratic
  298. // removal algorithm here. There should be a better way, but the use-def
  299. // updates required make that impossible using the public API.
  300. //
  301. // Note that we have to process these backwards so that we don't
  302. // invalidate other indices with each removal.
  303. while (!DupIndices.empty()) {
  304. int OpIdx = DupIndices.pop_back_val();
  305. // Remove both the block and value operand, again in reverse order to
  306. // preserve indices.
  307. MI.RemoveOperand(OpIdx + 1);
  308. MI.RemoveOperand(OpIdx);
  309. }
  310. Preds.clear();
  311. }
  312. }
  313. /// Helper to scan a function for loads vulnerable to misspeculation that we
  314. /// want to harden.
  315. ///
  316. /// We use this to avoid making changes to functions where there is nothing we
  317. /// need to do to harden against misspeculation.
  318. static bool hasVulnerableLoad(MachineFunction &MF) {
  319. for (MachineBasicBlock &MBB : MF) {
  320. for (MachineInstr &MI : MBB) {
  321. // Loads within this basic block after an LFENCE are not at risk of
  322. // speculatively executing with invalid predicates from prior control
  323. // flow. So break out of this block but continue scanning the function.
  324. if (MI.getOpcode() == X86::LFENCE)
  325. break;
  326. // Looking for loads only.
  327. if (!MI.mayLoad())
  328. continue;
  329. // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
  330. if (MI.getOpcode() == X86::MFENCE)
  331. continue;
  332. // We found a load.
  333. return true;
  334. }
  335. }
  336. // No loads found.
  337. return false;
  338. }
  339. bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
  340. MachineFunction &MF) {
  341. LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
  342. << " **********\n");
  343. // Only run if this pass is forced enabled or we detect the relevant function
  344. // attribute requesting SLH.
  345. if (!EnableSpeculativeLoadHardening &&
  346. !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
  347. return false;
  348. Subtarget = &MF.getSubtarget<X86Subtarget>();
  349. MRI = &MF.getRegInfo();
  350. TII = Subtarget->getInstrInfo();
  351. TRI = Subtarget->getRegisterInfo();
  352. // FIXME: Support for 32-bit.
  353. PS.emplace(MF, &X86::GR64_NOSPRegClass);
  354. if (MF.begin() == MF.end())
  355. // Nothing to do for a degenerate empty function...
  356. return false;
  357. // We support an alternative hardening technique based on a debug flag.
  358. if (HardenEdgesWithLFENCE) {
  359. hardenEdgesWithLFENCE(MF);
  360. return true;
  361. }
  362. // Create a dummy debug loc to use for all the generated code here.
  363. DebugLoc Loc;
  364. MachineBasicBlock &Entry = *MF.begin();
  365. auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
  366. // Do a quick scan to see if we have any checkable loads.
  367. bool HasVulnerableLoad = hasVulnerableLoad(MF);
  368. // See if we have any conditional branching blocks that we will need to trace
  369. // predicate state through.
  370. SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
  371. // If we have no interesting conditions or loads, nothing to do here.
  372. if (!HasVulnerableLoad && Infos.empty())
  373. return true;
  374. // The poison value is required to be an all-ones value for many aspects of
  375. // this mitigation.
  376. const int PoisonVal = -1;
  377. PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
  378. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
  379. .addImm(PoisonVal);
  380. ++NumInstsInserted;
  381. // If we have loads being hardened and we've asked for call and ret edges to
  382. // get a full fence-based mitigation, inject that fence.
  383. if (HasVulnerableLoad && FenceCallAndRet) {
  384. // We need to insert an LFENCE at the start of the function to suspend any
  385. // incoming misspeculation from the caller. This helps two-fold: the caller
  386. // may not have been protected as this code has been, and this code gets to
  387. // not take any specific action to protect across calls.
  388. // FIXME: We could skip this for functions which unconditionally return
  389. // a constant.
  390. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
  391. ++NumInstsInserted;
  392. ++NumLFENCEsInserted;
  393. }
  394. // If we guarded the entry with an LFENCE and have no conditionals to protect
  395. // in blocks, then we're done.
  396. if (FenceCallAndRet && Infos.empty())
  397. // We may have changed the function's code at this point to insert fences.
  398. return true;
  399. // For every basic block in the function which can b
  400. if (HardenInterprocedurally && !FenceCallAndRet) {
  401. // Set up the predicate state by extracting it from the incoming stack
  402. // pointer so we pick up any misspeculation in our caller.
  403. PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
  404. } else {
  405. // Otherwise, just build the predicate state itself by zeroing a register
  406. // as we don't need any initial state.
  407. PS->InitialReg = MRI->createVirtualRegister(PS->RC);
  408. Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
  409. auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
  410. PredStateSubReg);
  411. ++NumInstsInserted;
  412. MachineOperand *ZeroEFLAGSDefOp =
  413. ZeroI->findRegisterDefOperand(X86::EFLAGS);
  414. assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
  415. "Must have an implicit def of EFLAGS!");
  416. ZeroEFLAGSDefOp->setIsDead(true);
  417. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
  418. PS->InitialReg)
  419. .addImm(0)
  420. .addReg(PredStateSubReg)
  421. .addImm(X86::sub_32bit);
  422. }
  423. // We're going to need to trace predicate state throughout the function's
  424. // CFG. Prepare for this by setting up our initial state of PHIs with unique
  425. // predecessor entries and all the initial predicate state.
  426. canonicalizePHIOperands(MF);
  427. // Track the updated values in an SSA updater to rewrite into SSA form at the
  428. // end.
  429. PS->SSA.Initialize(PS->InitialReg);
  430. PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
  431. // Trace through the CFG.
  432. auto CMovs = tracePredStateThroughCFG(MF, Infos);
  433. // We may also enter basic blocks in this function via exception handling
  434. // control flow. Here, if we are hardening interprocedurally, we need to
  435. // re-capture the predicate state from the throwing code. In the Itanium ABI,
  436. // the throw will always look like a call to __cxa_throw and will have the
  437. // predicate state in the stack pointer, so extract fresh predicate state from
  438. // the stack pointer and make it available in SSA.
  439. // FIXME: Handle non-itanium ABI EH models.
  440. if (HardenInterprocedurally) {
  441. for (MachineBasicBlock &MBB : MF) {
  442. assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
  443. assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
  444. assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
  445. if (!MBB.isEHPad())
  446. continue;
  447. PS->SSA.AddAvailableValue(
  448. &MBB,
  449. extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
  450. }
  451. }
  452. if (HardenIndirectCallsAndJumps) {
  453. // If we are going to harden calls and jumps we need to unfold their memory
  454. // operands.
  455. unfoldCallAndJumpLoads(MF);
  456. // Then we trace predicate state through the indirect branches.
  457. auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
  458. CMovs.append(IndirectBrCMovs.begin(), IndirectBrCMovs.end());
  459. }
  460. // Now that we have the predicate state available at the start of each block
  461. // in the CFG, trace it through each block, hardening vulnerable instructions
  462. // as we go.
  463. tracePredStateThroughBlocksAndHarden(MF);
  464. // Now rewrite all the uses of the pred state using the SSA updater to insert
  465. // PHIs connecting the state between blocks along the CFG edges.
  466. for (MachineInstr *CMovI : CMovs)
  467. for (MachineOperand &Op : CMovI->operands()) {
  468. if (!Op.isReg() || Op.getReg() != PS->InitialReg)
  469. continue;
  470. PS->SSA.RewriteUse(Op);
  471. }
  472. LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
  473. dbgs() << "\n"; MF.verify(this));
  474. return true;
  475. }
  476. /// Implements the naive hardening approach of putting an LFENCE after every
  477. /// potentially mis-predicted control flow construct.
  478. ///
  479. /// We include this as an alternative mostly for the purpose of comparison. The
  480. /// performance impact of this is expected to be extremely severe and not
  481. /// practical for any real-world users.
  482. void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
  483. MachineFunction &MF) {
  484. // First, we scan the function looking for blocks that are reached along edges
  485. // that we might want to harden.
  486. SmallSetVector<MachineBasicBlock *, 8> Blocks;
  487. for (MachineBasicBlock &MBB : MF) {
  488. // If there are no or only one successor, nothing to do here.
  489. if (MBB.succ_size() <= 1)
  490. continue;
  491. // Skip blocks unless their terminators start with a branch. Other
  492. // terminators don't seem interesting for guarding against misspeculation.
  493. auto TermIt = MBB.getFirstTerminator();
  494. if (TermIt == MBB.end() || !TermIt->isBranch())
  495. continue;
  496. // Add all the non-EH-pad succossors to the blocks we want to harden. We
  497. // skip EH pads because there isn't really a condition of interest on
  498. // entering.
  499. for (MachineBasicBlock *SuccMBB : MBB.successors())
  500. if (!SuccMBB->isEHPad())
  501. Blocks.insert(SuccMBB);
  502. }
  503. for (MachineBasicBlock *MBB : Blocks) {
  504. auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
  505. BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
  506. ++NumInstsInserted;
  507. ++NumLFENCEsInserted;
  508. }
  509. }
  510. SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
  511. X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
  512. SmallVector<BlockCondInfo, 16> Infos;
  513. // Walk the function and build up a summary for each block's conditions that
  514. // we need to trace through.
  515. for (MachineBasicBlock &MBB : MF) {
  516. // If there are no or only one successor, nothing to do here.
  517. if (MBB.succ_size() <= 1)
  518. continue;
  519. // We want to reliably handle any conditional branch terminators in the
  520. // MBB, so we manually analyze the branch. We can handle all of the
  521. // permutations here, including ones that analyze branch cannot.
  522. //
  523. // The approach is to walk backwards across the terminators, resetting at
  524. // any unconditional non-indirect branch, and track all conditional edges
  525. // to basic blocks as well as the fallthrough or unconditional successor
  526. // edge. For each conditional edge, we track the target and the opposite
  527. // condition code in order to inject a "no-op" cmov into that successor
  528. // that will harden the predicate. For the fallthrough/unconditional
  529. // edge, we inject a separate cmov for each conditional branch with
  530. // matching condition codes. This effectively implements an "and" of the
  531. // condition flags, even if there isn't a single condition flag that would
  532. // directly implement that. We don't bother trying to optimize either of
  533. // these cases because if such an optimization is possible, LLVM should
  534. // have optimized the conditional *branches* in that way already to reduce
  535. // instruction count. This late, we simply assume the minimal number of
  536. // branch instructions is being emitted and use that to guide our cmov
  537. // insertion.
  538. BlockCondInfo Info = {&MBB, {}, nullptr};
  539. // Now walk backwards through the terminators and build up successors they
  540. // reach and the conditions.
  541. for (MachineInstr &MI : llvm::reverse(MBB)) {
  542. // Once we've handled all the terminators, we're done.
  543. if (!MI.isTerminator())
  544. break;
  545. // If we see a non-branch terminator, we can't handle anything so bail.
  546. if (!MI.isBranch()) {
  547. Info.CondBrs.clear();
  548. break;
  549. }
  550. // If we see an unconditional branch, reset our state, clear any
  551. // fallthrough, and set this is the "else" successor.
  552. if (MI.getOpcode() == X86::JMP_1) {
  553. Info.CondBrs.clear();
  554. Info.UncondBr = &MI;
  555. continue;
  556. }
  557. // If we get an invalid condition, we have an indirect branch or some
  558. // other unanalyzable "fallthrough" case. We model this as a nullptr for
  559. // the destination so we can still guard any conditional successors.
  560. // Consider code sequences like:
  561. // ```
  562. // jCC L1
  563. // jmpq *%rax
  564. // ```
  565. // We still want to harden the edge to `L1`.
  566. if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
  567. Info.CondBrs.clear();
  568. Info.UncondBr = &MI;
  569. continue;
  570. }
  571. // We have a vanilla conditional branch, add it to our list.
  572. Info.CondBrs.push_back(&MI);
  573. }
  574. if (Info.CondBrs.empty()) {
  575. ++NumBranchesUntraced;
  576. LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
  577. MBB.dump());
  578. continue;
  579. }
  580. Infos.push_back(Info);
  581. }
  582. return Infos;
  583. }
  584. /// Trace the predicate state through the CFG, instrumenting each conditional
  585. /// branch such that misspeculation through an edge will poison the predicate
  586. /// state.
  587. ///
  588. /// Returns the list of inserted CMov instructions so that they can have their
  589. /// uses of the predicate state rewritten into proper SSA form once it is
  590. /// complete.
  591. SmallVector<MachineInstr *, 16>
  592. X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
  593. MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
  594. // Collect the inserted cmov instructions so we can rewrite their uses of the
  595. // predicate state into SSA form.
  596. SmallVector<MachineInstr *, 16> CMovs;
  597. // Now walk all of the basic blocks looking for ones that end in conditional
  598. // jumps where we need to update this register along each edge.
  599. for (const BlockCondInfo &Info : Infos) {
  600. MachineBasicBlock &MBB = *Info.MBB;
  601. const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
  602. MachineInstr *UncondBr = Info.UncondBr;
  603. LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
  604. << "\n");
  605. ++NumCondBranchesTraced;
  606. // Compute the non-conditional successor as either the target of any
  607. // unconditional branch or the layout successor.
  608. MachineBasicBlock *UncondSucc =
  609. UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
  610. ? UncondBr->getOperand(0).getMBB()
  611. : nullptr)
  612. : &*std::next(MachineFunction::iterator(&MBB));
  613. // Count how many edges there are to any given successor.
  614. SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
  615. if (UncondSucc)
  616. ++SuccCounts[UncondSucc];
  617. for (auto *CondBr : CondBrs)
  618. ++SuccCounts[CondBr->getOperand(0).getMBB()];
  619. // A lambda to insert cmov instructions into a block checking all of the
  620. // condition codes in a sequence.
  621. auto BuildCheckingBlockForSuccAndConds =
  622. [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
  623. MachineInstr *Br, MachineInstr *&UncondBr,
  624. ArrayRef<X86::CondCode> Conds) {
  625. // First, we split the edge to insert the checking block into a safe
  626. // location.
  627. auto &CheckingMBB =
  628. (SuccCount == 1 && Succ.pred_size() == 1)
  629. ? Succ
  630. : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
  631. bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
  632. if (!LiveEFLAGS)
  633. CheckingMBB.addLiveIn(X86::EFLAGS);
  634. // Now insert the cmovs to implement the checks.
  635. auto InsertPt = CheckingMBB.begin();
  636. assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
  637. "Should never have a PHI in the initial checking block as it "
  638. "always has a single predecessor!");
  639. // We will wire each cmov to each other, but need to start with the
  640. // incoming pred state.
  641. unsigned CurStateReg = PS->InitialReg;
  642. for (X86::CondCode Cond : Conds) {
  643. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  644. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  645. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  646. // Note that we intentionally use an empty debug location so that
  647. // this picks up the preceding location.
  648. auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
  649. TII->get(CMovOp), UpdatedStateReg)
  650. .addReg(CurStateReg)
  651. .addReg(PS->PoisonReg)
  652. .addImm(Cond);
  653. // If this is the last cmov and the EFLAGS weren't originally
  654. // live-in, mark them as killed.
  655. if (!LiveEFLAGS && Cond == Conds.back())
  656. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  657. ++NumInstsInserted;
  658. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
  659. dbgs() << "\n");
  660. // The first one of the cmovs will be using the top level
  661. // `PredStateReg` and need to get rewritten into SSA form.
  662. if (CurStateReg == PS->InitialReg)
  663. CMovs.push_back(&*CMovI);
  664. // The next cmov should start from this one's def.
  665. CurStateReg = UpdatedStateReg;
  666. }
  667. // And put the last one into the available values for SSA form of our
  668. // predicate state.
  669. PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
  670. };
  671. std::vector<X86::CondCode> UncondCodeSeq;
  672. for (auto *CondBr : CondBrs) {
  673. MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
  674. int &SuccCount = SuccCounts[&Succ];
  675. X86::CondCode Cond = X86::getCondFromBranch(*CondBr);
  676. X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
  677. UncondCodeSeq.push_back(Cond);
  678. BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
  679. {InvCond});
  680. // Decrement the successor count now that we've split one of the edges.
  681. // We need to keep the count of edges to the successor accurate in order
  682. // to know above when to *replace* the successor in the CFG vs. just
  683. // adding the new successor.
  684. --SuccCount;
  685. }
  686. // Since we may have split edges and changed the number of successors,
  687. // normalize the probabilities. This avoids doing it each time we split an
  688. // edge.
  689. MBB.normalizeSuccProbs();
  690. // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
  691. // need to intersect the other condition codes. We can do this by just
  692. // doing a cmov for each one.
  693. if (!UncondSucc)
  694. // If we have no fallthrough to protect (perhaps it is an indirect jump?)
  695. // just skip this and continue.
  696. continue;
  697. assert(SuccCounts[UncondSucc] == 1 &&
  698. "We should never have more than one edge to the unconditional "
  699. "successor at this point because every other edge must have been "
  700. "split above!");
  701. // Sort and unique the codes to minimize them.
  702. llvm::sort(UncondCodeSeq);
  703. UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
  704. UncondCodeSeq.end());
  705. // Build a checking version of the successor.
  706. BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
  707. UncondBr, UncondBr, UncondCodeSeq);
  708. }
  709. return CMovs;
  710. }
  711. /// Compute the register class for the unfolded load.
  712. ///
  713. /// FIXME: This should probably live in X86InstrInfo, potentially by adding
  714. /// a way to unfold into a newly created vreg rather than requiring a register
  715. /// input.
  716. static const TargetRegisterClass *
  717. getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
  718. unsigned Opcode) {
  719. unsigned Index;
  720. unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
  721. Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
  722. const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
  723. return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
  724. }
  725. void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
  726. MachineFunction &MF) {
  727. for (MachineBasicBlock &MBB : MF)
  728. // We use make_early_inc_range here so we can remove instructions if needed
  729. // without disturbing the iteration.
  730. for (MachineInstr &MI : llvm::make_early_inc_range(MBB.instrs())) {
  731. // Must either be a call or a branch.
  732. if (!MI.isCall() && !MI.isBranch())
  733. continue;
  734. // We only care about loading variants of these instructions.
  735. if (!MI.mayLoad())
  736. continue;
  737. switch (MI.getOpcode()) {
  738. default: {
  739. LLVM_DEBUG(
  740. dbgs() << "ERROR: Found an unexpected loading branch or call "
  741. "instruction:\n";
  742. MI.dump(); dbgs() << "\n");
  743. report_fatal_error("Unexpected loading branch or call!");
  744. }
  745. case X86::FARCALL16m:
  746. case X86::FARCALL32m:
  747. case X86::FARCALL64m:
  748. case X86::FARJMP16m:
  749. case X86::FARJMP32m:
  750. case X86::FARJMP64m:
  751. // We cannot mitigate far jumps or calls, but we also don't expect them
  752. // to be vulnerable to Spectre v1.2 style attacks.
  753. continue;
  754. case X86::CALL16m:
  755. case X86::CALL16m_NT:
  756. case X86::CALL32m:
  757. case X86::CALL32m_NT:
  758. case X86::CALL64m:
  759. case X86::CALL64m_NT:
  760. case X86::JMP16m:
  761. case X86::JMP16m_NT:
  762. case X86::JMP32m:
  763. case X86::JMP32m_NT:
  764. case X86::JMP64m:
  765. case X86::JMP64m_NT:
  766. case X86::TAILJMPm64:
  767. case X86::TAILJMPm64_REX:
  768. case X86::TAILJMPm:
  769. case X86::TCRETURNmi64:
  770. case X86::TCRETURNmi: {
  771. // Use the generic unfold logic now that we know we're dealing with
  772. // expected instructions.
  773. // FIXME: We don't have test coverage for all of these!
  774. auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
  775. if (!UnfoldedRC) {
  776. LLVM_DEBUG(dbgs()
  777. << "ERROR: Unable to unfold load from instruction:\n";
  778. MI.dump(); dbgs() << "\n");
  779. report_fatal_error("Unable to unfold load!");
  780. }
  781. Register Reg = MRI->createVirtualRegister(UnfoldedRC);
  782. SmallVector<MachineInstr *, 2> NewMIs;
  783. // If we were able to compute an unfolded reg class, any failure here
  784. // is just a programming error so just assert.
  785. bool Unfolded =
  786. TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
  787. /*UnfoldStore*/ false, NewMIs);
  788. (void)Unfolded;
  789. assert(Unfolded &&
  790. "Computed unfolded register class but failed to unfold");
  791. // Now stitch the new instructions into place and erase the old one.
  792. for (auto *NewMI : NewMIs)
  793. MBB.insert(MI.getIterator(), NewMI);
  794. // Update the call site info.
  795. if (MI.isCandidateForCallSiteEntry())
  796. MF.eraseCallSiteInfo(&MI);
  797. MI.eraseFromParent();
  798. LLVM_DEBUG({
  799. dbgs() << "Unfolded load successfully into:\n";
  800. for (auto *NewMI : NewMIs) {
  801. NewMI->dump();
  802. dbgs() << "\n";
  803. }
  804. });
  805. continue;
  806. }
  807. }
  808. llvm_unreachable("Escaped switch with default!");
  809. }
  810. }
  811. /// Trace the predicate state through indirect branches, instrumenting them to
  812. /// poison the state if a target is reached that does not match the expected
  813. /// target.
  814. ///
  815. /// This is designed to mitigate Spectre variant 1 attacks where an indirect
  816. /// branch is trained to predict a particular target and then mispredicts that
  817. /// target in a way that can leak data. Despite using an indirect branch, this
  818. /// is really a variant 1 style attack: it does not steer execution to an
  819. /// arbitrary or attacker controlled address, and it does not require any
  820. /// special code executing next to the victim. This attack can also be mitigated
  821. /// through retpolines, but those require either replacing indirect branches
  822. /// with conditional direct branches or lowering them through a device that
  823. /// blocks speculation. This mitigation can replace these retpoline-style
  824. /// mitigations for jump tables and other indirect branches within a function
  825. /// when variant 2 isn't a risk while allowing limited speculation. Indirect
  826. /// calls, however, cannot be mitigated through this technique without changing
  827. /// the ABI in a fundamental way.
  828. SmallVector<MachineInstr *, 16>
  829. X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
  830. MachineFunction &MF) {
  831. // We use the SSAUpdater to insert PHI nodes for the target addresses of
  832. // indirect branches. We don't actually need the full power of the SSA updater
  833. // in this particular case as we always have immediately available values, but
  834. // this avoids us having to re-implement the PHI construction logic.
  835. MachineSSAUpdater TargetAddrSSA(MF);
  836. TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
  837. // Track which blocks were terminated with an indirect branch.
  838. SmallPtrSet<MachineBasicBlock *, 4> IndirectTerminatedMBBs;
  839. // We need to know what blocks end up reached via indirect branches. We
  840. // expect this to be a subset of those whose address is taken and so track it
  841. // directly via the CFG.
  842. SmallPtrSet<MachineBasicBlock *, 4> IndirectTargetMBBs;
  843. // Walk all the blocks which end in an indirect branch and make the
  844. // target address available.
  845. for (MachineBasicBlock &MBB : MF) {
  846. // Find the last terminator.
  847. auto MII = MBB.instr_rbegin();
  848. while (MII != MBB.instr_rend() && MII->isDebugInstr())
  849. ++MII;
  850. if (MII == MBB.instr_rend())
  851. continue;
  852. MachineInstr &TI = *MII;
  853. if (!TI.isTerminator() || !TI.isBranch())
  854. // No terminator or non-branch terminator.
  855. continue;
  856. unsigned TargetReg;
  857. switch (TI.getOpcode()) {
  858. default:
  859. // Direct branch or conditional branch (leading to fallthrough).
  860. continue;
  861. case X86::FARJMP16m:
  862. case X86::FARJMP32m:
  863. case X86::FARJMP64m:
  864. // We cannot mitigate far jumps or calls, but we also don't expect them
  865. // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
  866. continue;
  867. case X86::JMP16m:
  868. case X86::JMP16m_NT:
  869. case X86::JMP32m:
  870. case X86::JMP32m_NT:
  871. case X86::JMP64m:
  872. case X86::JMP64m_NT:
  873. // Mostly as documentation.
  874. report_fatal_error("Memory operand jumps should have been unfolded!");
  875. case X86::JMP16r:
  876. report_fatal_error(
  877. "Support for 16-bit indirect branches is not implemented.");
  878. case X86::JMP32r:
  879. report_fatal_error(
  880. "Support for 32-bit indirect branches is not implemented.");
  881. case X86::JMP64r:
  882. TargetReg = TI.getOperand(0).getReg();
  883. }
  884. // We have definitely found an indirect branch. Verify that there are no
  885. // preceding conditional branches as we don't yet support that.
  886. if (llvm::any_of(MBB.terminators(), [&](MachineInstr &OtherTI) {
  887. return !OtherTI.isDebugInstr() && &OtherTI != &TI;
  888. })) {
  889. LLVM_DEBUG({
  890. dbgs() << "ERROR: Found other terminators in a block with an indirect "
  891. "branch! This is not yet supported! Terminator sequence:\n";
  892. for (MachineInstr &MI : MBB.terminators()) {
  893. MI.dump();
  894. dbgs() << '\n';
  895. }
  896. });
  897. report_fatal_error("Unimplemented terminator sequence!");
  898. }
  899. // Make the target register an available value for this block.
  900. TargetAddrSSA.AddAvailableValue(&MBB, TargetReg);
  901. IndirectTerminatedMBBs.insert(&MBB);
  902. // Add all the successors to our target candidates.
  903. for (MachineBasicBlock *Succ : MBB.successors())
  904. IndirectTargetMBBs.insert(Succ);
  905. }
  906. // Keep track of the cmov instructions we insert so we can return them.
  907. SmallVector<MachineInstr *, 16> CMovs;
  908. // If we didn't find any indirect branches with targets, nothing to do here.
  909. if (IndirectTargetMBBs.empty())
  910. return CMovs;
  911. // We found indirect branches and targets that need to be instrumented to
  912. // harden loads within them. Walk the blocks of the function (to get a stable
  913. // ordering) and instrument each target of an indirect branch.
  914. for (MachineBasicBlock &MBB : MF) {
  915. // Skip the blocks that aren't candidate targets.
  916. if (!IndirectTargetMBBs.count(&MBB))
  917. continue;
  918. // We don't expect EH pads to ever be reached via an indirect branch. If
  919. // this is desired for some reason, we could simply skip them here rather
  920. // than asserting.
  921. assert(!MBB.isEHPad() &&
  922. "Unexpected EH pad as target of an indirect branch!");
  923. // We should never end up threading EFLAGS into a block to harden
  924. // conditional jumps as there would be an additional successor via the
  925. // indirect branch. As a consequence, all such edges would be split before
  926. // reaching here, and the inserted block will handle the EFLAGS-based
  927. // hardening.
  928. assert(!MBB.isLiveIn(X86::EFLAGS) &&
  929. "Cannot check within a block that already has live-in EFLAGS!");
  930. // We can't handle having non-indirect edges into this block unless this is
  931. // the only successor and we can synthesize the necessary target address.
  932. for (MachineBasicBlock *Pred : MBB.predecessors()) {
  933. // If we've already handled this by extracting the target directly,
  934. // nothing to do.
  935. if (IndirectTerminatedMBBs.count(Pred))
  936. continue;
  937. // Otherwise, we have to be the only successor. We generally expect this
  938. // to be true as conditional branches should have had a critical edge
  939. // split already. We don't however need to worry about EH pad successors
  940. // as they'll happily ignore the target and their hardening strategy is
  941. // resilient to all ways in which they could be reached speculatively.
  942. if (!llvm::all_of(Pred->successors(), [&](MachineBasicBlock *Succ) {
  943. return Succ->isEHPad() || Succ == &MBB;
  944. })) {
  945. LLVM_DEBUG({
  946. dbgs() << "ERROR: Found conditional entry to target of indirect "
  947. "branch!\n";
  948. Pred->dump();
  949. MBB.dump();
  950. });
  951. report_fatal_error("Cannot harden a conditional entry to a target of "
  952. "an indirect branch!");
  953. }
  954. // Now we need to compute the address of this block and install it as a
  955. // synthetic target in the predecessor. We do this at the bottom of the
  956. // predecessor.
  957. auto InsertPt = Pred->getFirstTerminator();
  958. Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
  959. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  960. !Subtarget->isPositionIndependent()) {
  961. // Directly materialize it into an immediate.
  962. auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
  963. TII->get(X86::MOV64ri32), TargetReg)
  964. .addMBB(&MBB);
  965. ++NumInstsInserted;
  966. (void)AddrI;
  967. LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
  968. dbgs() << "\n");
  969. } else {
  970. auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
  971. TargetReg)
  972. .addReg(/*Base*/ X86::RIP)
  973. .addImm(/*Scale*/ 1)
  974. .addReg(/*Index*/ 0)
  975. .addMBB(&MBB)
  976. .addReg(/*Segment*/ 0);
  977. ++NumInstsInserted;
  978. (void)AddrI;
  979. LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
  980. dbgs() << "\n");
  981. }
  982. // And make this available.
  983. TargetAddrSSA.AddAvailableValue(Pred, TargetReg);
  984. }
  985. // Materialize the needed SSA value of the target. Note that we need the
  986. // middle of the block as this block might at the bottom have an indirect
  987. // branch back to itself. We can do this here because at this point, every
  988. // predecessor of this block has an available value. This is basically just
  989. // automating the construction of a PHI node for this target.
  990. Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
  991. // Insert a comparison of the incoming target register with this block's
  992. // address. This also requires us to mark the block as having its address
  993. // taken explicitly.
  994. MBB.setHasAddressTaken();
  995. auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
  996. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  997. !Subtarget->isPositionIndependent()) {
  998. // Check directly against a relocated immediate when we can.
  999. auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
  1000. .addReg(TargetReg, RegState::Kill)
  1001. .addMBB(&MBB);
  1002. ++NumInstsInserted;
  1003. (void)CheckI;
  1004. LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
  1005. } else {
  1006. // Otherwise compute the address into a register first.
  1007. Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
  1008. auto AddrI =
  1009. BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
  1010. .addReg(/*Base*/ X86::RIP)
  1011. .addImm(/*Scale*/ 1)
  1012. .addReg(/*Index*/ 0)
  1013. .addMBB(&MBB)
  1014. .addReg(/*Segment*/ 0);
  1015. ++NumInstsInserted;
  1016. (void)AddrI;
  1017. LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
  1018. auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
  1019. .addReg(TargetReg, RegState::Kill)
  1020. .addReg(AddrReg, RegState::Kill);
  1021. ++NumInstsInserted;
  1022. (void)CheckI;
  1023. LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
  1024. }
  1025. // Now cmov over the predicate if the comparison wasn't equal.
  1026. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  1027. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  1028. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  1029. auto CMovI =
  1030. BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
  1031. .addReg(PS->InitialReg)
  1032. .addReg(PS->PoisonReg)
  1033. .addImm(X86::COND_NE);
  1034. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  1035. ++NumInstsInserted;
  1036. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
  1037. CMovs.push_back(&*CMovI);
  1038. // And put the new value into the available values for SSA form of our
  1039. // predicate state.
  1040. PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
  1041. }
  1042. // Return all the newly inserted cmov instructions of the predicate state.
  1043. return CMovs;
  1044. }
  1045. // Returns true if the MI has EFLAGS as a register def operand and it's live,
  1046. // otherwise it returns false
  1047. static bool isEFLAGSDefLive(const MachineInstr &MI) {
  1048. if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
  1049. return !DefOp->isDead();
  1050. }
  1051. return false;
  1052. }
  1053. static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
  1054. const TargetRegisterInfo &TRI) {
  1055. // Check if EFLAGS are alive by seeing if there is a def of them or they
  1056. // live-in, and then seeing if that def is in turn used.
  1057. for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
  1058. if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
  1059. // If the def is dead, then EFLAGS is not live.
  1060. if (DefOp->isDead())
  1061. return false;
  1062. // Otherwise we've def'ed it, and it is live.
  1063. return true;
  1064. }
  1065. // While at this instruction, also check if we use and kill EFLAGS
  1066. // which means it isn't live.
  1067. if (MI.killsRegister(X86::EFLAGS, &TRI))
  1068. return false;
  1069. }
  1070. // If we didn't find anything conclusive (neither definitely alive or
  1071. // definitely dead) return whether it lives into the block.
  1072. return MBB.isLiveIn(X86::EFLAGS);
  1073. }
  1074. /// Trace the predicate state through each of the blocks in the function,
  1075. /// hardening everything necessary along the way.
  1076. ///
  1077. /// We call this routine once the initial predicate state has been established
  1078. /// for each basic block in the function in the SSA updater. This routine traces
  1079. /// it through the instructions within each basic block, and for non-returning
  1080. /// blocks informs the SSA updater about the final state that lives out of the
  1081. /// block. Along the way, it hardens any vulnerable instruction using the
  1082. /// currently valid predicate state. We have to do these two things together
  1083. /// because the SSA updater only works across blocks. Within a block, we track
  1084. /// the current predicate state directly and update it as it changes.
  1085. ///
  1086. /// This operates in two passes over each block. First, we analyze the loads in
  1087. /// the block to determine which strategy will be used to harden them: hardening
  1088. /// the address or hardening the loaded value when loaded into a register
  1089. /// amenable to hardening. We have to process these first because the two
  1090. /// strategies may interact -- later hardening may change what strategy we wish
  1091. /// to use. We also will analyze data dependencies between loads and avoid
  1092. /// hardening those loads that are data dependent on a load with a hardened
  1093. /// address. We also skip hardening loads already behind an LFENCE as that is
  1094. /// sufficient to harden them against misspeculation.
  1095. ///
  1096. /// Second, we actively trace the predicate state through the block, applying
  1097. /// the hardening steps we determined necessary in the first pass as we go.
  1098. ///
  1099. /// These two passes are applied to each basic block. We operate one block at a
  1100. /// time to simplify reasoning about reachability and sequencing.
  1101. void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
  1102. MachineFunction &MF) {
  1103. SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
  1104. SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
  1105. SmallSet<unsigned, 16> HardenedAddrRegs;
  1106. SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
  1107. // Track the set of load-dependent registers through the basic block. Because
  1108. // the values of these registers have an existing data dependency on a loaded
  1109. // value which we would have checked, we can omit any checks on them.
  1110. SparseBitVector<> LoadDepRegs;
  1111. for (MachineBasicBlock &MBB : MF) {
  1112. // The first pass over the block: collect all the loads which can have their
  1113. // loaded value hardened and all the loads that instead need their address
  1114. // hardened. During this walk we propagate load dependence for address
  1115. // hardened loads and also look for LFENCE to stop hardening wherever
  1116. // possible. When deciding whether or not to harden the loaded value or not,
  1117. // we check to see if any registers used in the address will have been
  1118. // hardened at this point and if so, harden any remaining address registers
  1119. // as that often successfully re-uses hardened addresses and minimizes
  1120. // instructions.
  1121. //
  1122. // FIXME: We should consider an aggressive mode where we continue to keep as
  1123. // many loads value hardened even when some address register hardening would
  1124. // be free (due to reuse).
  1125. //
  1126. // Note that we only need this pass if we are actually hardening loads.
  1127. if (HardenLoads)
  1128. for (MachineInstr &MI : MBB) {
  1129. // We naively assume that all def'ed registers of an instruction have
  1130. // a data dependency on all of their operands.
  1131. // FIXME: Do a more careful analysis of x86 to build a conservative
  1132. // model here.
  1133. if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
  1134. return Op.isReg() && LoadDepRegs.test(Op.getReg());
  1135. }))
  1136. for (MachineOperand &Def : MI.defs())
  1137. if (Def.isReg())
  1138. LoadDepRegs.set(Def.getReg());
  1139. // Both Intel and AMD are guiding that they will change the semantics of
  1140. // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
  1141. // no more need to guard things in this block.
  1142. if (MI.getOpcode() == X86::LFENCE)
  1143. break;
  1144. // If this instruction cannot load, nothing to do.
  1145. if (!MI.mayLoad())
  1146. continue;
  1147. // Some instructions which "load" are trivially safe or unimportant.
  1148. if (MI.getOpcode() == X86::MFENCE)
  1149. continue;
  1150. // Extract the memory operand information about this instruction.
  1151. // FIXME: This doesn't handle loading pseudo instructions which we often
  1152. // could handle with similarly generic logic. We probably need to add an
  1153. // MI-layer routine similar to the MC-layer one we use here which maps
  1154. // pseudos much like this maps real instructions.
  1155. const MCInstrDesc &Desc = MI.getDesc();
  1156. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1157. if (MemRefBeginIdx < 0) {
  1158. LLVM_DEBUG(dbgs()
  1159. << "WARNING: unable to harden loading instruction: ";
  1160. MI.dump());
  1161. continue;
  1162. }
  1163. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1164. MachineOperand &BaseMO =
  1165. MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1166. MachineOperand &IndexMO =
  1167. MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1168. // If we have at least one (non-frame-index, non-RIP) register operand,
  1169. // and neither operand is load-dependent, we need to check the load.
  1170. unsigned BaseReg = 0, IndexReg = 0;
  1171. if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
  1172. BaseMO.getReg() != X86::NoRegister)
  1173. BaseReg = BaseMO.getReg();
  1174. if (IndexMO.getReg() != X86::NoRegister)
  1175. IndexReg = IndexMO.getReg();
  1176. if (!BaseReg && !IndexReg)
  1177. // No register operands!
  1178. continue;
  1179. // If any register operand is dependent, this load is dependent and we
  1180. // needn't check it.
  1181. // FIXME: Is this true in the case where we are hardening loads after
  1182. // they complete? Unclear, need to investigate.
  1183. if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
  1184. (IndexReg && LoadDepRegs.test(IndexReg)))
  1185. continue;
  1186. // If post-load hardening is enabled, this load is compatible with
  1187. // post-load hardening, and we aren't already going to harden one of the
  1188. // address registers, queue it up to be hardened post-load. Notably,
  1189. // even once hardened this won't introduce a useful dependency that
  1190. // could prune out subsequent loads.
  1191. if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
  1192. !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 &&
  1193. MI.getOperand(0).isReg() &&
  1194. canHardenRegister(MI.getOperand(0).getReg()) &&
  1195. !HardenedAddrRegs.count(BaseReg) &&
  1196. !HardenedAddrRegs.count(IndexReg)) {
  1197. HardenPostLoad.insert(&MI);
  1198. HardenedAddrRegs.insert(MI.getOperand(0).getReg());
  1199. continue;
  1200. }
  1201. // Record this instruction for address hardening and record its register
  1202. // operands as being address-hardened.
  1203. HardenLoadAddr.insert(&MI);
  1204. if (BaseReg)
  1205. HardenedAddrRegs.insert(BaseReg);
  1206. if (IndexReg)
  1207. HardenedAddrRegs.insert(IndexReg);
  1208. for (MachineOperand &Def : MI.defs())
  1209. if (Def.isReg())
  1210. LoadDepRegs.set(Def.getReg());
  1211. }
  1212. // Now re-walk the instructions in the basic block, and apply whichever
  1213. // hardening strategy we have elected. Note that we do this in a second
  1214. // pass specifically so that we have the complete set of instructions for
  1215. // which we will do post-load hardening and can defer it in certain
  1216. // circumstances.
  1217. for (MachineInstr &MI : MBB) {
  1218. if (HardenLoads) {
  1219. // We cannot both require hardening the def of a load and its address.
  1220. assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
  1221. "Requested to harden both the address and def of a load!");
  1222. // Check if this is a load whose address needs to be hardened.
  1223. if (HardenLoadAddr.erase(&MI)) {
  1224. const MCInstrDesc &Desc = MI.getDesc();
  1225. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1226. assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
  1227. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1228. MachineOperand &BaseMO =
  1229. MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1230. MachineOperand &IndexMO =
  1231. MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1232. hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
  1233. continue;
  1234. }
  1235. // Test if this instruction is one of our post load instructions (and
  1236. // remove it from the set if so).
  1237. if (HardenPostLoad.erase(&MI)) {
  1238. assert(!MI.isCall() && "Must not try to post-load harden a call!");
  1239. // If this is a data-invariant load and there is no EFLAGS
  1240. // interference, we want to try and sink any hardening as far as
  1241. // possible.
  1242. if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
  1243. // Sink the instruction we'll need to harden as far as we can down
  1244. // the graph.
  1245. MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
  1246. // If we managed to sink this instruction, update everything so we
  1247. // harden that instruction when we reach it in the instruction
  1248. // sequence.
  1249. if (SunkMI != &MI) {
  1250. // If in sinking there was no instruction needing to be hardened,
  1251. // we're done.
  1252. if (!SunkMI)
  1253. continue;
  1254. // Otherwise, add this to the set of defs we harden.
  1255. HardenPostLoad.insert(SunkMI);
  1256. continue;
  1257. }
  1258. }
  1259. unsigned HardenedReg = hardenPostLoad(MI);
  1260. // Mark the resulting hardened register as such so we don't re-harden.
  1261. AddrRegToHardenedReg[HardenedReg] = HardenedReg;
  1262. continue;
  1263. }
  1264. // Check for an indirect call or branch that may need its input hardened
  1265. // even if we couldn't find the specific load used, or were able to
  1266. // avoid hardening it for some reason. Note that here we cannot break
  1267. // out afterward as we may still need to handle any call aspect of this
  1268. // instruction.
  1269. if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
  1270. hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
  1271. }
  1272. // After we finish hardening loads we handle interprocedural hardening if
  1273. // enabled and relevant for this instruction.
  1274. if (!HardenInterprocedurally)
  1275. continue;
  1276. if (!MI.isCall() && !MI.isReturn())
  1277. continue;
  1278. // If this is a direct return (IE, not a tail call) just directly harden
  1279. // it.
  1280. if (MI.isReturn() && !MI.isCall()) {
  1281. hardenReturnInstr(MI);
  1282. continue;
  1283. }
  1284. // Otherwise we have a call. We need to handle transferring the predicate
  1285. // state into a call and recovering it after the call returns (unless this
  1286. // is a tail call).
  1287. assert(MI.isCall() && "Should only reach here for calls!");
  1288. tracePredStateThroughCall(MI);
  1289. }
  1290. HardenPostLoad.clear();
  1291. HardenLoadAddr.clear();
  1292. HardenedAddrRegs.clear();
  1293. AddrRegToHardenedReg.clear();
  1294. // Currently, we only track data-dependent loads within a basic block.
  1295. // FIXME: We should see if this is necessary or if we could be more
  1296. // aggressive here without opening up attack avenues.
  1297. LoadDepRegs.clear();
  1298. }
  1299. }
  1300. /// Save EFLAGS into the returned GPR. This can in turn be restored with
  1301. /// `restoreEFLAGS`.
  1302. ///
  1303. /// Note that LLVM can only lower very simple patterns of saved and restored
  1304. /// EFLAGS registers. The restore should always be within the same basic block
  1305. /// as the save so that no PHI nodes are inserted.
  1306. unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
  1307. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1308. DebugLoc Loc) {
  1309. // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
  1310. // what instruction selection does.
  1311. Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
  1312. // We directly copy the FLAGS register and rely on later lowering to clean
  1313. // this up into the appropriate setCC instructions.
  1314. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
  1315. ++NumInstsInserted;
  1316. return Reg;
  1317. }
  1318. /// Restore EFLAGS from the provided GPR. This should be produced by
  1319. /// `saveEFLAGS`.
  1320. ///
  1321. /// This must be done within the same basic block as the save in order to
  1322. /// reliably lower.
  1323. void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
  1324. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
  1325. Register Reg) {
  1326. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
  1327. ++NumInstsInserted;
  1328. }
  1329. /// Takes the current predicate state (in a register) and merges it into the
  1330. /// stack pointer. The state is essentially a single bit, but we merge this in
  1331. /// a way that won't form non-canonical pointers and also will be preserved
  1332. /// across normal stack adjustments.
  1333. void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
  1334. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
  1335. unsigned PredStateReg) {
  1336. Register TmpReg = MRI->createVirtualRegister(PS->RC);
  1337. // FIXME: This hard codes a shift distance based on the number of bits needed
  1338. // to stay canonical on 64-bit. We should compute this somehow and support
  1339. // 32-bit as part of that.
  1340. auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
  1341. .addReg(PredStateReg, RegState::Kill)
  1342. .addImm(47);
  1343. ShiftI->addRegisterDead(X86::EFLAGS, TRI);
  1344. ++NumInstsInserted;
  1345. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
  1346. .addReg(X86::RSP)
  1347. .addReg(TmpReg, RegState::Kill);
  1348. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1349. ++NumInstsInserted;
  1350. }
  1351. /// Extracts the predicate state stored in the high bits of the stack pointer.
  1352. unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
  1353. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1354. DebugLoc Loc) {
  1355. Register PredStateReg = MRI->createVirtualRegister(PS->RC);
  1356. Register TmpReg = MRI->createVirtualRegister(PS->RC);
  1357. // We know that the stack pointer will have any preserved predicate state in
  1358. // its high bit. We just want to smear this across the other bits. Turns out,
  1359. // this is exactly what an arithmetic right shift does.
  1360. BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
  1361. .addReg(X86::RSP);
  1362. auto ShiftI =
  1363. BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
  1364. .addReg(TmpReg, RegState::Kill)
  1365. .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
  1366. ShiftI->addRegisterDead(X86::EFLAGS, TRI);
  1367. ++NumInstsInserted;
  1368. return PredStateReg;
  1369. }
  1370. void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
  1371. MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
  1372. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
  1373. MachineBasicBlock &MBB = *MI.getParent();
  1374. const DebugLoc &Loc = MI.getDebugLoc();
  1375. // Check if EFLAGS are alive by seeing if there is a def of them or they
  1376. // live-in, and then seeing if that def is in turn used.
  1377. bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
  1378. SmallVector<MachineOperand *, 2> HardenOpRegs;
  1379. if (BaseMO.isFI()) {
  1380. // A frame index is never a dynamically controllable load, so only
  1381. // harden it if we're covering fixed address loads as well.
  1382. LLVM_DEBUG(
  1383. dbgs() << " Skipping hardening base of explicit stack frame load: ";
  1384. MI.dump(); dbgs() << "\n");
  1385. } else if (BaseMO.getReg() == X86::RSP) {
  1386. // Some idempotent atomic operations are lowered directly to a locked
  1387. // OR with 0 to the top of stack(or slightly offset from top) which uses an
  1388. // explicit RSP register as the base.
  1389. assert(IndexMO.getReg() == X86::NoRegister &&
  1390. "Explicit RSP access with dynamic index!");
  1391. LLVM_DEBUG(
  1392. dbgs() << " Cannot harden base of explicit RSP offset in a load!");
  1393. } else if (BaseMO.getReg() == X86::RIP ||
  1394. BaseMO.getReg() == X86::NoRegister) {
  1395. // For both RIP-relative addressed loads or absolute loads, we cannot
  1396. // meaningfully harden them because the address being loaded has no
  1397. // dynamic component.
  1398. //
  1399. // FIXME: When using a segment base (like TLS does) we end up with the
  1400. // dynamic address being the base plus -1 because we can't mutate the
  1401. // segment register here. This allows the signed 32-bit offset to point at
  1402. // valid segment-relative addresses and load them successfully.
  1403. LLVM_DEBUG(
  1404. dbgs() << " Cannot harden base of "
  1405. << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
  1406. << " address in a load!");
  1407. } else {
  1408. assert(BaseMO.isReg() &&
  1409. "Only allowed to have a frame index or register base.");
  1410. HardenOpRegs.push_back(&BaseMO);
  1411. }
  1412. if (IndexMO.getReg() != X86::NoRegister &&
  1413. (HardenOpRegs.empty() ||
  1414. HardenOpRegs.front()->getReg() != IndexMO.getReg()))
  1415. HardenOpRegs.push_back(&IndexMO);
  1416. assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
  1417. "Should have exactly one or two registers to harden!");
  1418. assert((HardenOpRegs.size() == 1 ||
  1419. HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
  1420. "Should not have two of the same registers!");
  1421. // Remove any registers that have alreaded been checked.
  1422. llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
  1423. // See if this operand's register has already been checked.
  1424. auto It = AddrRegToHardenedReg.find(Op->getReg());
  1425. if (It == AddrRegToHardenedReg.end())
  1426. // Not checked, so retain this one.
  1427. return false;
  1428. // Otherwise, we can directly update this operand and remove it.
  1429. Op->setReg(It->second);
  1430. return true;
  1431. });
  1432. // If there are none left, we're done.
  1433. if (HardenOpRegs.empty())
  1434. return;
  1435. // Compute the current predicate state.
  1436. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1437. auto InsertPt = MI.getIterator();
  1438. // If EFLAGS are live and we don't have access to instructions that avoid
  1439. // clobbering EFLAGS we need to save and restore them. This in turn makes
  1440. // the EFLAGS no longer live.
  1441. unsigned FlagsReg = 0;
  1442. if (EFLAGSLive && !Subtarget->hasBMI2()) {
  1443. EFLAGSLive = false;
  1444. FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
  1445. }
  1446. for (MachineOperand *Op : HardenOpRegs) {
  1447. Register OpReg = Op->getReg();
  1448. auto *OpRC = MRI->getRegClass(OpReg);
  1449. Register TmpReg = MRI->createVirtualRegister(OpRC);
  1450. // If this is a vector register, we'll need somewhat custom logic to handle
  1451. // hardening it.
  1452. if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
  1453. OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
  1454. assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
  1455. bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
  1456. // Move our state into a vector register.
  1457. // FIXME: We could skip this at the cost of longer encodings with AVX-512
  1458. // but that doesn't seem likely worth it.
  1459. Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
  1460. auto MovI =
  1461. BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
  1462. .addReg(StateReg);
  1463. (void)MovI;
  1464. ++NumInstsInserted;
  1465. LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
  1466. // Broadcast it across the vector register.
  1467. Register VBStateReg = MRI->createVirtualRegister(OpRC);
  1468. auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
  1469. TII->get(Is128Bit ? X86::VPBROADCASTQrr
  1470. : X86::VPBROADCASTQYrr),
  1471. VBStateReg)
  1472. .addReg(VStateReg);
  1473. (void)BroadcastI;
  1474. ++NumInstsInserted;
  1475. LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
  1476. dbgs() << "\n");
  1477. // Merge our potential poison state into the value with a vector or.
  1478. auto OrI =
  1479. BuildMI(MBB, InsertPt, Loc,
  1480. TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
  1481. .addReg(VBStateReg)
  1482. .addReg(OpReg);
  1483. (void)OrI;
  1484. ++NumInstsInserted;
  1485. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1486. } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
  1487. OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
  1488. OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
  1489. assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
  1490. bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
  1491. bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
  1492. if (Is128Bit || Is256Bit)
  1493. assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
  1494. // Broadcast our state into a vector register.
  1495. Register VStateReg = MRI->createVirtualRegister(OpRC);
  1496. unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
  1497. : Is256Bit ? X86::VPBROADCASTQrZ256rr
  1498. : X86::VPBROADCASTQrZrr;
  1499. auto BroadcastI =
  1500. BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
  1501. .addReg(StateReg);
  1502. (void)BroadcastI;
  1503. ++NumInstsInserted;
  1504. LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
  1505. dbgs() << "\n");
  1506. // Merge our potential poison state into the value with a vector or.
  1507. unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
  1508. : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
  1509. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
  1510. .addReg(VStateReg)
  1511. .addReg(OpReg);
  1512. (void)OrI;
  1513. ++NumInstsInserted;
  1514. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1515. } else {
  1516. // FIXME: Need to support GR32 here for 32-bit code.
  1517. assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
  1518. "Not a supported register class for address hardening!");
  1519. if (!EFLAGSLive) {
  1520. // Merge our potential poison state into the value with an or.
  1521. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
  1522. .addReg(StateReg)
  1523. .addReg(OpReg);
  1524. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1525. ++NumInstsInserted;
  1526. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1527. } else {
  1528. // We need to avoid touching EFLAGS so shift out all but the least
  1529. // significant bit using the instruction that doesn't update flags.
  1530. auto ShiftI =
  1531. BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
  1532. .addReg(OpReg)
  1533. .addReg(StateReg);
  1534. (void)ShiftI;
  1535. ++NumInstsInserted;
  1536. LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
  1537. dbgs() << "\n");
  1538. }
  1539. }
  1540. // Record this register as checked and update the operand.
  1541. assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
  1542. "Should not have checked this register yet!");
  1543. AddrRegToHardenedReg[Op->getReg()] = TmpReg;
  1544. Op->setReg(TmpReg);
  1545. ++NumAddrRegsHardened;
  1546. }
  1547. // And restore the flags if needed.
  1548. if (FlagsReg)
  1549. restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
  1550. }
  1551. MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
  1552. MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
  1553. assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
  1554. "Cannot get here with a non-invariant load!");
  1555. assert(!isEFLAGSDefLive(InitialMI) &&
  1556. "Cannot get here with a data invariant load "
  1557. "that interferes with EFLAGS!");
  1558. // See if we can sink hardening the loaded value.
  1559. auto SinkCheckToSingleUse =
  1560. [&](MachineInstr &MI) -> Optional<MachineInstr *> {
  1561. Register DefReg = MI.getOperand(0).getReg();
  1562. // We need to find a single use which we can sink the check. We can
  1563. // primarily do this because many uses may already end up checked on their
  1564. // own.
  1565. MachineInstr *SingleUseMI = nullptr;
  1566. for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
  1567. // If we're already going to harden this use, it is data invariant, it
  1568. // does not interfere with EFLAGS, and within our block.
  1569. if (HardenedInstrs.count(&UseMI)) {
  1570. if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) {
  1571. // If we've already decided to harden a non-load, we must have sunk
  1572. // some other post-load hardened instruction to it and it must itself
  1573. // be data-invariant.
  1574. assert(X86InstrInfo::isDataInvariant(UseMI) &&
  1575. "Data variant instruction being hardened!");
  1576. continue;
  1577. }
  1578. // Otherwise, this is a load and the load component can't be data
  1579. // invariant so check how this register is being used.
  1580. const MCInstrDesc &Desc = UseMI.getDesc();
  1581. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1582. assert(MemRefBeginIdx >= 0 &&
  1583. "Should always have mem references here!");
  1584. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1585. MachineOperand &BaseMO =
  1586. UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1587. MachineOperand &IndexMO =
  1588. UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1589. if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
  1590. (IndexMO.isReg() && IndexMO.getReg() == DefReg))
  1591. // The load uses the register as part of its address making it not
  1592. // invariant.
  1593. return {};
  1594. continue;
  1595. }
  1596. if (SingleUseMI)
  1597. // We already have a single use, this would make two. Bail.
  1598. return {};
  1599. // If this single use isn't data invariant, isn't in this block, or has
  1600. // interfering EFLAGS, we can't sink the hardening to it.
  1601. if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() ||
  1602. isEFLAGSDefLive(UseMI))
  1603. return {};
  1604. // If this instruction defines multiple registers bail as we won't harden
  1605. // all of them.
  1606. if (UseMI.getDesc().getNumDefs() > 1)
  1607. return {};
  1608. // If this register isn't a virtual register we can't walk uses of sanely,
  1609. // just bail. Also check that its register class is one of the ones we
  1610. // can harden.
  1611. Register UseDefReg = UseMI.getOperand(0).getReg();
  1612. if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
  1613. return {};
  1614. SingleUseMI = &UseMI;
  1615. }
  1616. // If SingleUseMI is still null, there is no use that needs its own
  1617. // checking. Otherwise, it is the single use that needs checking.
  1618. return {SingleUseMI};
  1619. };
  1620. MachineInstr *MI = &InitialMI;
  1621. while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
  1622. // Update which MI we're checking now.
  1623. MI = *SingleUse;
  1624. if (!MI)
  1625. break;
  1626. }
  1627. return MI;
  1628. }
  1629. bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
  1630. auto *RC = MRI->getRegClass(Reg);
  1631. int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
  1632. if (RegBytes > 8)
  1633. // We don't support post-load hardening of vectors.
  1634. return false;
  1635. unsigned RegIdx = Log2_32(RegBytes);
  1636. assert(RegIdx < 4 && "Unsupported register size");
  1637. // If this register class is explicitly constrained to a class that doesn't
  1638. // require REX prefix, we may not be able to satisfy that constraint when
  1639. // emitting the hardening instructions, so bail out here.
  1640. // FIXME: This seems like a pretty lame hack. The way this comes up is when we
  1641. // end up both with a NOREX and REX-only register as operands to the hardening
  1642. // instructions. It would be better to fix that code to handle this situation
  1643. // rather than hack around it in this way.
  1644. const TargetRegisterClass *NOREXRegClasses[] = {
  1645. &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
  1646. &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
  1647. if (RC == NOREXRegClasses[RegIdx])
  1648. return false;
  1649. const TargetRegisterClass *GPRRegClasses[] = {
  1650. &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
  1651. &X86::GR64RegClass};
  1652. return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
  1653. }
  1654. /// Harden a value in a register.
  1655. ///
  1656. /// This is the low-level logic to fully harden a value sitting in a register
  1657. /// against leaking during speculative execution.
  1658. ///
  1659. /// Unlike hardening an address that is used by a load, this routine is required
  1660. /// to hide *all* incoming bits in the register.
  1661. ///
  1662. /// `Reg` must be a virtual register. Currently, it is required to be a GPR no
  1663. /// larger than the predicate state register. FIXME: We should support vector
  1664. /// registers here by broadcasting the predicate state.
  1665. ///
  1666. /// The new, hardened virtual register is returned. It will have the same
  1667. /// register class as `Reg`.
  1668. unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
  1669. Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1670. DebugLoc Loc) {
  1671. assert(canHardenRegister(Reg) && "Cannot harden this register!");
  1672. assert(Reg.isVirtual() && "Cannot harden a physical register!");
  1673. auto *RC = MRI->getRegClass(Reg);
  1674. int Bytes = TRI->getRegSizeInBits(*RC) / 8;
  1675. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1676. assert((Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8) &&
  1677. "Unknown register size");
  1678. // FIXME: Need to teach this about 32-bit mode.
  1679. if (Bytes != 8) {
  1680. unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
  1681. unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
  1682. Register NarrowStateReg = MRI->createVirtualRegister(RC);
  1683. BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
  1684. .addReg(StateReg, 0, SubRegImm);
  1685. StateReg = NarrowStateReg;
  1686. }
  1687. unsigned FlagsReg = 0;
  1688. if (isEFLAGSLive(MBB, InsertPt, *TRI))
  1689. FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
  1690. Register NewReg = MRI->createVirtualRegister(RC);
  1691. unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
  1692. unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
  1693. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
  1694. .addReg(StateReg)
  1695. .addReg(Reg);
  1696. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1697. ++NumInstsInserted;
  1698. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1699. if (FlagsReg)
  1700. restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
  1701. return NewReg;
  1702. }
  1703. /// Harden a load by hardening the loaded value in the defined register.
  1704. ///
  1705. /// We can harden a non-leaking load into a register without touching the
  1706. /// address by just hiding all of the loaded bits during misspeculation. We use
  1707. /// an `or` instruction to do this because we set up our poison value as all
  1708. /// ones. And the goal is just for the loaded bits to not be exposed to
  1709. /// execution and coercing them to one is sufficient.
  1710. ///
  1711. /// Returns the newly hardened register.
  1712. unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
  1713. MachineBasicBlock &MBB = *MI.getParent();
  1714. const DebugLoc &Loc = MI.getDebugLoc();
  1715. auto &DefOp = MI.getOperand(0);
  1716. Register OldDefReg = DefOp.getReg();
  1717. auto *DefRC = MRI->getRegClass(OldDefReg);
  1718. // Because we want to completely replace the uses of this def'ed value with
  1719. // the hardened value, create a dedicated new register that will only be used
  1720. // to communicate the unhardened value to the hardening.
  1721. Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
  1722. DefOp.setReg(UnhardenedReg);
  1723. // Now harden this register's value, getting a hardened reg that is safe to
  1724. // use. Note that we insert the instructions to compute this *after* the
  1725. // defining instruction, not before it.
  1726. unsigned HardenedReg = hardenValueInRegister(
  1727. UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
  1728. // Finally, replace the old register (which now only has the uses of the
  1729. // original def) with the hardened register.
  1730. MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
  1731. ++NumPostLoadRegsHardened;
  1732. return HardenedReg;
  1733. }
  1734. /// Harden a return instruction.
  1735. ///
  1736. /// Returns implicitly perform a load which we need to harden. Without hardening
  1737. /// this load, an attacker my speculatively write over the return address to
  1738. /// steer speculation of the return to an attacker controlled address. This is
  1739. /// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
  1740. /// this paper:
  1741. /// https://people.csail.mit.edu/vlk/spectre11.pdf
  1742. ///
  1743. /// We can harden this by introducing an LFENCE that will delay any load of the
  1744. /// return address until prior instructions have retired (and thus are not being
  1745. /// speculated), or we can harden the address used by the implicit load: the
  1746. /// stack pointer.
  1747. ///
  1748. /// If we are not using an LFENCE, hardening the stack pointer has an additional
  1749. /// benefit: it allows us to pass the predicate state accumulated in this
  1750. /// function back to the caller. In the absence of a BCBS attack on the return,
  1751. /// the caller will typically be resumed and speculatively executed due to the
  1752. /// Return Stack Buffer (RSB) prediction which is very accurate and has a high
  1753. /// priority. It is possible that some code from the caller will be executed
  1754. /// speculatively even during a BCBS-attacked return until the steering takes
  1755. /// effect. Whenever this happens, the caller can recover the (poisoned)
  1756. /// predicate state from the stack pointer and continue to harden loads.
  1757. void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
  1758. MachineBasicBlock &MBB = *MI.getParent();
  1759. const DebugLoc &Loc = MI.getDebugLoc();
  1760. auto InsertPt = MI.getIterator();
  1761. if (FenceCallAndRet)
  1762. // No need to fence here as we'll fence at the return site itself. That
  1763. // handles more cases than we can handle here.
  1764. return;
  1765. // Take our predicate state, shift it to the high 17 bits (so that we keep
  1766. // pointers canonical) and merge it into RSP. This will allow the caller to
  1767. // extract it when we return (speculatively).
  1768. mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
  1769. }
  1770. /// Trace the predicate state through a call.
  1771. ///
  1772. /// There are several layers of this needed to handle the full complexity of
  1773. /// calls.
  1774. ///
  1775. /// First, we need to send the predicate state into the called function. We do
  1776. /// this by merging it into the high bits of the stack pointer.
  1777. ///
  1778. /// For tail calls, this is all we need to do.
  1779. ///
  1780. /// For calls where we might return and resume the control flow, we need to
  1781. /// extract the predicate state from the high bits of the stack pointer after
  1782. /// control returns from the called function.
  1783. ///
  1784. /// We also need to verify that we intended to return to this location in the
  1785. /// code. An attacker might arrange for the processor to mispredict the return
  1786. /// to this valid but incorrect return address in the program rather than the
  1787. /// correct one. See the paper on this attack, called "ret2spec" by the
  1788. /// researchers, here:
  1789. /// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
  1790. ///
  1791. /// The way we verify that we returned to the correct location is by preserving
  1792. /// the expected return address across the call. One technique involves taking
  1793. /// advantage of the red-zone to load the return address from `8(%rsp)` where it
  1794. /// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
  1795. /// directly save the address into a register that will be preserved across the
  1796. /// call. We compare this intended return address against the address
  1797. /// immediately following the call (the observed return address). If these
  1798. /// mismatch, we have detected misspeculation and can poison our predicate
  1799. /// state.
  1800. void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
  1801. MachineInstr &MI) {
  1802. MachineBasicBlock &MBB = *MI.getParent();
  1803. MachineFunction &MF = *MBB.getParent();
  1804. auto InsertPt = MI.getIterator();
  1805. const DebugLoc &Loc = MI.getDebugLoc();
  1806. if (FenceCallAndRet) {
  1807. if (MI.isReturn())
  1808. // Tail call, we don't return to this function.
  1809. // FIXME: We should also handle noreturn calls.
  1810. return;
  1811. // We don't need to fence before the call because the function should fence
  1812. // in its entry. However, we do need to fence after the call returns.
  1813. // Fencing before the return doesn't correctly handle cases where the return
  1814. // itself is mispredicted.
  1815. BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
  1816. ++NumInstsInserted;
  1817. ++NumLFENCEsInserted;
  1818. return;
  1819. }
  1820. // First, we transfer the predicate state into the called function by merging
  1821. // it into the stack pointer. This will kill the current def of the state.
  1822. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1823. mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
  1824. // If this call is also a return, it is a tail call and we don't need anything
  1825. // else to handle it so just return. Also, if there are no further
  1826. // instructions and no successors, this call does not return so we can also
  1827. // bail.
  1828. if (MI.isReturn() || (std::next(InsertPt) == MBB.end() && MBB.succ_empty()))
  1829. return;
  1830. // Create a symbol to track the return address and attach it to the call
  1831. // machine instruction. We will lower extra symbols attached to call
  1832. // instructions as label immediately following the call.
  1833. MCSymbol *RetSymbol =
  1834. MF.getContext().createTempSymbol("slh_ret_addr",
  1835. /*AlwaysAddSuffix*/ true);
  1836. MI.setPostInstrSymbol(MF, RetSymbol);
  1837. const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
  1838. unsigned ExpectedRetAddrReg = 0;
  1839. // If we have no red zones or if the function returns twice (possibly without
  1840. // using the `ret` instruction) like setjmp, we need to save the expected
  1841. // return address prior to the call.
  1842. if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) ||
  1843. MF.exposesReturnsTwice()) {
  1844. // If we don't have red zones, we need to compute the expected return
  1845. // address prior to the call and store it in a register that lives across
  1846. // the call.
  1847. //
  1848. // In some ways, this is doubly satisfying as a mitigation because it will
  1849. // also successfully detect stack smashing bugs in some cases (typically,
  1850. // when a callee-saved register is used and the callee doesn't push it onto
  1851. // the stack). But that isn't our primary goal, so we only use it as
  1852. // a fallback.
  1853. //
  1854. // FIXME: It isn't clear that this is reliable in the face of
  1855. // rematerialization in the register allocator. We somehow need to force
  1856. // that to not occur for this particular instruction, and instead to spill
  1857. // or otherwise preserve the value computed *prior* to the call.
  1858. //
  1859. // FIXME: It is even less clear why MachineCSE can't just fold this when we
  1860. // end up having to use identical instructions both before and after the
  1861. // call to feed the comparison.
  1862. ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1863. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  1864. !Subtarget->isPositionIndependent()) {
  1865. BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
  1866. .addSym(RetSymbol);
  1867. } else {
  1868. BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
  1869. .addReg(/*Base*/ X86::RIP)
  1870. .addImm(/*Scale*/ 1)
  1871. .addReg(/*Index*/ 0)
  1872. .addSym(RetSymbol)
  1873. .addReg(/*Segment*/ 0);
  1874. }
  1875. }
  1876. // Step past the call to handle when it returns.
  1877. ++InsertPt;
  1878. // If we didn't pre-compute the expected return address into a register, then
  1879. // red zones are enabled and the return address is still available on the
  1880. // stack immediately after the call. As the very first instruction, we load it
  1881. // into a register.
  1882. if (!ExpectedRetAddrReg) {
  1883. ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1884. BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
  1885. .addReg(/*Base*/ X86::RSP)
  1886. .addImm(/*Scale*/ 1)
  1887. .addReg(/*Index*/ 0)
  1888. .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
  1889. // the return address is 8-bytes past it.
  1890. .addReg(/*Segment*/ 0);
  1891. }
  1892. // Now we extract the callee's predicate state from the stack pointer.
  1893. unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
  1894. // Test the expected return address against our actual address. If we can
  1895. // form this basic block's address as an immediate, this is easy. Otherwise
  1896. // we compute it.
  1897. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  1898. !Subtarget->isPositionIndependent()) {
  1899. // FIXME: Could we fold this with the load? It would require careful EFLAGS
  1900. // management.
  1901. BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
  1902. .addReg(ExpectedRetAddrReg, RegState::Kill)
  1903. .addSym(RetSymbol);
  1904. } else {
  1905. Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1906. BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
  1907. .addReg(/*Base*/ X86::RIP)
  1908. .addImm(/*Scale*/ 1)
  1909. .addReg(/*Index*/ 0)
  1910. .addSym(RetSymbol)
  1911. .addReg(/*Segment*/ 0);
  1912. BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
  1913. .addReg(ExpectedRetAddrReg, RegState::Kill)
  1914. .addReg(ActualRetAddrReg, RegState::Kill);
  1915. }
  1916. // Now conditionally update the predicate state we just extracted if we ended
  1917. // up at a different return address than expected.
  1918. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  1919. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  1920. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  1921. auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
  1922. .addReg(NewStateReg, RegState::Kill)
  1923. .addReg(PS->PoisonReg)
  1924. .addImm(X86::COND_NE);
  1925. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  1926. ++NumInstsInserted;
  1927. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
  1928. PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
  1929. }
  1930. /// An attacker may speculatively store over a value that is then speculatively
  1931. /// loaded and used as the target of an indirect call or jump instruction. This
  1932. /// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
  1933. /// in this paper:
  1934. /// https://people.csail.mit.edu/vlk/spectre11.pdf
  1935. ///
  1936. /// When this happens, the speculative execution of the call or jump will end up
  1937. /// being steered to this attacker controlled address. While most such loads
  1938. /// will be adequately hardened already, we want to ensure that they are
  1939. /// definitively treated as needing post-load hardening. While address hardening
  1940. /// is sufficient to prevent secret data from leaking to the attacker, it may
  1941. /// not be sufficient to prevent an attacker from steering speculative
  1942. /// execution. We forcibly unfolded all relevant loads above and so will always
  1943. /// have an opportunity to post-load harden here, we just need to scan for cases
  1944. /// not already flagged and add them.
  1945. void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
  1946. MachineInstr &MI,
  1947. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
  1948. switch (MI.getOpcode()) {
  1949. case X86::FARCALL16m:
  1950. case X86::FARCALL32m:
  1951. case X86::FARCALL64m:
  1952. case X86::FARJMP16m:
  1953. case X86::FARJMP32m:
  1954. case X86::FARJMP64m:
  1955. // We don't need to harden either far calls or far jumps as they are
  1956. // safe from Spectre.
  1957. return;
  1958. default:
  1959. break;
  1960. }
  1961. // We should never see a loading instruction at this point, as those should
  1962. // have been unfolded.
  1963. assert(!MI.mayLoad() && "Found a lingering loading instruction!");
  1964. // If the first operand isn't a register, this is a branch or call
  1965. // instruction with an immediate operand which doesn't need to be hardened.
  1966. if (!MI.getOperand(0).isReg())
  1967. return;
  1968. // For all of these, the target register is the first operand of the
  1969. // instruction.
  1970. auto &TargetOp = MI.getOperand(0);
  1971. Register OldTargetReg = TargetOp.getReg();
  1972. // Try to lookup a hardened version of this register. We retain a reference
  1973. // here as we want to update the map to track any newly computed hardened
  1974. // register.
  1975. unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
  1976. // If we don't have a hardened register yet, compute one. Otherwise, just use
  1977. // the already hardened register.
  1978. //
  1979. // FIXME: It is a little suspect that we use partially hardened registers that
  1980. // only feed addresses. The complexity of partial hardening with SHRX
  1981. // continues to pile up. Should definitively measure its value and consider
  1982. // eliminating it.
  1983. if (!HardenedTargetReg)
  1984. HardenedTargetReg = hardenValueInRegister(
  1985. OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
  1986. // Set the target operand to the hardened register.
  1987. TargetOp.setReg(HardenedTargetReg);
  1988. ++NumCallsOrJumpsHardened;
  1989. }
  1990. INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
  1991. "X86 speculative load hardener", false, false)
  1992. INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
  1993. "X86 speculative load hardener", false, false)
  1994. FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
  1995. return new X86SpeculativeLoadHardeningPass();
  1996. }