X86SpeculativeLoadHardening.cpp 93 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278
  1. //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. ///
  10. /// Provide a pass which mitigates speculative execution attacks which operate
  11. /// by speculating incorrectly past some predicate (a type check, bounds check,
  12. /// or other condition) to reach a load with invalid inputs and leak the data
  13. /// accessed by that load using a side channel out of the speculative domain.
  14. ///
  15. /// For details on the attacks, see the first variant in both the Project Zero
  16. /// writeup and the Spectre paper:
  17. /// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
  18. /// https://spectreattack.com/spectre.pdf
  19. ///
  20. //===----------------------------------------------------------------------===//
  21. #include "X86.h"
  22. #include "X86InstrBuilder.h"
  23. #include "X86InstrInfo.h"
  24. #include "X86Subtarget.h"
  25. #include "llvm/ADT/ArrayRef.h"
  26. #include "llvm/ADT/DenseMap.h"
  27. #include "llvm/ADT/STLExtras.h"
  28. #include "llvm/ADT/ScopeExit.h"
  29. #include "llvm/ADT/SmallPtrSet.h"
  30. #include "llvm/ADT/SmallSet.h"
  31. #include "llvm/ADT/SmallVector.h"
  32. #include "llvm/ADT/SparseBitVector.h"
  33. #include "llvm/ADT/Statistic.h"
  34. #include "llvm/CodeGen/MachineBasicBlock.h"
  35. #include "llvm/CodeGen/MachineConstantPool.h"
  36. #include "llvm/CodeGen/MachineFunction.h"
  37. #include "llvm/CodeGen/MachineFunctionPass.h"
  38. #include "llvm/CodeGen/MachineInstr.h"
  39. #include "llvm/CodeGen/MachineInstrBuilder.h"
  40. #include "llvm/CodeGen/MachineModuleInfo.h"
  41. #include "llvm/CodeGen/MachineOperand.h"
  42. #include "llvm/CodeGen/MachineRegisterInfo.h"
  43. #include "llvm/CodeGen/MachineSSAUpdater.h"
  44. #include "llvm/CodeGen/TargetInstrInfo.h"
  45. #include "llvm/CodeGen/TargetRegisterInfo.h"
  46. #include "llvm/CodeGen/TargetSchedule.h"
  47. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  48. #include "llvm/IR/DebugLoc.h"
  49. #include "llvm/MC/MCSchedule.h"
  50. #include "llvm/Pass.h"
  51. #include "llvm/Support/CommandLine.h"
  52. #include "llvm/Support/Debug.h"
  53. #include "llvm/Support/raw_ostream.h"
  54. #include "llvm/Target/TargetMachine.h"
  55. #include <algorithm>
  56. #include <cassert>
  57. #include <iterator>
  58. #include <optional>
  59. #include <utility>
  60. using namespace llvm;
  61. #define PASS_KEY "x86-slh"
  62. #define DEBUG_TYPE PASS_KEY
  63. STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
  64. STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
  65. STATISTIC(NumAddrRegsHardened,
  66. "Number of address mode used registers hardaned");
  67. STATISTIC(NumPostLoadRegsHardened,
  68. "Number of post-load register values hardened");
  69. STATISTIC(NumCallsOrJumpsHardened,
  70. "Number of calls or jumps requiring extra hardening");
  71. STATISTIC(NumInstsInserted, "Number of instructions inserted");
  72. STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
  73. static cl::opt<bool> EnableSpeculativeLoadHardening(
  74. "x86-speculative-load-hardening",
  75. cl::desc("Force enable speculative load hardening"), cl::init(false),
  76. cl::Hidden);
  77. static cl::opt<bool> HardenEdgesWithLFENCE(
  78. PASS_KEY "-lfence",
  79. cl::desc(
  80. "Use LFENCE along each conditional edge to harden against speculative "
  81. "loads rather than conditional movs and poisoned pointers."),
  82. cl::init(false), cl::Hidden);
  83. static cl::opt<bool> EnablePostLoadHardening(
  84. PASS_KEY "-post-load",
  85. cl::desc("Harden the value loaded *after* it is loaded by "
  86. "flushing the loaded bits to 1. This is hard to do "
  87. "in general but can be done easily for GPRs."),
  88. cl::init(true), cl::Hidden);
  89. static cl::opt<bool> FenceCallAndRet(
  90. PASS_KEY "-fence-call-and-ret",
  91. cl::desc("Use a full speculation fence to harden both call and ret edges "
  92. "rather than a lighter weight mitigation."),
  93. cl::init(false), cl::Hidden);
  94. static cl::opt<bool> HardenInterprocedurally(
  95. PASS_KEY "-ip",
  96. cl::desc("Harden interprocedurally by passing our state in and out of "
  97. "functions in the high bits of the stack pointer."),
  98. cl::init(true), cl::Hidden);
  99. static cl::opt<bool>
  100. HardenLoads(PASS_KEY "-loads",
  101. cl::desc("Sanitize loads from memory. When disable, no "
  102. "significant security is provided."),
  103. cl::init(true), cl::Hidden);
  104. static cl::opt<bool> HardenIndirectCallsAndJumps(
  105. PASS_KEY "-indirect",
  106. cl::desc("Harden indirect calls and jumps against using speculatively "
  107. "stored attacker controlled addresses. This is designed to "
  108. "mitigate Spectre v1.2 style attacks."),
  109. cl::init(true), cl::Hidden);
  110. namespace {
  111. class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
  112. public:
  113. X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { }
  114. StringRef getPassName() const override {
  115. return "X86 speculative load hardening";
  116. }
  117. bool runOnMachineFunction(MachineFunction &MF) override;
  118. void getAnalysisUsage(AnalysisUsage &AU) const override;
  119. /// Pass identification, replacement for typeid.
  120. static char ID;
  121. private:
  122. /// The information about a block's conditional terminators needed to trace
  123. /// our predicate state through the exiting edges.
  124. struct BlockCondInfo {
  125. MachineBasicBlock *MBB;
  126. // We mostly have one conditional branch, and in extremely rare cases have
  127. // two. Three and more are so rare as to be unimportant for compile time.
  128. SmallVector<MachineInstr *, 2> CondBrs;
  129. MachineInstr *UncondBr;
  130. };
  131. /// Manages the predicate state traced through the program.
  132. struct PredState {
  133. unsigned InitialReg = 0;
  134. unsigned PoisonReg = 0;
  135. const TargetRegisterClass *RC;
  136. MachineSSAUpdater SSA;
  137. PredState(MachineFunction &MF, const TargetRegisterClass *RC)
  138. : RC(RC), SSA(MF) {}
  139. };
  140. const X86Subtarget *Subtarget = nullptr;
  141. MachineRegisterInfo *MRI = nullptr;
  142. const X86InstrInfo *TII = nullptr;
  143. const TargetRegisterInfo *TRI = nullptr;
  144. std::optional<PredState> PS;
  145. void hardenEdgesWithLFENCE(MachineFunction &MF);
  146. SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
  147. SmallVector<MachineInstr *, 16>
  148. tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
  149. void unfoldCallAndJumpLoads(MachineFunction &MF);
  150. SmallVector<MachineInstr *, 16>
  151. tracePredStateThroughIndirectBranches(MachineFunction &MF);
  152. void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
  153. unsigned saveEFLAGS(MachineBasicBlock &MBB,
  154. MachineBasicBlock::iterator InsertPt,
  155. const DebugLoc &Loc);
  156. void restoreEFLAGS(MachineBasicBlock &MBB,
  157. MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
  158. Register Reg);
  159. void mergePredStateIntoSP(MachineBasicBlock &MBB,
  160. MachineBasicBlock::iterator InsertPt,
  161. const DebugLoc &Loc, unsigned PredStateReg);
  162. unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
  163. MachineBasicBlock::iterator InsertPt,
  164. const DebugLoc &Loc);
  165. void
  166. hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
  167. MachineOperand &IndexMO,
  168. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
  169. MachineInstr *
  170. sinkPostLoadHardenedInst(MachineInstr &MI,
  171. SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
  172. bool canHardenRegister(Register Reg);
  173. unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
  174. MachineBasicBlock::iterator InsertPt,
  175. const DebugLoc &Loc);
  176. unsigned hardenPostLoad(MachineInstr &MI);
  177. void hardenReturnInstr(MachineInstr &MI);
  178. void tracePredStateThroughCall(MachineInstr &MI);
  179. void hardenIndirectCallOrJumpInstr(
  180. MachineInstr &MI,
  181. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
  182. };
  183. } // end anonymous namespace
  184. char X86SpeculativeLoadHardeningPass::ID = 0;
  185. void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
  186. AnalysisUsage &AU) const {
  187. MachineFunctionPass::getAnalysisUsage(AU);
  188. }
  189. static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
  190. MachineBasicBlock &Succ, int SuccCount,
  191. MachineInstr *Br, MachineInstr *&UncondBr,
  192. const X86InstrInfo &TII) {
  193. assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
  194. MachineFunction &MF = *MBB.getParent();
  195. MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
  196. // We have to insert the new block immediately after the current one as we
  197. // don't know what layout-successor relationships the successor has and we
  198. // may not be able to (and generally don't want to) try to fix those up.
  199. MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
  200. // Update the branch instruction if necessary.
  201. if (Br) {
  202. assert(Br->getOperand(0).getMBB() == &Succ &&
  203. "Didn't start with the right target!");
  204. Br->getOperand(0).setMBB(&NewMBB);
  205. // If this successor was reached through a branch rather than fallthrough,
  206. // we might have *broken* fallthrough and so need to inject a new
  207. // unconditional branch.
  208. if (!UncondBr) {
  209. MachineBasicBlock &OldLayoutSucc =
  210. *std::next(MachineFunction::iterator(&NewMBB));
  211. assert(MBB.isSuccessor(&OldLayoutSucc) &&
  212. "Without an unconditional branch, the old layout successor should "
  213. "be an actual successor!");
  214. auto BrBuilder =
  215. BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
  216. // Update the unconditional branch now that we've added one.
  217. UncondBr = &*BrBuilder;
  218. }
  219. // Insert unconditional "jump Succ" instruction in the new block if
  220. // necessary.
  221. if (!NewMBB.isLayoutSuccessor(&Succ)) {
  222. SmallVector<MachineOperand, 4> Cond;
  223. TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
  224. }
  225. } else {
  226. assert(!UncondBr &&
  227. "Cannot have a branchless successor and an unconditional branch!");
  228. assert(NewMBB.isLayoutSuccessor(&Succ) &&
  229. "A non-branch successor must have been a layout successor before "
  230. "and now is a layout successor of the new block.");
  231. }
  232. // If this is the only edge to the successor, we can just replace it in the
  233. // CFG. Otherwise we need to add a new entry in the CFG for the new
  234. // successor.
  235. if (SuccCount == 1) {
  236. MBB.replaceSuccessor(&Succ, &NewMBB);
  237. } else {
  238. MBB.splitSuccessor(&Succ, &NewMBB);
  239. }
  240. // Hook up the edge from the new basic block to the old successor in the CFG.
  241. NewMBB.addSuccessor(&Succ);
  242. // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
  243. for (MachineInstr &MI : Succ) {
  244. if (!MI.isPHI())
  245. break;
  246. for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
  247. OpIdx += 2) {
  248. MachineOperand &OpV = MI.getOperand(OpIdx);
  249. MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
  250. assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
  251. if (OpMBB.getMBB() != &MBB)
  252. continue;
  253. // If this is the last edge to the succesor, just replace MBB in the PHI
  254. if (SuccCount == 1) {
  255. OpMBB.setMBB(&NewMBB);
  256. break;
  257. }
  258. // Otherwise, append a new pair of operands for the new incoming edge.
  259. MI.addOperand(MF, OpV);
  260. MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
  261. break;
  262. }
  263. }
  264. // Inherit live-ins from the successor
  265. for (auto &LI : Succ.liveins())
  266. NewMBB.addLiveIn(LI);
  267. LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
  268. << Succ.getName() << "'.\n");
  269. return NewMBB;
  270. }
  271. /// Removing duplicate PHI operands to leave the PHI in a canonical and
  272. /// predictable form.
  273. ///
  274. /// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
  275. /// isn't what you might expect. We may have multiple entries in PHI nodes for
  276. /// a single predecessor. This makes CFG-updating extremely complex, so here we
  277. /// simplify all PHI nodes to a model even simpler than the IR's model: exactly
  278. /// one entry per predecessor, regardless of how many edges there are.
  279. static void canonicalizePHIOperands(MachineFunction &MF) {
  280. SmallPtrSet<MachineBasicBlock *, 4> Preds;
  281. SmallVector<int, 4> DupIndices;
  282. for (auto &MBB : MF)
  283. for (auto &MI : MBB) {
  284. if (!MI.isPHI())
  285. break;
  286. // First we scan the operands of the PHI looking for duplicate entries
  287. // a particular predecessor. We retain the operand index of each duplicate
  288. // entry found.
  289. for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
  290. OpIdx += 2)
  291. if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
  292. DupIndices.push_back(OpIdx);
  293. // Now walk the duplicate indices, removing both the block and value. Note
  294. // that these are stored as a vector making this element-wise removal
  295. // :w
  296. // potentially quadratic.
  297. //
  298. // FIXME: It is really frustrating that we have to use a quadratic
  299. // removal algorithm here. There should be a better way, but the use-def
  300. // updates required make that impossible using the public API.
  301. //
  302. // Note that we have to process these backwards so that we don't
  303. // invalidate other indices with each removal.
  304. while (!DupIndices.empty()) {
  305. int OpIdx = DupIndices.pop_back_val();
  306. // Remove both the block and value operand, again in reverse order to
  307. // preserve indices.
  308. MI.removeOperand(OpIdx + 1);
  309. MI.removeOperand(OpIdx);
  310. }
  311. Preds.clear();
  312. }
  313. }
  314. /// Helper to scan a function for loads vulnerable to misspeculation that we
  315. /// want to harden.
  316. ///
  317. /// We use this to avoid making changes to functions where there is nothing we
  318. /// need to do to harden against misspeculation.
  319. static bool hasVulnerableLoad(MachineFunction &MF) {
  320. for (MachineBasicBlock &MBB : MF) {
  321. for (MachineInstr &MI : MBB) {
  322. // Loads within this basic block after an LFENCE are not at risk of
  323. // speculatively executing with invalid predicates from prior control
  324. // flow. So break out of this block but continue scanning the function.
  325. if (MI.getOpcode() == X86::LFENCE)
  326. break;
  327. // Looking for loads only.
  328. if (!MI.mayLoad())
  329. continue;
  330. // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
  331. if (MI.getOpcode() == X86::MFENCE)
  332. continue;
  333. // We found a load.
  334. return true;
  335. }
  336. }
  337. // No loads found.
  338. return false;
  339. }
  340. bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
  341. MachineFunction &MF) {
  342. LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
  343. << " **********\n");
  344. // Only run if this pass is forced enabled or we detect the relevant function
  345. // attribute requesting SLH.
  346. if (!EnableSpeculativeLoadHardening &&
  347. !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
  348. return false;
  349. Subtarget = &MF.getSubtarget<X86Subtarget>();
  350. MRI = &MF.getRegInfo();
  351. TII = Subtarget->getInstrInfo();
  352. TRI = Subtarget->getRegisterInfo();
  353. // FIXME: Support for 32-bit.
  354. PS.emplace(MF, &X86::GR64_NOSPRegClass);
  355. if (MF.begin() == MF.end())
  356. // Nothing to do for a degenerate empty function...
  357. return false;
  358. // We support an alternative hardening technique based on a debug flag.
  359. if (HardenEdgesWithLFENCE) {
  360. hardenEdgesWithLFENCE(MF);
  361. return true;
  362. }
  363. // Create a dummy debug loc to use for all the generated code here.
  364. DebugLoc Loc;
  365. MachineBasicBlock &Entry = *MF.begin();
  366. auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
  367. // Do a quick scan to see if we have any checkable loads.
  368. bool HasVulnerableLoad = hasVulnerableLoad(MF);
  369. // See if we have any conditional branching blocks that we will need to trace
  370. // predicate state through.
  371. SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
  372. // If we have no interesting conditions or loads, nothing to do here.
  373. if (!HasVulnerableLoad && Infos.empty())
  374. return true;
  375. // The poison value is required to be an all-ones value for many aspects of
  376. // this mitigation.
  377. const int PoisonVal = -1;
  378. PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
  379. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
  380. .addImm(PoisonVal);
  381. ++NumInstsInserted;
  382. // If we have loads being hardened and we've asked for call and ret edges to
  383. // get a full fence-based mitigation, inject that fence.
  384. if (HasVulnerableLoad && FenceCallAndRet) {
  385. // We need to insert an LFENCE at the start of the function to suspend any
  386. // incoming misspeculation from the caller. This helps two-fold: the caller
  387. // may not have been protected as this code has been, and this code gets to
  388. // not take any specific action to protect across calls.
  389. // FIXME: We could skip this for functions which unconditionally return
  390. // a constant.
  391. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
  392. ++NumInstsInserted;
  393. ++NumLFENCEsInserted;
  394. }
  395. // If we guarded the entry with an LFENCE and have no conditionals to protect
  396. // in blocks, then we're done.
  397. if (FenceCallAndRet && Infos.empty())
  398. // We may have changed the function's code at this point to insert fences.
  399. return true;
  400. // For every basic block in the function which can b
  401. if (HardenInterprocedurally && !FenceCallAndRet) {
  402. // Set up the predicate state by extracting it from the incoming stack
  403. // pointer so we pick up any misspeculation in our caller.
  404. PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
  405. } else {
  406. // Otherwise, just build the predicate state itself by zeroing a register
  407. // as we don't need any initial state.
  408. PS->InitialReg = MRI->createVirtualRegister(PS->RC);
  409. Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
  410. auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
  411. PredStateSubReg);
  412. ++NumInstsInserted;
  413. MachineOperand *ZeroEFLAGSDefOp =
  414. ZeroI->findRegisterDefOperand(X86::EFLAGS);
  415. assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
  416. "Must have an implicit def of EFLAGS!");
  417. ZeroEFLAGSDefOp->setIsDead(true);
  418. BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
  419. PS->InitialReg)
  420. .addImm(0)
  421. .addReg(PredStateSubReg)
  422. .addImm(X86::sub_32bit);
  423. }
  424. // We're going to need to trace predicate state throughout the function's
  425. // CFG. Prepare for this by setting up our initial state of PHIs with unique
  426. // predecessor entries and all the initial predicate state.
  427. canonicalizePHIOperands(MF);
  428. // Track the updated values in an SSA updater to rewrite into SSA form at the
  429. // end.
  430. PS->SSA.Initialize(PS->InitialReg);
  431. PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
  432. // Trace through the CFG.
  433. auto CMovs = tracePredStateThroughCFG(MF, Infos);
  434. // We may also enter basic blocks in this function via exception handling
  435. // control flow. Here, if we are hardening interprocedurally, we need to
  436. // re-capture the predicate state from the throwing code. In the Itanium ABI,
  437. // the throw will always look like a call to __cxa_throw and will have the
  438. // predicate state in the stack pointer, so extract fresh predicate state from
  439. // the stack pointer and make it available in SSA.
  440. // FIXME: Handle non-itanium ABI EH models.
  441. if (HardenInterprocedurally) {
  442. for (MachineBasicBlock &MBB : MF) {
  443. assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
  444. assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
  445. assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
  446. if (!MBB.isEHPad())
  447. continue;
  448. PS->SSA.AddAvailableValue(
  449. &MBB,
  450. extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
  451. }
  452. }
  453. if (HardenIndirectCallsAndJumps) {
  454. // If we are going to harden calls and jumps we need to unfold their memory
  455. // operands.
  456. unfoldCallAndJumpLoads(MF);
  457. // Then we trace predicate state through the indirect branches.
  458. auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
  459. CMovs.append(IndirectBrCMovs.begin(), IndirectBrCMovs.end());
  460. }
  461. // Now that we have the predicate state available at the start of each block
  462. // in the CFG, trace it through each block, hardening vulnerable instructions
  463. // as we go.
  464. tracePredStateThroughBlocksAndHarden(MF);
  465. // Now rewrite all the uses of the pred state using the SSA updater to insert
  466. // PHIs connecting the state between blocks along the CFG edges.
  467. for (MachineInstr *CMovI : CMovs)
  468. for (MachineOperand &Op : CMovI->operands()) {
  469. if (!Op.isReg() || Op.getReg() != PS->InitialReg)
  470. continue;
  471. PS->SSA.RewriteUse(Op);
  472. }
  473. LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
  474. dbgs() << "\n"; MF.verify(this));
  475. return true;
  476. }
  477. /// Implements the naive hardening approach of putting an LFENCE after every
  478. /// potentially mis-predicted control flow construct.
  479. ///
  480. /// We include this as an alternative mostly for the purpose of comparison. The
  481. /// performance impact of this is expected to be extremely severe and not
  482. /// practical for any real-world users.
  483. void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
  484. MachineFunction &MF) {
  485. // First, we scan the function looking for blocks that are reached along edges
  486. // that we might want to harden.
  487. SmallSetVector<MachineBasicBlock *, 8> Blocks;
  488. for (MachineBasicBlock &MBB : MF) {
  489. // If there are no or only one successor, nothing to do here.
  490. if (MBB.succ_size() <= 1)
  491. continue;
  492. // Skip blocks unless their terminators start with a branch. Other
  493. // terminators don't seem interesting for guarding against misspeculation.
  494. auto TermIt = MBB.getFirstTerminator();
  495. if (TermIt == MBB.end() || !TermIt->isBranch())
  496. continue;
  497. // Add all the non-EH-pad succossors to the blocks we want to harden. We
  498. // skip EH pads because there isn't really a condition of interest on
  499. // entering.
  500. for (MachineBasicBlock *SuccMBB : MBB.successors())
  501. if (!SuccMBB->isEHPad())
  502. Blocks.insert(SuccMBB);
  503. }
  504. for (MachineBasicBlock *MBB : Blocks) {
  505. auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
  506. BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
  507. ++NumInstsInserted;
  508. ++NumLFENCEsInserted;
  509. }
  510. }
  511. SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
  512. X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
  513. SmallVector<BlockCondInfo, 16> Infos;
  514. // Walk the function and build up a summary for each block's conditions that
  515. // we need to trace through.
  516. for (MachineBasicBlock &MBB : MF) {
  517. // If there are no or only one successor, nothing to do here.
  518. if (MBB.succ_size() <= 1)
  519. continue;
  520. // We want to reliably handle any conditional branch terminators in the
  521. // MBB, so we manually analyze the branch. We can handle all of the
  522. // permutations here, including ones that analyze branch cannot.
  523. //
  524. // The approach is to walk backwards across the terminators, resetting at
  525. // any unconditional non-indirect branch, and track all conditional edges
  526. // to basic blocks as well as the fallthrough or unconditional successor
  527. // edge. For each conditional edge, we track the target and the opposite
  528. // condition code in order to inject a "no-op" cmov into that successor
  529. // that will harden the predicate. For the fallthrough/unconditional
  530. // edge, we inject a separate cmov for each conditional branch with
  531. // matching condition codes. This effectively implements an "and" of the
  532. // condition flags, even if there isn't a single condition flag that would
  533. // directly implement that. We don't bother trying to optimize either of
  534. // these cases because if such an optimization is possible, LLVM should
  535. // have optimized the conditional *branches* in that way already to reduce
  536. // instruction count. This late, we simply assume the minimal number of
  537. // branch instructions is being emitted and use that to guide our cmov
  538. // insertion.
  539. BlockCondInfo Info = {&MBB, {}, nullptr};
  540. // Now walk backwards through the terminators and build up successors they
  541. // reach and the conditions.
  542. for (MachineInstr &MI : llvm::reverse(MBB)) {
  543. // Once we've handled all the terminators, we're done.
  544. if (!MI.isTerminator())
  545. break;
  546. // If we see a non-branch terminator, we can't handle anything so bail.
  547. if (!MI.isBranch()) {
  548. Info.CondBrs.clear();
  549. break;
  550. }
  551. // If we see an unconditional branch, reset our state, clear any
  552. // fallthrough, and set this is the "else" successor.
  553. if (MI.getOpcode() == X86::JMP_1) {
  554. Info.CondBrs.clear();
  555. Info.UncondBr = &MI;
  556. continue;
  557. }
  558. // If we get an invalid condition, we have an indirect branch or some
  559. // other unanalyzable "fallthrough" case. We model this as a nullptr for
  560. // the destination so we can still guard any conditional successors.
  561. // Consider code sequences like:
  562. // ```
  563. // jCC L1
  564. // jmpq *%rax
  565. // ```
  566. // We still want to harden the edge to `L1`.
  567. if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
  568. Info.CondBrs.clear();
  569. Info.UncondBr = &MI;
  570. continue;
  571. }
  572. // We have a vanilla conditional branch, add it to our list.
  573. Info.CondBrs.push_back(&MI);
  574. }
  575. if (Info.CondBrs.empty()) {
  576. ++NumBranchesUntraced;
  577. LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
  578. MBB.dump());
  579. continue;
  580. }
  581. Infos.push_back(Info);
  582. }
  583. return Infos;
  584. }
  585. /// Trace the predicate state through the CFG, instrumenting each conditional
  586. /// branch such that misspeculation through an edge will poison the predicate
  587. /// state.
  588. ///
  589. /// Returns the list of inserted CMov instructions so that they can have their
  590. /// uses of the predicate state rewritten into proper SSA form once it is
  591. /// complete.
  592. SmallVector<MachineInstr *, 16>
  593. X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
  594. MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
  595. // Collect the inserted cmov instructions so we can rewrite their uses of the
  596. // predicate state into SSA form.
  597. SmallVector<MachineInstr *, 16> CMovs;
  598. // Now walk all of the basic blocks looking for ones that end in conditional
  599. // jumps where we need to update this register along each edge.
  600. for (const BlockCondInfo &Info : Infos) {
  601. MachineBasicBlock &MBB = *Info.MBB;
  602. const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
  603. MachineInstr *UncondBr = Info.UncondBr;
  604. LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
  605. << "\n");
  606. ++NumCondBranchesTraced;
  607. // Compute the non-conditional successor as either the target of any
  608. // unconditional branch or the layout successor.
  609. MachineBasicBlock *UncondSucc =
  610. UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
  611. ? UncondBr->getOperand(0).getMBB()
  612. : nullptr)
  613. : &*std::next(MachineFunction::iterator(&MBB));
  614. // Count how many edges there are to any given successor.
  615. SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
  616. if (UncondSucc)
  617. ++SuccCounts[UncondSucc];
  618. for (auto *CondBr : CondBrs)
  619. ++SuccCounts[CondBr->getOperand(0).getMBB()];
  620. // A lambda to insert cmov instructions into a block checking all of the
  621. // condition codes in a sequence.
  622. auto BuildCheckingBlockForSuccAndConds =
  623. [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
  624. MachineInstr *Br, MachineInstr *&UncondBr,
  625. ArrayRef<X86::CondCode> Conds) {
  626. // First, we split the edge to insert the checking block into a safe
  627. // location.
  628. auto &CheckingMBB =
  629. (SuccCount == 1 && Succ.pred_size() == 1)
  630. ? Succ
  631. : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
  632. bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
  633. if (!LiveEFLAGS)
  634. CheckingMBB.addLiveIn(X86::EFLAGS);
  635. // Now insert the cmovs to implement the checks.
  636. auto InsertPt = CheckingMBB.begin();
  637. assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
  638. "Should never have a PHI in the initial checking block as it "
  639. "always has a single predecessor!");
  640. // We will wire each cmov to each other, but need to start with the
  641. // incoming pred state.
  642. unsigned CurStateReg = PS->InitialReg;
  643. for (X86::CondCode Cond : Conds) {
  644. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  645. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  646. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  647. // Note that we intentionally use an empty debug location so that
  648. // this picks up the preceding location.
  649. auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
  650. TII->get(CMovOp), UpdatedStateReg)
  651. .addReg(CurStateReg)
  652. .addReg(PS->PoisonReg)
  653. .addImm(Cond);
  654. // If this is the last cmov and the EFLAGS weren't originally
  655. // live-in, mark them as killed.
  656. if (!LiveEFLAGS && Cond == Conds.back())
  657. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  658. ++NumInstsInserted;
  659. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
  660. dbgs() << "\n");
  661. // The first one of the cmovs will be using the top level
  662. // `PredStateReg` and need to get rewritten into SSA form.
  663. if (CurStateReg == PS->InitialReg)
  664. CMovs.push_back(&*CMovI);
  665. // The next cmov should start from this one's def.
  666. CurStateReg = UpdatedStateReg;
  667. }
  668. // And put the last one into the available values for SSA form of our
  669. // predicate state.
  670. PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
  671. };
  672. std::vector<X86::CondCode> UncondCodeSeq;
  673. for (auto *CondBr : CondBrs) {
  674. MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
  675. int &SuccCount = SuccCounts[&Succ];
  676. X86::CondCode Cond = X86::getCondFromBranch(*CondBr);
  677. X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
  678. UncondCodeSeq.push_back(Cond);
  679. BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
  680. {InvCond});
  681. // Decrement the successor count now that we've split one of the edges.
  682. // We need to keep the count of edges to the successor accurate in order
  683. // to know above when to *replace* the successor in the CFG vs. just
  684. // adding the new successor.
  685. --SuccCount;
  686. }
  687. // Since we may have split edges and changed the number of successors,
  688. // normalize the probabilities. This avoids doing it each time we split an
  689. // edge.
  690. MBB.normalizeSuccProbs();
  691. // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
  692. // need to intersect the other condition codes. We can do this by just
  693. // doing a cmov for each one.
  694. if (!UncondSucc)
  695. // If we have no fallthrough to protect (perhaps it is an indirect jump?)
  696. // just skip this and continue.
  697. continue;
  698. assert(SuccCounts[UncondSucc] == 1 &&
  699. "We should never have more than one edge to the unconditional "
  700. "successor at this point because every other edge must have been "
  701. "split above!");
  702. // Sort and unique the codes to minimize them.
  703. llvm::sort(UncondCodeSeq);
  704. UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
  705. UncondCodeSeq.end());
  706. // Build a checking version of the successor.
  707. BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
  708. UncondBr, UncondBr, UncondCodeSeq);
  709. }
  710. return CMovs;
  711. }
  712. /// Compute the register class for the unfolded load.
  713. ///
  714. /// FIXME: This should probably live in X86InstrInfo, potentially by adding
  715. /// a way to unfold into a newly created vreg rather than requiring a register
  716. /// input.
  717. static const TargetRegisterClass *
  718. getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
  719. unsigned Opcode) {
  720. unsigned Index;
  721. unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
  722. Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
  723. const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
  724. return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
  725. }
  726. void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
  727. MachineFunction &MF) {
  728. for (MachineBasicBlock &MBB : MF)
  729. // We use make_early_inc_range here so we can remove instructions if needed
  730. // without disturbing the iteration.
  731. for (MachineInstr &MI : llvm::make_early_inc_range(MBB.instrs())) {
  732. // Must either be a call or a branch.
  733. if (!MI.isCall() && !MI.isBranch())
  734. continue;
  735. // We only care about loading variants of these instructions.
  736. if (!MI.mayLoad())
  737. continue;
  738. switch (MI.getOpcode()) {
  739. default: {
  740. LLVM_DEBUG(
  741. dbgs() << "ERROR: Found an unexpected loading branch or call "
  742. "instruction:\n";
  743. MI.dump(); dbgs() << "\n");
  744. report_fatal_error("Unexpected loading branch or call!");
  745. }
  746. case X86::FARCALL16m:
  747. case X86::FARCALL32m:
  748. case X86::FARCALL64m:
  749. case X86::FARJMP16m:
  750. case X86::FARJMP32m:
  751. case X86::FARJMP64m:
  752. // We cannot mitigate far jumps or calls, but we also don't expect them
  753. // to be vulnerable to Spectre v1.2 style attacks.
  754. continue;
  755. case X86::CALL16m:
  756. case X86::CALL16m_NT:
  757. case X86::CALL32m:
  758. case X86::CALL32m_NT:
  759. case X86::CALL64m:
  760. case X86::CALL64m_NT:
  761. case X86::JMP16m:
  762. case X86::JMP16m_NT:
  763. case X86::JMP32m:
  764. case X86::JMP32m_NT:
  765. case X86::JMP64m:
  766. case X86::JMP64m_NT:
  767. case X86::TAILJMPm64:
  768. case X86::TAILJMPm64_REX:
  769. case X86::TAILJMPm:
  770. case X86::TCRETURNmi64:
  771. case X86::TCRETURNmi: {
  772. // Use the generic unfold logic now that we know we're dealing with
  773. // expected instructions.
  774. // FIXME: We don't have test coverage for all of these!
  775. auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
  776. if (!UnfoldedRC) {
  777. LLVM_DEBUG(dbgs()
  778. << "ERROR: Unable to unfold load from instruction:\n";
  779. MI.dump(); dbgs() << "\n");
  780. report_fatal_error("Unable to unfold load!");
  781. }
  782. Register Reg = MRI->createVirtualRegister(UnfoldedRC);
  783. SmallVector<MachineInstr *, 2> NewMIs;
  784. // If we were able to compute an unfolded reg class, any failure here
  785. // is just a programming error so just assert.
  786. bool Unfolded =
  787. TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
  788. /*UnfoldStore*/ false, NewMIs);
  789. (void)Unfolded;
  790. assert(Unfolded &&
  791. "Computed unfolded register class but failed to unfold");
  792. // Now stitch the new instructions into place and erase the old one.
  793. for (auto *NewMI : NewMIs)
  794. MBB.insert(MI.getIterator(), NewMI);
  795. // Update the call site info.
  796. if (MI.isCandidateForCallSiteEntry())
  797. MF.eraseCallSiteInfo(&MI);
  798. MI.eraseFromParent();
  799. LLVM_DEBUG({
  800. dbgs() << "Unfolded load successfully into:\n";
  801. for (auto *NewMI : NewMIs) {
  802. NewMI->dump();
  803. dbgs() << "\n";
  804. }
  805. });
  806. continue;
  807. }
  808. }
  809. llvm_unreachable("Escaped switch with default!");
  810. }
  811. }
  812. /// Trace the predicate state through indirect branches, instrumenting them to
  813. /// poison the state if a target is reached that does not match the expected
  814. /// target.
  815. ///
  816. /// This is designed to mitigate Spectre variant 1 attacks where an indirect
  817. /// branch is trained to predict a particular target and then mispredicts that
  818. /// target in a way that can leak data. Despite using an indirect branch, this
  819. /// is really a variant 1 style attack: it does not steer execution to an
  820. /// arbitrary or attacker controlled address, and it does not require any
  821. /// special code executing next to the victim. This attack can also be mitigated
  822. /// through retpolines, but those require either replacing indirect branches
  823. /// with conditional direct branches or lowering them through a device that
  824. /// blocks speculation. This mitigation can replace these retpoline-style
  825. /// mitigations for jump tables and other indirect branches within a function
  826. /// when variant 2 isn't a risk while allowing limited speculation. Indirect
  827. /// calls, however, cannot be mitigated through this technique without changing
  828. /// the ABI in a fundamental way.
  829. SmallVector<MachineInstr *, 16>
  830. X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
  831. MachineFunction &MF) {
  832. // We use the SSAUpdater to insert PHI nodes for the target addresses of
  833. // indirect branches. We don't actually need the full power of the SSA updater
  834. // in this particular case as we always have immediately available values, but
  835. // this avoids us having to re-implement the PHI construction logic.
  836. MachineSSAUpdater TargetAddrSSA(MF);
  837. TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
  838. // Track which blocks were terminated with an indirect branch.
  839. SmallPtrSet<MachineBasicBlock *, 4> IndirectTerminatedMBBs;
  840. // We need to know what blocks end up reached via indirect branches. We
  841. // expect this to be a subset of those whose address is taken and so track it
  842. // directly via the CFG.
  843. SmallPtrSet<MachineBasicBlock *, 4> IndirectTargetMBBs;
  844. // Walk all the blocks which end in an indirect branch and make the
  845. // target address available.
  846. for (MachineBasicBlock &MBB : MF) {
  847. // Find the last terminator.
  848. auto MII = MBB.instr_rbegin();
  849. while (MII != MBB.instr_rend() && MII->isDebugInstr())
  850. ++MII;
  851. if (MII == MBB.instr_rend())
  852. continue;
  853. MachineInstr &TI = *MII;
  854. if (!TI.isTerminator() || !TI.isBranch())
  855. // No terminator or non-branch terminator.
  856. continue;
  857. unsigned TargetReg;
  858. switch (TI.getOpcode()) {
  859. default:
  860. // Direct branch or conditional branch (leading to fallthrough).
  861. continue;
  862. case X86::FARJMP16m:
  863. case X86::FARJMP32m:
  864. case X86::FARJMP64m:
  865. // We cannot mitigate far jumps or calls, but we also don't expect them
  866. // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
  867. continue;
  868. case X86::JMP16m:
  869. case X86::JMP16m_NT:
  870. case X86::JMP32m:
  871. case X86::JMP32m_NT:
  872. case X86::JMP64m:
  873. case X86::JMP64m_NT:
  874. // Mostly as documentation.
  875. report_fatal_error("Memory operand jumps should have been unfolded!");
  876. case X86::JMP16r:
  877. report_fatal_error(
  878. "Support for 16-bit indirect branches is not implemented.");
  879. case X86::JMP32r:
  880. report_fatal_error(
  881. "Support for 32-bit indirect branches is not implemented.");
  882. case X86::JMP64r:
  883. TargetReg = TI.getOperand(0).getReg();
  884. }
  885. // We have definitely found an indirect branch. Verify that there are no
  886. // preceding conditional branches as we don't yet support that.
  887. if (llvm::any_of(MBB.terminators(), [&](MachineInstr &OtherTI) {
  888. return !OtherTI.isDebugInstr() && &OtherTI != &TI;
  889. })) {
  890. LLVM_DEBUG({
  891. dbgs() << "ERROR: Found other terminators in a block with an indirect "
  892. "branch! This is not yet supported! Terminator sequence:\n";
  893. for (MachineInstr &MI : MBB.terminators()) {
  894. MI.dump();
  895. dbgs() << '\n';
  896. }
  897. });
  898. report_fatal_error("Unimplemented terminator sequence!");
  899. }
  900. // Make the target register an available value for this block.
  901. TargetAddrSSA.AddAvailableValue(&MBB, TargetReg);
  902. IndirectTerminatedMBBs.insert(&MBB);
  903. // Add all the successors to our target candidates.
  904. for (MachineBasicBlock *Succ : MBB.successors())
  905. IndirectTargetMBBs.insert(Succ);
  906. }
  907. // Keep track of the cmov instructions we insert so we can return them.
  908. SmallVector<MachineInstr *, 16> CMovs;
  909. // If we didn't find any indirect branches with targets, nothing to do here.
  910. if (IndirectTargetMBBs.empty())
  911. return CMovs;
  912. // We found indirect branches and targets that need to be instrumented to
  913. // harden loads within them. Walk the blocks of the function (to get a stable
  914. // ordering) and instrument each target of an indirect branch.
  915. for (MachineBasicBlock &MBB : MF) {
  916. // Skip the blocks that aren't candidate targets.
  917. if (!IndirectTargetMBBs.count(&MBB))
  918. continue;
  919. // We don't expect EH pads to ever be reached via an indirect branch. If
  920. // this is desired for some reason, we could simply skip them here rather
  921. // than asserting.
  922. assert(!MBB.isEHPad() &&
  923. "Unexpected EH pad as target of an indirect branch!");
  924. // We should never end up threading EFLAGS into a block to harden
  925. // conditional jumps as there would be an additional successor via the
  926. // indirect branch. As a consequence, all such edges would be split before
  927. // reaching here, and the inserted block will handle the EFLAGS-based
  928. // hardening.
  929. assert(!MBB.isLiveIn(X86::EFLAGS) &&
  930. "Cannot check within a block that already has live-in EFLAGS!");
  931. // We can't handle having non-indirect edges into this block unless this is
  932. // the only successor and we can synthesize the necessary target address.
  933. for (MachineBasicBlock *Pred : MBB.predecessors()) {
  934. // If we've already handled this by extracting the target directly,
  935. // nothing to do.
  936. if (IndirectTerminatedMBBs.count(Pred))
  937. continue;
  938. // Otherwise, we have to be the only successor. We generally expect this
  939. // to be true as conditional branches should have had a critical edge
  940. // split already. We don't however need to worry about EH pad successors
  941. // as they'll happily ignore the target and their hardening strategy is
  942. // resilient to all ways in which they could be reached speculatively.
  943. if (!llvm::all_of(Pred->successors(), [&](MachineBasicBlock *Succ) {
  944. return Succ->isEHPad() || Succ == &MBB;
  945. })) {
  946. LLVM_DEBUG({
  947. dbgs() << "ERROR: Found conditional entry to target of indirect "
  948. "branch!\n";
  949. Pred->dump();
  950. MBB.dump();
  951. });
  952. report_fatal_error("Cannot harden a conditional entry to a target of "
  953. "an indirect branch!");
  954. }
  955. // Now we need to compute the address of this block and install it as a
  956. // synthetic target in the predecessor. We do this at the bottom of the
  957. // predecessor.
  958. auto InsertPt = Pred->getFirstTerminator();
  959. Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
  960. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  961. !Subtarget->isPositionIndependent()) {
  962. // Directly materialize it into an immediate.
  963. auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
  964. TII->get(X86::MOV64ri32), TargetReg)
  965. .addMBB(&MBB);
  966. ++NumInstsInserted;
  967. (void)AddrI;
  968. LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
  969. dbgs() << "\n");
  970. } else {
  971. auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
  972. TargetReg)
  973. .addReg(/*Base*/ X86::RIP)
  974. .addImm(/*Scale*/ 1)
  975. .addReg(/*Index*/ 0)
  976. .addMBB(&MBB)
  977. .addReg(/*Segment*/ 0);
  978. ++NumInstsInserted;
  979. (void)AddrI;
  980. LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
  981. dbgs() << "\n");
  982. }
  983. // And make this available.
  984. TargetAddrSSA.AddAvailableValue(Pred, TargetReg);
  985. }
  986. // Materialize the needed SSA value of the target. Note that we need the
  987. // middle of the block as this block might at the bottom have an indirect
  988. // branch back to itself. We can do this here because at this point, every
  989. // predecessor of this block has an available value. This is basically just
  990. // automating the construction of a PHI node for this target.
  991. Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
  992. // Insert a comparison of the incoming target register with this block's
  993. // address. This also requires us to mark the block as having its address
  994. // taken explicitly.
  995. MBB.setMachineBlockAddressTaken();
  996. auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
  997. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  998. !Subtarget->isPositionIndependent()) {
  999. // Check directly against a relocated immediate when we can.
  1000. auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
  1001. .addReg(TargetReg, RegState::Kill)
  1002. .addMBB(&MBB);
  1003. ++NumInstsInserted;
  1004. (void)CheckI;
  1005. LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
  1006. } else {
  1007. // Otherwise compute the address into a register first.
  1008. Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
  1009. auto AddrI =
  1010. BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
  1011. .addReg(/*Base*/ X86::RIP)
  1012. .addImm(/*Scale*/ 1)
  1013. .addReg(/*Index*/ 0)
  1014. .addMBB(&MBB)
  1015. .addReg(/*Segment*/ 0);
  1016. ++NumInstsInserted;
  1017. (void)AddrI;
  1018. LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
  1019. auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
  1020. .addReg(TargetReg, RegState::Kill)
  1021. .addReg(AddrReg, RegState::Kill);
  1022. ++NumInstsInserted;
  1023. (void)CheckI;
  1024. LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
  1025. }
  1026. // Now cmov over the predicate if the comparison wasn't equal.
  1027. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  1028. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  1029. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  1030. auto CMovI =
  1031. BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
  1032. .addReg(PS->InitialReg)
  1033. .addReg(PS->PoisonReg)
  1034. .addImm(X86::COND_NE);
  1035. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  1036. ++NumInstsInserted;
  1037. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
  1038. CMovs.push_back(&*CMovI);
  1039. // And put the new value into the available values for SSA form of our
  1040. // predicate state.
  1041. PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
  1042. }
  1043. // Return all the newly inserted cmov instructions of the predicate state.
  1044. return CMovs;
  1045. }
  1046. // Returns true if the MI has EFLAGS as a register def operand and it's live,
  1047. // otherwise it returns false
  1048. static bool isEFLAGSDefLive(const MachineInstr &MI) {
  1049. if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
  1050. return !DefOp->isDead();
  1051. }
  1052. return false;
  1053. }
  1054. static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
  1055. const TargetRegisterInfo &TRI) {
  1056. // Check if EFLAGS are alive by seeing if there is a def of them or they
  1057. // live-in, and then seeing if that def is in turn used.
  1058. for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
  1059. if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
  1060. // If the def is dead, then EFLAGS is not live.
  1061. if (DefOp->isDead())
  1062. return false;
  1063. // Otherwise we've def'ed it, and it is live.
  1064. return true;
  1065. }
  1066. // While at this instruction, also check if we use and kill EFLAGS
  1067. // which means it isn't live.
  1068. if (MI.killsRegister(X86::EFLAGS, &TRI))
  1069. return false;
  1070. }
  1071. // If we didn't find anything conclusive (neither definitely alive or
  1072. // definitely dead) return whether it lives into the block.
  1073. return MBB.isLiveIn(X86::EFLAGS);
  1074. }
  1075. /// Trace the predicate state through each of the blocks in the function,
  1076. /// hardening everything necessary along the way.
  1077. ///
  1078. /// We call this routine once the initial predicate state has been established
  1079. /// for each basic block in the function in the SSA updater. This routine traces
  1080. /// it through the instructions within each basic block, and for non-returning
  1081. /// blocks informs the SSA updater about the final state that lives out of the
  1082. /// block. Along the way, it hardens any vulnerable instruction using the
  1083. /// currently valid predicate state. We have to do these two things together
  1084. /// because the SSA updater only works across blocks. Within a block, we track
  1085. /// the current predicate state directly and update it as it changes.
  1086. ///
  1087. /// This operates in two passes over each block. First, we analyze the loads in
  1088. /// the block to determine which strategy will be used to harden them: hardening
  1089. /// the address or hardening the loaded value when loaded into a register
  1090. /// amenable to hardening. We have to process these first because the two
  1091. /// strategies may interact -- later hardening may change what strategy we wish
  1092. /// to use. We also will analyze data dependencies between loads and avoid
  1093. /// hardening those loads that are data dependent on a load with a hardened
  1094. /// address. We also skip hardening loads already behind an LFENCE as that is
  1095. /// sufficient to harden them against misspeculation.
  1096. ///
  1097. /// Second, we actively trace the predicate state through the block, applying
  1098. /// the hardening steps we determined necessary in the first pass as we go.
  1099. ///
  1100. /// These two passes are applied to each basic block. We operate one block at a
  1101. /// time to simplify reasoning about reachability and sequencing.
  1102. void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
  1103. MachineFunction &MF) {
  1104. SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
  1105. SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
  1106. SmallSet<unsigned, 16> HardenedAddrRegs;
  1107. SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
  1108. // Track the set of load-dependent registers through the basic block. Because
  1109. // the values of these registers have an existing data dependency on a loaded
  1110. // value which we would have checked, we can omit any checks on them.
  1111. SparseBitVector<> LoadDepRegs;
  1112. for (MachineBasicBlock &MBB : MF) {
  1113. // The first pass over the block: collect all the loads which can have their
  1114. // loaded value hardened and all the loads that instead need their address
  1115. // hardened. During this walk we propagate load dependence for address
  1116. // hardened loads and also look for LFENCE to stop hardening wherever
  1117. // possible. When deciding whether or not to harden the loaded value or not,
  1118. // we check to see if any registers used in the address will have been
  1119. // hardened at this point and if so, harden any remaining address registers
  1120. // as that often successfully re-uses hardened addresses and minimizes
  1121. // instructions.
  1122. //
  1123. // FIXME: We should consider an aggressive mode where we continue to keep as
  1124. // many loads value hardened even when some address register hardening would
  1125. // be free (due to reuse).
  1126. //
  1127. // Note that we only need this pass if we are actually hardening loads.
  1128. if (HardenLoads)
  1129. for (MachineInstr &MI : MBB) {
  1130. // We naively assume that all def'ed registers of an instruction have
  1131. // a data dependency on all of their operands.
  1132. // FIXME: Do a more careful analysis of x86 to build a conservative
  1133. // model here.
  1134. if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
  1135. return Op.isReg() && LoadDepRegs.test(Op.getReg());
  1136. }))
  1137. for (MachineOperand &Def : MI.defs())
  1138. if (Def.isReg())
  1139. LoadDepRegs.set(Def.getReg());
  1140. // Both Intel and AMD are guiding that they will change the semantics of
  1141. // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
  1142. // no more need to guard things in this block.
  1143. if (MI.getOpcode() == X86::LFENCE)
  1144. break;
  1145. // If this instruction cannot load, nothing to do.
  1146. if (!MI.mayLoad())
  1147. continue;
  1148. // Some instructions which "load" are trivially safe or unimportant.
  1149. if (MI.getOpcode() == X86::MFENCE)
  1150. continue;
  1151. // Extract the memory operand information about this instruction.
  1152. // FIXME: This doesn't handle loading pseudo instructions which we often
  1153. // could handle with similarly generic logic. We probably need to add an
  1154. // MI-layer routine similar to the MC-layer one we use here which maps
  1155. // pseudos much like this maps real instructions.
  1156. const MCInstrDesc &Desc = MI.getDesc();
  1157. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1158. if (MemRefBeginIdx < 0) {
  1159. LLVM_DEBUG(dbgs()
  1160. << "WARNING: unable to harden loading instruction: ";
  1161. MI.dump());
  1162. continue;
  1163. }
  1164. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1165. MachineOperand &BaseMO =
  1166. MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1167. MachineOperand &IndexMO =
  1168. MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1169. // If we have at least one (non-frame-index, non-RIP) register operand,
  1170. // and neither operand is load-dependent, we need to check the load.
  1171. unsigned BaseReg = 0, IndexReg = 0;
  1172. if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
  1173. BaseMO.getReg() != X86::NoRegister)
  1174. BaseReg = BaseMO.getReg();
  1175. if (IndexMO.getReg() != X86::NoRegister)
  1176. IndexReg = IndexMO.getReg();
  1177. if (!BaseReg && !IndexReg)
  1178. // No register operands!
  1179. continue;
  1180. // If any register operand is dependent, this load is dependent and we
  1181. // needn't check it.
  1182. // FIXME: Is this true in the case where we are hardening loads after
  1183. // they complete? Unclear, need to investigate.
  1184. if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
  1185. (IndexReg && LoadDepRegs.test(IndexReg)))
  1186. continue;
  1187. // If post-load hardening is enabled, this load is compatible with
  1188. // post-load hardening, and we aren't already going to harden one of the
  1189. // address registers, queue it up to be hardened post-load. Notably,
  1190. // even once hardened this won't introduce a useful dependency that
  1191. // could prune out subsequent loads.
  1192. if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
  1193. !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 &&
  1194. MI.getOperand(0).isReg() &&
  1195. canHardenRegister(MI.getOperand(0).getReg()) &&
  1196. !HardenedAddrRegs.count(BaseReg) &&
  1197. !HardenedAddrRegs.count(IndexReg)) {
  1198. HardenPostLoad.insert(&MI);
  1199. HardenedAddrRegs.insert(MI.getOperand(0).getReg());
  1200. continue;
  1201. }
  1202. // Record this instruction for address hardening and record its register
  1203. // operands as being address-hardened.
  1204. HardenLoadAddr.insert(&MI);
  1205. if (BaseReg)
  1206. HardenedAddrRegs.insert(BaseReg);
  1207. if (IndexReg)
  1208. HardenedAddrRegs.insert(IndexReg);
  1209. for (MachineOperand &Def : MI.defs())
  1210. if (Def.isReg())
  1211. LoadDepRegs.set(Def.getReg());
  1212. }
  1213. // Now re-walk the instructions in the basic block, and apply whichever
  1214. // hardening strategy we have elected. Note that we do this in a second
  1215. // pass specifically so that we have the complete set of instructions for
  1216. // which we will do post-load hardening and can defer it in certain
  1217. // circumstances.
  1218. for (MachineInstr &MI : MBB) {
  1219. if (HardenLoads) {
  1220. // We cannot both require hardening the def of a load and its address.
  1221. assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
  1222. "Requested to harden both the address and def of a load!");
  1223. // Check if this is a load whose address needs to be hardened.
  1224. if (HardenLoadAddr.erase(&MI)) {
  1225. const MCInstrDesc &Desc = MI.getDesc();
  1226. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1227. assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
  1228. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1229. MachineOperand &BaseMO =
  1230. MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1231. MachineOperand &IndexMO =
  1232. MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1233. hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
  1234. continue;
  1235. }
  1236. // Test if this instruction is one of our post load instructions (and
  1237. // remove it from the set if so).
  1238. if (HardenPostLoad.erase(&MI)) {
  1239. assert(!MI.isCall() && "Must not try to post-load harden a call!");
  1240. // If this is a data-invariant load and there is no EFLAGS
  1241. // interference, we want to try and sink any hardening as far as
  1242. // possible.
  1243. if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
  1244. // Sink the instruction we'll need to harden as far as we can down
  1245. // the graph.
  1246. MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
  1247. // If we managed to sink this instruction, update everything so we
  1248. // harden that instruction when we reach it in the instruction
  1249. // sequence.
  1250. if (SunkMI != &MI) {
  1251. // If in sinking there was no instruction needing to be hardened,
  1252. // we're done.
  1253. if (!SunkMI)
  1254. continue;
  1255. // Otherwise, add this to the set of defs we harden.
  1256. HardenPostLoad.insert(SunkMI);
  1257. continue;
  1258. }
  1259. }
  1260. unsigned HardenedReg = hardenPostLoad(MI);
  1261. // Mark the resulting hardened register as such so we don't re-harden.
  1262. AddrRegToHardenedReg[HardenedReg] = HardenedReg;
  1263. continue;
  1264. }
  1265. // Check for an indirect call or branch that may need its input hardened
  1266. // even if we couldn't find the specific load used, or were able to
  1267. // avoid hardening it for some reason. Note that here we cannot break
  1268. // out afterward as we may still need to handle any call aspect of this
  1269. // instruction.
  1270. if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
  1271. hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
  1272. }
  1273. // After we finish hardening loads we handle interprocedural hardening if
  1274. // enabled and relevant for this instruction.
  1275. if (!HardenInterprocedurally)
  1276. continue;
  1277. if (!MI.isCall() && !MI.isReturn())
  1278. continue;
  1279. // If this is a direct return (IE, not a tail call) just directly harden
  1280. // it.
  1281. if (MI.isReturn() && !MI.isCall()) {
  1282. hardenReturnInstr(MI);
  1283. continue;
  1284. }
  1285. // Otherwise we have a call. We need to handle transferring the predicate
  1286. // state into a call and recovering it after the call returns (unless this
  1287. // is a tail call).
  1288. assert(MI.isCall() && "Should only reach here for calls!");
  1289. tracePredStateThroughCall(MI);
  1290. }
  1291. HardenPostLoad.clear();
  1292. HardenLoadAddr.clear();
  1293. HardenedAddrRegs.clear();
  1294. AddrRegToHardenedReg.clear();
  1295. // Currently, we only track data-dependent loads within a basic block.
  1296. // FIXME: We should see if this is necessary or if we could be more
  1297. // aggressive here without opening up attack avenues.
  1298. LoadDepRegs.clear();
  1299. }
  1300. }
  1301. /// Save EFLAGS into the returned GPR. This can in turn be restored with
  1302. /// `restoreEFLAGS`.
  1303. ///
  1304. /// Note that LLVM can only lower very simple patterns of saved and restored
  1305. /// EFLAGS registers. The restore should always be within the same basic block
  1306. /// as the save so that no PHI nodes are inserted.
  1307. unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
  1308. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1309. const DebugLoc &Loc) {
  1310. // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
  1311. // what instruction selection does.
  1312. Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
  1313. // We directly copy the FLAGS register and rely on later lowering to clean
  1314. // this up into the appropriate setCC instructions.
  1315. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
  1316. ++NumInstsInserted;
  1317. return Reg;
  1318. }
  1319. /// Restore EFLAGS from the provided GPR. This should be produced by
  1320. /// `saveEFLAGS`.
  1321. ///
  1322. /// This must be done within the same basic block as the save in order to
  1323. /// reliably lower.
  1324. void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
  1325. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1326. const DebugLoc &Loc, Register Reg) {
  1327. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
  1328. ++NumInstsInserted;
  1329. }
  1330. /// Takes the current predicate state (in a register) and merges it into the
  1331. /// stack pointer. The state is essentially a single bit, but we merge this in
  1332. /// a way that won't form non-canonical pointers and also will be preserved
  1333. /// across normal stack adjustments.
  1334. void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
  1335. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1336. const DebugLoc &Loc, unsigned PredStateReg) {
  1337. Register TmpReg = MRI->createVirtualRegister(PS->RC);
  1338. // FIXME: This hard codes a shift distance based on the number of bits needed
  1339. // to stay canonical on 64-bit. We should compute this somehow and support
  1340. // 32-bit as part of that.
  1341. auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
  1342. .addReg(PredStateReg, RegState::Kill)
  1343. .addImm(47);
  1344. ShiftI->addRegisterDead(X86::EFLAGS, TRI);
  1345. ++NumInstsInserted;
  1346. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
  1347. .addReg(X86::RSP)
  1348. .addReg(TmpReg, RegState::Kill);
  1349. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1350. ++NumInstsInserted;
  1351. }
  1352. /// Extracts the predicate state stored in the high bits of the stack pointer.
  1353. unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
  1354. MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1355. const DebugLoc &Loc) {
  1356. Register PredStateReg = MRI->createVirtualRegister(PS->RC);
  1357. Register TmpReg = MRI->createVirtualRegister(PS->RC);
  1358. // We know that the stack pointer will have any preserved predicate state in
  1359. // its high bit. We just want to smear this across the other bits. Turns out,
  1360. // this is exactly what an arithmetic right shift does.
  1361. BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
  1362. .addReg(X86::RSP);
  1363. auto ShiftI =
  1364. BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
  1365. .addReg(TmpReg, RegState::Kill)
  1366. .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
  1367. ShiftI->addRegisterDead(X86::EFLAGS, TRI);
  1368. ++NumInstsInserted;
  1369. return PredStateReg;
  1370. }
  1371. void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
  1372. MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
  1373. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
  1374. MachineBasicBlock &MBB = *MI.getParent();
  1375. const DebugLoc &Loc = MI.getDebugLoc();
  1376. // Check if EFLAGS are alive by seeing if there is a def of them or they
  1377. // live-in, and then seeing if that def is in turn used.
  1378. bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
  1379. SmallVector<MachineOperand *, 2> HardenOpRegs;
  1380. if (BaseMO.isFI()) {
  1381. // A frame index is never a dynamically controllable load, so only
  1382. // harden it if we're covering fixed address loads as well.
  1383. LLVM_DEBUG(
  1384. dbgs() << " Skipping hardening base of explicit stack frame load: ";
  1385. MI.dump(); dbgs() << "\n");
  1386. } else if (BaseMO.getReg() == X86::RSP) {
  1387. // Some idempotent atomic operations are lowered directly to a locked
  1388. // OR with 0 to the top of stack(or slightly offset from top) which uses an
  1389. // explicit RSP register as the base.
  1390. assert(IndexMO.getReg() == X86::NoRegister &&
  1391. "Explicit RSP access with dynamic index!");
  1392. LLVM_DEBUG(
  1393. dbgs() << " Cannot harden base of explicit RSP offset in a load!");
  1394. } else if (BaseMO.getReg() == X86::RIP ||
  1395. BaseMO.getReg() == X86::NoRegister) {
  1396. // For both RIP-relative addressed loads or absolute loads, we cannot
  1397. // meaningfully harden them because the address being loaded has no
  1398. // dynamic component.
  1399. //
  1400. // FIXME: When using a segment base (like TLS does) we end up with the
  1401. // dynamic address being the base plus -1 because we can't mutate the
  1402. // segment register here. This allows the signed 32-bit offset to point at
  1403. // valid segment-relative addresses and load them successfully.
  1404. LLVM_DEBUG(
  1405. dbgs() << " Cannot harden base of "
  1406. << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
  1407. << " address in a load!");
  1408. } else {
  1409. assert(BaseMO.isReg() &&
  1410. "Only allowed to have a frame index or register base.");
  1411. HardenOpRegs.push_back(&BaseMO);
  1412. }
  1413. if (IndexMO.getReg() != X86::NoRegister &&
  1414. (HardenOpRegs.empty() ||
  1415. HardenOpRegs.front()->getReg() != IndexMO.getReg()))
  1416. HardenOpRegs.push_back(&IndexMO);
  1417. assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
  1418. "Should have exactly one or two registers to harden!");
  1419. assert((HardenOpRegs.size() == 1 ||
  1420. HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
  1421. "Should not have two of the same registers!");
  1422. // Remove any registers that have alreaded been checked.
  1423. llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
  1424. // See if this operand's register has already been checked.
  1425. auto It = AddrRegToHardenedReg.find(Op->getReg());
  1426. if (It == AddrRegToHardenedReg.end())
  1427. // Not checked, so retain this one.
  1428. return false;
  1429. // Otherwise, we can directly update this operand and remove it.
  1430. Op->setReg(It->second);
  1431. return true;
  1432. });
  1433. // If there are none left, we're done.
  1434. if (HardenOpRegs.empty())
  1435. return;
  1436. // Compute the current predicate state.
  1437. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1438. auto InsertPt = MI.getIterator();
  1439. // If EFLAGS are live and we don't have access to instructions that avoid
  1440. // clobbering EFLAGS we need to save and restore them. This in turn makes
  1441. // the EFLAGS no longer live.
  1442. unsigned FlagsReg = 0;
  1443. if (EFLAGSLive && !Subtarget->hasBMI2()) {
  1444. EFLAGSLive = false;
  1445. FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
  1446. }
  1447. for (MachineOperand *Op : HardenOpRegs) {
  1448. Register OpReg = Op->getReg();
  1449. auto *OpRC = MRI->getRegClass(OpReg);
  1450. Register TmpReg = MRI->createVirtualRegister(OpRC);
  1451. // If this is a vector register, we'll need somewhat custom logic to handle
  1452. // hardening it.
  1453. if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
  1454. OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
  1455. assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
  1456. bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
  1457. // Move our state into a vector register.
  1458. // FIXME: We could skip this at the cost of longer encodings with AVX-512
  1459. // but that doesn't seem likely worth it.
  1460. Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
  1461. auto MovI =
  1462. BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
  1463. .addReg(StateReg);
  1464. (void)MovI;
  1465. ++NumInstsInserted;
  1466. LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
  1467. // Broadcast it across the vector register.
  1468. Register VBStateReg = MRI->createVirtualRegister(OpRC);
  1469. auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
  1470. TII->get(Is128Bit ? X86::VPBROADCASTQrr
  1471. : X86::VPBROADCASTQYrr),
  1472. VBStateReg)
  1473. .addReg(VStateReg);
  1474. (void)BroadcastI;
  1475. ++NumInstsInserted;
  1476. LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
  1477. dbgs() << "\n");
  1478. // Merge our potential poison state into the value with a vector or.
  1479. auto OrI =
  1480. BuildMI(MBB, InsertPt, Loc,
  1481. TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
  1482. .addReg(VBStateReg)
  1483. .addReg(OpReg);
  1484. (void)OrI;
  1485. ++NumInstsInserted;
  1486. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1487. } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
  1488. OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
  1489. OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
  1490. assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
  1491. bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
  1492. bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
  1493. if (Is128Bit || Is256Bit)
  1494. assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
  1495. // Broadcast our state into a vector register.
  1496. Register VStateReg = MRI->createVirtualRegister(OpRC);
  1497. unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
  1498. : Is256Bit ? X86::VPBROADCASTQrZ256rr
  1499. : X86::VPBROADCASTQrZrr;
  1500. auto BroadcastI =
  1501. BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
  1502. .addReg(StateReg);
  1503. (void)BroadcastI;
  1504. ++NumInstsInserted;
  1505. LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
  1506. dbgs() << "\n");
  1507. // Merge our potential poison state into the value with a vector or.
  1508. unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
  1509. : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
  1510. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
  1511. .addReg(VStateReg)
  1512. .addReg(OpReg);
  1513. (void)OrI;
  1514. ++NumInstsInserted;
  1515. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1516. } else {
  1517. // FIXME: Need to support GR32 here for 32-bit code.
  1518. assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
  1519. "Not a supported register class for address hardening!");
  1520. if (!EFLAGSLive) {
  1521. // Merge our potential poison state into the value with an or.
  1522. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
  1523. .addReg(StateReg)
  1524. .addReg(OpReg);
  1525. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1526. ++NumInstsInserted;
  1527. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1528. } else {
  1529. // We need to avoid touching EFLAGS so shift out all but the least
  1530. // significant bit using the instruction that doesn't update flags.
  1531. auto ShiftI =
  1532. BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
  1533. .addReg(OpReg)
  1534. .addReg(StateReg);
  1535. (void)ShiftI;
  1536. ++NumInstsInserted;
  1537. LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
  1538. dbgs() << "\n");
  1539. }
  1540. }
  1541. // Record this register as checked and update the operand.
  1542. assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
  1543. "Should not have checked this register yet!");
  1544. AddrRegToHardenedReg[Op->getReg()] = TmpReg;
  1545. Op->setReg(TmpReg);
  1546. ++NumAddrRegsHardened;
  1547. }
  1548. // And restore the flags if needed.
  1549. if (FlagsReg)
  1550. restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
  1551. }
  1552. MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
  1553. MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
  1554. assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
  1555. "Cannot get here with a non-invariant load!");
  1556. assert(!isEFLAGSDefLive(InitialMI) &&
  1557. "Cannot get here with a data invariant load "
  1558. "that interferes with EFLAGS!");
  1559. // See if we can sink hardening the loaded value.
  1560. auto SinkCheckToSingleUse =
  1561. [&](MachineInstr &MI) -> std::optional<MachineInstr *> {
  1562. Register DefReg = MI.getOperand(0).getReg();
  1563. // We need to find a single use which we can sink the check. We can
  1564. // primarily do this because many uses may already end up checked on their
  1565. // own.
  1566. MachineInstr *SingleUseMI = nullptr;
  1567. for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
  1568. // If we're already going to harden this use, it is data invariant, it
  1569. // does not interfere with EFLAGS, and within our block.
  1570. if (HardenedInstrs.count(&UseMI)) {
  1571. if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) {
  1572. // If we've already decided to harden a non-load, we must have sunk
  1573. // some other post-load hardened instruction to it and it must itself
  1574. // be data-invariant.
  1575. assert(X86InstrInfo::isDataInvariant(UseMI) &&
  1576. "Data variant instruction being hardened!");
  1577. continue;
  1578. }
  1579. // Otherwise, this is a load and the load component can't be data
  1580. // invariant so check how this register is being used.
  1581. const MCInstrDesc &Desc = UseMI.getDesc();
  1582. int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
  1583. assert(MemRefBeginIdx >= 0 &&
  1584. "Should always have mem references here!");
  1585. MemRefBeginIdx += X86II::getOperandBias(Desc);
  1586. MachineOperand &BaseMO =
  1587. UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
  1588. MachineOperand &IndexMO =
  1589. UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
  1590. if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
  1591. (IndexMO.isReg() && IndexMO.getReg() == DefReg))
  1592. // The load uses the register as part of its address making it not
  1593. // invariant.
  1594. return {};
  1595. continue;
  1596. }
  1597. if (SingleUseMI)
  1598. // We already have a single use, this would make two. Bail.
  1599. return {};
  1600. // If this single use isn't data invariant, isn't in this block, or has
  1601. // interfering EFLAGS, we can't sink the hardening to it.
  1602. if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() ||
  1603. isEFLAGSDefLive(UseMI))
  1604. return {};
  1605. // If this instruction defines multiple registers bail as we won't harden
  1606. // all of them.
  1607. if (UseMI.getDesc().getNumDefs() > 1)
  1608. return {};
  1609. // If this register isn't a virtual register we can't walk uses of sanely,
  1610. // just bail. Also check that its register class is one of the ones we
  1611. // can harden.
  1612. Register UseDefReg = UseMI.getOperand(0).getReg();
  1613. if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
  1614. return {};
  1615. SingleUseMI = &UseMI;
  1616. }
  1617. // If SingleUseMI is still null, there is no use that needs its own
  1618. // checking. Otherwise, it is the single use that needs checking.
  1619. return {SingleUseMI};
  1620. };
  1621. MachineInstr *MI = &InitialMI;
  1622. while (std::optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
  1623. // Update which MI we're checking now.
  1624. MI = *SingleUse;
  1625. if (!MI)
  1626. break;
  1627. }
  1628. return MI;
  1629. }
  1630. bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
  1631. auto *RC = MRI->getRegClass(Reg);
  1632. int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
  1633. if (RegBytes > 8)
  1634. // We don't support post-load hardening of vectors.
  1635. return false;
  1636. unsigned RegIdx = Log2_32(RegBytes);
  1637. assert(RegIdx < 4 && "Unsupported register size");
  1638. // If this register class is explicitly constrained to a class that doesn't
  1639. // require REX prefix, we may not be able to satisfy that constraint when
  1640. // emitting the hardening instructions, so bail out here.
  1641. // FIXME: This seems like a pretty lame hack. The way this comes up is when we
  1642. // end up both with a NOREX and REX-only register as operands to the hardening
  1643. // instructions. It would be better to fix that code to handle this situation
  1644. // rather than hack around it in this way.
  1645. const TargetRegisterClass *NOREXRegClasses[] = {
  1646. &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
  1647. &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
  1648. if (RC == NOREXRegClasses[RegIdx])
  1649. return false;
  1650. const TargetRegisterClass *GPRRegClasses[] = {
  1651. &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
  1652. &X86::GR64RegClass};
  1653. return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
  1654. }
  1655. /// Harden a value in a register.
  1656. ///
  1657. /// This is the low-level logic to fully harden a value sitting in a register
  1658. /// against leaking during speculative execution.
  1659. ///
  1660. /// Unlike hardening an address that is used by a load, this routine is required
  1661. /// to hide *all* incoming bits in the register.
  1662. ///
  1663. /// `Reg` must be a virtual register. Currently, it is required to be a GPR no
  1664. /// larger than the predicate state register. FIXME: We should support vector
  1665. /// registers here by broadcasting the predicate state.
  1666. ///
  1667. /// The new, hardened virtual register is returned. It will have the same
  1668. /// register class as `Reg`.
  1669. unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
  1670. Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
  1671. const DebugLoc &Loc) {
  1672. assert(canHardenRegister(Reg) && "Cannot harden this register!");
  1673. assert(Reg.isVirtual() && "Cannot harden a physical register!");
  1674. auto *RC = MRI->getRegClass(Reg);
  1675. int Bytes = TRI->getRegSizeInBits(*RC) / 8;
  1676. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1677. assert((Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8) &&
  1678. "Unknown register size");
  1679. // FIXME: Need to teach this about 32-bit mode.
  1680. if (Bytes != 8) {
  1681. unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
  1682. unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
  1683. Register NarrowStateReg = MRI->createVirtualRegister(RC);
  1684. BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
  1685. .addReg(StateReg, 0, SubRegImm);
  1686. StateReg = NarrowStateReg;
  1687. }
  1688. unsigned FlagsReg = 0;
  1689. if (isEFLAGSLive(MBB, InsertPt, *TRI))
  1690. FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
  1691. Register NewReg = MRI->createVirtualRegister(RC);
  1692. unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
  1693. unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
  1694. auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
  1695. .addReg(StateReg)
  1696. .addReg(Reg);
  1697. OrI->addRegisterDead(X86::EFLAGS, TRI);
  1698. ++NumInstsInserted;
  1699. LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
  1700. if (FlagsReg)
  1701. restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
  1702. return NewReg;
  1703. }
  1704. /// Harden a load by hardening the loaded value in the defined register.
  1705. ///
  1706. /// We can harden a non-leaking load into a register without touching the
  1707. /// address by just hiding all of the loaded bits during misspeculation. We use
  1708. /// an `or` instruction to do this because we set up our poison value as all
  1709. /// ones. And the goal is just for the loaded bits to not be exposed to
  1710. /// execution and coercing them to one is sufficient.
  1711. ///
  1712. /// Returns the newly hardened register.
  1713. unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
  1714. MachineBasicBlock &MBB = *MI.getParent();
  1715. const DebugLoc &Loc = MI.getDebugLoc();
  1716. auto &DefOp = MI.getOperand(0);
  1717. Register OldDefReg = DefOp.getReg();
  1718. auto *DefRC = MRI->getRegClass(OldDefReg);
  1719. // Because we want to completely replace the uses of this def'ed value with
  1720. // the hardened value, create a dedicated new register that will only be used
  1721. // to communicate the unhardened value to the hardening.
  1722. Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
  1723. DefOp.setReg(UnhardenedReg);
  1724. // Now harden this register's value, getting a hardened reg that is safe to
  1725. // use. Note that we insert the instructions to compute this *after* the
  1726. // defining instruction, not before it.
  1727. unsigned HardenedReg = hardenValueInRegister(
  1728. UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
  1729. // Finally, replace the old register (which now only has the uses of the
  1730. // original def) with the hardened register.
  1731. MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
  1732. ++NumPostLoadRegsHardened;
  1733. return HardenedReg;
  1734. }
  1735. /// Harden a return instruction.
  1736. ///
  1737. /// Returns implicitly perform a load which we need to harden. Without hardening
  1738. /// this load, an attacker my speculatively write over the return address to
  1739. /// steer speculation of the return to an attacker controlled address. This is
  1740. /// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
  1741. /// this paper:
  1742. /// https://people.csail.mit.edu/vlk/spectre11.pdf
  1743. ///
  1744. /// We can harden this by introducing an LFENCE that will delay any load of the
  1745. /// return address until prior instructions have retired (and thus are not being
  1746. /// speculated), or we can harden the address used by the implicit load: the
  1747. /// stack pointer.
  1748. ///
  1749. /// If we are not using an LFENCE, hardening the stack pointer has an additional
  1750. /// benefit: it allows us to pass the predicate state accumulated in this
  1751. /// function back to the caller. In the absence of a BCBS attack on the return,
  1752. /// the caller will typically be resumed and speculatively executed due to the
  1753. /// Return Stack Buffer (RSB) prediction which is very accurate and has a high
  1754. /// priority. It is possible that some code from the caller will be executed
  1755. /// speculatively even during a BCBS-attacked return until the steering takes
  1756. /// effect. Whenever this happens, the caller can recover the (poisoned)
  1757. /// predicate state from the stack pointer and continue to harden loads.
  1758. void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
  1759. MachineBasicBlock &MBB = *MI.getParent();
  1760. const DebugLoc &Loc = MI.getDebugLoc();
  1761. auto InsertPt = MI.getIterator();
  1762. if (FenceCallAndRet)
  1763. // No need to fence here as we'll fence at the return site itself. That
  1764. // handles more cases than we can handle here.
  1765. return;
  1766. // Take our predicate state, shift it to the high 17 bits (so that we keep
  1767. // pointers canonical) and merge it into RSP. This will allow the caller to
  1768. // extract it when we return (speculatively).
  1769. mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
  1770. }
  1771. /// Trace the predicate state through a call.
  1772. ///
  1773. /// There are several layers of this needed to handle the full complexity of
  1774. /// calls.
  1775. ///
  1776. /// First, we need to send the predicate state into the called function. We do
  1777. /// this by merging it into the high bits of the stack pointer.
  1778. ///
  1779. /// For tail calls, this is all we need to do.
  1780. ///
  1781. /// For calls where we might return and resume the control flow, we need to
  1782. /// extract the predicate state from the high bits of the stack pointer after
  1783. /// control returns from the called function.
  1784. ///
  1785. /// We also need to verify that we intended to return to this location in the
  1786. /// code. An attacker might arrange for the processor to mispredict the return
  1787. /// to this valid but incorrect return address in the program rather than the
  1788. /// correct one. See the paper on this attack, called "ret2spec" by the
  1789. /// researchers, here:
  1790. /// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
  1791. ///
  1792. /// The way we verify that we returned to the correct location is by preserving
  1793. /// the expected return address across the call. One technique involves taking
  1794. /// advantage of the red-zone to load the return address from `8(%rsp)` where it
  1795. /// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
  1796. /// directly save the address into a register that will be preserved across the
  1797. /// call. We compare this intended return address against the address
  1798. /// immediately following the call (the observed return address). If these
  1799. /// mismatch, we have detected misspeculation and can poison our predicate
  1800. /// state.
  1801. void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
  1802. MachineInstr &MI) {
  1803. MachineBasicBlock &MBB = *MI.getParent();
  1804. MachineFunction &MF = *MBB.getParent();
  1805. auto InsertPt = MI.getIterator();
  1806. const DebugLoc &Loc = MI.getDebugLoc();
  1807. if (FenceCallAndRet) {
  1808. if (MI.isReturn())
  1809. // Tail call, we don't return to this function.
  1810. // FIXME: We should also handle noreturn calls.
  1811. return;
  1812. // We don't need to fence before the call because the function should fence
  1813. // in its entry. However, we do need to fence after the call returns.
  1814. // Fencing before the return doesn't correctly handle cases where the return
  1815. // itself is mispredicted.
  1816. BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
  1817. ++NumInstsInserted;
  1818. ++NumLFENCEsInserted;
  1819. return;
  1820. }
  1821. // First, we transfer the predicate state into the called function by merging
  1822. // it into the stack pointer. This will kill the current def of the state.
  1823. Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
  1824. mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
  1825. // If this call is also a return, it is a tail call and we don't need anything
  1826. // else to handle it so just return. Also, if there are no further
  1827. // instructions and no successors, this call does not return so we can also
  1828. // bail.
  1829. if (MI.isReturn() || (std::next(InsertPt) == MBB.end() && MBB.succ_empty()))
  1830. return;
  1831. // Create a symbol to track the return address and attach it to the call
  1832. // machine instruction. We will lower extra symbols attached to call
  1833. // instructions as label immediately following the call.
  1834. MCSymbol *RetSymbol =
  1835. MF.getContext().createTempSymbol("slh_ret_addr",
  1836. /*AlwaysAddSuffix*/ true);
  1837. MI.setPostInstrSymbol(MF, RetSymbol);
  1838. const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
  1839. unsigned ExpectedRetAddrReg = 0;
  1840. // If we have no red zones or if the function returns twice (possibly without
  1841. // using the `ret` instruction) like setjmp, we need to save the expected
  1842. // return address prior to the call.
  1843. if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) ||
  1844. MF.exposesReturnsTwice()) {
  1845. // If we don't have red zones, we need to compute the expected return
  1846. // address prior to the call and store it in a register that lives across
  1847. // the call.
  1848. //
  1849. // In some ways, this is doubly satisfying as a mitigation because it will
  1850. // also successfully detect stack smashing bugs in some cases (typically,
  1851. // when a callee-saved register is used and the callee doesn't push it onto
  1852. // the stack). But that isn't our primary goal, so we only use it as
  1853. // a fallback.
  1854. //
  1855. // FIXME: It isn't clear that this is reliable in the face of
  1856. // rematerialization in the register allocator. We somehow need to force
  1857. // that to not occur for this particular instruction, and instead to spill
  1858. // or otherwise preserve the value computed *prior* to the call.
  1859. //
  1860. // FIXME: It is even less clear why MachineCSE can't just fold this when we
  1861. // end up having to use identical instructions both before and after the
  1862. // call to feed the comparison.
  1863. ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1864. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  1865. !Subtarget->isPositionIndependent()) {
  1866. BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
  1867. .addSym(RetSymbol);
  1868. } else {
  1869. BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
  1870. .addReg(/*Base*/ X86::RIP)
  1871. .addImm(/*Scale*/ 1)
  1872. .addReg(/*Index*/ 0)
  1873. .addSym(RetSymbol)
  1874. .addReg(/*Segment*/ 0);
  1875. }
  1876. }
  1877. // Step past the call to handle when it returns.
  1878. ++InsertPt;
  1879. // If we didn't pre-compute the expected return address into a register, then
  1880. // red zones are enabled and the return address is still available on the
  1881. // stack immediately after the call. As the very first instruction, we load it
  1882. // into a register.
  1883. if (!ExpectedRetAddrReg) {
  1884. ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1885. BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
  1886. .addReg(/*Base*/ X86::RSP)
  1887. .addImm(/*Scale*/ 1)
  1888. .addReg(/*Index*/ 0)
  1889. .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
  1890. // the return address is 8-bytes past it.
  1891. .addReg(/*Segment*/ 0);
  1892. }
  1893. // Now we extract the callee's predicate state from the stack pointer.
  1894. unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
  1895. // Test the expected return address against our actual address. If we can
  1896. // form this basic block's address as an immediate, this is easy. Otherwise
  1897. // we compute it.
  1898. if (MF.getTarget().getCodeModel() == CodeModel::Small &&
  1899. !Subtarget->isPositionIndependent()) {
  1900. // FIXME: Could we fold this with the load? It would require careful EFLAGS
  1901. // management.
  1902. BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
  1903. .addReg(ExpectedRetAddrReg, RegState::Kill)
  1904. .addSym(RetSymbol);
  1905. } else {
  1906. Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
  1907. BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
  1908. .addReg(/*Base*/ X86::RIP)
  1909. .addImm(/*Scale*/ 1)
  1910. .addReg(/*Index*/ 0)
  1911. .addSym(RetSymbol)
  1912. .addReg(/*Segment*/ 0);
  1913. BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
  1914. .addReg(ExpectedRetAddrReg, RegState::Kill)
  1915. .addReg(ActualRetAddrReg, RegState::Kill);
  1916. }
  1917. // Now conditionally update the predicate state we just extracted if we ended
  1918. // up at a different return address than expected.
  1919. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
  1920. auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
  1921. Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
  1922. auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
  1923. .addReg(NewStateReg, RegState::Kill)
  1924. .addReg(PS->PoisonReg)
  1925. .addImm(X86::COND_NE);
  1926. CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
  1927. ++NumInstsInserted;
  1928. LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
  1929. PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
  1930. }
  1931. /// An attacker may speculatively store over a value that is then speculatively
  1932. /// loaded and used as the target of an indirect call or jump instruction. This
  1933. /// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
  1934. /// in this paper:
  1935. /// https://people.csail.mit.edu/vlk/spectre11.pdf
  1936. ///
  1937. /// When this happens, the speculative execution of the call or jump will end up
  1938. /// being steered to this attacker controlled address. While most such loads
  1939. /// will be adequately hardened already, we want to ensure that they are
  1940. /// definitively treated as needing post-load hardening. While address hardening
  1941. /// is sufficient to prevent secret data from leaking to the attacker, it may
  1942. /// not be sufficient to prevent an attacker from steering speculative
  1943. /// execution. We forcibly unfolded all relevant loads above and so will always
  1944. /// have an opportunity to post-load harden here, we just need to scan for cases
  1945. /// not already flagged and add them.
  1946. void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
  1947. MachineInstr &MI,
  1948. SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
  1949. switch (MI.getOpcode()) {
  1950. case X86::FARCALL16m:
  1951. case X86::FARCALL32m:
  1952. case X86::FARCALL64m:
  1953. case X86::FARJMP16m:
  1954. case X86::FARJMP32m:
  1955. case X86::FARJMP64m:
  1956. // We don't need to harden either far calls or far jumps as they are
  1957. // safe from Spectre.
  1958. return;
  1959. default:
  1960. break;
  1961. }
  1962. // We should never see a loading instruction at this point, as those should
  1963. // have been unfolded.
  1964. assert(!MI.mayLoad() && "Found a lingering loading instruction!");
  1965. // If the first operand isn't a register, this is a branch or call
  1966. // instruction with an immediate operand which doesn't need to be hardened.
  1967. if (!MI.getOperand(0).isReg())
  1968. return;
  1969. // For all of these, the target register is the first operand of the
  1970. // instruction.
  1971. auto &TargetOp = MI.getOperand(0);
  1972. Register OldTargetReg = TargetOp.getReg();
  1973. // Try to lookup a hardened version of this register. We retain a reference
  1974. // here as we want to update the map to track any newly computed hardened
  1975. // register.
  1976. unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
  1977. // If we don't have a hardened register yet, compute one. Otherwise, just use
  1978. // the already hardened register.
  1979. //
  1980. // FIXME: It is a little suspect that we use partially hardened registers that
  1981. // only feed addresses. The complexity of partial hardening with SHRX
  1982. // continues to pile up. Should definitively measure its value and consider
  1983. // eliminating it.
  1984. if (!HardenedTargetReg)
  1985. HardenedTargetReg = hardenValueInRegister(
  1986. OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
  1987. // Set the target operand to the hardened register.
  1988. TargetOp.setReg(HardenedTargetReg);
  1989. ++NumCallsOrJumpsHardened;
  1990. }
  1991. INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
  1992. "X86 speculative load hardener", false, false)
  1993. INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
  1994. "X86 speculative load hardener", false, false)
  1995. FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
  1996. return new X86SpeculativeLoadHardeningPass();
  1997. }