123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354 |
- //===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This file defines the pass which inserts x86 AVX vzeroupper instructions
- // before calls to SSE encoded functions. This avoids transition latency
- // penalty when transferring control between AVX encoded instructions and old
- // SSE encoding mode.
- //
- //===----------------------------------------------------------------------===//
- #include "X86.h"
- #include "X86InstrInfo.h"
- #include "X86Subtarget.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/CodeGen/MachineBasicBlock.h"
- #include "llvm/CodeGen/MachineFunction.h"
- #include "llvm/CodeGen/MachineFunctionPass.h"
- #include "llvm/CodeGen/MachineInstr.h"
- #include "llvm/CodeGen/MachineInstrBuilder.h"
- #include "llvm/CodeGen/MachineOperand.h"
- #include "llvm/CodeGen/MachineRegisterInfo.h"
- #include "llvm/CodeGen/TargetInstrInfo.h"
- #include "llvm/CodeGen/TargetRegisterInfo.h"
- #include "llvm/IR/CallingConv.h"
- #include "llvm/IR/DebugLoc.h"
- #include "llvm/IR/Function.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Support/raw_ostream.h"
- #include <cassert>
- using namespace llvm;
- #define DEBUG_TYPE "x86-vzeroupper"
- static cl::opt<bool>
- UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
- cl::desc("Minimize AVX to SSE transition penalty"),
- cl::init(true));
- STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
- namespace {
- class VZeroUpperInserter : public MachineFunctionPass {
- public:
- VZeroUpperInserter() : MachineFunctionPass(ID) {}
- bool runOnMachineFunction(MachineFunction &MF) override;
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
- StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
- private:
- void processBasicBlock(MachineBasicBlock &MBB);
- void insertVZeroUpper(MachineBasicBlock::iterator I,
- MachineBasicBlock &MBB);
- void addDirtySuccessor(MachineBasicBlock &MBB);
- using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
- static const char* getBlockExitStateName(BlockExitState ST);
- // Core algorithm state:
- // BlockState - Each block is either:
- // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
- // vzeroupper instructions in this block.
- // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
- // block that will ensure that YMM/ZMM is clean on exit.
- // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
- // subsequent vzeroupper in the block clears it.
- //
- // AddedToDirtySuccessors - This flag is raised when a block is added to the
- // DirtySuccessors list to ensure that it's not
- // added multiple times.
- //
- // FirstUnguardedCall - Records the location of the first unguarded call in
- // each basic block that may need to be guarded by a
- // vzeroupper. We won't know whether it actually needs
- // to be guarded until we discover a predecessor that
- // is DIRTY_OUT.
- struct BlockState {
- BlockExitState ExitState = PASS_THROUGH;
- bool AddedToDirtySuccessors = false;
- MachineBasicBlock::iterator FirstUnguardedCall;
- BlockState() = default;
- };
- using BlockStateMap = SmallVector<BlockState, 8>;
- using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>;
- BlockStateMap BlockStates;
- DirtySuccessorsWorkList DirtySuccessors;
- bool EverMadeChange;
- bool IsX86INTR;
- const TargetInstrInfo *TII;
- static char ID;
- };
- } // end anonymous namespace
- char VZeroUpperInserter::ID = 0;
- FunctionPass *llvm::createX86IssueVZeroUpperPass() {
- return new VZeroUpperInserter();
- }
- #ifndef NDEBUG
- const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
- switch (ST) {
- case PASS_THROUGH: return "Pass-through";
- case EXITS_DIRTY: return "Exits-dirty";
- case EXITS_CLEAN: return "Exits-clean";
- }
- llvm_unreachable("Invalid block exit state.");
- }
- #endif
- /// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
- /// Thus, there is no need to check for Y/ZMM16 and above.
- static bool isYmmOrZmmReg(unsigned Reg) {
- return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
- (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
- }
- static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
- for (std::pair<unsigned, unsigned> LI : MRI.liveins())
- if (isYmmOrZmmReg(LI.first))
- return true;
- return false;
- }
- static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
- for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
- if (!MO.clobbersPhysReg(reg))
- return false;
- }
- for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
- if (!MO.clobbersPhysReg(reg))
- return false;
- }
- return true;
- }
- static bool hasYmmOrZmmReg(MachineInstr &MI) {
- for (const MachineOperand &MO : MI.operands()) {
- if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
- return true;
- if (!MO.isReg())
- continue;
- if (MO.isDebug())
- continue;
- if (isYmmOrZmmReg(MO.getReg()))
- return true;
- }
- return false;
- }
- /// Check if given call instruction has a RegMask operand.
- static bool callHasRegMask(MachineInstr &MI) {
- assert(MI.isCall() && "Can only be called on call instructions.");
- for (const MachineOperand &MO : MI.operands()) {
- if (MO.isRegMask())
- return true;
- }
- return false;
- }
- /// Insert a vzeroupper instruction before I.
- void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
- MachineBasicBlock &MBB) {
- BuildMI(MBB, I, I->getDebugLoc(), TII->get(X86::VZEROUPPER));
- ++NumVZU;
- EverMadeChange = true;
- }
- /// Add MBB to the DirtySuccessors list if it hasn't already been added.
- void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
- if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
- DirtySuccessors.push_back(&MBB);
- BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
- }
- }
- /// Loop over all of the instructions in the basic block, inserting vzeroupper
- /// instructions before function calls.
- void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
- // Start by assuming that the block is PASS_THROUGH which implies no unguarded
- // calls.
- BlockExitState CurState = PASS_THROUGH;
- BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
- for (MachineInstr &MI : MBB) {
- bool IsCall = MI.isCall();
- bool IsReturn = MI.isReturn();
- bool IsControlFlow = IsCall || IsReturn;
- // No need for vzeroupper before iret in interrupt handler function,
- // epilogue will restore YMM/ZMM registers if needed.
- if (IsX86INTR && IsReturn)
- continue;
- // An existing VZERO* instruction resets the state.
- if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
- CurState = EXITS_CLEAN;
- continue;
- }
- // Shortcut: don't need to check regular instructions in dirty state.
- if (!IsControlFlow && CurState == EXITS_DIRTY)
- continue;
- if (hasYmmOrZmmReg(MI)) {
- // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
- // instruction, or it could be control flow.
- CurState = EXITS_DIRTY;
- continue;
- }
- // Check for control-flow out of the current function (which might
- // indirectly execute SSE instructions).
- if (!IsControlFlow)
- continue;
- // If the call has no RegMask, skip it as well. It usually happens on
- // helper function calls (such as '_chkstk', '_ftol2') where standard
- // calling convention is not used (RegMask is not used to mark register
- // clobbered and register usage (def/implicit-def/use) is well-defined and
- // explicitly specified.
- if (IsCall && !callHasRegMask(MI))
- continue;
- // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
- // registers. In addition, the processor changes back to Clean state, after
- // which execution of SSE instructions or AVX instructions has no transition
- // penalty. Add the VZEROUPPER instruction before any function call/return
- // that might execute SSE code.
- // FIXME: In some cases, we may want to move the VZEROUPPER into a
- // predecessor block.
- if (CurState == EXITS_DIRTY) {
- // After the inserted VZEROUPPER the state becomes clean again, but
- // other YMM/ZMM may appear before other subsequent calls or even before
- // the end of the BB.
- insertVZeroUpper(MI, MBB);
- CurState = EXITS_CLEAN;
- } else if (CurState == PASS_THROUGH) {
- // If this block is currently in pass-through state and we encounter a
- // call then whether we need a vzeroupper or not depends on whether this
- // block has successors that exit dirty. Record the location of the call,
- // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
- // It will be inserted later if necessary.
- BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
- CurState = EXITS_CLEAN;
- }
- }
- LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
- << getBlockExitStateName(CurState) << '\n');
- if (CurState == EXITS_DIRTY)
- for (MachineBasicBlock *Succ : MBB.successors())
- addDirtySuccessor(*Succ);
- BlockStates[MBB.getNumber()].ExitState = CurState;
- }
- /// Loop over all of the basic blocks, inserting vzeroupper instructions before
- /// function calls.
- bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
- if (!UseVZeroUpper)
- return false;
- const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX() || !ST.insertVZEROUPPER())
- return false;
- TII = ST.getInstrInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- EverMadeChange = false;
- IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR;
- bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
- // Fast check: if the function doesn't use any ymm/zmm registers, we don't
- // need to insert any VZEROUPPER instructions. This is constant-time, so it
- // is cheap in the common case of no ymm/zmm use.
- bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
- for (const auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) {
- if (!YmmOrZmmUsed) {
- for (MCPhysReg R : *RC) {
- if (!MRI.reg_nodbg_empty(R)) {
- YmmOrZmmUsed = true;
- break;
- }
- }
- }
- }
- if (!YmmOrZmmUsed)
- return false;
- assert(BlockStates.empty() && DirtySuccessors.empty() &&
- "X86VZeroUpper state should be clear");
- BlockStates.resize(MF.getNumBlockIDs());
- // Process all blocks. This will compute block exit states, record the first
- // unguarded call in each block, and add successors of dirty blocks to the
- // DirtySuccessors list.
- for (MachineBasicBlock &MBB : MF)
- processBasicBlock(MBB);
- // If any YMM/ZMM regs are live-in to this function, add the entry block to
- // the DirtySuccessors list
- if (FnHasLiveInYmmOrZmm)
- addDirtySuccessor(MF.front());
- // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
- // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
- // through PASS_THROUGH blocks.
- while (!DirtySuccessors.empty()) {
- MachineBasicBlock &MBB = *DirtySuccessors.back();
- DirtySuccessors.pop_back();
- BlockState &BBState = BlockStates[MBB.getNumber()];
- // MBB is a successor of a dirty block, so its first call needs to be
- // guarded.
- if (BBState.FirstUnguardedCall != MBB.end())
- insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
- // If this successor was a pass-through block, then it is now dirty. Its
- // successors need to be added to the worklist (if they haven't been
- // already).
- if (BBState.ExitState == PASS_THROUGH) {
- LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber()
- << " was Pass-through, is now Dirty-out.\n");
- for (MachineBasicBlock *Succ : MBB.successors())
- addDirtySuccessor(*Succ);
- }
- }
- BlockStates.clear();
- return EverMadeChange;
- }
|