//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file contains implementations for different VPlan recipes. /// //===----------------------------------------------------------------------===// #include "VPlan.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include using namespace llvm; using VectorParts = SmallVector; extern cl::opt EnableVPlanNativePath; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPWidenMemoryInstructionSC: { return cast(this)->isStore(); } case VPReplicateSC: case VPWidenCallSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayWriteToMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: return false; case VPWidenIntOrFpInductionSC: case VPWidenCanonicalIVSC: case VPWidenPHISC: case VPBlendSC: case VPWidenSC: case VPWidenGEPSC: case VPReductionSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayWriteToMemory()) && "underlying instruction may write to memory"); return false; } default: return true; } } bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { case VPWidenMemoryInstructionSC: { return !cast(this)->isStore(); } case VPReplicateSC: case VPWidenCallSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayReadFromMemory(); case VPBranchOnMaskSC: case VPScalarIVStepsSC: return false; case VPWidenIntOrFpInductionSC: case VPWidenCanonicalIVSC: case VPWidenPHISC: case VPBlendSC: case VPWidenSC: case VPWidenGEPSC: case VPReductionSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayReadFromMemory()) && "underlying instruction may read from memory"); return false; } default: return true; } } bool VPRecipeBase::mayHaveSideEffects() const { switch (getVPDefID()) { case VPDerivedIVSC: case VPPredInstPHISC: return false; case VPWidenIntOrFpInductionSC: case VPWidenPointerInductionSC: case VPWidenCanonicalIVSC: case VPWidenPHISC: case VPBlendSC: case VPWidenSC: case VPWidenGEPSC: case VPReductionSC: case VPWidenSelectSC: case VPScalarIVStepsSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; assert((!I || !I->mayHaveSideEffects()) && "underlying instruction has side-effects"); return false; } case VPReplicateSC: { auto *R = cast(this); return R->getUnderlyingInstr()->mayHaveSideEffects(); } default: return true; } } void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { auto Lane = VPLane::getLastLaneForVF(State.VF); VPValue *ExitValue = getOperand(0); if (vputils::isUniformAfterVectorization(ExitValue)) Lane = VPLane::getFirstLane(); Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)), State.Builder.GetInsertBlock()); } void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && "Insertion position not in any VPBasicBlock"); Parent = InsertPos->getParent(); Parent->getRecipeList().insert(InsertPos->getIterator(), this); } void VPRecipeBase::insertBefore(VPBasicBlock &BB, iplist::iterator I) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(I == BB.end() || I->getParent() == &BB); Parent = &BB; BB.getRecipeList().insert(I, this); } void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && "Insertion position not in any VPBasicBlock"); Parent = InsertPos->getParent(); Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); } void VPRecipeBase::removeFromParent() { assert(getParent() && "Recipe not in any VPBasicBlock"); getParent()->getRecipeList().remove(getIterator()); Parent = nullptr; } iplist::iterator VPRecipeBase::eraseFromParent() { assert(getParent() && "Recipe not in any VPBasicBlock"); return getParent()->getRecipeList().erase(getIterator()); } void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { removeFromParent(); insertAfter(InsertPos); } void VPRecipeBase::moveBefore(VPBasicBlock &BB, iplist::iterator I) { removeFromParent(); insertBefore(BB, I); } void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilderBase &Builder = State.Builder; Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); State.set(this, V, Part); return; } switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0), Part); Value *V = Builder.CreateNot(A, Name); State.set(this, V, Part); break; } case VPInstruction::ICmpULE: { Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); Value *V = Builder.CreateICmpULE(IV, TC, Name); State.set(this, V, Part); break; } case Instruction::Select: { Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); Value *Op2 = State.get(getOperand(2), Part); Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); State.set(this, V, Part); break; } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. Value *ScalarTC = State.get(getOperand(1), Part); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); State.set(this, Call, Part); break; } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // // vector.ph: // v_init = vector(..., ..., ..., a[-1]) // br vector.body // // vector.body // i = phi [0, vector.ph], [i+4, vector.body] // v1 = phi [v_init, vector.ph], [v2, vector.body] // v2 = a[i, i+1, i+2, i+3]; // v3 = vector(v1(3), v2(0, 1, 2)) // For the first part, use the recurrence phi (v1), otherwise v2. auto *V1 = State.get(getOperand(0), 0); Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); if (!PartMinus1->getType()->isVectorTy()) { State.set(this, PartMinus1, Part); } else { Value *V2 = State.get(getOperand(1), Part); State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), Part); } break; } case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: { Value *Next = nullptr; if (Part == 0) { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; auto *Phi = State.get(getOperand(0), 0); // The loop step is equal to the vectorization factor (num of SIMD // elements) times the unroll factor (num of SIMD instructions). Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); } State.set(this, Next, Part); break; } case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::CanonicalIVIncrementForPartNUW: { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; auto *IV = State.get(getOperand(0), VPIteration(0, 0)); if (Part == 0) { State.set(this, IV, Part); break; } // The canonical IV is incremented by the vectorization factor (num of SIMD // elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); State.set(this, Next, Part); break; } case VPInstruction::BranchOnCond: { if (Part != 0) break; Value *Cond = State.get(getOperand(0), VPIteration(Part, 0)); VPRegionBlock *ParentRegion = getParent()->getParent(); VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination for exiting blocks now and // to forward destination(s) later when they are created. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr); if (getParent()->isExiting()) CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); break; } case VPInstruction::BranchOnCount: { if (Part != 0) break; // First create the compare. Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); Value *Cond = Builder.CreateICmpEQ(IV, TC); // Now create the branch. auto *Plan = getParent()->getPlan(); VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); // Replace the temporary unreachable terminator with a new conditional // branch, hooking it up to backward destination (the header) now and to the // forward destination (the exit/middle block) later when it is created. // Note that CreateCondBr expects a valid BB as first argument, so we need // to set it to nullptr later. BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), State.CFG.VPBB2IRBB[Header]); CondBr->setSuccessor(0, nullptr); Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); break; } default: llvm_unreachable("Unsupported opcode for instruction"); } } void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(FMF); for (unsigned Part = 0; Part < State.UF; ++Part) generateInstruction(State, Part); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstruction::dump() const { VPSlotTracker SlotTracker(getParent()->getPlan()); print(dbgs(), "", SlotTracker); } void VPInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; if (hasResult()) { printAsOperand(O, SlotTracker); O << " = "; } switch (getOpcode()) { case VPInstruction::Not: O << "not"; break; case VPInstruction::ICmpULE: O << "icmp ule"; break; case VPInstruction::SLPLoad: O << "combined load"; break; case VPInstruction::SLPStore: O << "combined store"; break; case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; case VPInstruction::CanonicalIVIncrement: O << "VF * UF + "; break; case VPInstruction::CanonicalIVIncrementNUW: O << "VF * UF +(nuw) "; break; case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; case VPInstruction::CanonicalIVIncrementForPart: O << "VF * Part + "; break; case VPInstruction::CanonicalIVIncrementForPartNUW: O << "VF * Part +(nuw) "; break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; default: O << Instruction::getOpcodeName(getOpcode()); } O << FMF; for (const VPValue *Operand : operands()) { O << " "; Operand->printAsOperand(O, SlotTracker); } if (DL) { O << ", !dbg "; DL.print(O); } } #endif void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { // Make sure the VPInstruction is a floating-point operation. assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FNeg || Opcode == Instruction::FSub || Opcode == Instruction::FDiv || Opcode == Instruction::FRem || Opcode == Instruction::FCmp) && "this op can't take fast-math flags"); FMF = FMFNew; } void VPWidenCallRecipe::execute(VPTransformState &State) { auto &CI = *cast(getUnderlyingInstr()); assert(!isa(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); SmallVector Tys; for (Value *ArgOperand : CI.args()) Tys.push_back( ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue())); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; for (const auto &I : enumerate(operands())) { // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; if (VectorIntrinsicID == Intrinsic::not_intrinsic || !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), Part); else Arg = State.get(I.value(), VPIteration(0, 0)); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } Function *VectorF; if (VectorIntrinsicID != Intrinsic::not_intrinsic) { // Use vector version of the intrinsic. if (State.VF.isVector()) TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), State.VF); Module *M = State.Builder.GetInsertBlock()->getModule(); VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/); #ifndef NDEBUG assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && "Can't create vector function."); #endif VectorF = VFDatabase(CI).getVectorizedFunction(Shape); } SmallVector OpBundles; CI.getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); if (isa(V)) V->copyFastMathFlags(&CI); State.set(this, V, Part); State.addMetadata(V, &CI); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CALL "; auto *CI = cast(getUnderlyingInstr()); if (CI->getType()->isVoidTy()) O << "void "; else { printAsOperand(O, SlotTracker); O << " = "; } O << "call @" << CI->getCalledFunction()->getName() << "("; printOperands(O, SlotTracker); O << ")"; if (VectorIntrinsicID) O << " (using vector intrinsic)"; else O << " (using library function)"; } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-SELECT "; printAsOperand(O, SlotTracker); O << " = select "; getOperand(0)->printAsOperand(O, SlotTracker); O << ", "; getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); O << (InvariantCond ? " (condition is loop invariant)" : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { auto &I = *cast(getUnderlyingInstr()); State.setDebugLocFromInst(&I); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. auto *InvarCond = InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; for (unsigned Part = 0; Part < State.UF; ++Part) { Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); Value *Op0 = State.get(getOperand(1), Part); Value *Op1 = State.get(getOperand(2), Part); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel, Part); State.addMetadata(Sel, &I); } } void VPWidenRecipe::execute(VPTransformState &State) { auto &I = *cast(getUnderlyingValue()); auto &Builder = State.Builder; switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: case Instruction::PHI: case Instruction::GetElementPtr: case Instruction::Select: llvm_unreachable("This instruction is handled by a different recipe."); case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::FNeg: case Instruction::Mul: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { // Just widen unops and binops. State.setDebugLocFromInst(&I); for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector Ops; for (VPValue *VPOp : operands()) Ops.push_back(State.get(VPOp, Part)); Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); if (auto *VecOp = dyn_cast(V)) { VecOp->copyIRFlags(&I); // If the instruction is vectorized and was in a basic block that needed // predication, we can't propagate poison-generating flags (nuw/nsw, // exact, etc.). The control flow has been linearized and the // instruction is no longer guarded by the predicate, which could make // the flag properties to no longer hold. if (State.MayGeneratePoisonRecipes.contains(this)) VecOp->dropPoisonGeneratingFlags(); } // Use this vector value for all users of the original instruction. State.set(this, V, Part); State.addMetadata(V, &I); } break; } case Instruction::Freeze: { State.setDebugLocFromInst(&I); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *Op = State.get(getOperand(0), Part); Value *Freeze = Builder.CreateFreeze(Op); State.set(this, Freeze, Part); } break; } case Instruction::ICmp: case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = cast(&I); State.setDebugLocFromInst(Cmp); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); Value *C = nullptr; if (FCmp) { // Propagate fast math flags. IRBuilder<>::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(Cmp->getFastMathFlags()); C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); } else { C = Builder.CreateICmp(Cmp->getPredicate(), A, B); } State.set(this, C, Part); State.addMetadata(C, &I); } break; } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { auto *CI = cast(&I); State.setDebugLocFromInst(CI); /// Vectorize casts. Type *DestTy = (State.VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), State.VF); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *A = State.get(getOperand(0), Part); Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); State.set(this, Cast, Part); State.addMetadata(Cast, &I); } break; } default: // This instruction is not vectorized by simple widening. LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); llvm_unreachable("Unhandled instruction!"); } // end of switch. } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); const Instruction *UI = getUnderlyingInstr(); O << " = " << UI->getOpcodeName() << " "; if (auto *Cmp = dyn_cast(UI)) O << CmpInst::getPredicateName(Cmp->getPredicate()) << " "; printOperands(O, SlotTracker); } void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INDUCTION"; if (getTruncInst()) { O << "\\l\""; O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; O << " +\n" << Indent << "\" "; getVPValue(0)->printAsOperand(O, SlotTracker); } else O << " " << VPlanIngredient(IV); O << ", "; getStepValue()->printAsOperand(O, SlotTracker); } #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); auto *StepC = dyn_cast(getInductionDescriptor().getStep()); return StartC && StartC->isZero() && StepC && StepC->isOne(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << Indent << "= DERIVED-IV "; getStartValue()->printAsOperand(O, SlotTracker); O << " + "; getCanonicalIV()->printAsOperand(O, SlotTracker); O << " * "; getStepValue()->printAsOperand(O, SlotTracker); if (IndDesc.getStep()->getType() != ResultTy) O << " (truncated to " << *ResultTy << ")"; } #endif #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent; printAsOperand(O, SlotTracker); O << Indent << "= SCALAR-STEPS "; printOperands(O, SlotTracker); } #endif void VPWidenGEPRecipe::execute(VPTransformState &State) { auto *GEP = cast(getUnderlyingInstr()); // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP // results in a vector of pointers when at least one operand of the GEP // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { // If we are vectorizing, but the GEP has only loop-invariant operands, // the GEP we build (by only using vector-typed operands for // loop-varying values) would be a scalar pointer. Thus, to ensure we // produce a vector of pointers, we need to either arbitrarily pick an // operand to broadcast, or broadcast a clone of the original GEP. // Here, we broadcast a clone of the original. // // TODO: If at some point we decide to scalarize instructions having // loop-invariant operands, this special case will no longer be // required. We would add the scalarization decision to // collectLoopScalars() and teach getVectorValue() to broadcast // the lane-zero scalar value. auto *Clone = State.Builder.Insert(GEP->clone()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); State.set(this, EntryPart, Part); State.addMetadata(EntryPart, GEP); } } else { // If the GEP has at least one loop-varying operand, we are sure to // produce a vector of pointers. But if we are only unrolling, we want // to produce a scalar GEP for each unroll part. Thus, the GEP we // produce with the code below will be scalar (if VF == 1) or vector // (otherwise). Note that for the unroll-only case, we still maintain // values in the vector mapping with initVector, as we do for other // instructions. for (unsigned Part = 0; Part < State.UF; ++Part) { // The pointer operand of the new GEP. If it's loop-invariant, we // won't broadcast it. auto *Ptr = IsPtrLoopInvariant ? State.get(getOperand(0), VPIteration(0, 0)) : State.get(getOperand(0), Part); // Collect all the indices for the new GEP. If any index is // loop-invariant, we won't broadcast it. SmallVector Indices; for (unsigned I = 1, E = getNumOperands(); I < E; I++) { VPValue *Operand = getOperand(I); if (IsIndexLoopInvariant[I - 1]) Indices.push_back(State.get(Operand, VPIteration(0, 0))); else Indices.push_back(State.get(Operand, Part)); } // If the GEP instruction is vectorized and was in a basic block that // needed predication, we can't propagate the poison-generating 'inbounds' // flag. The control flow has been linearized and the GEP is no longer // guarded by the predicate, which could make the 'inbounds' properties to // no longer hold. bool IsInBounds = GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; // Create the new GEP. Note that this GEP may be a scalar if VF == 1, // but it should be a vector, otherwise. auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices, "", IsInBounds); assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && "NewGEP is not a pointer vector"); State.set(this, NewGEP, Part); State.addMetadata(NewGEP, GEP); } } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; O << (IsPtrLoopInvariant ? "Inv" : "Var"); size_t IndicesNumber = IsIndexLoopInvariant.size(); for (size_t I = 0; I < IndicesNumber; ++I) O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; O << " "; printAsOperand(O, SlotTracker); O << " = getelementptr "; printOperands(O, SlotTracker); } #endif void VPBlendRecipe::execute(VPTransformState &State) { State.setDebugLocFromInst(Phi); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. unsigned NumIncoming = getNumIncomingValues(); // Generate a sequence of selects of the form: // SELECT(Mask3, In3, // SELECT(Mask2, In2, // SELECT(Mask1, In1, // In0))) // Note that Mask0 is never used: lanes for which no path reaches this phi and // are essentially undef are taken from In0. VectorParts Entry(State.UF); for (unsigned In = 0; In < NumIncoming; ++In) { for (unsigned Part = 0; Part < State.UF; ++Part) { // We might have single edge PHIs (blocks) - use an identity // 'select' for the first PHI operand. Value *In0 = State.get(getIncomingValue(In), Part); if (In == 0) Entry[Part] = In0; // Initialize with the first incoming value. else { // Select between the current value and the previous incoming edge // based on the incoming mask. Value *Cond = State.get(getMask(In), Part); Entry[Part] = State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); } } } for (unsigned Part = 0; Part < State.UF; ++Part) State.set(this, Entry[Part], Part); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; Phi->printAsOperand(O, false); O << " ="; if (getNumIncomingValues() == 1) { // Not a User of any mask: not really blending, this is a // single-predecessor phi. O << " "; getIncomingValue(0)->printAsOperand(O, SlotTracker); } else { for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { O << " "; getIncomingValue(I)->printAsOperand(O, SlotTracker); O << "/"; getMask(I)->printAsOperand(O, SlotTracker); } } } void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " +"; if (isa(getUnderlyingInstr())) O << getUnderlyingInstr()->getFastMathFlags(); O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; if (RdxDesc->IntermediateStore) O << " (with final reduction value stored in invariant address sank " "outside of loop)"; } void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); if (!getUnderlyingInstr()->getType()->isVoidTy()) { printAsOperand(O, SlotTracker); O << " = "; } if (auto *CB = dyn_cast(getUnderlyingInstr())) { O << "call @" << CB->getCalledFunction()->getName() << "("; interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); }); O << ")"; } else { O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; printOperands(O, SlotTracker); } if (AlsoPack) O << " (S->V)"; } #endif void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); unsigned Part = State.Instance->Part; unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); if (BlockInMask) { ConditionBit = State.get(BlockInMask, Part); if (ConditionBit->getType()->isVectorTy()) ConditionBit = State.Builder.CreateExtractElement( ConditionBit, State.Builder.getInt32(Lane)); } else // Block in mask is all-one. ConditionBit = State.Builder.getTrue(); // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); assert(isa(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."); auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); CondBr->setSuccessor(0, nullptr); ReplaceInstWithInst(CurrentTerminator, CondBr); } void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = cast(State.get(getOperand(0), *State.Instance)); BasicBlock *PredicatedBB = ScalarPredInst->getParent(); BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); assert(PredicatingBB && "Predicated block has no single predecessor."); assert(isa(getOperand(0)) && "operand must be VPReplicateRecipe"); // By current pack/unpack logic we need to generate only a single phi node: if // a vector value for the predicated instruction exists at this point it means // the instruction has vector users only, and a phi for the vector value is // needed. In this case the recipe of the predicated instruction is marked to // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. unsigned Part = State.Instance->Part; if (State.hasVectorValue(getOperand(0), Part)) { Value *VectorValue = State.get(getOperand(0), Part); InsertElementInst *IEI = cast(VectorValue); PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. if (State.hasVectorValue(this, Part)) State.reset(this, VPhi, Part); else State.set(this, VPhi, Part); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. State.reset(getOperand(0), VPhi, Part); } else { Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); Phi->addIncoming(ScalarPredInst, PredicatedBB); if (State.hasScalarValue(this, *State.Instance)) State.reset(this, Phi, *State.Instance); else State.set(this, Phi, *State.Instance); // NOTE: Currently we need to update the value of the operand, so the next // predicated iteration inserts its generated value in the correct vector. State.reset(getOperand(0), Phi, *State.Instance); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; printAsOperand(O, SlotTracker); O << " = "; printOperands(O, SlotTracker); } void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; if (!isStore()) { getVPSingleValue()->printAsOperand(O, SlotTracker); O << " = "; } O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; printOperands(O, SlotTracker); } #endif void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { Value *Start = getStartValue()->getLiveInIRValue(); PHINode *EntryPart = PHINode::Create( Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); EntryPart->addIncoming(Start, VectorPH); EntryPart->setDebugLoc(DL); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(this, EntryPart, Part); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = CANONICAL-INDUCTION"; } #endif bool VPCanonicalIVPHIRecipe::isCanonical(const InductionDescriptor &ID, Type *Ty) const { if (Ty != getScalarType()) return false; // The start value of ID must match the start value of this canonical // induction. if (getStartValue()->getLiveInIRValue() != ID.getStartValue()) return false; ConstantInt *Step = ID.getConstIntStepValue(); // ID must also be incremented by one. IK_IntInduction always increment the // induction by Step, but the binary op may not be set. return ID.getKind() == InductionDescriptor::IK_IntInduction && Step && Step->isOne(); } bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) { return IsScalarAfterVectorization && (!VF.isScalable() || vputils::onlyFirstLaneUsed(this)); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = WIDEN-POINTER-INDUCTION "; getStartValue()->printAsOperand(O, SlotTracker); O << ", " << *IndDesc.getStep(); } #endif void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Instance && "cannot be used in per-lane"); const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); SCEVExpander Exp(SE, DL, "induction"); Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), &*State.Builder.GetInsertPoint()); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) State.set(this, Res, Part); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; getVPSingleValue()->printAsOperand(O, SlotTracker); O << " = EXPAND SCEV " << *Expr; } #endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.get(getOperand(0), 0); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; Value *VStart = VF.isScalar() ? CanonicalIV : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { Value *VStep = createStepForVF(Builder, STy, VF, Part); if (VF.isVector()) { VStep = Builder.CreateVectorSplat(VF, VStep); VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); } Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(this, CanonicalVectorIV, Part); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = WIDEN-CANONICAL-INDUCTION "; printOperands(O, SlotTracker); } #endif void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; // Create a vector from the initial value. auto *VectorInit = getStartValue()->getLiveInIRValue(); Type *VecTy = State.VF.isScalar() ? VectorInit->getType() : VectorType::get(VectorInit->getType(), State.VF); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); if (State.VF.isVector()) { auto *IdxTy = Builder.getInt32Ty(); auto *One = ConstantInt::get(IdxTy, 1); IRBuilder<>::InsertPointGuard Guard(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); auto *LastIdx = Builder.CreateSub(RuntimeVF, One); VectorInit = Builder.CreateInsertElement( PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); } // Create a phi node for the new recurrence. PHINode *EntryPart = PHINode::Create( VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); EntryPart->addIncoming(VectorInit, VectorPH); State.set(this, EntryPart, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif void VPReductionPHIRecipe::execute(VPTransformState &State) { PHINode *PN = cast(getUnderlyingValue()); auto &Builder = State.Builder; // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; Type *VecTy = ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentVectorLoop->getHeader() == HeaderBB && "recipe must be in the vector loop header"); unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { Value *EntryPart = PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); State.set(this, EntryPart, Part); } BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); Value *StartV = StartVPV->getLiveInIRValue(); Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { // MinMax reduction have the start value as their identify. if (ScalarPHI) { Iden = StartV; } else { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); StartV = Iden = Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); } } else { Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); if (!ScalarPHI) { Iden = Builder.CreateVectorSplat(State.VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } } for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { Value *EntryPart = State.get(this, Part); // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? StartV : Iden; cast(EntryPart)->addIncoming(StartVal, VectorPH); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-REDUCTION-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); // Currently we enter here in the VPlan-native path for non-induction // PHIs where all control flow is uniform. We simply widen these PHIs. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. VPBasicBlock *Parent = getParent(); VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion(); unsigned StartIdx = 0; // For phis in header blocks of loop regions, use the index of the value // coming from the preheader. if (LoopRegion->getEntryBasicBlock() == Parent) { for (unsigned I = 0; I < getNumOperands(); ++I) { if (getIncomingBlock(I) == LoopRegion->getSinglePredecessor()->getExitingBasicBlock()) StartIdx = I; } } Value *Op0 = State.get(getOperand(StartIdx), 0); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi"); State.set(this, VecPhi, 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-PHI "; auto *OriginalPhi = cast(getUnderlyingValue()); // Unless all incoming values are modeled in VPlan print the original PHI // directly. // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming // values as VPValues. if (getNumOperands() != OriginalPhi->getNumOperands()) { O << VPlanIngredient(OriginalPhi); return; } printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif // TODO: It would be good to use the existing VPWidenPHIRecipe instead and // remove VPActiveLaneMaskPHIRecipe. void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { Value *StartMask = State.get(getOperand(0), Part); PHINode *EntryPart = State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); EntryPart->addIncoming(StartMask, VectorPH); EntryPart->setDebugLoc(DL); State.set(this, EntryPart, Part); } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "ACTIVE-LANE-MASK-PHI "; printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); } #endif