//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass implements the Bottom Up SLP vectorizer. It detects consecutive // stores that can be put together into vector-stores. Next, it attempts to // construct vectorizable tree using the use-def chains. If a profitable tree // was found, the SLP vectorizer performs vectorization on the tree. // // The pass is inspired by the work described in the paper: // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #ifdef EXPENSIVE_CHECKS #include "llvm/IR/Verifier.h" #endif #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/DOTGraphTraits.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/InstructionCost.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; using namespace llvm::PatternMatch; using namespace slpvectorizer; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number ")); static cl::opt ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); static cl::opt MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)")); static cl::opt MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. /// This limit is way higher than needed by real-world functions. static cl::opt ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block")); static cl::opt MinVectorRegSizeOption( "slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); static cl::opt RecursionMaxDepth( "slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree")); static cl::opt MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); // The maximum depth that the look-ahead score heuristic will explore. // The higher this value, the higher the compilation time overhead. static cl::opt LookAheadMaxDepth( "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores")); // The maximum depth that the look-ahead score heuristic will explore // when it probing among candidates for vectorization tree roots. // The higher this value, the higher the compilation time overhead but unlike // similar limit for operands ordering this is less frequently used, hence // impact of higher value is less noticeable. static cl::opt RootLookAheadMaxDepth( "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option")); static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; // Another limit for the alias checks: The maximum distance between load/store // instructions where alias checks are done. // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling /// regions to be handled. static const int MinScheduleRegionSize = 16; /// Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM /// vectors. We also filter target specific types which have absolutely no /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just /// avoids spending time checking the cost model and realizing that they will /// be inevitably scalarized. static bool isValidElementType(Type *Ty) { return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty(); } /// \returns True if the value is a constant (but not globals/constant /// expressions). static bool isConstant(Value *V) { return isa(V) && !isa(V); } /// Checks if \p V is one of vector-like instructions, i.e. undef, /// insertelement/extractelement with constant indices for fixed vector type or /// extractvalue instruction. static bool isVectorLikeInstWithConstOps(Value *V) { if (!isa(V) && !isa(V)) return false; auto *I = dyn_cast(V); if (!I || isa(I)) return true; if (!isa(I->getOperand(0)->getType())) return false; if (isa(I)) return isConstant(I->getOperand(1)); assert(isa(V) && "Expected only insertelement."); return isConstant(I->getOperand(2)); } /// \returns true if all of the instructions in \p VL are in the same block or /// false otherwise. static bool allSameBlock(ArrayRef VL) { Instruction *I0 = dyn_cast(VL[0]); if (!I0) return false; if (all_of(VL, isVectorLikeInstWithConstOps)) return true; BasicBlock *BB = I0->getParent(); for (int I = 1, E = VL.size(); I < E; I++) { auto *II = dyn_cast(VL[I]); if (!II) return false; if (BB != II->getParent()) return false; } return true; } /// \returns True if all of the values in \p VL are constants (but not /// globals/constant expressions). static bool allConstant(ArrayRef VL) { // Constant expressions and globals can't be vectorized like normal integer/FP // constants. return all_of(VL, isConstant); } /// \returns True if all of the values in \p VL are identical or some of them /// are UndefValue. static bool isSplat(ArrayRef VL) { Value *FirstNonUndef = nullptr; for (Value *V : VL) { if (isa(V)) continue; if (!FirstNonUndef) { FirstNonUndef = V; continue; } if (V != FirstNonUndef) return false; } return FirstNonUndef != nullptr; } /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. static bool isCommutative(Instruction *I) { if (auto *Cmp = dyn_cast(I)) return Cmp->isCommutative(); if (auto *BO = dyn_cast(I)) return BO->isCommutative(); // TODO: This should check for generic Instruction::isCommutative(), but // we need to confirm that the caller code correctly handles Intrinsics // for example (does not have 2 operands). return false; } /// \returns inserting index of InsertElement or InsertValue instruction, /// using Offset as base offset for index. static std::optional getInsertIndex(const Value *InsertInst, unsigned Offset = 0) { int Index = Offset; if (const auto *IE = dyn_cast(InsertInst)) { const auto *VT = dyn_cast(IE->getType()); if (!VT) return std::nullopt; const auto *CI = dyn_cast(IE->getOperand(2)); if (!CI) return std::nullopt; if (CI->getValue().uge(VT->getNumElements())) return std::nullopt; Index *= VT->getNumElements(); Index += CI->getZExtValue(); return Index; } const auto *IV = cast(InsertInst); Type *CurrentType = IV->getType(); for (unsigned I : IV->indices()) { if (const auto *ST = dyn_cast(CurrentType)) { Index *= ST->getNumElements(); CurrentType = ST->getElementType(I); } else if (const auto *AT = dyn_cast(CurrentType)) { Index *= AT->getNumElements(); CurrentType = AT->getElementType(); } else { return std::nullopt; } Index += I; } return Index; } namespace { /// Specifies the way the mask should be analyzed for undefs/poisonous elements /// in the shuffle mask. enum class UseMask { FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, ///< check for the mask elements for the first argument (mask ///< indices are in range [0:VF)). SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check ///< for the mask elements for the second argument (mask indices ///< are in range [VF:2*VF)) UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for ///< future shuffle elements and mark them as ones as being used ///< in future. Non-undef elements are considered as unused since ///< they're already marked as used in the mask. }; } // namespace /// Prepares a use bitset for the given mask either for the first argument or /// for the second. static SmallBitVector buildUseMask(int VF, ArrayRef Mask, UseMask MaskArg) { SmallBitVector UseMask(VF, true); for (auto P : enumerate(Mask)) { if (P.value() == UndefMaskElem) { if (MaskArg == UseMask::UndefsAsMask) UseMask.reset(P.index()); continue; } if (MaskArg == UseMask::FirstArg && P.value() < VF) UseMask.reset(P.value()); else if (MaskArg == UseMask::SecondArg && P.value() >= VF) UseMask.reset(P.value() - VF); } return UseMask; } /// Checks if the given value is actually an undefined constant vector. /// Also, if the \p UseMask is not empty, tries to check if the non-masked /// elements actually mask the insertelement buildvector, if any. template static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask = {}) { SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); using T = std::conditional_t; if (isa(V)) return Res; auto *VecTy = dyn_cast(V->getType()); if (!VecTy) return Res.reset(); auto *C = dyn_cast(V); if (!C) { if (!UseMask.empty()) { const Value *Base = V; while (auto *II = dyn_cast(Base)) { if (isa(II->getOperand(1))) continue; Base = II->getOperand(0); std::optional Idx = getInsertIndex(II); if (!Idx) continue; if (*Idx < UseMask.size() && !UseMask.test(*Idx)) Res.reset(*Idx); } // TODO: Add analysis for shuffles here too. if (V == Base) { Res.reset(); } else { SmallBitVector SubMask(UseMask.size(), false); Res &= isUndefVector(Base, SubMask); } } else { Res.reset(); } return Res; } for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { if (Constant *Elem = C->getAggregateElement(I)) if (!isa(Elem) && (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I)))) Res.reset(I); } return Res; } /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 /// %y1 = extractelement <4 x i8> %y, i32 1 /// %y2 = extractelement <4 x i8> %y, i32 2 /// %x0x0 = mul i8 %x0, %x0 /// %x3x3 = mul i8 %x3, %x3 /// %y1y1 = mul i8 %y1, %y1 /// %y2y2 = mul i8 %y2, %y2 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 /// ret <4 x i8> %ins4 /// can be transformed into: /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 /// We convert this initially to something like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 /// %y1 = extractelement <4 x i8> %y, i32 1 /// %y2 = extractelement <4 x i8> %y, i32 2 /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 /// %5 = mul <4 x i8> %4, %4 /// %6 = extractelement <4 x i8> %5, i32 0 /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 /// %7 = extractelement <4 x i8> %5, i32 1 /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 /// %8 = extractelement <4 x i8> %5, i32 2 /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 /// %9 = extractelement <4 x i8> %5, i32 3 /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 /// ret <4 x i8> %ins4 /// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? static std::optional isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask) { const auto *It = find_if(VL, [](Value *V) { return isa(V); }); if (It == VL.end()) return std::nullopt; auto *EI0 = cast(*It); if (isa(EI0->getVectorOperandType())) return std::nullopt; unsigned Size = cast(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; Mask.assign(VL.size(), UndefMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { // Undef can be represented as an undef element in a vector. if (isa(VL[I])) continue; auto *EI = cast(VL[I]); if (isa(EI->getVectorOperandType())) return std::nullopt; auto *Vec = EI->getVectorOperand(); // We can extractelement from undef or poison vector. if (isUndefVector(Vec).all()) continue; // All vector operands must have the same number of vector elements. if (cast(Vec->getType())->getNumElements() != Size) return std::nullopt; if (isa(EI->getIndexOperand())) continue; auto *Idx = dyn_cast(EI->getIndexOperand()); if (!Idx) return std::nullopt; // Undefined behavior if Idx is negative or >= Size. if (Idx->getValue().uge(Size)) continue; unsigned IntIdx = Idx->getValue().getZExtValue(); Mask[I] = IntIdx; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. if (!Vec1 || Vec1 == Vec) { Vec1 = Vec; } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; Mask[I] += Size; } else { return std::nullopt; } if (CommonShuffleMode == Permute) continue; // If the extract index is not the same as the operation number, it is a // permutation. if (IntIdx != I) { CommonShuffleMode = Permute; continue; } CommonShuffleMode = Select; } // If we're not crossing lanes in different vectors, consider it as blending. if (CommonShuffleMode == Select && Vec2) return TargetTransformInfo::SK_Select; // If Vec2 was never used, we have a permutation of a single vector, otherwise // we have permutation of 2 vectors. return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc : TargetTransformInfo::SK_PermuteSingleSrc; } /// \returns True if Extract{Value,Element} instruction extracts element Idx. static std::optional getExtractIndex(Instruction *E) { unsigned Opcode = E->getOpcode(); assert((Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."); if (Opcode == Instruction::ExtractElement) { auto *CI = dyn_cast(E->getOperand(1)); if (!CI) return std::nullopt; return CI->getZExtValue(); } auto *EI = cast(E); if (EI->getNumIndices() != 1) return std::nullopt; return *EI->idx_begin(); } namespace { /// Main data required for vectorization of instructions. struct InstructionsState { /// The very first instruction in the list with the main opcode. Value *OpValue = nullptr; /// The main/alternate instruction. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; /// The main/alternate opcodes for the list of instructions. unsigned getOpcode() const { return MainOp ? MainOp->getOpcode() : 0; } unsigned getAltOpcode() const { return AltOp ? AltOp->getOpcode() : 0; } /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return AltOp != MainOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } InstructionsState() = delete; InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} }; } // end anonymous namespace /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// OpValue. static Value *isOneOf(const InstructionsState &S, Value *Op) { auto *I = dyn_cast(Op); if (I && S.isOpcodeOrAlt(I)) return Op; return S.OpValue; } /// \returns true if \p Opcode is allowed as part of of the main/alternate /// instruction for SLP vectorization. /// /// Example of unsupported opcode is SDIV that can potentially cause UB if the /// "shuffled out" lane would result in division by zero. static bool isValidForAlternation(unsigned Opcode) { if (Instruction::isIntDivRem(Opcode)) return false; return true; } static InstructionsState getSameOpcode(ArrayRef VL, const TargetLibraryInfo &TLI, unsigned BaseIndex = 0); /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. /// compatible instructions or constants, or just some other regular values. static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI) { return (isConstant(BaseOp0) && isConstant(Op0)) || (isConstant(BaseOp1) && isConstant(Op1)) || (!isa(BaseOp0) && !isa(Op0) && !isa(BaseOp1) && !isa(Op1)) || BaseOp0 == Op0 || BaseOp1 == Op1 || getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); } /// \returns true if a compare instruction \p CI has similar "look" and /// same predicate as \p BaseCI, "as is" or with its operands and predicate /// swapped, false otherwise. static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI) { assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && "Assessing comparisons of different types?"); CmpInst::Predicate BasePred = BaseCI->getPredicate(); CmpInst::Predicate Pred = CI->getPredicate(); CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred); Value *BaseOp0 = BaseCI->getOperand(0); Value *BaseOp1 = BaseCI->getOperand(1); Value *Op0 = CI->getOperand(0); Value *Op1 = CI->getOperand(1); return (BasePred == Pred && areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || (BasePred == SwappedPred && areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); } /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, const TargetLibraryInfo &TLI, unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); bool IsCmpOp = isa(VL[BaseIndex]); CmpInst::Predicate BasePred = IsCmpOp ? cast(VL[BaseIndex])->getPredicate() : CmpInst::BAD_ICMP_PREDICATE; unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). auto *IBase = cast(VL[BaseIndex]); Intrinsic::ID BaseID = 0; SmallVector BaseMappings; if (auto *CallBase = dyn_cast(IBase)) { BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); unsigned InstOpcode = I->getOpcode(); if (IsBinOp && isa(I)) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && isValidForAlternation(Opcode)) { AltOpcode = InstOpcode; AltIndex = Cnt; continue; } } else if (IsCastOp && isa(I)) { Value *Op0 = IBase->getOperand(0); Type *Ty0 = Op0->getType(); Value *Op1 = I->getOperand(0); Type *Ty1 = Op1->getType(); if (Ty0 == Ty1) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode) { assert(isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"); AltOpcode = InstOpcode; AltIndex = Cnt; continue; } } } else if (auto *Inst = dyn_cast(VL[Cnt]); Inst && IsCmpOp) { auto *BaseInst = cast(VL[BaseIndex]); Type *Ty0 = BaseInst->getOperand(0)->getType(); Type *Ty1 = Inst->getOperand(0)->getType(); if (Ty0 == Ty1) { assert(InstOpcode == Opcode && "Expected same CmpInst opcode."); // Check for compatible operands. If the corresponding operands are not // compatible - need to perform alternate vectorization. CmpInst::Predicate CurrentPred = Inst->getPredicate(); CmpInst::Predicate SwappedCurrentPred = CmpInst::getSwappedPredicate(CurrentPred); if (E == 2 && (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) continue; if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) continue; auto *AltInst = cast(VL[AltIndex]); if (AltIndex != BaseIndex) { if (isCmpSameOrSwapped(AltInst, Inst, TLI)) continue; } else if (BasePred != CurrentPred) { assert( isValidForAlternation(InstOpcode) && "CmpInst isn't safe for alternation, logic needs to be updated!"); AltIndex = Cnt; continue; } CmpInst::Predicate AltPred = AltInst->getPredicate(); if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || AltPred == CurrentPred || AltPred == SwappedCurrentPred) continue; } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) { if (auto *Gep = dyn_cast(I)) { if (Gep->getNumOperands() != 2 || Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } else if (auto *EI = dyn_cast(I)) { if (!isVectorLikeInstWithConstOps(EI)) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } else if (auto *LI = dyn_cast(I)) { auto *BaseLI = cast(IBase); if (!LI->isSimple() || !BaseLI->isSimple()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } else if (auto *Call = dyn_cast(I)) { auto *CallBase = cast(IBase); if (Call->getCalledFunction() != CallBase->getCalledFunction()) return InstructionsState(VL[BaseIndex], nullptr, nullptr); if (Call->hasOperandBundles() && !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), Call->op_begin() + Call->getBundleOperandsEndIndex(), CallBase->op_begin() + CallBase->getBundleOperandsStartIndex())) return InstructionsState(VL[BaseIndex], nullptr, nullptr); Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); if (ID != BaseID) return InstructionsState(VL[BaseIndex], nullptr, nullptr); if (!ID) { SmallVector Mappings = VFDatabase(*Call).getMappings(*Call); if (Mappings.size() != BaseMappings.size() || Mappings.front().ISA != BaseMappings.front().ISA || Mappings.front().ScalarName != BaseMappings.front().ScalarName || Mappings.front().VectorName != BaseMappings.front().VectorName || Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || Mappings.front().Shape.Parameters != BaseMappings.front().Shape.Parameters) return InstructionsState(VL[BaseIndex], nullptr, nullptr); } } continue; } return InstructionsState(VL[BaseIndex], nullptr, nullptr); } return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), cast(VL[AltIndex])); } /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) return false; return true; } /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI) { unsigned Opcode = UserInst->getOpcode(); switch (Opcode) { case Instruction::Load: { LoadInst *LI = cast(UserInst); return (LI->getPointerOperand() == Scalar); } case Instruction::Store: { StoreInst *SI = cast(UserInst); return (SI->getPointerOperand() == Scalar); } case Instruction::Call: { CallInst *CI = cast(UserInst); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) { if (isVectorIntrinsicWithScalarOpAtArg(ID, i)) return (CI->getArgOperand(i) == Scalar); } [[fallthrough]]; } default: return false; } } /// \returns the AA location that is being access by the instruction. static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast(I)) return MemoryLocation::get(LI); return MemoryLocation(); } /// \returns True if the instruction is not a volatile or atomic load/store. static bool isSimple(Instruction *I) { if (LoadInst *LI = dyn_cast(I)) return LI->isSimple(); if (StoreInst *SI = dyn_cast(I)) return SI->isSimple(); if (MemIntrinsic *MI = dyn_cast(I)) return !MI->isVolatile(); return true; } /// Shuffles \p Mask in accordance with the given \p SubMask. static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { if (SubMask.empty()) return; if (Mask.empty()) { Mask.append(SubMask.begin(), SubMask.end()); return; } SmallVector NewMask(SubMask.size(), UndefMaskElem); int TermValue = std::min(Mask.size(), SubMask.size()); for (int I = 0, E = SubMask.size(); I < E; ++I) { if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || Mask[SubMask[I]] >= TermValue) continue; NewMask[I] = Mask[SubMask[I]]; } Mask.swap(NewMask); } /// Order may have elements assigned special value (size) which is out of /// bounds. Such indices only appear on places which correspond to undef values /// (see canReuseExtract for details) and used in order to avoid undef values /// have effect on operands ordering. /// The first loop below simply finds all unused indices and then the next loop /// nest assigns these indices for undef values positions. /// As an example below Order has two undef positions and they have assigned /// values 3 and 7 respectively: /// before: 6 9 5 4 9 2 1 0 /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl &Order) { const unsigned Sz = Order.size(); SmallBitVector UnusedIndices(Sz, /*t=*/true); SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) UnusedIndices.reset(Order[I]); else MaskedIndices.set(I); } if (MaskedIndices.none()) return; assert(UnusedIndices.count() == MaskedIndices.count() && "Non-synced masked/available indices."); int Idx = UnusedIndices.find_first(); int MIdx = MaskedIndices.find_first(); while (MIdx >= 0) { assert(Idx >= 0 && "Indices must be synced."); Order[MIdx] = Idx; Idx = UnusedIndices.find_next(Idx); MIdx = MaskedIndices.find_next(MIdx); } } namespace llvm { static void inversePermutation(ArrayRef Indices, SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); Mask.resize(E, UndefMaskElem); for (unsigned I = 0; I < E; ++I) Mask[Indices[I]] = I; } /// Reorders the list of scalars in accordance with the given \p Mask. static void reorderScalars(SmallVectorImpl &Scalars, ArrayRef Mask) { assert(!Mask.empty() && "Expected non-empty mask."); SmallVector Prev(Scalars.size(), UndefValue::get(Scalars.front()->getType())); Prev.swap(Scalars); for (unsigned I = 0, E = Prev.size(); I < E; ++I) if (Mask[I] != UndefMaskElem) Scalars[Mask[I]] = Prev[I]; } /// Checks if the provided value does not require scheduling. It does not /// require scheduling if this is not an instruction or it is an instruction /// that does not read/write memory and all operands are either not instructions /// or phi nodes or instructions from different blocks. static bool areAllOperandsNonInsts(Value *V) { auto *I = dyn_cast(V); if (!I) return true; return !mayHaveNonDefUseDependency(*I) && all_of(I->operands(), [I](Value *V) { auto *IO = dyn_cast(V); if (!IO) return true; return isa(IO) || IO->getParent() != I->getParent(); }); } /// Checks if the provided value does not require scheduling. It does not /// require scheduling if this is not an instruction or it is an instruction /// that does not read/write memory and all users are phi nodes or instructions /// from the different blocks. static bool isUsedOutsideBlock(Value *V) { auto *I = dyn_cast(V); if (!I) return true; // Limits the number of uses to save compile time. constexpr int UsesLimit = 8; return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && all_of(I->users(), [I](User *U) { auto *IU = dyn_cast(U); if (!IU) return true; return IU->getParent() != I->getParent() || isa(IU); }); } /// Checks if the specified value does not require scheduling. It does not /// require scheduling if all operands and all users do not need to be scheduled /// in the current basic block. static bool doesNotNeedToBeScheduled(Value *V) { return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); } /// Checks if the specified array of instructions does not require scheduling. /// It is so if all either instructions have operands that do not require /// scheduling or their users do not require scheduling since they are phis or /// in other basic blocks. static bool doesNotNeedToSchedule(ArrayRef VL) { return !VL.empty() && (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); } namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { struct TreeEntry; struct ScheduleData; class ShuffleInstructionBuilder; public: using ValueList = SmallVector; using InstrList = SmallVector; using ValueSet = SmallPtrSet; using StoreList = SmallVector; using ExtraValueToDebugLocsMap = MapVector>; using OrdersType = SmallVector; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. // TODO: It would be better to limit the vectorization factor based on // data type rather than just register size. For example, x86 AVX has // 256-bit registers, but it does not support integer operations // at that width (that requires AVX2). if (MaxVectorRegSizeOption.getNumOccurrences()) MaxVecRegSize = MaxVectorRegSizeOption; else MaxVecRegSize = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(); if (MinVectorRegSizeOption.getNumOccurrences()) MinVecRegSize = MinVectorRegSizeOption; else MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); /// Vectorize the tree but with the list of externally used values \p /// ExternallyUsedValues. Values in this MapVector can be replaced but the /// generated extractvalue instructions. Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, Instruction *ReductionRoot = nullptr); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. InstructionCost getSpillCost() const; /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. InstructionCost getTreeCost(ArrayRef VectorizedVals = std::nullopt); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, const SmallDenseSet &UserIgnoreLst); /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef Roots); /// Checks if the very first tree node is going to be vectorized. bool isVectorizedFirstNode() const { return !VectorizableTree.empty() && VectorizableTree.front()->State == TreeEntry::Vectorize; } /// Returns the main instruction for the very first node. Instruction *getFirstNodeMainOp() const { assert(!VectorizableTree.empty() && "No tree to get the first node from"); return VectorizableTree.front()->getMainOp(); } /// Returns whether the root node has in-tree uses. bool doesRootHaveInTreeUses() const { return !VectorizableTree.empty() && !VectorizableTree.front()->UserTreeIndices.empty(); } /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle /// vectorization of reductions. void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); } MinBWs.clear(); InstrElementSize.clear(); UserIgnoreList = nullptr; } unsigned getTreeSize() const { return VectorizableTree.size(); } /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); /// Checks if the specified gather tree entry \p TE can be represented as a /// shuffled vector entry + (possibly) permutation with other gathers. It /// implements the checks only for possibly ordered scalars (Loads, /// ExtractElement, ExtractValue), which can be part of the graph. std::optional findReusedOrderedScalars(const TreeEntry &TE); /// Sort loads into increasing pointers offsets to allow greater clustering. std::optional findPartiallyOrderedLoads(const TreeEntry &TE); /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. std::optional getReorderingData(const TreeEntry &TE, bool TopToBottom); /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered /// parts of subgraph with smaller VF and they are reordered independently. We /// can make it because we still need to extend smaller nodes to the wider VF /// and we can merge reordering shuffles with the widening shuffles. void reorderTopToBottom(); /// Reorders the current graph to the most profitable order starting from /// leaves to the root. It allows to rotate small subgraphs and reduce the /// number of reshuffles if the leaf nodes use the same order. In this case we /// can merge the orders and just shuffle user node instead of shuffling its /// operands. Plus, even the leaf nodes have different orders, it allows to /// sink reordering in the graph closer to the root node and merge it later /// during analysis. void reorderBottomToTop(bool IgnoreReorder = false); /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded /// value reaching V. This method is used by the vectorizer to calculate /// vectorization factors. unsigned getVectorElementSize(Value *V); /// Compute the minimum type sizes required to represent the entries in a /// vectorizable tree. void computeMinimumValueSizes(); // \returns maximum vector register size as set by TTI or overridden by cl::opt. unsigned getMaxVecRegSize() const { return MaxVecRegSize; } // \returns minimum vector register size as set by cl::opt. unsigned getMinVecRegSize() const { return MinVecRegSize; } unsigned getMinVF(unsigned Sz) const { return std::max(2U, getMinVecRegSize() / Sz); } unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { unsigned MaxVF = MaxVFOption.getNumOccurrences() ? MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); return MaxVF ? MaxVF : UINT_MAX; } /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, /// partially transforming a scalar bswap() pattern into vector code is /// effectively impossible for the backend to undo. /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, /// partially transforming a scalar bswap() pattern into vector code is /// effectively impossible for the backend to undo. /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. bool isLoadCombineCandidate() const; OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed /// during buildTree_rec(). We keep track of: /// (i) the user TreeEntry index, and /// (ii) the index of the edge. struct EdgeInfo { EdgeInfo() = default; EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) : UserTE(UserTE), EdgeIdx(EdgeIdx) {} /// The user TreeEntry. TreeEntry *UserTE = nullptr; /// The operand index of the use. unsigned EdgeIdx = UINT_MAX; #ifndef NDEBUG friend inline raw_ostream &operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI) { EI.dump(OS); return OS; } /// Debug print. void dump(raw_ostream &OS) const { OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") << " EdgeIdx:" << EdgeIdx << "}"; } LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } #endif }; /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; int NumLanes; // Total number of lanes (aka vectorization factor). int MaxLevel; // The maximum recursion depth for accumulating score. public: LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel) : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {} // The hard-coded scores listed here are not very important, though it shall // be higher for better matches to improve the resulting cost. When // computing the scores of matching one sub-tree with another, we are // basically counting the number of values that are matching. So even if all // scores are set to 1, we would still get a decent matching result. // However, sometimes we have to break ties. For example we may have to // choose between matching loads vs matching opcodes. This is what these // scores are helping us with: they provide the order of preference. Also, // this is important if the scalar is externally used or used in another // tree entry node in the different lane. /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). static const int ScoreConsecutiveLoads = 4; /// The same load multiple times. This should have a better score than /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for /// a vector load and 1.0 for a broadcast. static const int ScoreSplatLoads = 3; /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). static const int ScoreReversedLoads = 3; /// A load candidate for masked gather. static const int ScoreMaskedGatherCandidate = 1; /// ExtractElementInst from same vector and consecutive indexes. static const int ScoreConsecutiveExtracts = 4; /// ExtractElementInst from same vector and reversed indices. static const int ScoreReversedExtracts = 3; /// Constants. static const int ScoreConstants = 2; /// Instructions with the same opcode. static const int ScoreSameOpcode = 2; /// Instructions with alt opcodes (e.g, add + sub). static const int ScoreAltOpcodes = 1; /// Identical instructions (a.k.a. splat or broadcast). static const int ScoreSplat = 1; /// Matching with an undef is preferable to failing. static const int ScoreUndef = 1; /// Score for failing to find a decent match. static const int ScoreFail = 0; /// Score if all users are vectorized. static const int ScoreAllUserVectorized = 1; /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. /// \p U1 and \p U2 are the users of \p V1 and \p V2. /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p /// MainAltOps. int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef MainAltOps) const { if (!isValidElementType(V1->getType()) || !isValidElementType(V2->getType())) return LookAheadHeuristics::ScoreFail; if (V1 == V2) { if (isa(V1)) { // Retruns true if the users of V1 and V2 won't need to be extracted. auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { // Bail out if we have too many uses to save compilation time. static constexpr unsigned Limit = 8; if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit)) return false; auto AllUsersVectorized = [U1, U2, this](Value *V) { return llvm::all_of(V->users(), [U1, U2, this](Value *U) { return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr; }); }; return AllUsersVectorized(V1) && AllUsersVectorized(V2); }; // A broadcast of a load can be cheaper on some targets. if (R.TTI->isLegalBroadcastLoad(V1->getType(), ElementCount::getFixed(NumLanes)) && ((int)V1->getNumUses() == NumLanes || AllUsersAreInternal(V1, V2))) return LookAheadHeuristics::ScoreSplatLoads; } return LookAheadHeuristics::ScoreSplat; } auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); if (LI1 && LI2) { if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || !LI2->isSimple()) return LookAheadHeuristics::ScoreFail; std::optional Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); if (!Dist || *Dist == 0) { if (getUnderlyingObject(LI1->getPointerOperand()) == getUnderlyingObject(LI2->getPointerOperand()) && R.TTI->isLegalMaskedGather( FixedVectorType::get(LI1->getType(), NumLanes), LI1->getAlign())) return LookAheadHeuristics::ScoreMaskedGatherCandidate; return LookAheadHeuristics::ScoreFail; } // The distance is too large - still may be profitable to use masked // loads/gathers. if (std::abs(*Dist) > NumLanes / 2) return LookAheadHeuristics::ScoreMaskedGatherCandidate; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce // better results. It should not affect current vectorization. return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads : LookAheadHeuristics::ScoreReversedLoads; } auto *C1 = dyn_cast(V1); auto *C2 = dyn_cast(V2); if (C1 && C2) return LookAheadHeuristics::ScoreConstants; // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. Value *EV1; ConstantInt *Ex1Idx; if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { // Undefs are always profitable for extractelements. if (isa(V2)) return LookAheadHeuristics::ScoreConsecutiveExtracts; Value *EV2 = nullptr; ConstantInt *Ex2Idx = nullptr; if (match(V2, m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), m_Undef())))) { // Undefs are always profitable for extractelements. if (!Ex2Idx) return LookAheadHeuristics::ScoreConsecutiveExtracts; if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType()) return LookAheadHeuristics::ScoreConsecutiveExtracts; if (EV2 == EV1) { int Idx1 = Ex1Idx->getZExtValue(); int Idx2 = Ex2Idx->getZExtValue(); int Dist = Idx2 - Idx1; // The distance is too large - still may be profitable to use // shuffles. if (std::abs(Dist) == 0) return LookAheadHeuristics::ScoreSplat; if (std::abs(Dist) > NumLanes / 2) return LookAheadHeuristics::ScoreSameOpcode; return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts : LookAheadHeuristics::ScoreReversedExtracts; } return LookAheadHeuristics::ScoreAltOpcodes; } return LookAheadHeuristics::ScoreFail; } auto *I1 = dyn_cast(V1); auto *I2 = dyn_cast(V2); if (I1 && I2) { if (I1->getParent() != I2->getParent()) return LookAheadHeuristics::ScoreFail; SmallVector Ops(MainAltOps.begin(), MainAltOps.end()); Ops.push_back(I1); Ops.push_back(I2); InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. if (S.getOpcode() && (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() || !S.isAltShuffle()) && all_of(Ops, [&S](Value *V) { return cast(V)->getNumOperands() == S.MainOp->getNumOperands(); })) return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes : LookAheadHeuristics::ScoreSameOpcode; } if (isa(V2)) return LookAheadHeuristics::ScoreUndef; return LookAheadHeuristics::ScoreFail; } /// Go through the operands of \p LHS and \p RHS recursively until /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands /// of \p U1 and \p U2), except at the beginning of the recursion where /// these are set to nullptr. /// /// For example: /// \verbatim /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] /// \ / \ / \ / \ / /// + + + + /// G1 G2 G3 G4 /// \endverbatim /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at /// each level recursively, accumulating the score. It starts from matching /// the additions at level 0, then moves on to the loads (level 1). The /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. /// Please note that the order of the operands does not matter, as we /// evaluate the score of all profitable combinations of operands. In /// other words the score of G1 and G4 is the same as G1 and G2. This /// heuristic is based on ideas described in: /// Look-ahead SLP: Auto-vectorization in the presence of commutative /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, /// Luís F. W. Góes int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef MainAltOps) const { // Get the shallow score of V1 and V2. int ShallowScoreAtThisLevel = getShallowScore(LHS, RHS, U1, U2, MainAltOps); // If reached MaxLevel, // or if V1 and V2 are not instructions, // or if they are SPLAT, // or if they are not consecutive, // or if profitable to vectorize loads or extractelements, early return // the current cost. auto *I1 = dyn_cast(LHS); auto *I2 = dyn_cast(RHS); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || (((isa(I1) && isa(I2)) || (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || (isa(I1) && isa(I2))) && ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); // Contains the I2 operand indexes that got matched with I1 operands. SmallSet Op2Used; // Recursion towards the operands of I1 and I2. We are trying all possible // operand pairs, and keeping track of the best score. for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); OpIdx1 != NumOperands1; ++OpIdx1) { // Try to pair op1I with the best operand of I2. int MaxTmpScore = 0; unsigned MaxOpIdx2 = 0; bool FoundBest = false; // If I2 is commutative try all combinations. unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; unsigned ToIdx = isCommutative(I2) ? I2->getNumOperands() : std::min(I2->getNumOperands(), OpIdx1 + 1); assert(FromIdx <= ToIdx && "Bad index"); for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { // Skip operands already paired with OpIdx1. if (Op2Used.count(OpIdx2)) continue; // Recursively calculate the cost at each level int TmpScore = getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), I1, I2, CurrLevel + 1, std::nullopt); // Look for the best score. if (TmpScore > LookAheadHeuristics::ScoreFail && TmpScore > MaxTmpScore) { MaxTmpScore = TmpScore; MaxOpIdx2 = OpIdx2; FoundBest = true; } } if (FoundBest) { // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. Op2Used.insert(MaxOpIdx2); ShallowScoreAtThisLevel += MaxTmpScore; } } return ShallowScoreAtThisLevel; } }; /// A helper data structure to hold the operands of a vector of instructions. /// This supports a fixed vector length for all operand vectors. class VLOperands { /// For each operand we need (i) the value, and (ii) the opcode that it /// would be attached to if the expression was in a left-linearized form. /// This is required to avoid illegal operand reordering. /// For example: /// \verbatim /// 0 Op1 /// |/ /// Op1 Op2 Linearized + Op2 /// \ / ----------> |/ /// - - /// /// Op1 - Op2 (0 + Op1) - Op2 /// \endverbatim /// /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. /// /// Another way to think of this is to track all the operations across the /// path from the operand all the way to the root of the tree and to /// calculate the operation that corresponds to this path. For example, the /// path from Op2 to the root crosses the RHS of the '-', therefore the /// corresponding operation is a '-' (which matches the one in the /// linearized tree, as shown above). /// /// For lack of a better term, we refer to this operation as Accumulated /// Path Operation (APO). struct OperandData { OperandData() = default; OperandData(Value *V, bool APO, bool IsUsed) : V(V), APO(APO), IsUsed(IsUsed) {} /// The operand value. Value *V = nullptr; /// TreeEntries only allow a single opcode, or an alternate sequence of /// them (e.g, +, -). Therefore, we can safely use a boolean value for the /// APO. It is set to 'true' if 'V' is attached to an inverse operation /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise /// (e.g., Add/Mul) bool APO = false; /// Helper data for the reordering function. bool IsUsed = false; }; /// During operand reordering, we are trying to select the operand at lane /// that matches best with the operand at the neighboring lane. Our /// selection is based on the type of value we are looking for. For example, /// if the neighboring lane has a load, we need to look for a load that is /// accessing a consecutive address. These strategies are summarized in the /// 'ReorderingMode' enumerator. enum class ReorderingMode { Load, ///< Matching loads to consecutive memory addresses Opcode, ///< Matching instructions based on opcode (same or alternate) Constant, ///< Matching constants Splat, ///< Matching the same instruction multiple times (broadcast) Failed, ///< We failed to create a vectorizable group }; using OperandDataVec = SmallVector; /// A vector of operand vectors. SmallVector OpsVec; const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { return OpsVec[OpIdx][Lane]; } /// \returns the operand data at \p OpIdx and \p Lane. Const version. const OperandData &getData(unsigned OpIdx, unsigned Lane) const { return OpsVec[OpIdx][Lane]; } /// Clears the used flag for all entries. void clearUsed() { for (unsigned OpIdx = 0, NumOperands = getNumOperands(); OpIdx != NumOperands; ++OpIdx) for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; ++Lane) OpsVec[OpIdx][Lane].IsUsed = false; } /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } /// \param Lane lane of the operands under analysis. /// \param OpIdx operand index in \p Lane lane we're looking the best /// candidate for. /// \param Idx operand index of the current candidate value. /// \returns The additional score due to possible broadcasting of the /// elements in the lane. It is more profitable to have power-of-2 unique /// elements in the lane, it will be vectorized with higher probability /// after removing duplicates. Currently the SLP vectorizer supports only /// vectorization of the power-of-2 number of unique scalars. int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { Value *IdxLaneV = getData(Idx, Lane).V; if (!isa(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) return 0; SmallPtrSet Uniques; for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { if (Ln == Lane) continue; Value *OpIdxLnV = getData(OpIdx, Ln).V; if (!isa(OpIdxLnV)) return 0; Uniques.insert(OpIdxLnV); } int UniquesCount = Uniques.size(); int UniquesCntWithIdxLaneV = Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; Value *OpIdxLaneV = getData(OpIdx, Lane).V; int UniquesCntWithOpIdxLaneV = Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) return 0; return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - UniquesCntWithOpIdxLaneV) - (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); } /// \param Lane lane of the operands under analysis. /// \param OpIdx operand index in \p Lane lane we're looking the best /// candidate for. /// \param Idx operand index of the current candidate value. /// \returns The additional score for the scalar which users are all /// vectorized. int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { Value *IdxLaneV = getData(Idx, Lane).V; Value *OpIdxLaneV = getData(OpIdx, Lane).V; // Do not care about number of uses for vector-like instructions // (extractelement/extractvalue with constant indices), they are extracts // themselves and already externally used. Vectorization of such // instructions does not add extra extractelement instruction, just may // remove it. if (isVectorLikeInstWithConstOps(IdxLaneV) && isVectorLikeInstWithConstOps(OpIdxLaneV)) return LookAheadHeuristics::ScoreAllUserVectorized; auto *IdxLaneI = dyn_cast(IdxLaneV); if (!IdxLaneI || !isa(OpIdxLaneV)) return 0; return R.areAllUsersVectorized(IdxLaneI, std::nullopt) ? LookAheadHeuristics::ScoreAllUserVectorized : 0; } /// Score scaling factor for fully compatible instructions but with /// different number of external uses. Allows better selection of the /// instructions with less external uses. static const int ScoreScaleFactor = 10; /// \Returns the look-ahead score, which tells us how much the sub-trees /// rooted at \p LHS and \p RHS match, the more they match the higher the /// score. This helps break ties in an informed way when we cannot decide on /// the order of the operands by just considering the immediate /// predecessors. int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef MainAltOps, int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed) { LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), LookAheadMaxDepth); // Keep track of the instruction stack as we recurse into the operands // during the look-ahead score exploration. int Score = LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, /*CurrLevel=*/1, MainAltOps); if (Score) { int SplatScore = getSplatScore(Lane, OpIdx, Idx); if (Score <= -SplatScore) { // Set the minimum score for splat-like sequence to avoid setting // failed state. Score = 1; } else { Score += SplatScore; // Scale score to see the difference between different operands // and similar operands but all vectorized/not all vectorized // uses. It does not affect actual selection of the best // compatible operand in general, just allows to select the // operand with all vectorized uses. Score *= ScoreScaleFactor; Score += getExternalUseScore(Lane, OpIdx, Idx); IsUsed = true; } } return Score; } /// Best defined scores per lanes between the passes. Used to choose the /// best operand (with the highest score) between the passes. /// The key - {Operand Index, Lane}. /// The value - the best score between the passes for the lane and the /// operand. SmallDenseMap, unsigned, 8> BestScoresPerLanes; // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return std::nullopt. std::optional getBestOperand(unsigned OpIdx, int Lane, int LastLane, ArrayRef ReorderingModes, ArrayRef MainAltOps) { unsigned NumOperands = getNumOperands(); // The operand of the previous lane at OpIdx. Value *OpLastLane = getData(OpIdx, LastLane).V; // Our strategy mode for OpIdx. ReorderingMode RMode = ReorderingModes[OpIdx]; if (RMode == ReorderingMode::Failed) return std::nullopt; // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. struct BestOpData { std::optional Idx; unsigned Score = 0; } BestOp; BestOp.Score = BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0) .first->second; // Track if the operand must be marked as used. If the operand is set to // Score 1 explicitly (because of non power-of-2 unique scalars, we may // want to reestimate the operands again on the following iterations). bool IsUsed = RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant; // Iterate through all unused operands and look for the best. for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { // Get the operand at Idx and Lane. OperandData &OpData = getData(Idx, Lane); Value *Op = OpData.V; bool OpAPO = OpData.APO; // Skip already selected operands. if (OpData.IsUsed) continue; // Skip if we are trying to move the operand to a position with a // different opcode in the linearized tree form. This would break the // semantics. if (OpAPO != OpIdxAPO) continue; // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: case ReorderingMode::Constant: case ReorderingMode::Opcode: { bool LeftToRight = Lane > LastLane; Value *OpLeft = (LeftToRight) ? OpLastLane : Op; Value *OpRight = (LeftToRight) ? Op : OpLastLane; int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, OpIdx, Idx, IsUsed); if (Score > static_cast(BestOp.Score)) { BestOp.Idx = Idx; BestOp.Score = Score; BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score; } break; } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; break; case ReorderingMode::Failed: llvm_unreachable("Not expected Failed reordering mode."); } } if (BestOp.Idx) { getData(*BestOp.Idx, Lane).IsUsed = IsUsed; return BestOp.Idx; } // If we could not find a good match return std::nullopt. return std::nullopt; } /// Helper for reorderOperandVecs. /// \returns the lane that we should start reordering from. This is the one /// which has the least number of operands that can freely move about or /// less profitable because it already has the most optimal set of operands. unsigned getBestLaneToStartReordering() const { unsigned Min = UINT_MAX; unsigned SameOpNumber = 0; // std::pair is used to implement a simple voting // algorithm and choose the lane with the least number of operands that // can freely move about or less profitable because it already has the // most optimal set of operands. The first unsigned is a counter for // voting, the second unsigned is the counter of lanes with instructions // with same/alternate opcodes and same parent basic block. MapVector> HashMap; // Try to be closer to the original results, if we have multiple lanes // with same cost. If 2 lanes have the same cost, use the one with the // lowest index. for (int I = getNumLanes(); I > 0; --I) { unsigned Lane = I - 1; OperandsOrderData NumFreeOpsHash = getMaxNumOperandsThatCanBeReordered(Lane); // Compare the number of operands that can move and choose the one with // the least number. if (NumFreeOpsHash.NumOfAPOs < Min) { Min = NumFreeOpsHash.NumOfAPOs; SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; HashMap.clear(); HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { // Select the most optimal lane in terms of number of operands that // should be moved around. SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { auto It = HashMap.find(NumFreeOpsHash.Hash); if (It == HashMap.end()) HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); else ++It->second.first; } } // Select the lane with the minimum counter. unsigned BestLane = 0; unsigned CntMin = UINT_MAX; for (const auto &Data : reverse(HashMap)) { if (Data.second.first < CntMin) { CntMin = Data.second.first; BestLane = Data.second.second; } } return BestLane; } /// Data structure that helps to reorder operands. struct OperandsOrderData { /// The best number of operands with the same APOs, which can be /// reordered. unsigned NumOfAPOs = UINT_MAX; /// Number of operands with the same/alternate instruction opcode and /// parent. unsigned NumOpsWithSameOpcodeParent = 0; /// Hash for the actual operands ordering. /// Used to count operands, actually their position id and opcode /// value. It is used in the voting mechanism to find the lane with the /// least number of operands that can freely move about or less profitable /// because it already has the most optimal set of operands. Can be /// replaced with SmallVector instead but hash code is faster /// and requires less memory. unsigned Hash = 0; }; /// \returns the maximum number of operands that are allowed to be reordered /// for \p Lane and the number of compatible instructions(with the same /// parent/opcode). This is used as a heuristic for selecting the first lane /// to start operand reordering. OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { unsigned CntTrue = 0; unsigned NumOperands = getNumOperands(); // Operands with the same APO can be reordered. We therefore need to count // how many of them we have for each APO, like this: Cnt[APO] = x. // Since we only have two APOs, namely true and false, we can avoid using // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. // Operands with the same instruction opcode and parent are more // profitable since we don't need to move them in many cases, with a high // probability such lane already can be vectorized effectively. bool AllUndefs = true; unsigned NumOpsWithSameOpcodeParent = 0; Instruction *OpcodeI = nullptr; BasicBlock *Parent = nullptr; unsigned Hash = 0; for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { const OperandData &OpData = getData(OpIdx, Lane); if (OpData.APO) ++CntTrue; // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast(OpData.V)) { if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; OpcodeI = I; Parent = I->getParent(); } else { --NumOpsWithSameOpcodeParent; } } else { ++NumOpsWithSameOpcodeParent; } } Hash = hash_combine( Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); AllUndefs = AllUndefs && isa(OpData.V); } if (AllUndefs) return {}; OperandsOrderData Data; Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; Data.Hash = Hash; return Data; } /// Go through the instructions in VL and append their operands. void appendOperandsOfVL(ArrayRef VL) { assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); assert(isa(VL[0]) && "Expected instruction"); unsigned NumOperands = cast(VL[0])->getNumOperands(); OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { OpsVec[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { assert(isa(VL[Lane]) && "Expected instruction"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or // RHS operand. The LHS operand of both add and sub is never attached // to an inversese operation in the linearized form, therefore its APO // is false. The RHS is true only if VL[Lane] is an inverse operation. // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely // tell the inverse operations by checking commutativity. bool IsInverseOperation = !isCommutative(cast(VL[Lane])); bool APO = (OpIdx == 0) ? false : IsInverseOperation; OpsVec[OpIdx][Lane] = {cast(VL[Lane])->getOperand(OpIdx), APO, false}; } } } /// \returns the number of operands. unsigned getNumOperands() const { return OpsVec.size(); } /// \returns the number of lanes. unsigned getNumLanes() const { return OpsVec[0].size(); } /// \returns the operand value at \p OpIdx and \p Lane. Value *getValue(unsigned OpIdx, unsigned Lane) const { return getData(OpIdx, Lane).V; } /// \returns true if the data structure is empty. bool empty() const { return OpsVec.empty(); } /// Clears the data. void clear() { OpsVec.clear(); } /// \Returns true if there are enough operands identical to \p Op to fill /// the whole vector. /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { bool OpAPO = getData(OpIdx, Lane).APO; for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { if (Ln == Lane) continue; // This is set to true if we found a candidate for broadcast at Lane. bool FoundCandidate = false; for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { OperandData &Data = getData(OpI, Ln); if (Data.APO != OpAPO || Data.IsUsed) continue; if (Data.V == Op) { FoundCandidate = true; Data.IsUsed = true; break; } } if (!FoundCandidate) return false; } return true; } public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef RootVL, const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) : TLI(TLI), DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } /// \Returns a value vector with the operands across all lanes for the /// opearnd at \p OpIdx. ValueList getVL(unsigned OpIdx) const { ValueList OpVL(OpsVec[OpIdx].size()); assert(OpsVec[OpIdx].size() == getNumLanes() && "Expected same num of lanes across all operands"); for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) OpVL[Lane] = OpsVec[OpIdx][Lane].V; return OpVL; } // Performs operand reordering for 2 or more operands. // The original operands are in OrigOps[OpIdx][Lane]. // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. void reorder() { unsigned NumOperands = getNumOperands(); unsigned NumLanes = getNumLanes(); // Each operand has its own mode. We are using this mode to help us select // the instructions for each lane, so that they match best with the ones // we have selected so far. SmallVector ReorderingModes(NumOperands); // This is a greedy single-pass algorithm. We are going over each lane // once and deciding on the best order right away with no back-tracking. // However, in order to increase its effectiveness, we start with the lane // that has operands that can move the least. For example, given the // following lanes: // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd // Lane 1 : A[1] = C[1] - B[1] // Visited 1st // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd // Lane 3 : A[3] = C[3] - B[3] // Visited 4th // we will start at Lane 1, since the operands of the subtraction cannot // be reordered. Then we will visit the rest of the lanes in a circular // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. // Find the first lane that we will start our search from. unsigned FirstLane = getBestLaneToStartReordering(); // Initialize the modes. for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { Value *OpLane0 = getValue(OpIdx, FirstLane); // Keep track if we have instructions with all the same opcode on one // side. if (isa(OpLane0)) ReorderingModes[OpIdx] = ReorderingMode::Load; else if (isa(OpLane0)) { // Check if OpLane0 should be broadcast. if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) ReorderingModes[OpIdx] = ReorderingMode::Splat; else ReorderingModes[OpIdx] = ReorderingMode::Opcode; } else if (isa(OpLane0)) ReorderingModes[OpIdx] = ReorderingMode::Constant; else if (isa(OpLane0)) // Our best hope is a Splat. It may save some cost in some cases. ReorderingModes[OpIdx] = ReorderingMode::Splat; else // NOTE: This should be unreachable. ReorderingModes[OpIdx] = ReorderingMode::Failed; } // Check that we don't have same operands. No need to reorder if operands // are just perfect diamond or shuffled diamond match. Do not do it only // for possible broadcasts or non-power of 2 number of scalars (just for // now). auto &&SkipReordering = [this]() { SmallPtrSet UniqueValues; ArrayRef Op0 = OpsVec.front(); for (const OperandData &Data : Op0) UniqueValues.insert(Data.V); for (ArrayRef Op : drop_begin(OpsVec, 1)) { if (any_of(Op, [&UniqueValues](const OperandData &Data) { return !UniqueValues.contains(Data.V); })) return false; } // TODO: Check if we can remove a check for non-power-2 number of // scalars after full support of non-power-2 vectorization. return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); }; // If the initial strategy fails for any of the operand indexes, then we // perform reordering again in a second pass. This helps avoid assigning // high priority to the failed strategy, and should improve reordering for // the non-failed operand indexes. for (int Pass = 0; Pass != 2; ++Pass) { // Check if no need to reorder operands since they're are perfect or // shuffled diamond match. // Need to to do it to avoid extra external use cost counting for // shuffled matches, which may cause regressions. if (SkipReordering()) break; // Skip the second pass if the first pass did not fail. bool StrategyFailed = false; // Mark all operand data as free to use. clearUsed(); // We keep the original operand order for the FirstLane, so reorder the // rest of the lanes. We are visiting the nodes in a circular fashion, // using FirstLane as the center point and increasing the radius // distance. SmallVector> MainAltOps(NumOperands); for (unsigned I = 0; I < NumOperands; ++I) MainAltOps[I].push_back(getData(I, FirstLane).V); for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { // Visit the lane on the right and then the lane on the left. for (int Direction : {+1, -1}) { int Lane = FirstLane + Direction * Distance; if (Lane < 0 || Lane >= (int)NumLanes) continue; int LastLane = Lane - Direction; assert(LastLane >= 0 && LastLane < (int)NumLanes && "Out of bounds"); // Look for a good match for each operand. for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { // Search for the operand that matches SortedOps[OpIdx][Lane-1]. std::optional BestIdx = getBestOperand( OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]); // By not selecting a value, we allow the operands that follow to // select a better matching value. We will get a non-null value in // the next run of getBestOperand(). if (BestIdx) { // Swap the current operand with the one returned by // getBestOperand(). swap(OpIdx, *BestIdx, Lane); } else { // We failed to find a best operand, set mode to 'Failed'. ReorderingModes[OpIdx] = ReorderingMode::Failed; // Enable the second pass. StrategyFailed = true; } // Try to get the alternate opcode and follow it during analysis. if (MainAltOps[OpIdx].size() != 2) { OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); if (OpS.getOpcode() && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } } } } // Skip second pass if the strategy did not fail. if (!StrategyFailed) break; } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { switch (RMode) { case ReorderingMode::Load: return "Load"; case ReorderingMode::Opcode: return "Opcode"; case ReorderingMode::Constant: return "Constant"; case ReorderingMode::Splat: return "Splat"; case ReorderingMode::Failed: return "Failed"; } llvm_unreachable("Unimplemented Reordering Type"); } LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, raw_ostream &OS) { return OS << getModeStr(RMode); } /// Debug print. LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { printMode(RMode, dbgs()); } friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { return printMode(RMode, OS); } LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { const unsigned Indent = 2; unsigned Cnt = 0; for (const OperandDataVec &OpDataVec : OpsVec) { OS << "Operand " << Cnt++ << "\n"; for (const OperandData &OpData : OpDataVec) { OS.indent(Indent) << "{"; if (Value *V = OpData.V) OS << *V; else OS << "null"; OS << ", APO:" << OpData.APO << "}\n"; } OS << "\n"; } return OS; } /// Debug print. LLVM_DUMP_METHOD void dump() const { print(dbgs()); } #endif }; /// Evaluate each pair in \p Candidates and return index into \p Candidates /// for a pair which have highest score deemed to have best chance to form /// root of profitable tree to vectorize. Return std::nullopt if no candidate /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit /// of the cost, considered to be good enough score. std::optional findBestRootPair(ArrayRef> Candidates, int Limit = LookAheadHeuristics::ScoreFail) { LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, RootLookAheadMaxDepth); int BestScore = Limit; std::optional Index; for (int I : seq(0, Candidates.size())) { int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, Candidates[I].second, /*U1=*/nullptr, /*U2=*/nullptr, /*Level=*/1, std::nullopt); if (Score > BestScore) { BestScore = Score; Index = I; } } return Index; } /// Checks if the instruction is marked for deletion. bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } /// Removes an instruction from its block and eventually deletes it. /// It's like Instruction::eraseFromParent() except that the actual deletion /// is delayed until BoUpSLP is destructed. void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); } /// Checks if the instruction was already analyzed for being possible /// reduction root. bool isAnalyzedReductionRoot(Instruction *I) const { return AnalyzedReductionsRoots.count(I); } /// Register given instruction as already analyzed for being possible /// reduction root. void analyzedReductionRoot(Instruction *I) { AnalyzedReductionsRoots.insert(I); } /// Checks if the provided list of reduced values was checked already for /// vectorization. bool areAnalyzedReductionVals(ArrayRef VL) const { return AnalyzedReductionVals.contains(hash_value(VL)); } /// Adds the list of reduced values to list of already checked values for the /// vectorization. void analyzedReductionVals(ArrayRef VL) { AnalyzedReductionVals.insert(hash_value(VL)); } /// Clear the list of the analyzed reduction root instructions. void clearReductionData() { AnalyzedReductionsRoots.clear(); AnalyzedReductionVals.clear(); } /// Checks if the given value is gathered in one of the nodes. bool isAnyGathered(const SmallDenseSet &Vals) const { return any_of(MustGather, [&](Value *V) { return Vals.contains(V); }); } /// Check if the value is vectorized in the tree. bool isVectorized(Value *V) const { return getTreeEntry(V); } ~BoUpSLP(); private: /// Check if the operands on the edges \p Edges of the \p UserTE allows /// reordering (i.e. the operands can be reordered because they have only one /// user and reordarable). /// \param ReorderableGathers List of all gather nodes that require reordering /// (e.g., gather of extractlements or partially vectorizable loads). /// \param GatherOps List of gather operand nodes for \p UserTE that require /// reordering, subset of \p NonVectorized. bool canReorderOperands(TreeEntry *UserTE, SmallVectorImpl> &Edges, ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps); /// Checks if the given \p TE is a gather node with clustered reused scalars /// and reorders it per given \p Mask. void reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const; /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, /// if any. If it is not vectorized (gather node), returns nullptr. TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { ArrayRef VL = UserTE->getOperand(OpIdx); TreeEntry *TE = nullptr; const auto *It = find_if(VL, [this, &TE](Value *V) { TE = getTreeEntry(V); return TE; }); if (It != VL.end() && TE->isSame(VL)) return TE; return nullptr; } /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, /// if any. If it is not vectorized (gather node), returns nullptr. const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, unsigned OpIdx) const { return const_cast(this)->getVectorizedOperand( const_cast(UserTE), OpIdx); } /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I, ArrayRef VectorizedVals) const; /// Return information about the vector formed for the specified index /// of a vector of (the same) instruction. TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef VL, unsigned OpIdx); /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, const EdgeInfo &EI); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a /// vector) and sets \p CurrentOrder to the identity permutation; otherwise /// returns false, setting \p CurrentOrder to either an empty vector or a /// non-identity permutation that allows to reuse extract instructions. bool canReuseExtract(ArrayRef VL, Value *OpValue, SmallVectorImpl &CurrentOrder) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry /// \p E. Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. Value *createBuildVector(const TreeEntry *E); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. If \p /// NeedToShuffle is true, need to add a cost of reshuffling some of the /// vector elements. InstructionCost getGatherCost(FixedVectorType *Ty, const APInt &ShuffledIndices, bool NeedToShuffle) const; /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except /// for the case when all operands are external (in this case, it is the first /// instruction in the list). Instruction &getLastInstructionInBundle(const TreeEntry *E); /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. std::optional isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. InstructionCost getGatherCost(ArrayRef VL) const; /// Set the Builder insert point to one after the last instruction in /// the bundle void setInsertPointAfterBundle(const TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. Value *gather(ArrayRef VL); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(bool ForReduction) const; /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. static void reorderInputsAccordingToOpcode( ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right, const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store /// pointers to the collected stores. DenseMap> collectUserStores(const BoUpSLP::TreeEntry *TE) const; /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the /// stores in \p StoresVec can form a vector instruction. If so it returns true /// and populates \p ReorderIndices with the shuffle indices of the the stores /// when compared to the sorted vector. bool canFormVector(const SmallVector &StoresVec, OrdersType &ReorderIndices) const; /// Iterates through the users of \p TE, looking for scalar stores that can be /// potentially vectorized in a future SLP-tree. If found, it keeps track of /// their order and builds an order index vector for each store bundle. It /// returns all these order vectors found. /// We run this after the tree has formed, otherwise we may come across user /// instructions that are not yet in the tree. SmallVector findExternalStoreUsersReorderIndices(TreeEntry *TE) const; struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { auto &&IsSame = [VL](ArrayRef Scalars, ArrayRef Mask) { if (Mask.size() != VL.size() && VL.size() == Scalars.size()) return std::equal(VL.begin(), VL.end(), Scalars.begin()); return VL.size() == Mask.size() && std::equal(VL.begin(), VL.end(), Mask.begin(), [Scalars](Value *V, int Idx) { return (isa(V) && Idx == UndefMaskElem) || (Idx != UndefMaskElem && V == Scalars[Idx]); }); }; if (!ReorderIndices.empty()) { // TODO: implement matching if the nodes are just reordered, still can // treat the vector as the same if the list of scalars matches VL // directly, without reordering. SmallVector Mask; inversePermutation(ReorderIndices, Mask); if (VL.size() == Scalars.size()) return IsSame(Scalars, Mask); if (VL.size() == ReuseShuffleIndices.size()) { ::addMask(Mask, ReuseShuffleIndices); return IsSame(Scalars, Mask); } return false; } return IsSame(Scalars, ReuseShuffleIndices); } bool isOperandGatherNode(const EdgeInfo &UserEI) const { return State == TreeEntry::NeedToGather && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && UserTreeIndices.front().UserTE == UserEI.UserTE; } /// \returns true if current entry has same operands as \p TE. bool hasEqualOperands(const TreeEntry &TE) const { if (TE.getNumOperands() != getNumOperands()) return false; SmallBitVector Used(getNumOperands()); for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { unsigned PrevCount = Used.count(); for (unsigned K = 0; K < E; ++K) { if (Used.test(K)) continue; if (getOperand(K) == TE.getOperand(I)) { Used.set(K); break; } } // Check if we actually found the matching operand. if (PrevCount == Used.count()) return false; } return true; } /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. unsigned getVectorFactor() const { if (!ReuseShuffleIndices.empty()) return ReuseShuffleIndices.size(); return Scalars.size(); }; /// A vector of scalars. ValueList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather /// intrinsics for store/load)? enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; EntryState State; /// Does this sequence require some shuffling? SmallVector ReuseShuffleIndices; /// Does this entry require reordering? SmallVector ReorderIndices; /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has /// to be a pointer and needs to be able to initialize the child iterator. /// Thus we need a reference back to the container to translate the indices /// to entries. VecTreeTy &Container; /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; /// The index of this treeEntry in VectorizableTree. int Idx = -1; private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the /// reordering of operands during buildTree_rec() and vectorizeTree(). SmallVector Operands; /// The main/alternate instruction. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; public: /// Set this bundle's \p OpIdx'th operand to \p OpVL. void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); assert(OpVL.size() <= Scalars.size() && "Number of operands is greater than the number of scalars."); Operands[OpIdx].resize(OpVL.size()); copy(OpVL, Operands[OpIdx].begin()); } /// Set the operands of this bundle in their original order. void setOperandsInOrder() { assert(Operands.empty() && "Already initialized?"); auto *I0 = cast(Scalars[0]); Operands.resize(I0->getNumOperands()); unsigned NumLanes = Scalars.size(); for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); OpIdx != NumOperands; ++OpIdx) { Operands[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { auto *I = cast(Scalars[Lane]); assert(I->getNumOperands() == NumOperands && "Expected same number of operands"); Operands[OpIdx][Lane] = I->getOperand(OpIdx); } } } /// Reorders operands of the node to the given mask \p Mask. void reorderOperands(ArrayRef Mask) { for (ValueList &Operand : Operands) reorderScalars(Operand, Mask); } /// \returns the \p OpIdx operand of this TreeEntry. ValueList &getOperand(unsigned OpIdx) { assert(OpIdx < Operands.size() && "Off bounds"); return Operands[OpIdx]; } /// \returns the \p OpIdx operand of this TreeEntry. ArrayRef getOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); return Operands[OpIdx]; } /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } /// \return the single \p OpIdx operand. Value *getSingleOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); assert(!Operands[OpIdx].empty() && "No operand available"); return Operands[OpIdx][0]; } /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return MainOp != AltOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return (getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode); } /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is /// \p OpValue. Value *isOneOf(Value *Op) const { auto *I = dyn_cast(Op); if (I && isOpcodeOrAlt(I)) return Op; return MainOp; } void setOperations(const InstructionsState &S) { MainOp = S.MainOp; AltOp = S.AltOp; } Instruction *getMainOp() const { return MainOp; } Instruction *getAltOp() const { return AltOp; } /// The main/alternate opcodes for the list of instructions. unsigned getOpcode() const { return MainOp ? MainOp->getOpcode() : 0; } unsigned getAltOpcode() const { return AltOp ? AltOp->getOpcode() : 0; } /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); if (!ReorderIndices.empty()) FoundLane = ReorderIndices[FoundLane]; assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); if (!ReuseShuffleIndices.empty()) { FoundLane = std::distance(ReuseShuffleIndices.begin(), find(ReuseShuffleIndices, FoundLane)); } return FoundLane; } #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { dbgs() << Idx << ".\n"; for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { dbgs() << "Operand " << OpI << ":\n"; for (const Value *V : Operands[OpI]) dbgs().indent(2) << *V << "\n"; } dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "State: "; switch (State) { case Vectorize: dbgs() << "Vectorize\n"; break; case ScatterVectorize: dbgs() << "ScatterVectorize\n"; break; case NeedToGather: dbgs() << "NeedToGather\n"; break; } dbgs() << "MainOp: "; if (MainOp) dbgs() << *MainOp << "\n"; else dbgs() << "NULL\n"; dbgs() << "AltOp: "; if (AltOp) dbgs() << *AltOp << "\n"; else dbgs() << "NULL\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) dbgs() << *VectorizedValue << "\n"; else dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) dbgs() << "Empty"; else for (int ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; for (unsigned ReorderIdx : ReorderIndices) dbgs() << ReorderIdx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) dbgs() << EInfo << ", "; dbgs() << "\n"; } #endif }; #ifndef NDEBUG void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, InstructionCost VecCost, InstructionCost ScalarCost) const { dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); dbgs() << "SLP: Costs:\n"; dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; dbgs() << "SLP: VectorCost = " << VecCost << "\n"; dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << ReuseShuffleCost + VecCost - ScalarCost << "\n"; } #endif /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, std::optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = std::nullopt, ArrayRef ReorderIndices = std::nullopt) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, ReuseShuffleIndices, ReorderIndices); } TreeEntry *newTreeEntry(ArrayRef VL, TreeEntry::EntryState EntryState, std::optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = std::nullopt, ArrayRef ReorderIndices = std::nullopt) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { Last->Scalars.assign(VL.begin(), VL.end()); Last->setOperations(S); } else { // Reorder scalars and build final mask. Last->Scalars.assign(VL.size(), nullptr); transform(ReorderIndices, Last->Scalars.begin(), [VL](unsigned Idx) -> Value * { if (Idx >= VL.size()) return UndefValue::get(VL.front()->getType()); return VL[Idx]; }); InstructionsState S = getSameOpcode(Last->Scalars, *TLI); Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. ScheduleData *BundleMember = *Bundle; assert((BundleMember || isa(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && "Bundle and VL out of sync"); if (BundleMember) { for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; assert(BundleMember && "Unexpected end of bundle."); BundleMember->TE = Last; BundleMember = BundleMember->NextInBundle; } } assert(!BundleMember && "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); return Last; } /// -- Vectorization State -- /// Holds all of the tree entries. TreeEntry::VecTreeTy VectorizableTree; #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dumpVectorizableTree() const { for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { VectorizableTree[Id]->dump(); dbgs() << "\n"; } } #endif TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } const TreeEntry *getTreeEntry(Value *V) const { return ScalarToTreeEntry.lookup(V); } /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; /// A map between the vectorized entries and the last instructions in the /// bundles. The bundles are built in use order, not in the def order of the /// instructions. So, we cannot rely directly on the last instruction in the /// bundle being the last instruction in the program order during /// vectorization process since the basic blocks are affected, need to /// pre-gather them before. DenseMap EntryToLastInstruction; /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) : Scalar(S), User(U), Lane(L) {} // Which scalar in our function. Value *Scalar; // Which user that uses the scalar. llvm::User *User; // Which lane does the scalar belong to. int Lane; }; using UserList = SmallVector; /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it /// is invariant in the calling loop. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { // First check if the result is already in the cache. AliasCacheKey key = std::make_pair(Inst1, Inst2); std::optional &result = AliasCache[key]; if (result) { return *result; } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); // Store the result in the cache. result = aliased; return aliased; } using AliasCacheKey = std::pair; /// Cache for alias results. /// TODO: consider moving this to the AliasAnalysis itself. DenseMap> AliasCache; // Cache for pointerMayBeCaptured calls inside AA. This is preserved // globally through SLP because we don't perform any action which // invalidates capture results. BatchAAResults BatchAA; /// Temporary store for deleted instructions. Instructions will be deleted /// eventually when the BoUpSLP is destructed. The deferral is required to /// ensure that there are no incorrect collisions in the AliasCache, which /// can happen if a new instruction is allocated at the same address as a /// previously deleted instruction. DenseSet DeletedInstructions; /// Set of the instruction, being analyzed already for reductions. SmallPtrSet AnalyzedReductionsRoots; /// Set of hashes for the list of reduction values already being analyzed. DenseSet AnalyzedReductionVals; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User /// can be nullptr, it means that this Internal Scalar will be used later, /// after vectorization. UserList ExternalUses; /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; /// Holds all of the instructions that we gathered, shuffle instructions and /// extractelements. SetVector GatherShuffleExtractSeq; /// A list of blocks that we are going to CSE. SetVector CSEBlocks; /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a /// vector instruction). struct ScheduleData { // The initial value for the dependency counters. It means that the // dependencies are not calculated yet. enum { InvalidDeps = -1 }; ScheduleData() = default; void init(int BlockSchedulingRegionID, Value *OpVal) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; SchedulingRegionID = BlockSchedulingRegionID; clearDependencies(); OpValue = OpVal; TE = nullptr; } /// Verify basic self consistency properties void verify() { if (hasValidDependencies()) { assert(UnscheduledDeps <= Dependencies && "invariant"); } else { assert(UnscheduledDeps == Dependencies && "invariant"); } if (IsScheduled) { assert(isSchedulingEntity() && "unexpected scheduled state"); for (const ScheduleData *BundleMember = this; BundleMember; BundleMember = BundleMember->NextInBundle) { assert(BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && "unexpected scheduled state"); assert((BundleMember == this || !BundleMember->IsScheduled) && "only bundle is marked scheduled"); } } assert(Inst->getParent() == FirstInBundle->Inst->getParent() && "all bundle members must be in same basic block"); } /// Returns true if the dependency information has been calculated. /// Note that depenendency validity can vary between instructions within /// a single bundle. bool hasValidDependencies() const { return Dependencies != InvalidDeps; } /// Returns true for single instructions and for bundle representatives /// (= the head of a bundle). bool isSchedulingEntity() const { return FirstInBundle == this; } /// Returns true if it represents an instruction bundle and not only a /// single instruction. bool isPartOfBundle() const { return NextInBundle != nullptr || FirstInBundle != this || TE; } /// Returns true if it is ready for scheduling, i.e. it has no more /// unscheduled depending instructions/bundles. bool isReady() const { assert(isSchedulingEntity() && "can't consider non-scheduling entity for ready list"); return unscheduledDepsInBundle() == 0 && !IsScheduled; } /// Modifies the number of unscheduled dependencies for this instruction, /// and returns the number of remaining dependencies for the containing /// bundle. int incrementUnscheduledDeps(int Incr) { assert(hasValidDependencies() && "increment of unscheduled deps would be meaningless"); UnscheduledDeps += Incr; return FirstInBundle->unscheduledDepsInBundle(); } /// Sets the number of unscheduled dependencies to the number of /// dependencies. void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; } /// Clears all dependency information. void clearDependencies() { Dependencies = InvalidDeps; resetUnscheduledDeps(); MemoryDependencies.clear(); ControlDependencies.clear(); } int unscheduledDepsInBundle() const { assert(isSchedulingEntity() && "only meaningful on the bundle"); int Sum = 0; for (const ScheduleData *BundleMember = this; BundleMember; BundleMember = BundleMember->NextInBundle) { if (BundleMember->UnscheduledDeps == InvalidDeps) return InvalidDeps; Sum += BundleMember->UnscheduledDeps; } return Sum; } void dump(raw_ostream &os) const { if (!isSchedulingEntity()) { os << "/ " << *Inst; } else if (NextInBundle) { os << '[' << *Inst; ScheduleData *SD = NextInBundle; while (SD) { os << ';' << *SD->Inst; SD = SD->NextInBundle; } os << ']'; } else { os << *Inst; } } Instruction *Inst = nullptr; /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; /// The TreeEntry that this instruction corresponds to. TreeEntry *TE = nullptr; /// Points to the head in an instruction bundle (and always to this for /// single instructions). ScheduleData *FirstInBundle = nullptr; /// Single linked list of all instructions in a bundle. Null if it is a /// single instruction. ScheduleData *NextInBundle = nullptr; /// Single linked list of all memory instructions (e.g. load, store, call) /// in the block - until the end of the scheduling region. ScheduleData *NextLoadStore = nullptr; /// The dependent memory instructions. /// This list is derived on demand in calculateDependencies(). SmallVector MemoryDependencies; /// List of instructions which this instruction could be control dependent /// on. Allowing such nodes to be scheduled below this one could introduce /// a runtime fault which didn't exist in the original program. /// ex: this is a load or udiv following a readonly call which inf loops SmallVector ControlDependencies; /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. int SchedulingRegionID = 0; /// Used for getting a "good" final ordering of instructions. int SchedulingPriority = 0; /// The number of dependencies. Constitutes of the number of users of the /// instruction plus the number of dependent memory instructions (if any). /// This value is calculated on demand. /// If InvalidDeps, the number of dependencies is not calculated yet. int Dependencies = InvalidDeps; /// The number of dependencies minus the number of dependencies of scheduled /// instructions. As soon as this is zero, the instruction/bundle gets ready /// for scheduling. /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps = InvalidDeps; /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; }; #ifndef NDEBUG friend inline raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { SD.dump(os); return os; } #endif friend struct GraphTraits; friend struct DOTGraphTraits; /// Contains all scheduling data for a basic block. /// It does not schedules instructions, which are not memory read/write /// instructions and their operands are either constants, or arguments, or /// phis, or instructions from others blocks, or their users are phis or from /// the other blocks. The resulting vector instructions can be placed at the /// beginning of the basic block without scheduling (if operands does not need /// to be scheduled) or at the end of the block (if users are outside of the /// block). It allows to save some compile time and memory used by the /// compiler. /// ScheduleData is assigned for each instruction in between the boundaries of /// the tree entry, even for those, which are not part of the graph. It is /// required to correctly follow the dependencies between the instructions and /// their correct scheduling. The ScheduleData is not allocated for the /// instructions, which do not require scheduling, like phis, nodes with /// extractelements/insertelements only or nodes with instructions, with /// uses/operands outside of the block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} void clear() { ReadyInsts.clear(); ScheduleStart = nullptr; ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; RegionHasStackSave = false; // Reduce the maximum schedule region size by the size of the // previous scheduling run. ScheduleRegionSizeLimit -= ScheduleRegionSize; if (ScheduleRegionSizeLimit < MinScheduleRegionSize) ScheduleRegionSizeLimit = MinScheduleRegionSize; ScheduleRegionSize = 0; // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; } ScheduleData *getScheduleData(Instruction *I) { if (BB != I->getParent()) // Avoid lookup if can't possibly be in map. return nullptr; ScheduleData *SD = ScheduleDataMap.lookup(I); if (SD && isInSchedulingRegion(SD)) return SD; return nullptr; } ScheduleData *getScheduleData(Value *V) { if (auto *I = dyn_cast(V)) return getScheduleData(I); return nullptr; } ScheduleData *getScheduleData(Value *V, Value *Key) { if (V == Key) return getScheduleData(V); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) { ScheduleData *SD = I->second.lookup(Key); if (SD && isInSchedulingRegion(SD)) return SD; } return nullptr; } bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } /// Marks an instruction as scheduled and puts all dependent ready /// instructions into the ready-list. template void schedule(ScheduleData *SD, ReadyListType &ReadyList) { SD->IsScheduled = true; LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); for (ScheduleData *BundleMember = SD; BundleMember; BundleMember = BundleMember->NextInBundle) { if (BundleMember->Inst != BundleMember->OpValue) continue; // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction // into the ready list. ScheduleData *DepBundle = OpDef->FirstInBundle; assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); LLVM_DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); }; // If BundleMember is a vector bundle, its operands may have been // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { // Need to search for the lane since the tree entry can be reordered. int Lane = std::distance(TE->Scalars.begin(), find(TE->Scalars, BundleMember->Inst)); assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion // ensures that the tree entry has all operands set before reaching // this code. Couple of exceptions known at the moment are extracts // where their second (immediate) operand is not added. Since // immediates do not affect scheduler behavior this is considered // okay. auto *In = BundleMember->Inst; assert(In && (isa(In) || In->getNumOperands() == TE->getNumOperands()) && "Missed TreeEntry operands?"); (void)In; // fake use to avoid build failure when assertions disabled for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); OpIdx != NumOperands; ++OpIdx) if (auto *I = dyn_cast(TE->getOperand(OpIdx)[Lane])) DecrUnsched(I); } else { // If BundleMember is a stand-alone instruction, no operand reordering // has taken place, so we directly access its operands. for (Use &U : BundleMember->Inst->operands()) if (auto *I = dyn_cast(U.get())) DecrUnsched(I); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { if (MemoryDepSD->hasValidDependencies() && MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); LLVM_DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } // Handle the control dependencies. for (ScheduleData *DepSD : BundleMember->ControlDependencies) { if (DepSD->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = DepSD->FirstInBundle; assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); LLVM_DEBUG(dbgs() << "SLP: gets ready (ctl): " << *DepBundle << "\n"); } } } } /// Verify basic self consistency properties of the data structure. void verify() { if (!ScheduleStart) return; assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && "Not a valid scheduling region?"); for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { auto *SD = getScheduleData(I); if (!SD) continue; assert(isInSchedulingRegion(SD) && "primary schedule data not in window?"); assert(isInSchedulingRegion(SD->FirstInBundle) && "entire bundle in window!"); (void)SD; doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); }); } for (auto *SD : ReadyInsts) { assert(SD->isSchedulingEntity() && SD->isReady() && "item in ready list not ready?"); (void)SD; } } void doForAllOpcodes(Value *V, function_ref Action) { if (ScheduleData *SD = getScheduleData(V)) Action(SD); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) for (auto &P : I->second) if (isInSchedulingRegion(P.second)) Action(P.second); } /// Put all instructions into the ReadyList which are ready for scheduling. template void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { doForAllOpcodes(I, [&](ScheduleData *SD) { if (SD->isSchedulingEntity() && SD->hasValidDependencies() && SD->isReady()) { ReadyList.insert(SD); LLVM_DEBUG(dbgs() << "SLP: initially in ready list: " << *SD << "\n"); } }); } } /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. ScheduleData *buildBundle(ArrayRef VL); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. /// \returns the scheduling bundle. The returned Optional value is not /// std::nullopt if \p VL is allowed to be scheduled. std::optional tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. bool extendSchedulingRegion(Value *V, const InstructionsState &S); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. void initScheduleData(Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore, ScheduleData *NextLoadStore); /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, BoUpSLP *SLP); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); BasicBlock *BB; /// Simple memory allocation for ScheduleData. std::vector> ScheduleDataChunks; /// The size of a ScheduleData array in ScheduleDataChunks. int ChunkSize; /// The allocator position in the current chunk, which is the last entry /// of ScheduleDataChunks. int ChunkPos; /// Attaches ScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. /// ScheduleData structures are recycled. DenseMap ScheduleDataMap; /// Attaches ScheduleData to Instruction with the leading key. DenseMap> ExtraScheduleDataMap; /// The ready-list for scheduling (only used for the dry-run). SetVector ReadyInsts; /// The first instruction of the scheduling region. Instruction *ScheduleStart = nullptr; /// The first instruction _after_ the scheduling region. Instruction *ScheduleEnd = nullptr; /// The first memory accessing instruction in the scheduling region /// (can be null). ScheduleData *FirstLoadStoreInRegion = nullptr; /// The last memory accessing instruction in the scheduling region /// (can be null). ScheduleData *LastLoadStoreInRegion = nullptr; /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling /// region? Used to optimize the dependence calculation for the /// common case where there isn't. bool RegionHasStackSave = false; /// The current size of the scheduling region. int ScheduleRegionSize = 0; /// The maximum size allowed for the scheduling region. int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. /// Make sure that the initial SchedulingRegionID is greater than the /// initial SchedulingRegionID in ScheduleData (which is 0). int SchedulingRegionID = 1; }; /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. const SmallDenseSet *UserIgnoreList = nullptr; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. struct OrdersTypeDenseMapInfo { static OrdersType getEmptyKey() { OrdersType V; V.push_back(~1U); return V; } static OrdersType getTombstoneKey() { OrdersType V; V.push_back(~2U); return V; } static unsigned getHashValue(const OrdersType &V) { return static_cast(hash_combine_range(V.begin(), V.end())); } static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { return LHS == RHS; } }; // Analysis and block reference. Function *F; ScalarEvolution *SE; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; OptimizationRemarkEmitter *ORE; unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; /// A map of scalar integer values to the smallest bit width with which they /// can legally be represented. The values map to (width, signed) pairs, /// where "width" indicates the minimum bit width and "signed" is True if the /// value must be signed-extended, rather than zero-extended, back to its /// original width. MapVector> MinBWs; }; } // end namespace slpvectorizer template <> struct GraphTraits { using TreeEntry = BoUpSLP::TreeEntry; /// NodeRef has to be a pointer per the GraphWriter. using NodeRef = TreeEntry *; using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; /// Add the VectorizableTree to the index iterator to be able to return /// TreeEntry pointers. struct ChildIteratorType : public iterator_adaptor_base< ChildIteratorType, SmallVector::iterator> { ContainerTy &VectorizableTree; ChildIteratorType(SmallVector::iterator W, ContainerTy &VT) : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} NodeRef operator*() { return I->UserTE; } }; static NodeRef getEntryNode(BoUpSLP &R) { return R.VectorizableTree[0].get(); } static ChildIteratorType child_begin(NodeRef N) { return {N->UserTreeIndices.begin(), N->Container}; } static ChildIteratorType child_end(NodeRef N) { return {N->UserTreeIndices.end(), N->Container}; } /// For the node iterator we just need to turn the TreeEntry iterator into a /// TreeEntry* iterator so that it dereferences to NodeRef. class nodes_iterator { using ItTy = ContainerTy::iterator; ItTy It; public: nodes_iterator(const ItTy &It2) : It(It2) {} NodeRef operator*() { return It->get(); } nodes_iterator operator++() { ++It; return *this; } bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } }; static nodes_iterator nodes_begin(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.begin()); } static nodes_iterator nodes_end(BoUpSLP *R) { return nodes_iterator(R->VectorizableTree.end()); } static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } }; template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { using TreeEntry = BoUpSLP::TreeEntry; DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { std::string Str; raw_string_ostream OS(Str); OS << Entry->Idx << ".\n"; if (isSplat(Entry->Scalars)) OS << " "; for (auto *V : Entry->Scalars) { OS << *V; if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) OS << " "; OS << "\n"; } return Str; } static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize) return "color=blue"; return ""; } }; } // end namespace llvm BoUpSLP::~BoUpSLP() { SmallVector DeadInsts; for (auto *I : DeletedInstructions) { for (Use &U : I->operands()) { auto *Op = dyn_cast(U.get()); if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && wouldInstructionBeTriviallyDead(Op, TLI)) DeadInsts.emplace_back(Op); } I->dropAllReferences(); } for (auto *I : DeletedInstructions) { assert(I->use_empty() && "trying to erase instruction with users."); I->eraseFromParent(); } // Cleanup any dead scalar code feeding the vectorized instructions RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); #ifdef EXPENSIVE_CHECKS // If we could guarantee that this call is not extremely slow, we could // remove the ifdef limitation (see PR47712). assert(!verifyFunction(*F, &dbgs())); #endif } /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses /// contains original mask for the scalars reused in the node. Procedure /// transform this mask in accordance with the given \p Mask. static void reorderReuses(SmallVectorImpl &Reuses, ArrayRef Mask) { assert(!Mask.empty() && Reuses.size() == Mask.size() && "Expected non-empty mask."); SmallVector Prev(Reuses.begin(), Reuses.end()); Prev.swap(Reuses); for (unsigned I = 0, E = Prev.size(); I < E; ++I) if (Mask[I] != UndefMaskElem) Reuses[Mask[I]] = Prev[I]; } /// Reorders the given \p Order according to the given \p Mask. \p Order - is /// the original order of the scalars. Procedure transforms the provided order /// in accordance with the given \p Mask. If the resulting \p Order is just an /// identity order, \p Order is cleared. static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask) { assert(!Mask.empty() && "Expected non-empty mask."); SmallVector MaskOrder; if (Order.empty()) { MaskOrder.resize(Mask.size()); std::iota(MaskOrder.begin(), MaskOrder.end(), 0); } else { inversePermutation(Order, MaskOrder); } reorderReuses(MaskOrder, Mask); if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { Order.clear(); return; } Order.assign(Mask.size(), Mask.size()); for (unsigned I = 0, E = Mask.size(); I < E; ++I) if (MaskOrder[I] != UndefMaskElem) Order[MaskOrder[I]] = I; fixupOrderingIndices(Order); } std::optional BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); unsigned NumScalars = TE.Scalars.size(); OrdersType CurrentOrder(NumScalars, NumScalars); SmallVector Positions; SmallBitVector UsedPositions(NumScalars); const TreeEntry *STE = nullptr; // Try to find all gathered scalars that are gets vectorized in other // vectorize node. Here we can have only one single tree vector node to // correctly identify order of the gathered scalars. for (unsigned I = 0; I < NumScalars; ++I) { Value *V = TE.Scalars[I]; if (!isa(V)) continue; if (const auto *LocalSTE = getTreeEntry(V)) { if (!STE) STE = LocalSTE; else if (STE != LocalSTE) // Take the order only from the single vector node. return std::nullopt; unsigned Lane = std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); if (Lane >= NumScalars) return std::nullopt; if (CurrentOrder[Lane] != NumScalars) { if (Lane != I) continue; UsedPositions.reset(CurrentOrder[Lane]); } // The partial identity (where only some elements of the gather node are // in the identity order) is good. CurrentOrder[Lane] = I; UsedPositions.set(I); } } // Need to keep the order if we have a vector entry and at least 2 scalars or // the vectorized entry has just 2 scalars. if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { auto &&IsIdentityOrder = [NumScalars](ArrayRef CurrentOrder) { for (unsigned I = 0; I < NumScalars; ++I) if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) return false; return true; }; if (IsIdentityOrder(CurrentOrder)) { CurrentOrder.clear(); return CurrentOrder; } auto *It = CurrentOrder.begin(); for (unsigned I = 0; I < NumScalars;) { if (UsedPositions.test(I)) { ++I; continue; } if (*It == NumScalars) { *It = I; ++I; } ++It; } return CurrentOrder; } return std::nullopt; } namespace { /// Tracks the state we can represent the loads in the given sequence. enum class LoadsState { Gather, Vectorize, ScatterVectorize }; } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes = true) { if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) return false; auto *GEP1 = dyn_cast(Ptr1); if (!GEP1) return false; auto *GEP2 = dyn_cast(Ptr2); if (!GEP2) return false; return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 && ((isConstant(GEP1->getOperand(1)) && isConstant(GEP2->getOperand(1))) || !CompareOpcodes || getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) .getOpcode()); } /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, LoopInfo &LI, const TargetLibraryInfo &TLI, SmallVectorImpl &Order, SmallVectorImpl &PointerOps) { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller // than 8-bit. Even though we have a packed struct {} LLVM // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. Type *ScalarTy = VL0->getType(); if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) return LoadsState::Gather; // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. PointerOps.clear(); PointerOps.resize(VL.size()); auto *POIter = PointerOps.begin(); for (Value *V : VL) { auto *L = cast(V); if (!L->isSimple()) return LoadsState::Gather; *POIter = L->getPointerOperand(); ++POIter; } Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { if (IsSorted) { Value *Ptr0; Value *PtrN; if (Order.empty()) { Ptr0 = PointerOps.front(); PtrN = PointerOps.back(); } else { Ptr0 = PointerOps[Order.front()]; PtrN = PointerOps[Order.back()]; } std::optional Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); // Check that the sorted loads are consecutive. if (static_cast(*Diff) == VL.size() - 1) return LoadsState::Vectorize; } // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just // increases the cost. Loop *L = LI.getLoopFor(cast(VL0)->getParent()); bool ProfitableGatherPointers = static_cast(count_if(PointerOps, [L](Value *V) { return L && L->isLoopInvariant(V); })) <= VL.size() / 2 && VL.size() > 2; if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { auto *GEP = dyn_cast(P); return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || (GEP && GEP->getNumOperands() == 2); })) { Align CommonAlignment = cast(VL0)->getAlign(); for (Value *V : VL) CommonAlignment = std::min(CommonAlignment, cast(V)->getAlign()); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) return LoadsState::ScatterVectorize; } } return LoadsState::Gather; } bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl &SortedIndices) { assert(llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."); // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each // Ptr into, sort and return the sorted indices with values next to one // another. MapVector>> Bases; Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); unsigned Cnt = 1; for (Value *Ptr : VL.drop_front()) { bool Found = any_of(Bases, [&](auto &Base) { std::optional Diff = getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, /*StrictCheck=*/true); if (!Diff) return false; Base.second.emplace_back(Ptr, *Diff, Cnt++); return true; }); if (!Found) { // If we haven't found enough to usefully cluster, return early. if (Bases.size() > VL.size() / 2 - 1) return false; // Not found already - add a new Base Bases[Ptr].emplace_back(Ptr, 0, Cnt++); } } // For each of the bases sort the pointers by Offset and check if any of the // base become consecutively allocated. bool AnyConsecutive = false; for (auto &Base : Bases) { auto &Vec = Base.second; if (Vec.size() > 1) { llvm::stable_sort(Vec, [](const std::tuple &X, const std::tuple &Y) { return std::get<1>(X) < std::get<1>(Y); }); int InitialOffset = std::get<1>(Vec[0]); AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) { return std::get<1>(P.value()) == int(P.index()) + InitialOffset; }); } } // Fill SortedIndices array only if it looks worth-while to sort the ptrs. SortedIndices.clear(); if (!AnyConsecutive) return false; for (auto &Base : Bases) { for (auto &T : Base.second) SortedIndices.push_back(std::get<2>(T)); } assert(SortedIndices.size() == VL.size() && "Expected SortedIndices to be the size of VL"); return true; } std::optional BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); Type *ScalarTy = TE.Scalars[0]->getType(); SmallVector Ptrs; Ptrs.reserve(TE.Scalars.size()); for (Value *V : TE.Scalars) { auto *L = dyn_cast(V); if (!L || !L->isSimple()) return std::nullopt; Ptrs.push_back(L->getPointerOperand()); } BoUpSLP::OrdersType Order; if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) return Order; return std::nullopt; } /// Check if two insertelement instructions are from the same buildvector. static bool areTwoInsertFromSameBuildVector( InsertElementInst *VU, InsertElementInst *V, function_ref GetBaseOperand) { // Instructions must be from the same basic blocks. if (VU->getParent() != V->getParent()) return false; // Checks if 2 insertelements are from the same buildvector. if (VU->getType() != V->getType()) return false; // Multiple used inserts are separate nodes. if (!VU->hasOneUse() && !V->hasOneUse()) return false; auto *IE1 = VU; auto *IE2 = V; std::optional Idx1 = getInsertIndex(IE1); std::optional Idx2 = getInsertIndex(IE2); if (Idx1 == std::nullopt || Idx2 == std::nullopt) return false; // Go through the vector operand of insertelement instructions trying to find // either VU as the original vector for IE2 or V as the original vector for // IE1. do { if (IE2 == VU) return VU->hasOneUse(); if (IE1 == V) return V->hasOneUse(); if (IE1) { if ((IE1 != VU && !IE1->hasOneUse()) || getInsertIndex(IE1).value_or(*Idx2) == *Idx2) IE1 = nullptr; else IE1 = dyn_cast_or_null(GetBaseOperand(IE1)); } if (IE2) { if ((IE2 != V && !IE2->hasOneUse()) || getInsertIndex(IE2).value_or(*Idx1) == *Idx1) IE2 = nullptr; else IE2 = dyn_cast_or_null(GetBaseOperand(IE2)); } } while (IE1 || IE2); return false; } std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { // Check if reuse shuffle indices can be improved by reordering. // For this, check that reuse mask is "clustered", i.e. each scalar values // is used once in each submask of size . // Example: 4 scalar values. // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because // element 3 is used twice in the second submask. unsigned Sz = TE.Scalars.size(); if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, Sz)) return std::nullopt; unsigned VF = TE.getVectorFactor(); // Try build correct order for extractelement instructions. SmallVector ReusedMask(TE.ReuseShuffleIndices.begin(), TE.ReuseShuffleIndices.end()); if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() && all_of(TE.Scalars, [Sz](Value *V) { std::optional Idx = getExtractIndex(cast(V)); return Idx && *Idx < Sz; })) { SmallVector ReorderMask(Sz, UndefMaskElem); if (TE.ReorderIndices.empty()) std::iota(ReorderMask.begin(), ReorderMask.end(), 0); else inversePermutation(TE.ReorderIndices, ReorderMask); for (unsigned I = 0; I < VF; ++I) { int &Idx = ReusedMask[I]; if (Idx == UndefMaskElem) continue; Value *V = TE.Scalars[ReorderMask[Idx]]; std::optional EI = getExtractIndex(cast(V)); Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI)); } } // Build the order of the VF size, need to reorder reuses shuffles, they are // always of VF size. OrdersType ResOrder(VF); std::iota(ResOrder.begin(), ResOrder.end(), 0); auto *It = ResOrder.begin(); for (unsigned K = 0; K < VF; K += Sz) { OrdersType CurrentOrder(TE.ReorderIndices); SmallVector SubMask{ArrayRef(ReusedMask).slice(K, Sz)}; if (SubMask.front() == UndefMaskElem) std::iota(SubMask.begin(), SubMask.end(), 0); reorderOrder(CurrentOrder, SubMask); transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); std::advance(It, Sz); } if (all_of(enumerate(ResOrder), [](const auto &Data) { return Data.index() == Data.value(); })) return {}; // Use identity order. return ResOrder; } if (TE.State == TreeEntry::Vectorize && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { if (!V1->hasOneUse() || !V2->hasOneUse()) return false; auto *FirstUserOfPhi1 = cast(*V1->user_begin()); auto *FirstUserOfPhi2 = cast(*V2->user_begin()); if (auto *IE1 = dyn_cast(FirstUserOfPhi1)) if (auto *IE2 = dyn_cast(FirstUserOfPhi2)) { if (!areTwoInsertFromSameBuildVector( IE1, IE2, [](InsertElementInst *II) { return II->getOperand(0); })) return false; std::optional Idx1 = getInsertIndex(IE1); std::optional Idx2 = getInsertIndex(IE2); if (Idx1 == std::nullopt || Idx2 == std::nullopt) return false; return *Idx1 < *Idx2; } if (auto *EE1 = dyn_cast(FirstUserOfPhi1)) if (auto *EE2 = dyn_cast(FirstUserOfPhi2)) { if (EE1->getOperand(0) != EE2->getOperand(0)) return false; std::optional Idx1 = getExtractIndex(EE1); std::optional Idx2 = getExtractIndex(EE2); if (Idx1 == std::nullopt || Idx2 == std::nullopt) return false; return *Idx1 < *Idx2; } return false; }; auto IsIdentityOrder = [](const OrdersType &Order) { for (unsigned Idx : seq(0, Order.size())) if (Idx != Order[Idx]) return false; return true; }; if (!TE.ReorderIndices.empty()) return TE.ReorderIndices; DenseMap PhiToId; SmallVector Phis; OrdersType ResOrder(TE.Scalars.size()); for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) { PhiToId[TE.Scalars[Id]] = Id; Phis.push_back(TE.Scalars[Id]); } llvm::stable_sort(Phis, PHICompare); for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) ResOrder[Id] = PhiToId[Phis[Id]]; if (IsIdentityOrder(ResOrder)) return {}; return ResOrder; } if (TE.State == TreeEntry::NeedToGather) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. if (((TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle()) || (all_of(TE.Scalars, [](Value *V) { return isa(V); }) && any_of(TE.Scalars, [](Value *V) { return isa(V); }))) && all_of(TE.Scalars, [](Value *V) { auto *EE = dyn_cast(V); return !EE || isa(EE->getVectorOperandType()); }) && allSameType(TE.Scalars)) { // Check that gather of extractelements can be represented as // just a shuffle of a single vector. OrdersType CurrentOrder; bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); if (Reuse || !CurrentOrder.empty()) { if (!CurrentOrder.empty()) fixupOrderingIndices(CurrentOrder); return CurrentOrder; } } if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; if (TE.Scalars.size() >= 4) if (std::optional Order = findPartiallyOrderedLoads(TE)) return Order; } return std::nullopt; } /// Checks if the given mask is a "clustered" mask with the same clusters of /// size \p Sz, which are not identity submasks. static bool isRepeatedNonIdentityClusteredMask(ArrayRef Mask, unsigned Sz) { ArrayRef FirstCluster = Mask.slice(0, Sz); if (ShuffleVectorInst::isIdentityMask(FirstCluster)) return false; for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { ArrayRef Cluster = Mask.slice(I, Sz); if (Cluster != FirstCluster) return false; } return true; } void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const { // Reorder reuses mask. reorderReuses(TE.ReuseShuffleIndices, Mask); const unsigned Sz = TE.Scalars.size(); // For vectorized and non-clustered reused no need to do anything else. if (TE.State != TreeEntry::NeedToGather || !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, Sz) || !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) return; SmallVector NewMask; inversePermutation(TE.ReorderIndices, NewMask); addMask(NewMask, TE.ReuseShuffleIndices); // Clear reorder since it is going to be applied to the new mask. TE.ReorderIndices.clear(); // Try to improve gathered nodes with clustered reuses, if possible. ArrayRef Slice = ArrayRef(NewMask).slice(0, Sz); SmallVector NewOrder(Slice.begin(), Slice.end()); inversePermutation(NewOrder, NewMask); reorderScalars(TE.Scalars, NewMask); // Fill the reuses mask with the identity submasks. for (auto *It = TE.ReuseShuffleIndices.begin(), *End = TE.ReuseShuffleIndices.end(); It != End; std::advance(It, Sz)) std::iota(It, std::next(It, Sz), 0); } void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap GathersToOrders; // Phi nodes can have preferred ordering based on their result users DenseMap PhisToOrders; // AltShuffles can also have a preferred ordering that leads to fewer // instructions, e.g., the addsub instruction in x86. DenseMap AltShufflesToOrders; // Maps a TreeEntry to the reorder indices of external users. DenseMap> ExternalUserReorderMap; // FIXME: Workaround for syntax error reported by MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; // Find all reorderable nodes with the given VF. // Currently the are vectorized stores,loads,extracts + some gathering of // extracts. for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, &GathersToOrders, &ExternalUserReorderMap, &AltShufflesToOrders, &PhisToOrders]( const std::unique_ptr &TE) { // Look for external users that will probably be vectorized. SmallVector ExternalUserReorderIndices = findExternalStoreUsersReorderIndices(TE.get()); if (!ExternalUserReorderIndices.empty()) { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); ExternalUserReorderMap.try_emplace(TE.get(), std::move(ExternalUserReorderIndices)); } // Patterns like [fadd,fsub] can be combined into a single instruction in // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // to take into account their order when looking for the most used order. if (TE->isAltShuffle()) { VectorType *VecTy = FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); unsigned Opcode1 = TE->getAltOpcode(); // The opcode mask selects between the two opcodes. SmallBitVector OpcodeMask(TE->Scalars.size(), false); for (unsigned Lane : seq(0, TE->Scalars.size())) if (cast(TE->Scalars[Lane])->getOpcode() == Opcode1) OpcodeMask.set(Lane); // If this pattern is supported by the target then we consider the order. if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); } // TODO: Check the reverse order too. } if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/true)) { // Do not include ordering for nodes used in the alt opcode vectorization, // better to reorder them during bottom-to-top stage. If follow the order // here, it causes reordering of the whole graph though actually it is // profitable just to reorder the subgraph that starts from the alternate // opcode vectorization node. Such nodes already end-up with the shuffle // instruction and it is just enough to change this shuffle rather than // rotate the scalars for the whole graph. unsigned Cnt = 0; const TreeEntry *UserTE = TE.get(); while (UserTE && Cnt < RecursionMaxDepth) { if (UserTE->UserTreeIndices.size() != 1) break; if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { return EI.UserTE->State == TreeEntry::Vectorize && EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; })) return; UserTE = UserTE->UserTreeIndices.back().UserTE; ++Cnt; } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && TE->getOpcode() == Instruction::PHI) PhisToOrders.try_emplace(TE.get(), *CurrentOrder); } }); // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; VF /= 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; // Try to find the most profitable order. We just are looking for the most // used order and reorder scalar elements in the nodes according to this // mostly used order. ArrayRef OrderedEntries = It->second.getArrayRef(); // All operands are reordered and used only in this node - propagate the // most used order to the user node. MapVector> OrdersUses; SmallPtrSet VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, // just need to merge reordering shuffle and the reuse shuffle. if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; // Count number of orders uses. const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, &PhisToOrders]() -> const OrdersType & { if (OpTE->State == TreeEntry::NeedToGather || !OpTE->ReuseShuffleIndices.empty()) { auto It = GathersToOrders.find(OpTE); if (It != GathersToOrders.end()) return It->second; } if (OpTE->isAltShuffle()) { auto It = AltShufflesToOrders.find(OpTE); if (It != AltShufflesToOrders.end()) return It->second; } if (OpTE->State == TreeEntry::Vectorize && OpTE->getOpcode() == Instruction::PHI) { auto It = PhisToOrders.find(OpTE); if (It != PhisToOrders.end()) return It->second; } return OpTE->ReorderIndices; }(); // First consider the order of the external scalar users. auto It = ExternalUserReorderMap.find(OpTE); if (It != ExternalUserReorderMap.end()) { const auto &ExternalUserReorderIndices = It->second; // If the OpTE vector factor != number of scalars - use natural order, // it is an attempt to reorder node with reused scalars but with // external uses. if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += ExternalUserReorderIndices.size(); } else { for (const OrdersType &ExtOrder : ExternalUserReorderIndices) ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; } // No other useful reorder data in this entry. if (Order.empty()) continue; } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { SmallVector Mask; inversePermutation(Order, Mask); unsigned E = Order.size(); OrdersType CurrentOrder(E, E); transform(Mask, CurrentOrder.begin(), [E](int Idx) { return Idx == UndefMaskElem ? E : static_cast(Idx); }); fixupOrderingIndices(CurrentOrder); ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; } else { ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } } // Set order of the user node. if (OrdersUses.empty()) continue; // Choose the most used order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; for (const auto &Pair : drop_begin(OrdersUses)) { if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { BestOrder = Pair.first; Cnt = Pair.second; } } // Set order of the user node. if (BestOrder.empty()) continue; SmallVector Mask; inversePermutation(BestOrder, Mask); SmallVector MaskOrder(BestOrder.size(), UndefMaskElem); unsigned E = BestOrder.size(); transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { return I < E ? static_cast(I) : UndefMaskElem; }); // Do an actual reordering, if profitable. for (std::unique_ptr &TE : VectorizableTree) { // Just do the reordering for the nodes with the given VF. if (TE->Scalars.size() != VF) { if (TE->ReuseShuffleIndices.size() == VF) { // Need to reorder the reuses masks of the operands with smaller VF to // be able to find the match between the graph nodes and scalar // operands of the given node during vectorization/cost estimation. assert(all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && "All users must be of VF size."); // Update ordering of the operands with the smaller VF than the given // one. reorderNodeWithReuses(*TE, Mask); } continue; } if (TE->State == TreeEntry::Vectorize && isa