//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the transformation that optimizes memory intrinsics // such as memcpy using the size value profile. When memory intrinsic size // value profile metadata is available, a single memory intrinsic is expanded // to a sequence of guarded specialized versions that are called with the // hottest size(s), for later expansion into more optimal inline sequences. // //===----------------------------------------------------------------------===// #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/ProfileData/InstrProf.h" #define INSTR_PROF_VALUE_PROF_MEMOP_API #include "llvm/ProfileData/InstrProfData.inc" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include #include #include using namespace llvm; #define DEBUG_TYPE "pgo-memop-opt" STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); // The minimum call count to optimize memory intrinsic calls. static cl::opt MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000), cl::desc("The minimum count to optimize memory " "intrinsic calls")); // Command line option to disable memory intrinsic optimization. The default is // false. This is for debug purpose. static cl::opt DisableMemOPOPT("disable-memop-opt", cl::init(false), cl::Hidden, cl::desc("Disable optimize")); // The percent threshold to optimize memory intrinsic calls. static cl::opt MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), cl::Hidden, cl::desc("The percentage threshold for the " "memory intrinsic calls optimization")); // Maximum number of versions for optimizing memory intrinsic call. static cl::opt MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, cl::desc("The max version for the optimized memory " " intrinsic calls")); // Scale the counts from the annotation using the BB count value. static cl::opt MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, cl::desc("Scale the memop size counts using the basic " " block count value")); cl::opt MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), cl::Hidden, cl::desc("Size-specialize memcmp and bcmp calls")); static cl::opt MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), cl::desc("Optimize the memop size <= this value")); namespace { static const char *getMIName(const MemIntrinsic *MI) { switch (MI->getIntrinsicID()) { case Intrinsic::memcpy: return "memcpy"; case Intrinsic::memmove: return "memmove"; case Intrinsic::memset: return "memset"; default: return "unknown"; } } // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). struct MemOp { Instruction *I; MemOp(MemIntrinsic *MI) : I(MI) {} MemOp(CallInst *CI) : I(CI) {} MemIntrinsic *asMI() { return dyn_cast(I); } CallInst *asCI() { return cast(I); } MemOp clone() { if (auto MI = asMI()) return MemOp(cast(MI->clone())); return MemOp(cast(asCI()->clone())); } Value *getLength() { if (auto MI = asMI()) return MI->getLength(); return asCI()->getArgOperand(2); } void setLength(Value *Length) { if (auto MI = asMI()) return MI->setLength(Length); asCI()->setArgOperand(2, Length); } StringRef getFuncName() { if (auto MI = asMI()) return MI->getCalledFunction()->getName(); return asCI()->getCalledFunction()->getName(); } bool isMemmove() { if (auto MI = asMI()) if (MI->getIntrinsicID() == Intrinsic::memmove) return true; return false; } bool isMemcmp(TargetLibraryInfo &TLI) { LibFunc Func; if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && Func == LibFunc_memcmp) { return true; } return false; } bool isBcmp(TargetLibraryInfo &TLI) { LibFunc Func; if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && Func == LibFunc_bcmp) { return true; } return false; } const char *getName(TargetLibraryInfo &TLI) { if (auto MI = asMI()) return getMIName(MI); LibFunc Func; if (TLI.getLibFunc(*asCI(), Func)) { if (Func == LibFunc_memcmp) return "memcmp"; if (Func == LibFunc_bcmp) return "bcmp"; } llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); return nullptr; } }; class MemOPSizeOpt : public InstVisitor { public: MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT, TargetLibraryInfo &TLI) : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { ValueDataArray = std::make_unique(INSTR_PROF_NUM_BUCKETS); } bool isChanged() const { return Changed; } void perform() { WorkList.clear(); visit(Func); for (auto &MO : WorkList) { ++NumOfPGOMemOPAnnotate; if (perform(MO)) { Changed = true; ++NumOfPGOMemOPOpt; LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() << "is Transformed.\n"); } } } void visitMemIntrinsic(MemIntrinsic &MI) { Value *Length = MI.getLength(); // Not perform on constant length calls. if (isa(Length)) return; WorkList.push_back(MemOp(&MI)); } void visitCallInst(CallInst &CI) { LibFunc Func; if (TLI.getLibFunc(CI, Func) && (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && !isa(CI.getArgOperand(2))) { WorkList.push_back(MemOp(&CI)); } } private: Function &Func; BlockFrequencyInfo &BFI; OptimizationRemarkEmitter &ORE; DominatorTree *DT; TargetLibraryInfo &TLI; bool Changed; std::vector WorkList; // The space to read the profile annotation. std::unique_ptr ValueDataArray; bool perform(MemOp MO); }; static bool isProfitable(uint64_t Count, uint64_t TotalCount) { assert(Count <= TotalCount); if (Count < MemOPCountThreshold) return false; if (Count < TotalCount * MemOPPercentThreshold / 100) return false; return true; } static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, uint64_t Denom) { if (!MemOPScaleCount) return Count; bool Overflowed; uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); return ScaleCount / Denom; } bool MemOPSizeOpt::perform(MemOp MO) { assert(MO.I); if (MO.isMemmove()) return false; if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) return false; uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; uint64_t TotalCount; if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, ValueDataArray.get(), NumVals, TotalCount)) return false; uint64_t ActualCount = TotalCount; uint64_t SavedTotalCount = TotalCount; if (MemOPScaleCount) { auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); if (!BBEdgeCount) return false; ActualCount = *BBEdgeCount; } ArrayRef VDs(ValueDataArray.get(), NumVals); LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount << "\n"); LLVM_DEBUG( for (auto &VD : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); if (ActualCount < MemOPCountThreshold) return false; // Skip if the total value profiled count is 0, in which case we can't // scale up the counts properly (and there is no profitable transformation). if (TotalCount == 0) return false; TotalCount = ActualCount; if (MemOPScaleCount) LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount << " denominator = " << SavedTotalCount << "\n"); // Keeping track of the count of the default case: uint64_t RemainCount = TotalCount; uint64_t SavedRemainCount = SavedTotalCount; SmallVector SizeIds; SmallVector CaseCounts; SmallDenseSet SeenSizeId; uint64_t MaxCount = 0; unsigned Version = 0; // Default case is in the front -- save the slot here. CaseCounts.push_back(0); SmallVector RemainingVDs; for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { auto &VD = *I; int64_t V = VD.Value; uint64_t C = VD.Count; if (MemOPScaleCount) C = getScaledCount(C, ActualCount, SavedTotalCount); if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { RemainingVDs.push_back(VD); continue; } // ValueCounts are sorted on the count. Break at the first un-profitable // value. if (!isProfitable(C, RemainCount)) { RemainingVDs.insert(RemainingVDs.end(), I, E); break; } if (!SeenSizeId.insert(V).second) { errs() << "Invalid Profile Data in Function " << Func.getName() << ": Two identical values in MemOp value counts.\n"; return false; } SizeIds.push_back(V); CaseCounts.push_back(C); if (C > MaxCount) MaxCount = C; assert(RemainCount >= C); RemainCount -= C; assert(SavedRemainCount >= VD.Count); SavedRemainCount -= VD.Count; if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { RemainingVDs.insert(RemainingVDs.end(), I + 1, E); break; } } if (Version == 0) return false; CaseCounts[0] = RemainCount; if (RemainCount > MaxCount) MaxCount = RemainCount; uint64_t SumForOpt = TotalCount - RemainCount; LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version << " Versions (covering " << SumForOpt << " out of " << TotalCount << ")\n"); // mem_op(..., size) // ==> // switch (size) { // case s1: // mem_op(..., s1); // goto merge_bb; // case s2: // mem_op(..., s2); // goto merge_bb; // ... // default: // mem_op(..., size); // goto merge_bb; // } // merge_bb: BasicBlock *BB = MO.I->getParent(); LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); LLVM_DEBUG(dbgs() << *BB << "\n"); auto OrigBBFreq = BFI.getBlockFreq(BB); BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); BasicBlock::iterator It(*MO.I); ++It; assert(It != DefaultBB->end()); BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); MergeBB->setName("MemOP.Merge"); BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); DefaultBB->setName("MemOP.Default"); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); auto &Ctx = Func.getContext(); IRBuilder<> IRB(BB); BB->getTerminator()->eraseFromParent(); Value *SizeVar = MO.getLength(); SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); Type *MemOpTy = MO.I->getType(); PHINode *PHI = nullptr; if (!MemOpTy->isVoidTy()) { // Insert a phi for the return values at the merge block. IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); MO.I->replaceAllUsesWith(PHI); PHI->addIncoming(MO.I, DefaultBB); } // Clear the value profile data. MO.I->setMetadata(LLVMContext::MD_prof, nullptr); // If all promoted, we don't need the MD.prof metadata. if (SavedRemainCount > 0 || Version != NumVals) { // Otherwise we need update with the un-promoted records back. ArrayRef RemVDs(RemainingVDs); annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, IPVK_MemOPSize, NumVals); } LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); std::vector Updates; if (DT) Updates.reserve(2 * SizeIds.size()); for (uint64_t SizeId : SizeIds) { BasicBlock *CaseBB = BasicBlock::Create( Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); MemOp NewMO = MO.clone(); // Fix the argument. auto *SizeType = dyn_cast(NewMO.getLength()->getType()); assert(SizeType && "Expected integer type size argument."); ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); NewMO.setLength(CaseSizeId); NewMO.I->insertInto(CaseBB, CaseBB->end()); IRBuilder<> IRBCase(CaseBB); IRBCase.CreateBr(MergeBB); SI->addCase(CaseSizeId, CaseBB); if (!MemOpTy->isVoidTy()) PHI->addIncoming(NewMO.I, CaseBB); if (DT) { Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); Updates.push_back({DominatorTree::Insert, BB, CaseBB}); } LLVM_DEBUG(dbgs() << *CaseBB << "\n"); } DTU.applyUpdates(Updates); Updates.clear(); if (MaxCount) setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); LLVM_DEBUG(dbgs() << *BB << "\n"); LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); LLVM_DEBUG(dbgs() << *MergeBB << "\n"); ORE.emit([&]() { using namespace ore; return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) << " for " << NV("Versions", Version) << " versions"; }); return true; } } // namespace static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT, TargetLibraryInfo &TLI) { if (DisableMemOPOPT) return false; if (F.hasFnAttribute(Attribute::OptimizeForSize)) return false; MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); return MemOPSizeOpt.isChanged(); } PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, FunctionAnalysisManager &FAM) { auto &BFI = FAM.getResult(F); auto &ORE = FAM.getResult(F); auto *DT = FAM.getCachedResult(F); auto &TLI = FAM.getResult(F); bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); if (!Changed) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); PA.preserve(); return PA; }