PGOMemOPSizeOpt.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the transformation that optimizes memory intrinsics
  10. // such as memcpy using the size value profile. When memory intrinsic size
  11. // value profile metadata is available, a single memory intrinsic is expanded
  12. // to a sequence of guarded specialized versions that are called with the
  13. // hottest size(s), for later expansion into more optimal inline sequences.
  14. //
  15. //===----------------------------------------------------------------------===//
  16. #include "llvm/ADT/ArrayRef.h"
  17. #include "llvm/ADT/Statistic.h"
  18. #include "llvm/ADT/StringRef.h"
  19. #include "llvm/ADT/Twine.h"
  20. #include "llvm/Analysis/BlockFrequencyInfo.h"
  21. #include "llvm/Analysis/DomTreeUpdater.h"
  22. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  23. #include "llvm/Analysis/TargetLibraryInfo.h"
  24. #include "llvm/IR/BasicBlock.h"
  25. #include "llvm/IR/DerivedTypes.h"
  26. #include "llvm/IR/Dominators.h"
  27. #include "llvm/IR/Function.h"
  28. #include "llvm/IR/IRBuilder.h"
  29. #include "llvm/IR/InstVisitor.h"
  30. #include "llvm/IR/Instruction.h"
  31. #include "llvm/IR/Instructions.h"
  32. #include "llvm/IR/LLVMContext.h"
  33. #include "llvm/IR/PassManager.h"
  34. #include "llvm/IR/Type.h"
  35. #include "llvm/ProfileData/InstrProf.h"
  36. #define INSTR_PROF_VALUE_PROF_MEMOP_API
  37. #include "llvm/ProfileData/InstrProfData.inc"
  38. #include "llvm/Support/Casting.h"
  39. #include "llvm/Support/CommandLine.h"
  40. #include "llvm/Support/Debug.h"
  41. #include "llvm/Support/ErrorHandling.h"
  42. #include "llvm/Support/MathExtras.h"
  43. #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
  44. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  45. #include <cassert>
  46. #include <cstdint>
  47. #include <vector>
  48. using namespace llvm;
  49. #define DEBUG_TYPE "pgo-memop-opt"
  50. STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
  51. STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
  52. // The minimum call count to optimize memory intrinsic calls.
  53. static cl::opt<unsigned>
  54. MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
  55. cl::desc("The minimum count to optimize memory "
  56. "intrinsic calls"));
  57. // Command line option to disable memory intrinsic optimization. The default is
  58. // false. This is for debug purpose.
  59. static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
  60. cl::Hidden, cl::desc("Disable optimize"));
  61. // The percent threshold to optimize memory intrinsic calls.
  62. static cl::opt<unsigned>
  63. MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
  64. cl::Hidden,
  65. cl::desc("The percentage threshold for the "
  66. "memory intrinsic calls optimization"));
  67. // Maximum number of versions for optimizing memory intrinsic call.
  68. static cl::opt<unsigned>
  69. MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
  70. cl::desc("The max version for the optimized memory "
  71. " intrinsic calls"));
  72. // Scale the counts from the annotation using the BB count value.
  73. static cl::opt<bool>
  74. MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
  75. cl::desc("Scale the memop size counts using the basic "
  76. " block count value"));
  77. cl::opt<bool>
  78. MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
  79. cl::Hidden,
  80. cl::desc("Size-specialize memcmp and bcmp calls"));
  81. static cl::opt<unsigned>
  82. MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
  83. cl::desc("Optimize the memop size <= this value"));
  84. namespace {
  85. static const char *getMIName(const MemIntrinsic *MI) {
  86. switch (MI->getIntrinsicID()) {
  87. case Intrinsic::memcpy:
  88. return "memcpy";
  89. case Intrinsic::memmove:
  90. return "memmove";
  91. case Intrinsic::memset:
  92. return "memset";
  93. default:
  94. return "unknown";
  95. }
  96. }
  97. // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
  98. struct MemOp {
  99. Instruction *I;
  100. MemOp(MemIntrinsic *MI) : I(MI) {}
  101. MemOp(CallInst *CI) : I(CI) {}
  102. MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
  103. CallInst *asCI() { return cast<CallInst>(I); }
  104. MemOp clone() {
  105. if (auto MI = asMI())
  106. return MemOp(cast<MemIntrinsic>(MI->clone()));
  107. return MemOp(cast<CallInst>(asCI()->clone()));
  108. }
  109. Value *getLength() {
  110. if (auto MI = asMI())
  111. return MI->getLength();
  112. return asCI()->getArgOperand(2);
  113. }
  114. void setLength(Value *Length) {
  115. if (auto MI = asMI())
  116. return MI->setLength(Length);
  117. asCI()->setArgOperand(2, Length);
  118. }
  119. StringRef getFuncName() {
  120. if (auto MI = asMI())
  121. return MI->getCalledFunction()->getName();
  122. return asCI()->getCalledFunction()->getName();
  123. }
  124. bool isMemmove() {
  125. if (auto MI = asMI())
  126. if (MI->getIntrinsicID() == Intrinsic::memmove)
  127. return true;
  128. return false;
  129. }
  130. bool isMemcmp(TargetLibraryInfo &TLI) {
  131. LibFunc Func;
  132. if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
  133. Func == LibFunc_memcmp) {
  134. return true;
  135. }
  136. return false;
  137. }
  138. bool isBcmp(TargetLibraryInfo &TLI) {
  139. LibFunc Func;
  140. if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
  141. Func == LibFunc_bcmp) {
  142. return true;
  143. }
  144. return false;
  145. }
  146. const char *getName(TargetLibraryInfo &TLI) {
  147. if (auto MI = asMI())
  148. return getMIName(MI);
  149. LibFunc Func;
  150. if (TLI.getLibFunc(*asCI(), Func)) {
  151. if (Func == LibFunc_memcmp)
  152. return "memcmp";
  153. if (Func == LibFunc_bcmp)
  154. return "bcmp";
  155. }
  156. llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
  157. return nullptr;
  158. }
  159. };
  160. class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
  161. public:
  162. MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
  163. OptimizationRemarkEmitter &ORE, DominatorTree *DT,
  164. TargetLibraryInfo &TLI)
  165. : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
  166. ValueDataArray =
  167. std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
  168. }
  169. bool isChanged() const { return Changed; }
  170. void perform() {
  171. WorkList.clear();
  172. visit(Func);
  173. for (auto &MO : WorkList) {
  174. ++NumOfPGOMemOPAnnotate;
  175. if (perform(MO)) {
  176. Changed = true;
  177. ++NumOfPGOMemOPOpt;
  178. LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
  179. << "is Transformed.\n");
  180. }
  181. }
  182. }
  183. void visitMemIntrinsic(MemIntrinsic &MI) {
  184. Value *Length = MI.getLength();
  185. // Not perform on constant length calls.
  186. if (isa<ConstantInt>(Length))
  187. return;
  188. WorkList.push_back(MemOp(&MI));
  189. }
  190. void visitCallInst(CallInst &CI) {
  191. LibFunc Func;
  192. if (TLI.getLibFunc(CI, Func) &&
  193. (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
  194. !isa<ConstantInt>(CI.getArgOperand(2))) {
  195. WorkList.push_back(MemOp(&CI));
  196. }
  197. }
  198. private:
  199. Function &Func;
  200. BlockFrequencyInfo &BFI;
  201. OptimizationRemarkEmitter &ORE;
  202. DominatorTree *DT;
  203. TargetLibraryInfo &TLI;
  204. bool Changed;
  205. std::vector<MemOp> WorkList;
  206. // The space to read the profile annotation.
  207. std::unique_ptr<InstrProfValueData[]> ValueDataArray;
  208. bool perform(MemOp MO);
  209. };
  210. static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
  211. assert(Count <= TotalCount);
  212. if (Count < MemOPCountThreshold)
  213. return false;
  214. if (Count < TotalCount * MemOPPercentThreshold / 100)
  215. return false;
  216. return true;
  217. }
  218. static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
  219. uint64_t Denom) {
  220. if (!MemOPScaleCount)
  221. return Count;
  222. bool Overflowed;
  223. uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
  224. return ScaleCount / Denom;
  225. }
  226. bool MemOPSizeOpt::perform(MemOp MO) {
  227. assert(MO.I);
  228. if (MO.isMemmove())
  229. return false;
  230. if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
  231. return false;
  232. uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
  233. uint64_t TotalCount;
  234. if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
  235. ValueDataArray.get(), NumVals, TotalCount))
  236. return false;
  237. uint64_t ActualCount = TotalCount;
  238. uint64_t SavedTotalCount = TotalCount;
  239. if (MemOPScaleCount) {
  240. auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
  241. if (!BBEdgeCount)
  242. return false;
  243. ActualCount = *BBEdgeCount;
  244. }
  245. ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
  246. LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
  247. << ActualCount << "\n");
  248. LLVM_DEBUG(
  249. for (auto &VD
  250. : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
  251. if (ActualCount < MemOPCountThreshold)
  252. return false;
  253. // Skip if the total value profiled count is 0, in which case we can't
  254. // scale up the counts properly (and there is no profitable transformation).
  255. if (TotalCount == 0)
  256. return false;
  257. TotalCount = ActualCount;
  258. if (MemOPScaleCount)
  259. LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
  260. << " denominator = " << SavedTotalCount << "\n");
  261. // Keeping track of the count of the default case:
  262. uint64_t RemainCount = TotalCount;
  263. uint64_t SavedRemainCount = SavedTotalCount;
  264. SmallVector<uint64_t, 16> SizeIds;
  265. SmallVector<uint64_t, 16> CaseCounts;
  266. SmallDenseSet<uint64_t, 16> SeenSizeId;
  267. uint64_t MaxCount = 0;
  268. unsigned Version = 0;
  269. // Default case is in the front -- save the slot here.
  270. CaseCounts.push_back(0);
  271. SmallVector<InstrProfValueData, 24> RemainingVDs;
  272. for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
  273. auto &VD = *I;
  274. int64_t V = VD.Value;
  275. uint64_t C = VD.Count;
  276. if (MemOPScaleCount)
  277. C = getScaledCount(C, ActualCount, SavedTotalCount);
  278. if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
  279. RemainingVDs.push_back(VD);
  280. continue;
  281. }
  282. // ValueCounts are sorted on the count. Break at the first un-profitable
  283. // value.
  284. if (!isProfitable(C, RemainCount)) {
  285. RemainingVDs.insert(RemainingVDs.end(), I, E);
  286. break;
  287. }
  288. if (!SeenSizeId.insert(V).second) {
  289. errs() << "Invalid Profile Data in Function " << Func.getName()
  290. << ": Two identical values in MemOp value counts.\n";
  291. return false;
  292. }
  293. SizeIds.push_back(V);
  294. CaseCounts.push_back(C);
  295. if (C > MaxCount)
  296. MaxCount = C;
  297. assert(RemainCount >= C);
  298. RemainCount -= C;
  299. assert(SavedRemainCount >= VD.Count);
  300. SavedRemainCount -= VD.Count;
  301. if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
  302. RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
  303. break;
  304. }
  305. }
  306. if (Version == 0)
  307. return false;
  308. CaseCounts[0] = RemainCount;
  309. if (RemainCount > MaxCount)
  310. MaxCount = RemainCount;
  311. uint64_t SumForOpt = TotalCount - RemainCount;
  312. LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
  313. << " Versions (covering " << SumForOpt << " out of "
  314. << TotalCount << ")\n");
  315. // mem_op(..., size)
  316. // ==>
  317. // switch (size) {
  318. // case s1:
  319. // mem_op(..., s1);
  320. // goto merge_bb;
  321. // case s2:
  322. // mem_op(..., s2);
  323. // goto merge_bb;
  324. // ...
  325. // default:
  326. // mem_op(..., size);
  327. // goto merge_bb;
  328. // }
  329. // merge_bb:
  330. BasicBlock *BB = MO.I->getParent();
  331. LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
  332. LLVM_DEBUG(dbgs() << *BB << "\n");
  333. auto OrigBBFreq = BFI.getBlockFreq(BB);
  334. BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
  335. BasicBlock::iterator It(*MO.I);
  336. ++It;
  337. assert(It != DefaultBB->end());
  338. BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
  339. MergeBB->setName("MemOP.Merge");
  340. BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
  341. DefaultBB->setName("MemOP.Default");
  342. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
  343. auto &Ctx = Func.getContext();
  344. IRBuilder<> IRB(BB);
  345. BB->getTerminator()->eraseFromParent();
  346. Value *SizeVar = MO.getLength();
  347. SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
  348. Type *MemOpTy = MO.I->getType();
  349. PHINode *PHI = nullptr;
  350. if (!MemOpTy->isVoidTy()) {
  351. // Insert a phi for the return values at the merge block.
  352. IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
  353. PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
  354. MO.I->replaceAllUsesWith(PHI);
  355. PHI->addIncoming(MO.I, DefaultBB);
  356. }
  357. // Clear the value profile data.
  358. MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
  359. // If all promoted, we don't need the MD.prof metadata.
  360. if (SavedRemainCount > 0 || Version != NumVals) {
  361. // Otherwise we need update with the un-promoted records back.
  362. ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
  363. annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
  364. IPVK_MemOPSize, NumVals);
  365. }
  366. LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
  367. std::vector<DominatorTree::UpdateType> Updates;
  368. if (DT)
  369. Updates.reserve(2 * SizeIds.size());
  370. for (uint64_t SizeId : SizeIds) {
  371. BasicBlock *CaseBB = BasicBlock::Create(
  372. Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
  373. MemOp NewMO = MO.clone();
  374. // Fix the argument.
  375. auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
  376. assert(SizeType && "Expected integer type size argument.");
  377. ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
  378. NewMO.setLength(CaseSizeId);
  379. NewMO.I->insertInto(CaseBB, CaseBB->end());
  380. IRBuilder<> IRBCase(CaseBB);
  381. IRBCase.CreateBr(MergeBB);
  382. SI->addCase(CaseSizeId, CaseBB);
  383. if (!MemOpTy->isVoidTy())
  384. PHI->addIncoming(NewMO.I, CaseBB);
  385. if (DT) {
  386. Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
  387. Updates.push_back({DominatorTree::Insert, BB, CaseBB});
  388. }
  389. LLVM_DEBUG(dbgs() << *CaseBB << "\n");
  390. }
  391. DTU.applyUpdates(Updates);
  392. Updates.clear();
  393. if (MaxCount)
  394. setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
  395. LLVM_DEBUG(dbgs() << *BB << "\n");
  396. LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
  397. LLVM_DEBUG(dbgs() << *MergeBB << "\n");
  398. ORE.emit([&]() {
  399. using namespace ore;
  400. return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
  401. << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
  402. << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
  403. << " for " << NV("Versions", Version) << " versions";
  404. });
  405. return true;
  406. }
  407. } // namespace
  408. static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
  409. OptimizationRemarkEmitter &ORE,
  410. DominatorTree *DT, TargetLibraryInfo &TLI) {
  411. if (DisableMemOPOPT)
  412. return false;
  413. if (F.hasFnAttribute(Attribute::OptimizeForSize))
  414. return false;
  415. MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
  416. MemOPSizeOpt.perform();
  417. return MemOPSizeOpt.isChanged();
  418. }
  419. PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
  420. FunctionAnalysisManager &FAM) {
  421. auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
  422. auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  423. auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
  424. auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
  425. bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
  426. if (!Changed)
  427. return PreservedAnalyses::all();
  428. auto PA = PreservedAnalyses();
  429. PA.preserve<DominatorTreeAnalysis>();
  430. return PA;
  431. }