CSPreInliner.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "CSPreInliner.h"
  9. #include "ProfiledBinary.h"
  10. #include "llvm/ADT/SCCIterator.h"
  11. #include "llvm/ADT/Statistic.h"
  12. #include <cstdint>
  13. #include <queue>
  14. #define DEBUG_TYPE "cs-preinliner"
  15. using namespace llvm;
  16. using namespace sampleprof;
  17. STATISTIC(PreInlNumCSInlined,
  18. "Number of functions inlined with context sensitive profile");
  19. STATISTIC(PreInlNumCSNotInlined,
  20. "Number of functions not inlined with context sensitive profile");
  21. STATISTIC(PreInlNumCSInlinedHitMinLimit,
  22. "Number of functions with FDO inline stopped due to min size limit");
  23. STATISTIC(PreInlNumCSInlinedHitMaxLimit,
  24. "Number of functions with FDO inline stopped due to max size limit");
  25. STATISTIC(
  26. PreInlNumCSInlinedHitGrowthLimit,
  27. "Number of functions with FDO inline stopped due to growth size limit");
  28. // The switches specify inline thresholds used in SampleProfileLoader inlining.
  29. // TODO: the actual threshold to be tuned here because the size here is based
  30. // on machine code not LLVM IR.
  31. extern cl::opt<int> SampleHotCallSiteThreshold;
  32. extern cl::opt<int> SampleColdCallSiteThreshold;
  33. extern cl::opt<int> ProfileInlineGrowthLimit;
  34. extern cl::opt<int> ProfileInlineLimitMin;
  35. extern cl::opt<int> ProfileInlineLimitMax;
  36. extern cl::opt<bool> SortProfiledSCC;
  37. cl::opt<bool> EnableCSPreInliner(
  38. "csspgo-preinliner", cl::Hidden, cl::init(true),
  39. cl::desc("Run a global pre-inliner to merge context profile based on "
  40. "estimated global top-down inline decisions"));
  41. cl::opt<bool> UseContextCostForPreInliner(
  42. "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
  43. cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
  44. static cl::opt<bool> SamplePreInlineReplay(
  45. "csspgo-replay-preinline", cl::Hidden, cl::init(false),
  46. cl::desc(
  47. "Replay previous inlining and adjust context profile accordingly"));
  48. CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary,
  49. uint64_t HotThreshold, uint64_t ColdThreshold)
  50. : UseContextCost(UseContextCostForPreInliner),
  51. // TODO: Pass in a guid-to-name map in order for
  52. // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
  53. // as their profile context.
  54. ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary),
  55. HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {
  56. // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
  57. // for good performance with reasonable profile size.
  58. if (!SampleHotCallSiteThreshold.getNumOccurrences())
  59. SampleHotCallSiteThreshold = 1500;
  60. if (!SampleColdCallSiteThreshold.getNumOccurrences())
  61. SampleColdCallSiteThreshold = 0;
  62. }
  63. std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
  64. std::vector<StringRef> Order;
  65. ProfiledCallGraph ProfiledCG(ContextTracker);
  66. // Now that we have a profiled call graph, construct top-down order
  67. // by building up SCC and reversing SCC order.
  68. scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
  69. while (!I.isAtEnd()) {
  70. auto Range = *I;
  71. if (SortProfiledSCC) {
  72. // Sort nodes in one SCC based on callsite hotness.
  73. scc_member_iterator<ProfiledCallGraph *> SI(*I);
  74. Range = *SI;
  75. }
  76. for (auto *Node : Range) {
  77. if (Node != ProfiledCG.getEntryNode())
  78. Order.push_back(Node->Name);
  79. }
  80. ++I;
  81. }
  82. std::reverse(Order.begin(), Order.end());
  83. return Order;
  84. }
  85. bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
  86. const FunctionSamples *CallerSamples) {
  87. assert(CallerSamples && "Expect non-null caller samples");
  88. // Ideally we want to consider everything a function calls, but as far as
  89. // context profile is concerned, only those frames that are children of
  90. // current one in the trie is relavent. So we walk the trie instead of call
  91. // targets from function profile.
  92. ContextTrieNode *CallerNode =
  93. ContextTracker.getContextFor(CallerSamples->getContext());
  94. bool HasNewCandidate = false;
  95. for (auto &Child : CallerNode->getAllChildContext()) {
  96. ContextTrieNode *CalleeNode = &Child.second;
  97. FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
  98. if (!CalleeSamples)
  99. continue;
  100. // Call site count is more reliable, so we look up the corresponding call
  101. // target profile in caller's context profile to retrieve call site count.
  102. uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
  103. uint64_t CallsiteCount = 0;
  104. LineLocation Callsite = CalleeNode->getCallSiteLoc();
  105. if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
  106. SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
  107. auto It = TargetCounts.find(CalleeSamples->getName());
  108. if (It != TargetCounts.end())
  109. CallsiteCount = It->second;
  110. }
  111. // TODO: call site and callee entry count should be mostly consistent, add
  112. // check for that.
  113. HasNewCandidate = true;
  114. uint32_t CalleeSize = getFuncSize(*CalleeSamples);
  115. CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
  116. CalleeSize);
  117. }
  118. return HasNewCandidate;
  119. }
  120. uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) {
  121. if (UseContextCost) {
  122. return Binary.getFuncSizeForContext(FSamples.getContext());
  123. }
  124. return FSamples.getBodySamples().size();
  125. }
  126. bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
  127. // If replay inline is requested, simply follow the inline decision of the
  128. // profiled binary.
  129. if (SamplePreInlineReplay)
  130. return Candidate.CalleeSamples->getContext().hasAttribute(
  131. ContextWasInlined);
  132. // Adjust threshold based on call site hotness, only do this for callsite
  133. // prioritized inliner because otherwise cost-benefit check is done earlier.
  134. unsigned int SampleThreshold = SampleColdCallSiteThreshold;
  135. if (Candidate.CallsiteCount > HotCountThreshold)
  136. SampleThreshold = SampleHotCallSiteThreshold;
  137. // TODO: for small cold functions, we may inlined them and we need to keep
  138. // context profile accordingly.
  139. if (Candidate.CallsiteCount < ColdCountThreshold)
  140. SampleThreshold = SampleColdCallSiteThreshold;
  141. return (Candidate.SizeCost < SampleThreshold);
  142. }
  143. void CSPreInliner::processFunction(const StringRef Name) {
  144. FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
  145. if (!FSamples)
  146. return;
  147. unsigned FuncSize = getFuncSize(*FSamples);
  148. unsigned FuncFinalSize = FuncSize;
  149. unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
  150. SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
  151. SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
  152. LLVM_DEBUG(dbgs() << "Process " << Name
  153. << " for context-sensitive pre-inlining (pre-inline size: "
  154. << FuncSize << ", size limit: " << SizeLimit << ")\n");
  155. ProfiledCandidateQueue CQueue;
  156. getInlineCandidates(CQueue, FSamples);
  157. while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
  158. ProfiledInlineCandidate Candidate = CQueue.top();
  159. CQueue.pop();
  160. bool ShouldInline = false;
  161. if ((ShouldInline = shouldInline(Candidate))) {
  162. // We mark context as inlined as the corresponding context profile
  163. // won't be merged into that function's base profile.
  164. ++PreInlNumCSInlined;
  165. ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
  166. Candidate.CalleeSamples->getContext().setAttribute(
  167. ContextShouldBeInlined);
  168. FuncFinalSize += Candidate.SizeCost;
  169. getInlineCandidates(CQueue, Candidate.CalleeSamples);
  170. } else {
  171. ++PreInlNumCSNotInlined;
  172. }
  173. LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined")
  174. << " context profile for: "
  175. << Candidate.CalleeSamples->getContext().toString()
  176. << " (callee size: " << Candidate.SizeCost
  177. << ", call count:" << Candidate.CallsiteCount << ")\n");
  178. }
  179. if (!CQueue.empty()) {
  180. if (SizeLimit == (unsigned)ProfileInlineLimitMax)
  181. ++PreInlNumCSInlinedHitMaxLimit;
  182. else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
  183. ++PreInlNumCSInlinedHitMinLimit;
  184. else
  185. ++PreInlNumCSInlinedHitGrowthLimit;
  186. }
  187. LLVM_DEBUG({
  188. if (!CQueue.empty())
  189. dbgs() << " Inline candidates ignored due to size limit (inliner "
  190. "original size: "
  191. << FuncSize << ", inliner final size: " << FuncFinalSize
  192. << ", size limit: " << SizeLimit << ")\n";
  193. while (!CQueue.empty()) {
  194. ProfiledInlineCandidate Candidate = CQueue.top();
  195. CQueue.pop();
  196. bool WasInlined =
  197. Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
  198. dbgs() << " " << Candidate.CalleeSamples->getContext().toString()
  199. << " (candidate size:" << Candidate.SizeCost
  200. << ", call count: " << Candidate.CallsiteCount << ", previously "
  201. << (WasInlined ? "inlined)\n" : "not inlined)\n");
  202. }
  203. });
  204. }
  205. void CSPreInliner::run() {
  206. #ifndef NDEBUG
  207. auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) {
  208. dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
  209. << Profiles.size() << " total):\n";
  210. for (auto &It : Profiles) {
  211. const FunctionSamples &Samples = It.second;
  212. dbgs() << " [" << Samples.getContext().toString() << "] "
  213. << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
  214. << "\n";
  215. }
  216. };
  217. #endif
  218. LLVM_DEBUG(printProfileNames(ProfileMap, true));
  219. // Execute global pre-inliner to estimate a global top-down inline
  220. // decision and merge profiles accordingly. This helps with profile
  221. // merge for ThinLTO otherwise we won't be able to merge profiles back
  222. // to base profile across module/thin-backend boundaries.
  223. // It also helps better compress context profile to control profile
  224. // size, as we now only need context profile for functions going to
  225. // be inlined.
  226. for (StringRef FuncName : buildTopDownOrder()) {
  227. processFunction(FuncName);
  228. }
  229. // Not inlined context profiles are merged into its base, so we can
  230. // trim out such profiles from the output.
  231. std::vector<SampleContext> ProfilesToBeRemoved;
  232. for (auto &It : ProfileMap) {
  233. SampleContext &Context = It.second.getContext();
  234. if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
  235. assert(Context.hasState(MergedContext) &&
  236. "Not inlined context profile should be merged already");
  237. ProfilesToBeRemoved.push_back(It.first);
  238. }
  239. }
  240. for (auto &ContextName : ProfilesToBeRemoved) {
  241. ProfileMap.erase(ContextName);
  242. }
  243. // Make sure ProfileMap's key is consistent with FunctionSamples' name.
  244. SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
  245. LLVM_DEBUG(printProfileNames(ProfileMap, false));
  246. }