CSPreInliner.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "CSPreInliner.h"
  9. #include "ProfiledBinary.h"
  10. #include "llvm/ADT/SCCIterator.h"
  11. #include "llvm/ADT/Statistic.h"
  12. #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
  13. #include <cstdint>
  14. #include <queue>
  15. #define DEBUG_TYPE "cs-preinliner"
  16. using namespace llvm;
  17. using namespace sampleprof;
  18. STATISTIC(PreInlNumCSInlined,
  19. "Number of functions inlined with context sensitive profile");
  20. STATISTIC(PreInlNumCSNotInlined,
  21. "Number of functions not inlined with context sensitive profile");
  22. STATISTIC(PreInlNumCSInlinedHitMinLimit,
  23. "Number of functions with FDO inline stopped due to min size limit");
  24. STATISTIC(PreInlNumCSInlinedHitMaxLimit,
  25. "Number of functions with FDO inline stopped due to max size limit");
  26. STATISTIC(
  27. PreInlNumCSInlinedHitGrowthLimit,
  28. "Number of functions with FDO inline stopped due to growth size limit");
  29. // The switches specify inline thresholds used in SampleProfileLoader inlining.
  30. // TODO: the actual threshold to be tuned here because the size here is based
  31. // on machine code not LLVM IR.
  32. extern cl::opt<int> SampleHotCallSiteThreshold;
  33. extern cl::opt<int> SampleColdCallSiteThreshold;
  34. extern cl::opt<int> ProfileInlineGrowthLimit;
  35. extern cl::opt<int> ProfileInlineLimitMin;
  36. extern cl::opt<int> ProfileInlineLimitMax;
  37. extern cl::opt<bool> SortProfiledSCC;
  38. cl::opt<bool> EnableCSPreInliner(
  39. "csspgo-preinliner", cl::Hidden, cl::init(true),
  40. cl::desc("Run a global pre-inliner to merge context profile based on "
  41. "estimated global top-down inline decisions"));
  42. cl::opt<bool> UseContextCostForPreInliner(
  43. "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
  44. cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
  45. static cl::opt<bool> SamplePreInlineReplay(
  46. "csspgo-replay-preinline", cl::Hidden, cl::init(false),
  47. cl::desc(
  48. "Replay previous inlining and adjust context profile accordingly"));
  49. CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
  50. ProfiledBinary &Binary, ProfileSummary *Summary)
  51. : UseContextCost(UseContextCostForPreInliner),
  52. // TODO: Pass in a guid-to-name map in order for
  53. // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
  54. // as their profile context.
  55. ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
  56. // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
  57. // for good performance with reasonable profile size.
  58. if (!SampleHotCallSiteThreshold.getNumOccurrences())
  59. SampleHotCallSiteThreshold = 1500;
  60. if (!SampleColdCallSiteThreshold.getNumOccurrences())
  61. SampleColdCallSiteThreshold = 0;
  62. if (!ProfileInlineLimitMax.getNumOccurrences())
  63. ProfileInlineLimitMax = 3000;
  64. }
  65. std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
  66. std::vector<StringRef> Order;
  67. ProfiledCallGraph ProfiledCG(ContextTracker);
  68. // Now that we have a profiled call graph, construct top-down order
  69. // by building up SCC and reversing SCC order.
  70. scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
  71. while (!I.isAtEnd()) {
  72. auto Range = *I;
  73. if (SortProfiledSCC) {
  74. // Sort nodes in one SCC based on callsite hotness.
  75. scc_member_iterator<ProfiledCallGraph *> SI(*I);
  76. Range = *SI;
  77. }
  78. for (auto *Node : Range) {
  79. if (Node != ProfiledCG.getEntryNode())
  80. Order.push_back(Node->Name);
  81. }
  82. ++I;
  83. }
  84. std::reverse(Order.begin(), Order.end());
  85. return Order;
  86. }
  87. bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
  88. const FunctionSamples *CallerSamples) {
  89. assert(CallerSamples && "Expect non-null caller samples");
  90. // Ideally we want to consider everything a function calls, but as far as
  91. // context profile is concerned, only those frames that are children of
  92. // current one in the trie is relavent. So we walk the trie instead of call
  93. // targets from function profile.
  94. ContextTrieNode *CallerNode =
  95. ContextTracker.getContextNodeForProfile(CallerSamples);
  96. bool HasNewCandidate = false;
  97. for (auto &Child : CallerNode->getAllChildContext()) {
  98. ContextTrieNode *CalleeNode = &Child.second;
  99. FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
  100. if (!CalleeSamples)
  101. continue;
  102. // Call site count is more reliable, so we look up the corresponding call
  103. // target profile in caller's context profile to retrieve call site count.
  104. uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
  105. uint64_t CallsiteCount = 0;
  106. LineLocation Callsite = CalleeNode->getCallSiteLoc();
  107. if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
  108. SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
  109. auto It = TargetCounts.find(CalleeSamples->getName());
  110. if (It != TargetCounts.end())
  111. CallsiteCount = It->second;
  112. }
  113. // TODO: call site and callee entry count should be mostly consistent, add
  114. // check for that.
  115. HasNewCandidate = true;
  116. uint32_t CalleeSize = getFuncSize(CalleeNode);
  117. CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
  118. CalleeSize);
  119. }
  120. return HasNewCandidate;
  121. }
  122. uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
  123. if (UseContextCost)
  124. return Binary.getFuncSizeForContext(ContextNode);
  125. return ContextNode->getFunctionSamples()->getBodySamples().size();
  126. }
  127. bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
  128. // If replay inline is requested, simply follow the inline decision of the
  129. // profiled binary.
  130. if (SamplePreInlineReplay)
  131. return Candidate.CalleeSamples->getContext().hasAttribute(
  132. ContextWasInlined);
  133. unsigned int SampleThreshold = SampleColdCallSiteThreshold;
  134. uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
  135. (Summary->getDetailedSummary()));
  136. if (Candidate.CallsiteCount <= ColdCountThreshold)
  137. SampleThreshold = SampleColdCallSiteThreshold;
  138. else {
  139. // Linearly adjust threshold based on normalized hotness, i.e, a value in
  140. // [0,1]. Use 10% cutoff instead of the max count as the normalization
  141. // upperbound for stability.
  142. double NormalizationUpperBound =
  143. ProfileSummaryBuilder::getEntryForPercentile(
  144. Summary->getDetailedSummary(), 100000 /* 10% */)
  145. .MinCount;
  146. double NormalizationLowerBound = ColdCountThreshold;
  147. double NormalizedHotness =
  148. (Candidate.CallsiteCount - NormalizationLowerBound) /
  149. (NormalizationUpperBound - NormalizationLowerBound);
  150. if (NormalizedHotness > 1.0)
  151. NormalizedHotness = 1.0;
  152. // Add 1 to to ensure hot callsites get a non-zero threshold, which could
  153. // happen when SampleColdCallSiteThreshold is 0. This is when we do not
  154. // want any inlining for cold callsites.
  155. SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
  156. SampleColdCallSiteThreshold + 1;
  157. }
  158. return (Candidate.SizeCost < SampleThreshold);
  159. }
  160. void CSPreInliner::processFunction(const StringRef Name) {
  161. FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
  162. if (!FSamples)
  163. return;
  164. unsigned FuncSize =
  165. getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
  166. unsigned FuncFinalSize = FuncSize;
  167. unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
  168. SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
  169. SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
  170. LLVM_DEBUG(dbgs() << "Process " << Name
  171. << " for context-sensitive pre-inlining (pre-inline size: "
  172. << FuncSize << ", size limit: " << SizeLimit << ")\n");
  173. ProfiledCandidateQueue CQueue;
  174. getInlineCandidates(CQueue, FSamples);
  175. while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
  176. ProfiledInlineCandidate Candidate = CQueue.top();
  177. CQueue.pop();
  178. bool ShouldInline = false;
  179. if ((ShouldInline = shouldInline(Candidate))) {
  180. // We mark context as inlined as the corresponding context profile
  181. // won't be merged into that function's base profile.
  182. ++PreInlNumCSInlined;
  183. ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
  184. Candidate.CalleeSamples->getContext().setAttribute(
  185. ContextShouldBeInlined);
  186. FuncFinalSize += Candidate.SizeCost;
  187. getInlineCandidates(CQueue, Candidate.CalleeSamples);
  188. } else {
  189. ++PreInlNumCSNotInlined;
  190. }
  191. LLVM_DEBUG(
  192. dbgs() << (ShouldInline ? " Inlined" : " Outlined")
  193. << " context profile for: "
  194. << ContextTracker.getContextString(*Candidate.CalleeSamples)
  195. << " (callee size: " << Candidate.SizeCost
  196. << ", call count:" << Candidate.CallsiteCount << ")\n");
  197. }
  198. if (!CQueue.empty()) {
  199. if (SizeLimit == (unsigned)ProfileInlineLimitMax)
  200. ++PreInlNumCSInlinedHitMaxLimit;
  201. else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
  202. ++PreInlNumCSInlinedHitMinLimit;
  203. else
  204. ++PreInlNumCSInlinedHitGrowthLimit;
  205. }
  206. LLVM_DEBUG({
  207. if (!CQueue.empty())
  208. dbgs() << " Inline candidates ignored due to size limit (inliner "
  209. "original size: "
  210. << FuncSize << ", inliner final size: " << FuncFinalSize
  211. << ", size limit: " << SizeLimit << ")\n";
  212. while (!CQueue.empty()) {
  213. ProfiledInlineCandidate Candidate = CQueue.top();
  214. CQueue.pop();
  215. bool WasInlined =
  216. Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
  217. dbgs() << " "
  218. << ContextTracker.getContextString(*Candidate.CalleeSamples)
  219. << " (candidate size:" << Candidate.SizeCost
  220. << ", call count: " << Candidate.CallsiteCount << ", previously "
  221. << (WasInlined ? "inlined)\n" : "not inlined)\n");
  222. }
  223. });
  224. }
  225. void CSPreInliner::run() {
  226. #ifndef NDEBUG
  227. auto printProfileNames = [](SampleContextTracker &ContextTracker,
  228. bool IsInput) {
  229. uint32_t Size = 0;
  230. for (auto *Node : ContextTracker) {
  231. FunctionSamples *FSamples = Node->getFunctionSamples();
  232. if (FSamples) {
  233. Size++;
  234. dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
  235. << FSamples->getTotalSamples() << ":"
  236. << FSamples->getHeadSamples() << "\n";
  237. }
  238. }
  239. dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
  240. << Size << " total):\n";
  241. };
  242. #endif
  243. LLVM_DEBUG(printProfileNames(ContextTracker, true));
  244. // Execute global pre-inliner to estimate a global top-down inline
  245. // decision and merge profiles accordingly. This helps with profile
  246. // merge for ThinLTO otherwise we won't be able to merge profiles back
  247. // to base profile across module/thin-backend boundaries.
  248. // It also helps better compress context profile to control profile
  249. // size, as we now only need context profile for functions going to
  250. // be inlined.
  251. for (StringRef FuncName : buildTopDownOrder()) {
  252. processFunction(FuncName);
  253. }
  254. // Not inlined context profiles are merged into its base, so we can
  255. // trim out such profiles from the output.
  256. for (auto *Node : ContextTracker) {
  257. FunctionSamples *FProfile = Node->getFunctionSamples();
  258. if (FProfile &&
  259. (Node->getParentContext() != &ContextTracker.getRootContext() &&
  260. !FProfile->getContext().hasState(InlinedContext))) {
  261. Node->setFunctionSamples(nullptr);
  262. }
  263. }
  264. FunctionSamples::ProfileIsPreInlined = true;
  265. LLVM_DEBUG(printProfileNames(ContextTracker, false));
  266. }