TailRecursionElimination.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941
  1. //===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file transforms calls of the current function (self recursion) followed
  10. // by a return instruction with a branch to the entry of the function, creating
  11. // a loop. This pass also implements the following extensions to the basic
  12. // algorithm:
  13. //
  14. // 1. Trivial instructions between the call and return do not prevent the
  15. // transformation from taking place, though currently the analysis cannot
  16. // support moving any really useful instructions (only dead ones).
  17. // 2. This pass transforms functions that are prevented from being tail
  18. // recursive by an associative and commutative expression to use an
  19. // accumulator variable, thus compiling the typical naive factorial or
  20. // 'fib' implementation into efficient code.
  21. // 3. TRE is performed if the function returns void, if the return
  22. // returns the result returned by the call, or if the function returns a
  23. // run-time constant on all exits from the function. It is possible, though
  24. // unlikely, that the return returns something else (like constant 0), and
  25. // can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
  26. // the function return the exact same value.
  27. // 4. If it can prove that callees do not access their caller stack frame,
  28. // they are marked as eligible for tail call elimination (by the code
  29. // generator).
  30. //
  31. // There are several improvements that could be made:
  32. //
  33. // 1. If the function has any alloca instructions, these instructions will be
  34. // moved out of the entry block of the function, causing them to be
  35. // evaluated each time through the tail recursion. Safely keeping allocas
  36. // in the entry block requires analysis to proves that the tail-called
  37. // function does not read or write the stack object.
  38. // 2. Tail recursion is only performed if the call immediately precedes the
  39. // return instruction. It's possible that there could be a jump between
  40. // the call and the return.
  41. // 3. There can be intervening operations between the call and the return that
  42. // prevent the TRE from occurring. For example, there could be GEP's and
  43. // stores to memory that will not be read or written by the call. This
  44. // requires some substantial analysis (such as with DSA) to prove safe to
  45. // move ahead of the call, but doing so could allow many more TREs to be
  46. // performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
  47. // 4. The algorithm we use to detect if callees access their caller stack
  48. // frames is very primitive.
  49. //
  50. //===----------------------------------------------------------------------===//
  51. #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
  52. #include "llvm/ADT/STLExtras.h"
  53. #include "llvm/ADT/SmallPtrSet.h"
  54. #include "llvm/ADT/Statistic.h"
  55. #include "llvm/Analysis/DomTreeUpdater.h"
  56. #include "llvm/Analysis/GlobalsModRef.h"
  57. #include "llvm/Analysis/InstructionSimplify.h"
  58. #include "llvm/Analysis/Loads.h"
  59. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  60. #include "llvm/Analysis/PostDominators.h"
  61. #include "llvm/Analysis/TargetTransformInfo.h"
  62. #include "llvm/Analysis/ValueTracking.h"
  63. #include "llvm/IR/CFG.h"
  64. #include "llvm/IR/Constants.h"
  65. #include "llvm/IR/DataLayout.h"
  66. #include "llvm/IR/DerivedTypes.h"
  67. #include "llvm/IR/DiagnosticInfo.h"
  68. #include "llvm/IR/Dominators.h"
  69. #include "llvm/IR/Function.h"
  70. #include "llvm/IR/IRBuilder.h"
  71. #include "llvm/IR/InstIterator.h"
  72. #include "llvm/IR/Instructions.h"
  73. #include "llvm/IR/IntrinsicInst.h"
  74. #include "llvm/IR/Module.h"
  75. #include "llvm/InitializePasses.h"
  76. #include "llvm/Pass.h"
  77. #include "llvm/Support/Debug.h"
  78. #include "llvm/Support/raw_ostream.h"
  79. #include "llvm/Transforms/Scalar.h"
  80. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  81. using namespace llvm;
  82. #define DEBUG_TYPE "tailcallelim"
  83. STATISTIC(NumEliminated, "Number of tail calls removed");
  84. STATISTIC(NumRetDuped, "Number of return duplicated");
  85. STATISTIC(NumAccumAdded, "Number of accumulators introduced");
  86. /// Scan the specified function for alloca instructions.
  87. /// If it contains any dynamic allocas, returns false.
  88. static bool canTRE(Function &F) {
  89. // TODO: We don't do TRE if dynamic allocas are used.
  90. // Dynamic allocas allocate stack space which should be
  91. // deallocated before new iteration started. That is
  92. // currently not implemented.
  93. return llvm::all_of(instructions(F), [](Instruction &I) {
  94. auto *AI = dyn_cast<AllocaInst>(&I);
  95. return !AI || AI->isStaticAlloca();
  96. });
  97. }
  98. namespace {
  99. struct AllocaDerivedValueTracker {
  100. // Start at a root value and walk its use-def chain to mark calls that use the
  101. // value or a derived value in AllocaUsers, and places where it may escape in
  102. // EscapePoints.
  103. void walk(Value *Root) {
  104. SmallVector<Use *, 32> Worklist;
  105. SmallPtrSet<Use *, 32> Visited;
  106. auto AddUsesToWorklist = [&](Value *V) {
  107. for (auto &U : V->uses()) {
  108. if (!Visited.insert(&U).second)
  109. continue;
  110. Worklist.push_back(&U);
  111. }
  112. };
  113. AddUsesToWorklist(Root);
  114. while (!Worklist.empty()) {
  115. Use *U = Worklist.pop_back_val();
  116. Instruction *I = cast<Instruction>(U->getUser());
  117. switch (I->getOpcode()) {
  118. case Instruction::Call:
  119. case Instruction::Invoke: {
  120. auto &CB = cast<CallBase>(*I);
  121. // If the alloca-derived argument is passed byval it is not an escape
  122. // point, or a use of an alloca. Calling with byval copies the contents
  123. // of the alloca into argument registers or stack slots, which exist
  124. // beyond the lifetime of the current frame.
  125. if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U)))
  126. continue;
  127. bool IsNocapture =
  128. CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U));
  129. callUsesLocalStack(CB, IsNocapture);
  130. if (IsNocapture) {
  131. // If the alloca-derived argument is passed in as nocapture, then it
  132. // can't propagate to the call's return. That would be capturing.
  133. continue;
  134. }
  135. break;
  136. }
  137. case Instruction::Load: {
  138. // The result of a load is not alloca-derived (unless an alloca has
  139. // otherwise escaped, but this is a local analysis).
  140. continue;
  141. }
  142. case Instruction::Store: {
  143. if (U->getOperandNo() == 0)
  144. EscapePoints.insert(I);
  145. continue; // Stores have no users to analyze.
  146. }
  147. case Instruction::BitCast:
  148. case Instruction::GetElementPtr:
  149. case Instruction::PHI:
  150. case Instruction::Select:
  151. case Instruction::AddrSpaceCast:
  152. break;
  153. default:
  154. EscapePoints.insert(I);
  155. break;
  156. }
  157. AddUsesToWorklist(I);
  158. }
  159. }
  160. void callUsesLocalStack(CallBase &CB, bool IsNocapture) {
  161. // Add it to the list of alloca users.
  162. AllocaUsers.insert(&CB);
  163. // If it's nocapture then it can't capture this alloca.
  164. if (IsNocapture)
  165. return;
  166. // If it can write to memory, it can leak the alloca value.
  167. if (!CB.onlyReadsMemory())
  168. EscapePoints.insert(&CB);
  169. }
  170. SmallPtrSet<Instruction *, 32> AllocaUsers;
  171. SmallPtrSet<Instruction *, 32> EscapePoints;
  172. };
  173. }
  174. static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
  175. if (F.callsFunctionThatReturnsTwice())
  176. return false;
  177. // The local stack holds all alloca instructions and all byval arguments.
  178. AllocaDerivedValueTracker Tracker;
  179. for (Argument &Arg : F.args()) {
  180. if (Arg.hasByValAttr())
  181. Tracker.walk(&Arg);
  182. }
  183. for (auto &BB : F) {
  184. for (auto &I : BB)
  185. if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
  186. Tracker.walk(AI);
  187. }
  188. bool Modified = false;
  189. // Track whether a block is reachable after an alloca has escaped. Blocks that
  190. // contain the escaping instruction will be marked as being visited without an
  191. // escaped alloca, since that is how the block began.
  192. enum VisitType {
  193. UNVISITED,
  194. UNESCAPED,
  195. ESCAPED
  196. };
  197. DenseMap<BasicBlock *, VisitType> Visited;
  198. // We propagate the fact that an alloca has escaped from block to successor.
  199. // Visit the blocks that are propagating the escapedness first. To do this, we
  200. // maintain two worklists.
  201. SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
  202. // We may enter a block and visit it thinking that no alloca has escaped yet,
  203. // then see an escape point and go back around a loop edge and come back to
  204. // the same block twice. Because of this, we defer setting tail on calls when
  205. // we first encounter them in a block. Every entry in this list does not
  206. // statically use an alloca via use-def chain analysis, but may find an alloca
  207. // through other means if the block turns out to be reachable after an escape
  208. // point.
  209. SmallVector<CallInst *, 32> DeferredTails;
  210. BasicBlock *BB = &F.getEntryBlock();
  211. VisitType Escaped = UNESCAPED;
  212. do {
  213. for (auto &I : *BB) {
  214. if (Tracker.EscapePoints.count(&I))
  215. Escaped = ESCAPED;
  216. CallInst *CI = dyn_cast<CallInst>(&I);
  217. // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
  218. // considered accessing memory and will be marked as a tail call if we
  219. // don't bail out here.
  220. if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
  221. isa<PseudoProbeInst>(&I))
  222. continue;
  223. // Special-case operand bundles "clang.arc.attachedcall", "ptrauth", and
  224. // "kcfi".
  225. bool IsNoTail = CI->isNoTailCall() ||
  226. CI->hasOperandBundlesOtherThan(
  227. {LLVMContext::OB_clang_arc_attachedcall,
  228. LLVMContext::OB_ptrauth, LLVMContext::OB_kcfi});
  229. if (!IsNoTail && CI->doesNotAccessMemory()) {
  230. // A call to a readnone function whose arguments are all things computed
  231. // outside this function can be marked tail. Even if you stored the
  232. // alloca address into a global, a readnone function can't load the
  233. // global anyhow.
  234. //
  235. // Note that this runs whether we know an alloca has escaped or not. If
  236. // it has, then we can't trust Tracker.AllocaUsers to be accurate.
  237. bool SafeToTail = true;
  238. for (auto &Arg : CI->args()) {
  239. if (isa<Constant>(Arg.getUser()))
  240. continue;
  241. if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
  242. if (!A->hasByValAttr())
  243. continue;
  244. SafeToTail = false;
  245. break;
  246. }
  247. if (SafeToTail) {
  248. using namespace ore;
  249. ORE->emit([&]() {
  250. return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
  251. << "marked as tail call candidate (readnone)";
  252. });
  253. CI->setTailCall();
  254. Modified = true;
  255. continue;
  256. }
  257. }
  258. if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI))
  259. DeferredTails.push_back(CI);
  260. }
  261. for (auto *SuccBB : successors(BB)) {
  262. auto &State = Visited[SuccBB];
  263. if (State < Escaped) {
  264. State = Escaped;
  265. if (State == ESCAPED)
  266. WorklistEscaped.push_back(SuccBB);
  267. else
  268. WorklistUnescaped.push_back(SuccBB);
  269. }
  270. }
  271. if (!WorklistEscaped.empty()) {
  272. BB = WorklistEscaped.pop_back_val();
  273. Escaped = ESCAPED;
  274. } else {
  275. BB = nullptr;
  276. while (!WorklistUnescaped.empty()) {
  277. auto *NextBB = WorklistUnescaped.pop_back_val();
  278. if (Visited[NextBB] == UNESCAPED) {
  279. BB = NextBB;
  280. Escaped = UNESCAPED;
  281. break;
  282. }
  283. }
  284. }
  285. } while (BB);
  286. for (CallInst *CI : DeferredTails) {
  287. if (Visited[CI->getParent()] != ESCAPED) {
  288. // If the escape point was part way through the block, calls after the
  289. // escape point wouldn't have been put into DeferredTails.
  290. LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
  291. CI->setTailCall();
  292. Modified = true;
  293. }
  294. }
  295. return Modified;
  296. }
  297. /// Return true if it is safe to move the specified
  298. /// instruction from after the call to before the call, assuming that all
  299. /// instructions between the call and this instruction are movable.
  300. ///
  301. static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
  302. if (isa<DbgInfoIntrinsic>(I))
  303. return true;
  304. if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
  305. if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
  306. llvm::findAllocaForValue(II->getArgOperand(1)))
  307. return true;
  308. // FIXME: We can move load/store/call/free instructions above the call if the
  309. // call does not mod/ref the memory location being processed.
  310. if (I->mayHaveSideEffects()) // This also handles volatile loads.
  311. return false;
  312. if (LoadInst *L = dyn_cast<LoadInst>(I)) {
  313. // Loads may always be moved above calls without side effects.
  314. if (CI->mayHaveSideEffects()) {
  315. // Non-volatile loads may be moved above a call with side effects if it
  316. // does not write to memory and the load provably won't trap.
  317. // Writes to memory only matter if they may alias the pointer
  318. // being loaded from.
  319. const DataLayout &DL = L->getModule()->getDataLayout();
  320. if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
  321. !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
  322. L->getAlign(), DL, L))
  323. return false;
  324. }
  325. }
  326. // Otherwise, if this is a side-effect free instruction, check to make sure
  327. // that it does not use the return value of the call. If it doesn't use the
  328. // return value of the call, it must only use things that are defined before
  329. // the call, or movable instructions between the call and the instruction
  330. // itself.
  331. return !is_contained(I->operands(), CI);
  332. }
  333. static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
  334. if (!I->isAssociative() || !I->isCommutative())
  335. return false;
  336. assert(I->getNumOperands() == 2 &&
  337. "Associative/commutative operations should have 2 args!");
  338. // Exactly one operand should be the result of the call instruction.
  339. if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
  340. (I->getOperand(0) != CI && I->getOperand(1) != CI))
  341. return false;
  342. // The only user of this instruction we allow is a single return instruction.
  343. if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
  344. return false;
  345. return true;
  346. }
  347. static Instruction *firstNonDbg(BasicBlock::iterator I) {
  348. while (isa<DbgInfoIntrinsic>(I))
  349. ++I;
  350. return &*I;
  351. }
  352. namespace {
  353. class TailRecursionEliminator {
  354. Function &F;
  355. const TargetTransformInfo *TTI;
  356. AliasAnalysis *AA;
  357. OptimizationRemarkEmitter *ORE;
  358. DomTreeUpdater &DTU;
  359. // The below are shared state we want to have available when eliminating any
  360. // calls in the function. There values should be populated by
  361. // createTailRecurseLoopHeader the first time we find a call we can eliminate.
  362. BasicBlock *HeaderBB = nullptr;
  363. SmallVector<PHINode *, 8> ArgumentPHIs;
  364. // PHI node to store our return value.
  365. PHINode *RetPN = nullptr;
  366. // i1 PHI node to track if we have a valid return value stored in RetPN.
  367. PHINode *RetKnownPN = nullptr;
  368. // Vector of select instructions we insereted. These selects use RetKnownPN
  369. // to either propagate RetPN or select a new return value.
  370. SmallVector<SelectInst *, 8> RetSelects;
  371. // The below are shared state needed when performing accumulator recursion.
  372. // There values should be populated by insertAccumulator the first time we
  373. // find an elimination that requires an accumulator.
  374. // PHI node to store our current accumulated value.
  375. PHINode *AccPN = nullptr;
  376. // The instruction doing the accumulating.
  377. Instruction *AccumulatorRecursionInstr = nullptr;
  378. TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
  379. AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
  380. DomTreeUpdater &DTU)
  381. : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
  382. CallInst *findTRECandidate(BasicBlock *BB);
  383. void createTailRecurseLoopHeader(CallInst *CI);
  384. void insertAccumulator(Instruction *AccRecInstr);
  385. bool eliminateCall(CallInst *CI);
  386. void cleanupAndFinalize();
  387. bool processBlock(BasicBlock &BB);
  388. void copyByValueOperandIntoLocalTemp(CallInst *CI, int OpndIdx);
  389. void copyLocalTempOfByValueOperandIntoArguments(CallInst *CI, int OpndIdx);
  390. public:
  391. static bool eliminate(Function &F, const TargetTransformInfo *TTI,
  392. AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
  393. DomTreeUpdater &DTU);
  394. };
  395. } // namespace
  396. CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
  397. Instruction *TI = BB->getTerminator();
  398. if (&BB->front() == TI) // Make sure there is something before the terminator.
  399. return nullptr;
  400. // Scan backwards from the return, checking to see if there is a tail call in
  401. // this block. If so, set CI to it.
  402. CallInst *CI = nullptr;
  403. BasicBlock::iterator BBI(TI);
  404. while (true) {
  405. CI = dyn_cast<CallInst>(BBI);
  406. if (CI && CI->getCalledFunction() == &F)
  407. break;
  408. if (BBI == BB->begin())
  409. return nullptr; // Didn't find a potential tail call.
  410. --BBI;
  411. }
  412. assert((!CI->isTailCall() || !CI->isNoTailCall()) &&
  413. "Incompatible call site attributes(Tail,NoTail)");
  414. if (!CI->isTailCall())
  415. return nullptr;
  416. // As a special case, detect code like this:
  417. // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
  418. // and disable this xform in this case, because the code generator will
  419. // lower the call to fabs into inline code.
  420. if (BB == &F.getEntryBlock() &&
  421. firstNonDbg(BB->front().getIterator()) == CI &&
  422. firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
  423. !TTI->isLoweredToCall(CI->getCalledFunction())) {
  424. // A single-block function with just a call and a return. Check that
  425. // the arguments match.
  426. auto I = CI->arg_begin(), E = CI->arg_end();
  427. Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end();
  428. for (; I != E && FI != FE; ++I, ++FI)
  429. if (*I != &*FI) break;
  430. if (I == E && FI == FE)
  431. return nullptr;
  432. }
  433. return CI;
  434. }
  435. void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
  436. HeaderBB = &F.getEntryBlock();
  437. BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
  438. NewEntry->takeName(HeaderBB);
  439. HeaderBB->setName("tailrecurse");
  440. BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
  441. BI->setDebugLoc(CI->getDebugLoc());
  442. // Move all fixed sized allocas from HeaderBB to NewEntry.
  443. for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
  444. NEBI = NewEntry->begin();
  445. OEBI != E;)
  446. if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
  447. if (isa<ConstantInt>(AI->getArraySize()))
  448. AI->moveBefore(&*NEBI);
  449. // Now that we have created a new block, which jumps to the entry
  450. // block, insert a PHI node for each argument of the function.
  451. // For now, we initialize each PHI to only have the real arguments
  452. // which are passed in.
  453. Instruction *InsertPos = &HeaderBB->front();
  454. for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
  455. PHINode *PN =
  456. PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos);
  457. I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
  458. PN->addIncoming(&*I, NewEntry);
  459. ArgumentPHIs.push_back(PN);
  460. }
  461. // If the function doen't return void, create the RetPN and RetKnownPN PHI
  462. // nodes to track our return value. We initialize RetPN with poison and
  463. // RetKnownPN with false since we can't know our return value at function
  464. // entry.
  465. Type *RetType = F.getReturnType();
  466. if (!RetType->isVoidTy()) {
  467. Type *BoolType = Type::getInt1Ty(F.getContext());
  468. RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
  469. RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
  470. RetPN->addIncoming(PoisonValue::get(RetType), NewEntry);
  471. RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
  472. }
  473. // The entry block was changed from HeaderBB to NewEntry.
  474. // The forward DominatorTree needs to be recalculated when the EntryBB is
  475. // changed. In this corner-case we recalculate the entire tree.
  476. DTU.recalculate(*NewEntry->getParent());
  477. }
  478. void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
  479. assert(!AccPN && "Trying to insert multiple accumulators");
  480. AccumulatorRecursionInstr = AccRecInstr;
  481. // Start by inserting a new PHI node for the accumulator.
  482. pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB);
  483. AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1,
  484. "accumulator.tr", &HeaderBB->front());
  485. // Loop over all of the predecessors of the tail recursion block. For the
  486. // real entry into the function we seed the PHI with the identity constant for
  487. // the accumulation operation. For any other existing branches to this block
  488. // (due to other tail recursions eliminated) the accumulator is not modified.
  489. // Because we haven't added the branch in the current block to HeaderBB yet,
  490. // it will not show up as a predecessor.
  491. for (pred_iterator PI = PB; PI != PE; ++PI) {
  492. BasicBlock *P = *PI;
  493. if (P == &F.getEntryBlock()) {
  494. Constant *Identity = ConstantExpr::getBinOpIdentity(
  495. AccRecInstr->getOpcode(), AccRecInstr->getType());
  496. AccPN->addIncoming(Identity, P);
  497. } else {
  498. AccPN->addIncoming(AccPN, P);
  499. }
  500. }
  501. ++NumAccumAdded;
  502. }
  503. // Creates a copy of contents of ByValue operand of the specified
  504. // call instruction into the newly created temporarily variable.
  505. void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI,
  506. int OpndIdx) {
  507. Type *AggTy = CI->getParamByValType(OpndIdx);
  508. assert(AggTy);
  509. const DataLayout &DL = F.getParent()->getDataLayout();
  510. // Get alignment of byVal operand.
  511. Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
  512. // Create alloca for temporarily byval operands.
  513. // Put alloca into the entry block.
  514. Value *NewAlloca = new AllocaInst(
  515. AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
  516. CI->getArgOperand(OpndIdx)->getName(), &*F.getEntryBlock().begin());
  517. IRBuilder<> Builder(CI);
  518. Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy));
  519. // Copy data from byvalue operand into the temporarily variable.
  520. Builder.CreateMemCpy(NewAlloca, /*DstAlign*/ Alignment,
  521. CI->getArgOperand(OpndIdx),
  522. /*SrcAlign*/ Alignment, Size);
  523. CI->setArgOperand(OpndIdx, NewAlloca);
  524. }
  525. // Creates a copy from temporarily variable(keeping value of ByVal argument)
  526. // into the corresponding function argument location.
  527. void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
  528. CallInst *CI, int OpndIdx) {
  529. Type *AggTy = CI->getParamByValType(OpndIdx);
  530. assert(AggTy);
  531. const DataLayout &DL = F.getParent()->getDataLayout();
  532. // Get alignment of byVal operand.
  533. Align Alignment(CI->getParamAlign(OpndIdx).valueOrOne());
  534. IRBuilder<> Builder(CI);
  535. Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy));
  536. // Copy data from the temporarily variable into corresponding
  537. // function argument location.
  538. Builder.CreateMemCpy(F.getArg(OpndIdx), /*DstAlign*/ Alignment,
  539. CI->getArgOperand(OpndIdx),
  540. /*SrcAlign*/ Alignment, Size);
  541. }
  542. bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
  543. ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
  544. // Ok, we found a potential tail call. We can currently only transform the
  545. // tail call if all of the instructions between the call and the return are
  546. // movable to above the call itself, leaving the call next to the return.
  547. // Check that this is the case now.
  548. Instruction *AccRecInstr = nullptr;
  549. BasicBlock::iterator BBI(CI);
  550. for (++BBI; &*BBI != Ret; ++BBI) {
  551. if (canMoveAboveCall(&*BBI, CI, AA))
  552. continue;
  553. // If we can't move the instruction above the call, it might be because it
  554. // is an associative and commutative operation that could be transformed
  555. // using accumulator recursion elimination. Check to see if this is the
  556. // case, and if so, remember which instruction accumulates for later.
  557. if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI))
  558. return false; // We cannot eliminate the tail recursion!
  559. // Yes, this is accumulator recursion. Remember which instruction
  560. // accumulates.
  561. AccRecInstr = &*BBI;
  562. }
  563. BasicBlock *BB = Ret->getParent();
  564. using namespace ore;
  565. ORE->emit([&]() {
  566. return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
  567. << "transforming tail recursion into loop";
  568. });
  569. // OK! We can transform this tail call. If this is the first one found,
  570. // create the new entry block, allowing us to branch back to the old entry.
  571. if (!HeaderBB)
  572. createTailRecurseLoopHeader(CI);
  573. // Copy values of ByVal operands into local temporarily variables.
  574. for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
  575. if (CI->isByValArgument(I))
  576. copyByValueOperandIntoLocalTemp(CI, I);
  577. }
  578. // Ok, now that we know we have a pseudo-entry block WITH all of the
  579. // required PHI nodes, add entries into the PHI node for the actual
  580. // parameters passed into the tail-recursive call.
  581. for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
  582. if (CI->isByValArgument(I)) {
  583. copyLocalTempOfByValueOperandIntoArguments(CI, I);
  584. ArgumentPHIs[I]->addIncoming(F.getArg(I), BB);
  585. } else
  586. ArgumentPHIs[I]->addIncoming(CI->getArgOperand(I), BB);
  587. }
  588. if (AccRecInstr) {
  589. insertAccumulator(AccRecInstr);
  590. // Rewrite the accumulator recursion instruction so that it does not use
  591. // the result of the call anymore, instead, use the PHI node we just
  592. // inserted.
  593. AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
  594. }
  595. // Update our return value tracking
  596. if (RetPN) {
  597. if (Ret->getReturnValue() == CI || AccRecInstr) {
  598. // Defer selecting a return value
  599. RetPN->addIncoming(RetPN, BB);
  600. RetKnownPN->addIncoming(RetKnownPN, BB);
  601. } else {
  602. // We found a return value we want to use, insert a select instruction to
  603. // select it if we don't already know what our return value will be and
  604. // store the result in our return value PHI node.
  605. SelectInst *SI = SelectInst::Create(
  606. RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
  607. RetSelects.push_back(SI);
  608. RetPN->addIncoming(SI, BB);
  609. RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
  610. }
  611. if (AccPN)
  612. AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB);
  613. }
  614. // Now that all of the PHI nodes are in place, remove the call and
  615. // ret instructions, replacing them with an unconditional branch.
  616. BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
  617. NewBI->setDebugLoc(CI->getDebugLoc());
  618. Ret->eraseFromParent(); // Remove return.
  619. CI->eraseFromParent(); // Remove call.
  620. DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
  621. ++NumEliminated;
  622. return true;
  623. }
  624. void TailRecursionEliminator::cleanupAndFinalize() {
  625. // If we eliminated any tail recursions, it's possible that we inserted some
  626. // silly PHI nodes which just merge an initial value (the incoming operand)
  627. // with themselves. Check to see if we did and clean up our mess if so. This
  628. // occurs when a function passes an argument straight through to its tail
  629. // call.
  630. for (PHINode *PN : ArgumentPHIs) {
  631. // If the PHI Node is a dynamic constant, replace it with the value it is.
  632. if (Value *PNV = simplifyInstruction(PN, F.getParent()->getDataLayout())) {
  633. PN->replaceAllUsesWith(PNV);
  634. PN->eraseFromParent();
  635. }
  636. }
  637. if (RetPN) {
  638. if (RetSelects.empty()) {
  639. // If we didn't insert any select instructions, then we know we didn't
  640. // store a return value and we can remove the PHI nodes we inserted.
  641. RetPN->dropAllReferences();
  642. RetPN->eraseFromParent();
  643. RetKnownPN->dropAllReferences();
  644. RetKnownPN->eraseFromParent();
  645. if (AccPN) {
  646. // We need to insert a copy of our accumulator instruction before any
  647. // return in the function, and return its result instead.
  648. Instruction *AccRecInstr = AccumulatorRecursionInstr;
  649. for (BasicBlock &BB : F) {
  650. ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
  651. if (!RI)
  652. continue;
  653. Instruction *AccRecInstrNew = AccRecInstr->clone();
  654. AccRecInstrNew->setName("accumulator.ret.tr");
  655. AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
  656. RI->getOperand(0));
  657. AccRecInstrNew->insertBefore(RI);
  658. RI->setOperand(0, AccRecInstrNew);
  659. }
  660. }
  661. } else {
  662. // We need to insert a select instruction before any return left in the
  663. // function to select our stored return value if we have one.
  664. for (BasicBlock &BB : F) {
  665. ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
  666. if (!RI)
  667. continue;
  668. SelectInst *SI = SelectInst::Create(
  669. RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
  670. RetSelects.push_back(SI);
  671. RI->setOperand(0, SI);
  672. }
  673. if (AccPN) {
  674. // We need to insert a copy of our accumulator instruction before any
  675. // of the selects we inserted, and select its result instead.
  676. Instruction *AccRecInstr = AccumulatorRecursionInstr;
  677. for (SelectInst *SI : RetSelects) {
  678. Instruction *AccRecInstrNew = AccRecInstr->clone();
  679. AccRecInstrNew->setName("accumulator.ret.tr");
  680. AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
  681. SI->getFalseValue());
  682. AccRecInstrNew->insertBefore(SI);
  683. SI->setFalseValue(AccRecInstrNew);
  684. }
  685. }
  686. }
  687. }
  688. }
  689. bool TailRecursionEliminator::processBlock(BasicBlock &BB) {
  690. Instruction *TI = BB.getTerminator();
  691. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  692. if (BI->isConditional())
  693. return false;
  694. BasicBlock *Succ = BI->getSuccessor(0);
  695. ReturnInst *Ret = dyn_cast<ReturnInst>(Succ->getFirstNonPHIOrDbg(true));
  696. if (!Ret)
  697. return false;
  698. CallInst *CI = findTRECandidate(&BB);
  699. if (!CI)
  700. return false;
  701. LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ
  702. << "INTO UNCOND BRANCH PRED: " << BB);
  703. FoldReturnIntoUncondBranch(Ret, Succ, &BB, &DTU);
  704. ++NumRetDuped;
  705. // If all predecessors of Succ have been eliminated by
  706. // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
  707. // because the ret instruction in there is still using a value which
  708. // eliminateCall will attempt to remove. This block can only contain
  709. // instructions that can't have uses, therefore it is safe to remove.
  710. if (pred_empty(Succ))
  711. DTU.deleteBB(Succ);
  712. eliminateCall(CI);
  713. return true;
  714. } else if (isa<ReturnInst>(TI)) {
  715. CallInst *CI = findTRECandidate(&BB);
  716. if (CI)
  717. return eliminateCall(CI);
  718. }
  719. return false;
  720. }
  721. bool TailRecursionEliminator::eliminate(Function &F,
  722. const TargetTransformInfo *TTI,
  723. AliasAnalysis *AA,
  724. OptimizationRemarkEmitter *ORE,
  725. DomTreeUpdater &DTU) {
  726. if (F.getFnAttribute("disable-tail-calls").getValueAsBool())
  727. return false;
  728. bool MadeChange = false;
  729. MadeChange |= markTails(F, ORE);
  730. // If this function is a varargs function, we won't be able to PHI the args
  731. // right, so don't even try to convert it...
  732. if (F.getFunctionType()->isVarArg())
  733. return MadeChange;
  734. if (!canTRE(F))
  735. return MadeChange;
  736. // Change any tail recursive calls to loops.
  737. TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
  738. for (BasicBlock &BB : F)
  739. MadeChange |= TRE.processBlock(BB);
  740. TRE.cleanupAndFinalize();
  741. return MadeChange;
  742. }
  743. namespace {
  744. struct TailCallElim : public FunctionPass {
  745. static char ID; // Pass identification, replacement for typeid
  746. TailCallElim() : FunctionPass(ID) {
  747. initializeTailCallElimPass(*PassRegistry::getPassRegistry());
  748. }
  749. void getAnalysisUsage(AnalysisUsage &AU) const override {
  750. AU.addRequired<TargetTransformInfoWrapperPass>();
  751. AU.addRequired<AAResultsWrapperPass>();
  752. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  753. AU.addPreserved<GlobalsAAWrapperPass>();
  754. AU.addPreserved<DominatorTreeWrapperPass>();
  755. AU.addPreserved<PostDominatorTreeWrapperPass>();
  756. }
  757. bool runOnFunction(Function &F) override {
  758. if (skipFunction(F))
  759. return false;
  760. auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
  761. auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
  762. auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
  763. auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
  764. // There is no noticable performance difference here between Lazy and Eager
  765. // UpdateStrategy based on some test results. It is feasible to switch the
  766. // UpdateStrategy to Lazy if we find it profitable later.
  767. DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
  768. return TailRecursionEliminator::eliminate(
  769. F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
  770. &getAnalysis<AAResultsWrapperPass>().getAAResults(),
  771. &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
  772. }
  773. };
  774. }
  775. char TailCallElim::ID = 0;
  776. INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
  777. false, false)
  778. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  779. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  780. INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
  781. false, false)
  782. // Public interface to the TailCallElimination pass
  783. FunctionPass *llvm::createTailCallEliminationPass() {
  784. return new TailCallElim();
  785. }
  786. PreservedAnalyses TailCallElimPass::run(Function &F,
  787. FunctionAnalysisManager &AM) {
  788. TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
  789. AliasAnalysis &AA = AM.getResult<AAManager>(F);
  790. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  791. auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
  792. auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
  793. // There is no noticable performance difference here between Lazy and Eager
  794. // UpdateStrategy based on some test results. It is feasible to switch the
  795. // UpdateStrategy to Lazy if we find it profitable later.
  796. DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
  797. bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
  798. if (!Changed)
  799. return PreservedAnalyses::all();
  800. PreservedAnalyses PA;
  801. PA.preserve<DominatorTreeAnalysis>();
  802. PA.preserve<PostDominatorTreeAnalysis>();
  803. return PA;
  804. }