PPCLoopInstrFormPrep.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493
  1. //===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements a pass to prepare loops for ppc preferred addressing
  10. // modes, leveraging different instruction form. (eg: DS/DQ form, D/DS form with
  11. // update)
  12. // Additional PHIs are created for loop induction variables used by load/store
  13. // instructions so that preferred addressing modes can be used.
  14. //
  15. // 1: DS/DQ form preparation, prepare the load/store instructions so that they
  16. // can satisfy the DS/DQ form displacement requirements.
  17. // Generically, this means transforming loops like this:
  18. // for (int i = 0; i < n; ++i) {
  19. // unsigned long x1 = *(unsigned long *)(p + i + 5);
  20. // unsigned long x2 = *(unsigned long *)(p + i + 9);
  21. // }
  22. //
  23. // to look like this:
  24. //
  25. // unsigned NewP = p + 5;
  26. // for (int i = 0; i < n; ++i) {
  27. // unsigned long x1 = *(unsigned long *)(i + NewP);
  28. // unsigned long x2 = *(unsigned long *)(i + NewP + 4);
  29. // }
  30. //
  31. // 2: D/DS form with update preparation, prepare the load/store instructions so
  32. // that we can use update form to do pre-increment.
  33. // Generically, this means transforming loops like this:
  34. // for (int i = 0; i < n; ++i)
  35. // array[i] = c;
  36. //
  37. // to look like this:
  38. //
  39. // T *p = array[-1];
  40. // for (int i = 0; i < n; ++i)
  41. // *++p = c;
  42. //
  43. // 3: common multiple chains for the load/stores with same offsets in the loop,
  44. // so that we can reuse the offsets and reduce the register pressure in the
  45. // loop. This transformation can also increase the loop ILP as now each chain
  46. // uses its own loop induction add/addi. But this will increase the number of
  47. // add/addi in the loop.
  48. //
  49. // Generically, this means transforming loops like this:
  50. //
  51. // char *p;
  52. // A1 = p + base1
  53. // A2 = p + base1 + offset
  54. // B1 = p + base2
  55. // B2 = p + base2 + offset
  56. //
  57. // for (int i = 0; i < n; i++)
  58. // unsigned long x1 = *(unsigned long *)(A1 + i);
  59. // unsigned long x2 = *(unsigned long *)(A2 + i)
  60. // unsigned long x3 = *(unsigned long *)(B1 + i);
  61. // unsigned long x4 = *(unsigned long *)(B2 + i);
  62. // }
  63. //
  64. // to look like this:
  65. //
  66. // A1_new = p + base1 // chain 1
  67. // B1_new = p + base2 // chain 2, now inside the loop, common offset is
  68. // // reused.
  69. //
  70. // for (long long i = 0; i < n; i+=count) {
  71. // unsigned long x1 = *(unsigned long *)(A1_new + i);
  72. // unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
  73. // unsigned long x3 = *(unsigned long *)(B1_new + i);
  74. // unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
  75. // }
  76. //===----------------------------------------------------------------------===//
  77. #include "PPC.h"
  78. #include "PPCSubtarget.h"
  79. #include "PPCTargetMachine.h"
  80. #include "llvm/ADT/DepthFirstIterator.h"
  81. #include "llvm/ADT/SmallPtrSet.h"
  82. #include "llvm/ADT/SmallSet.h"
  83. #include "llvm/ADT/SmallVector.h"
  84. #include "llvm/ADT/Statistic.h"
  85. #include "llvm/Analysis/LoopInfo.h"
  86. #include "llvm/Analysis/ScalarEvolution.h"
  87. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  88. #include "llvm/IR/BasicBlock.h"
  89. #include "llvm/IR/CFG.h"
  90. #include "llvm/IR/Dominators.h"
  91. #include "llvm/IR/Instruction.h"
  92. #include "llvm/IR/Instructions.h"
  93. #include "llvm/IR/IntrinsicInst.h"
  94. #include "llvm/IR/IntrinsicsPowerPC.h"
  95. #include "llvm/IR/Module.h"
  96. #include "llvm/IR/Type.h"
  97. #include "llvm/IR/Value.h"
  98. #include "llvm/InitializePasses.h"
  99. #include "llvm/Pass.h"
  100. #include "llvm/Support/Casting.h"
  101. #include "llvm/Support/CommandLine.h"
  102. #include "llvm/Support/Debug.h"
  103. #include "llvm/Transforms/Scalar.h"
  104. #include "llvm/Transforms/Utils.h"
  105. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  106. #include "llvm/Transforms/Utils/Local.h"
  107. #include "llvm/Transforms/Utils/LoopUtils.h"
  108. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  109. #include <cassert>
  110. #include <iterator>
  111. #include <utility>
  112. #define DEBUG_TYPE "ppc-loop-instr-form-prep"
  113. using namespace llvm;
  114. static cl::opt<unsigned>
  115. MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24),
  116. cl::ZeroOrMore,
  117. cl::desc("Potential common base number threshold per function "
  118. "for PPC loop prep"));
  119. static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
  120. cl::init(true), cl::Hidden,
  121. cl::desc("prefer update form when ds form is also a update form"));
  122. static cl::opt<bool> EnableUpdateFormForNonConstInc(
  123. "ppc-formprep-update-nonconst-inc", cl::init(false), cl::Hidden,
  124. cl::desc("prepare update form when the load/store increment is a loop "
  125. "invariant non-const value."));
  126. static cl::opt<bool> EnableChainCommoning(
  127. "ppc-formprep-chain-commoning", cl::init(false), cl::Hidden,
  128. cl::desc("Enable chain commoning in PPC loop prepare pass."));
  129. // Sum of following 3 per loop thresholds for all loops can not be larger
  130. // than MaxVarsPrep.
  131. // now the thresholds for each kind prep are exterimental values on Power9.
  132. static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars",
  133. cl::Hidden, cl::init(3),
  134. cl::desc("Potential PHI threshold per loop for PPC loop prep of update "
  135. "form"));
  136. static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars",
  137. cl::Hidden, cl::init(3),
  138. cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form"));
  139. static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
  140. cl::Hidden, cl::init(8),
  141. cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
  142. // Commoning chain will reduce the register pressure, so we don't consider about
  143. // the PHI nodes number.
  144. // But commoning chain will increase the addi/add number in the loop and also
  145. // increase loop ILP. Maximum chain number should be same with hardware
  146. // IssueWidth, because we won't benefit from ILP if the parallel chains number
  147. // is bigger than IssueWidth. We assume there are 2 chains in one bucket, so
  148. // there would be 4 buckets at most on P9(IssueWidth is 8).
  149. static cl::opt<unsigned> MaxVarsChainCommon(
  150. "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4),
  151. cl::desc("Bucket number per loop for PPC loop chain common"));
  152. // If would not be profitable if the common base has only one load/store, ISEL
  153. // should already be able to choose best load/store form based on offset for
  154. // single load/store. Set minimal profitable value default to 2 and make it as
  155. // an option.
  156. static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold",
  157. cl::Hidden, cl::init(2),
  158. cl::desc("Minimal common base load/store instructions triggering DS/DQ form "
  159. "preparation"));
  160. static cl::opt<unsigned> ChainCommonPrepMinThreshold(
  161. "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4),
  162. cl::desc("Minimal common base load/store instructions triggering chain "
  163. "commoning preparation. Must be not smaller than 4"));
  164. STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form");
  165. STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form");
  166. STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form");
  167. STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten");
  168. STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten");
  169. STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
  170. STATISTIC(ChainCommoningRewritten, "Num of commoning chains");
  171. namespace {
  172. struct BucketElement {
  173. BucketElement(const SCEV *O, Instruction *I) : Offset(O), Instr(I) {}
  174. BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
  175. const SCEV *Offset;
  176. Instruction *Instr;
  177. };
  178. struct Bucket {
  179. Bucket(const SCEV *B, Instruction *I)
  180. : BaseSCEV(B), Elements(1, BucketElement(I)) {
  181. ChainSize = 0;
  182. }
  183. // The base of the whole bucket.
  184. const SCEV *BaseSCEV;
  185. // All elements in the bucket. In the bucket, the element with the BaseSCEV
  186. // has no offset and all other elements are stored as offsets to the
  187. // BaseSCEV.
  188. SmallVector<BucketElement, 16> Elements;
  189. // The potential chains size. This is used for chain commoning only.
  190. unsigned ChainSize;
  191. // The base for each potential chain. This is used for chain commoning only.
  192. SmallVector<BucketElement, 16> ChainBases;
  193. };
  194. // "UpdateForm" is not a real PPC instruction form, it stands for dform
  195. // load/store with update like ldu/stdu, or Prefetch intrinsic.
  196. // For DS form instructions, their displacements must be multiple of 4.
  197. // For DQ form instructions, their displacements must be multiple of 16.
  198. enum PrepForm { UpdateForm = 1, DSForm = 4, DQForm = 16, ChainCommoning };
  199. class PPCLoopInstrFormPrep : public FunctionPass {
  200. public:
  201. static char ID; // Pass ID, replacement for typeid
  202. PPCLoopInstrFormPrep() : FunctionPass(ID) {
  203. initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
  204. }
  205. PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
  206. initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
  207. }
  208. void getAnalysisUsage(AnalysisUsage &AU) const override {
  209. AU.addPreserved<DominatorTreeWrapperPass>();
  210. AU.addRequired<LoopInfoWrapperPass>();
  211. AU.addPreserved<LoopInfoWrapperPass>();
  212. AU.addRequired<ScalarEvolutionWrapperPass>();
  213. }
  214. bool runOnFunction(Function &F) override;
  215. private:
  216. PPCTargetMachine *TM = nullptr;
  217. const PPCSubtarget *ST;
  218. DominatorTree *DT;
  219. LoopInfo *LI;
  220. ScalarEvolution *SE;
  221. bool PreserveLCSSA;
  222. bool HasCandidateForPrepare;
  223. /// Successful preparation number for Update/DS/DQ form in all inner most
  224. /// loops. One successful preparation will put one common base out of loop,
  225. /// this may leads to register presure like LICM does.
  226. /// Make sure total preparation number can be controlled by option.
  227. unsigned SuccPrepCount;
  228. bool runOnLoop(Loop *L);
  229. /// Check if required PHI node is already exist in Loop \p L.
  230. bool alreadyPrepared(Loop *L, Instruction *MemI,
  231. const SCEV *BasePtrStartSCEV,
  232. const SCEV *BasePtrIncSCEV, PrepForm Form);
  233. /// Get the value which defines the increment SCEV \p BasePtrIncSCEV.
  234. Value *getNodeForInc(Loop *L, Instruction *MemI,
  235. const SCEV *BasePtrIncSCEV);
  236. /// Common chains to reuse offsets for a loop to reduce register pressure.
  237. bool chainCommoning(Loop *L, SmallVector<Bucket, 16> &Buckets);
  238. /// Find out the potential commoning chains and their bases.
  239. bool prepareBasesForCommoningChains(Bucket &BucketChain);
  240. /// Rewrite load/store according to the common chains.
  241. bool
  242. rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
  243. SmallSet<BasicBlock *, 16> &BBChanged);
  244. /// Collect condition matched(\p isValidCandidate() returns true)
  245. /// candidates in Loop \p L.
  246. SmallVector<Bucket, 16> collectCandidates(
  247. Loop *L,
  248. std::function<bool(const Instruction *, Value *, const Type *)>
  249. isValidCandidate,
  250. std::function<bool(const SCEV *)> isValidDiff,
  251. unsigned MaxCandidateNum);
  252. /// Add a candidate to candidates \p Buckets if diff between candidate and
  253. /// one base in \p Buckets matches \p isValidDiff.
  254. void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
  255. SmallVector<Bucket, 16> &Buckets,
  256. std::function<bool(const SCEV *)> isValidDiff,
  257. unsigned MaxCandidateNum);
  258. /// Prepare all candidates in \p Buckets for update form.
  259. bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets);
  260. /// Prepare all candidates in \p Buckets for displacement form, now for
  261. /// ds/dq.
  262. bool dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets, PrepForm Form);
  263. /// Prepare for one chain \p BucketChain, find the best base element and
  264. /// update all other elements in \p BucketChain accordingly.
  265. /// \p Form is used to find the best base element.
  266. /// If success, best base element must be stored as the first element of
  267. /// \p BucketChain.
  268. /// Return false if no base element found, otherwise return true.
  269. bool prepareBaseForDispFormChain(Bucket &BucketChain, PrepForm Form);
  270. /// Prepare for one chain \p BucketChain, find the best base element and
  271. /// update all other elements in \p BucketChain accordingly.
  272. /// If success, best base element must be stored as the first element of
  273. /// \p BucketChain.
  274. /// Return false if no base element found, otherwise return true.
  275. bool prepareBaseForUpdateFormChain(Bucket &BucketChain);
  276. /// Rewrite load/store instructions in \p BucketChain according to
  277. /// preparation.
  278. bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
  279. SmallSet<BasicBlock *, 16> &BBChanged,
  280. PrepForm Form);
  281. /// Rewrite for the base load/store of a chain.
  282. std::pair<Instruction *, Instruction *>
  283. rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
  284. Instruction *BaseMemI, bool CanPreInc, PrepForm Form,
  285. SCEVExpander &SCEVE, SmallPtrSet<Value *, 16> &DeletedPtrs);
  286. /// Rewrite for the other load/stores of a chain according to the new \p
  287. /// Base.
  288. Instruction *
  289. rewriteForBucketElement(std::pair<Instruction *, Instruction *> Base,
  290. const BucketElement &Element, Value *OffToBase,
  291. SmallPtrSet<Value *, 16> &DeletedPtrs);
  292. };
  293. } // end anonymous namespace
  294. char PPCLoopInstrFormPrep::ID = 0;
  295. static const char *name = "Prepare loop for ppc preferred instruction forms";
  296. INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
  297. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  298. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  299. INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
  300. static constexpr StringRef PHINodeNameSuffix = ".phi";
  301. static constexpr StringRef CastNodeNameSuffix = ".cast";
  302. static constexpr StringRef GEPNodeIncNameSuffix = ".inc";
  303. static constexpr StringRef GEPNodeOffNameSuffix = ".off";
  304. FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) {
  305. return new PPCLoopInstrFormPrep(TM);
  306. }
  307. static bool IsPtrInBounds(Value *BasePtr) {
  308. Value *StrippedBasePtr = BasePtr;
  309. while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
  310. StrippedBasePtr = BC->getOperand(0);
  311. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr))
  312. return GEP->isInBounds();
  313. return false;
  314. }
  315. static std::string getInstrName(const Value *I, StringRef Suffix) {
  316. assert(I && "Invalid paramater!");
  317. if (I->hasName())
  318. return (I->getName() + Suffix).str();
  319. else
  320. return "";
  321. }
  322. static Value *getPointerOperandAndType(Value *MemI,
  323. Type **PtrElementType = nullptr) {
  324. Value *PtrValue = nullptr;
  325. Type *PointerElementType = nullptr;
  326. if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
  327. PtrValue = LMemI->getPointerOperand();
  328. PointerElementType = LMemI->getType();
  329. } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
  330. PtrValue = SMemI->getPointerOperand();
  331. PointerElementType = SMemI->getValueOperand()->getType();
  332. } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
  333. PointerElementType = Type::getInt8Ty(MemI->getContext());
  334. if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
  335. IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
  336. PtrValue = IMemI->getArgOperand(0);
  337. } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
  338. PtrValue = IMemI->getArgOperand(1);
  339. }
  340. }
  341. /*Get ElementType if PtrElementType is not null.*/
  342. if (PtrElementType)
  343. *PtrElementType = PointerElementType;
  344. return PtrValue;
  345. }
  346. bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
  347. if (skipFunction(F))
  348. return false;
  349. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  350. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  351. auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
  352. DT = DTWP ? &DTWP->getDomTree() : nullptr;
  353. PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  354. ST = TM ? TM->getSubtargetImpl(F) : nullptr;
  355. SuccPrepCount = 0;
  356. bool MadeChange = false;
  357. for (Loop *I : *LI)
  358. for (Loop *L : depth_first(I))
  359. MadeChange |= runOnLoop(L);
  360. return MadeChange;
  361. }
  362. // Finding the minimal(chain_number + reusable_offset_number) is a complicated
  363. // algorithmic problem.
  364. // For now, the algorithm used here is simply adjusted to handle the case for
  365. // manually unrolling cases.
  366. // FIXME: use a more powerful algorithm to find minimal sum of chain_number and
  367. // reusable_offset_number for one base with multiple offsets.
  368. bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) {
  369. // The minimal size for profitable chain commoning:
  370. // A1 = base + offset1
  371. // A2 = base + offset2 (offset2 - offset1 = X)
  372. // A3 = base + offset3
  373. // A4 = base + offset4 (offset4 - offset3 = X)
  374. // ======>
  375. // base1 = base + offset1
  376. // base2 = base + offset3
  377. // A1 = base1
  378. // A2 = base1 + X
  379. // A3 = base2
  380. // A4 = base2 + X
  381. //
  382. // There is benefit because of reuse of offest 'X'.
  383. assert(ChainCommonPrepMinThreshold >= 4 &&
  384. "Thredhold can not be smaller than 4!\n");
  385. if (CBucket.Elements.size() < ChainCommonPrepMinThreshold)
  386. return false;
  387. // We simply select the FirstOffset as the first reusable offset between each
  388. // chain element 1 and element 0.
  389. const SCEV *FirstOffset = CBucket.Elements[1].Offset;
  390. // Figure out how many times above FirstOffset is used in the chain.
  391. // For a success commoning chain candidate, offset difference between each
  392. // chain element 1 and element 0 must be also FirstOffset.
  393. unsigned FirstOffsetReusedCount = 1;
  394. // Figure out how many times above FirstOffset is used in the first chain.
  395. // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain
  396. unsigned FirstOffsetReusedCountInFirstChain = 1;
  397. unsigned EleNum = CBucket.Elements.size();
  398. bool SawChainSeparater = false;
  399. for (unsigned j = 2; j != EleNum; ++j) {
  400. if (SE->getMinusSCEV(CBucket.Elements[j].Offset,
  401. CBucket.Elements[j - 1].Offset) == FirstOffset) {
  402. if (!SawChainSeparater)
  403. FirstOffsetReusedCountInFirstChain++;
  404. FirstOffsetReusedCount++;
  405. } else
  406. // For now, if we meet any offset which is not FirstOffset, we assume we
  407. // find a new Chain.
  408. // This makes us miss some opportunities.
  409. // For example, we can common:
  410. //
  411. // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB}
  412. //
  413. // as two chains:
  414. // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}}
  415. // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2
  416. //
  417. // But we fail to common:
  418. //
  419. // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA}
  420. // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1
  421. SawChainSeparater = true;
  422. }
  423. // FirstOffset is not reused, skip this bucket.
  424. if (FirstOffsetReusedCount == 1)
  425. return false;
  426. unsigned ChainNum =
  427. FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain;
  428. // All elements are increased by FirstOffset.
  429. // The number of chains should be sqrt(EleNum).
  430. if (!SawChainSeparater)
  431. ChainNum = (unsigned)sqrt((double)EleNum);
  432. CBucket.ChainSize = (unsigned)(EleNum / ChainNum);
  433. // If this is not a perfect chain(eg: not all elements can be put inside
  434. // commoning chains.), skip now.
  435. if (CBucket.ChainSize * ChainNum != EleNum)
  436. return false;
  437. if (SawChainSeparater) {
  438. // Check that the offset seqs are the same for all chains.
  439. for (unsigned i = 1; i < CBucket.ChainSize; i++)
  440. for (unsigned j = 1; j < ChainNum; j++)
  441. if (CBucket.Elements[i].Offset !=
  442. SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset,
  443. CBucket.Elements[j * CBucket.ChainSize].Offset))
  444. return false;
  445. }
  446. for (unsigned i = 0; i < ChainNum; i++)
  447. CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]);
  448. LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n");
  449. return true;
  450. }
  451. bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
  452. SmallVector<Bucket, 16> &Buckets) {
  453. bool MadeChange = false;
  454. if (Buckets.empty())
  455. return MadeChange;
  456. SmallSet<BasicBlock *, 16> BBChanged;
  457. for (auto &Bucket : Buckets) {
  458. if (prepareBasesForCommoningChains(Bucket))
  459. MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged);
  460. }
  461. if (MadeChange)
  462. for (auto *BB : BBChanged)
  463. DeleteDeadPHIs(BB);
  464. return MadeChange;
  465. }
  466. bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
  467. Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
  468. bool MadeChange = false;
  469. assert(Bucket.Elements.size() ==
  470. Bucket.ChainBases.size() * Bucket.ChainSize &&
  471. "invalid bucket for chain commoning!\n");
  472. SmallPtrSet<Value *, 16> DeletedPtrs;
  473. BasicBlock *Header = L->getHeader();
  474. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  475. SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
  476. "loopprepare-chaincommon");
  477. for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) {
  478. unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx;
  479. const SCEV *BaseSCEV =
  480. ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV,
  481. Bucket.Elements[BaseElemIdx].Offset)
  482. : Bucket.BaseSCEV;
  483. const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
  484. // Make sure the base is able to expand.
  485. if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
  486. return MadeChange;
  487. assert(BasePtrSCEV->isAffine() &&
  488. "Invalid SCEV type for the base ptr for a candidate chain!\n");
  489. std::pair<Instruction *, Instruction *> Base = rewriteForBase(
  490. L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr,
  491. false /* CanPreInc */, ChainCommoning, SCEVE, DeletedPtrs);
  492. if (!Base.first || !Base.second)
  493. return MadeChange;
  494. // Keep track of the replacement pointer values we've inserted so that we
  495. // don't generate more pointer values than necessary.
  496. SmallPtrSet<Value *, 16> NewPtrs;
  497. NewPtrs.insert(Base.first);
  498. for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize;
  499. ++Idx) {
  500. BucketElement &I = Bucket.Elements[Idx];
  501. Value *Ptr = getPointerOperandAndType(I.Instr);
  502. assert(Ptr && "No pointer operand");
  503. if (NewPtrs.count(Ptr))
  504. continue;
  505. const SCEV *OffsetSCEV =
  506. BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset,
  507. Bucket.Elements[BaseElemIdx].Offset)
  508. : Bucket.Elements[Idx].Offset;
  509. // Make sure offset is able to expand. Only need to check one time as the
  510. // offsets are reused between different chains.
  511. if (!BaseElemIdx)
  512. if (!isSafeToExpand(OffsetSCEV, *SE))
  513. return false;
  514. Value *OffsetValue = SCEVE.expandCodeFor(
  515. OffsetSCEV, OffsetSCEV->getType(), LoopPredecessor->getTerminator());
  516. Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx],
  517. OffsetValue, DeletedPtrs);
  518. assert(NewPtr && "Wrong rewrite!\n");
  519. NewPtrs.insert(NewPtr);
  520. }
  521. ++ChainCommoningRewritten;
  522. }
  523. // Clear the rewriter cache, because values that are in the rewriter's cache
  524. // can be deleted below, causing the AssertingVH in the cache to trigger.
  525. SCEVE.clear();
  526. for (auto *Ptr : DeletedPtrs) {
  527. if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
  528. BBChanged.insert(IDel->getParent());
  529. RecursivelyDeleteTriviallyDeadInstructions(Ptr);
  530. }
  531. MadeChange = true;
  532. return MadeChange;
  533. }
  534. // Rewrite the new base according to BasePtrSCEV.
  535. // bb.loop.preheader:
  536. // %newstart = ...
  537. // bb.loop.body:
  538. // %phinode = phi [ %newstart, %bb.loop.preheader ], [ %add, %bb.loop.body ]
  539. // ...
  540. // %add = getelementptr %phinode, %inc
  541. //
  542. // First returned instruciton is %phinode (or a type cast to %phinode), caller
  543. // needs this value to rewrite other load/stores in the same chain.
  544. // Second returned instruction is %add, caller needs this value to rewrite other
  545. // load/stores in the same chain.
  546. std::pair<Instruction *, Instruction *>
  547. PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
  548. Instruction *BaseMemI, bool CanPreInc,
  549. PrepForm Form, SCEVExpander &SCEVE,
  550. SmallPtrSet<Value *, 16> &DeletedPtrs) {
  551. LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
  552. assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
  553. Value *BasePtr = getPointerOperandAndType(BaseMemI);
  554. assert(BasePtr && "No pointer operand");
  555. Type *I8Ty = Type::getInt8Ty(BaseMemI->getParent()->getContext());
  556. Type *I8PtrTy =
  557. Type::getInt8PtrTy(BaseMemI->getParent()->getContext(),
  558. BasePtr->getType()->getPointerAddressSpace());
  559. bool IsConstantInc = false;
  560. const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE);
  561. Value *IncNode = getNodeForInc(L, BaseMemI, BasePtrIncSCEV);
  562. const SCEVConstant *BasePtrIncConstantSCEV =
  563. dyn_cast<SCEVConstant>(BasePtrIncSCEV);
  564. if (BasePtrIncConstantSCEV)
  565. IsConstantInc = true;
  566. // No valid representation for the increment.
  567. if (!IncNode) {
  568. LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n");
  569. return std::make_pair(nullptr, nullptr);
  570. }
  571. if (Form == UpdateForm && !IsConstantInc && !EnableUpdateFormForNonConstInc) {
  572. LLVM_DEBUG(
  573. dbgs()
  574. << "Update form prepare for non-const increment is not enabled!\n");
  575. return std::make_pair(nullptr, nullptr);
  576. }
  577. const SCEV *BasePtrStartSCEV = nullptr;
  578. if (CanPreInc) {
  579. assert(SE->isLoopInvariant(BasePtrIncSCEV, L) &&
  580. "Increment is not loop invariant!\n");
  581. BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(),
  582. IsConstantInc ? BasePtrIncConstantSCEV
  583. : BasePtrIncSCEV);
  584. } else
  585. BasePtrStartSCEV = BasePtrSCEV->getStart();
  586. if (alreadyPrepared(L, BaseMemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) {
  587. LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n");
  588. return std::make_pair(nullptr, nullptr);
  589. }
  590. LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
  591. BasicBlock *Header = L->getHeader();
  592. unsigned HeaderLoopPredCount = pred_size(Header);
  593. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  594. PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
  595. getInstrName(BaseMemI, PHINodeNameSuffix),
  596. Header->getFirstNonPHI());
  597. Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
  598. LoopPredecessor->getTerminator());
  599. // Note that LoopPredecessor might occur in the predecessor list multiple
  600. // times, and we need to add it the right number of times.
  601. for (auto PI : predecessors(Header)) {
  602. if (PI != LoopPredecessor)
  603. continue;
  604. NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
  605. }
  606. Instruction *PtrInc = nullptr;
  607. Instruction *NewBasePtr = nullptr;
  608. if (CanPreInc) {
  609. Instruction *InsPoint = &*Header->getFirstInsertionPt();
  610. PtrInc = GetElementPtrInst::Create(
  611. I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
  612. InsPoint);
  613. cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
  614. for (auto PI : predecessors(Header)) {
  615. if (PI == LoopPredecessor)
  616. continue;
  617. NewPHI->addIncoming(PtrInc, PI);
  618. }
  619. if (PtrInc->getType() != BasePtr->getType())
  620. NewBasePtr =
  621. new BitCastInst(PtrInc, BasePtr->getType(),
  622. getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
  623. else
  624. NewBasePtr = PtrInc;
  625. } else {
  626. // Note that LoopPredecessor might occur in the predecessor list multiple
  627. // times, and we need to make sure no more incoming value for them in PHI.
  628. for (auto PI : predecessors(Header)) {
  629. if (PI == LoopPredecessor)
  630. continue;
  631. // For the latch predecessor, we need to insert a GEP just before the
  632. // terminator to increase the address.
  633. BasicBlock *BB = PI;
  634. Instruction *InsPoint = BB->getTerminator();
  635. PtrInc = GetElementPtrInst::Create(
  636. I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
  637. InsPoint);
  638. cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
  639. NewPHI->addIncoming(PtrInc, PI);
  640. }
  641. PtrInc = NewPHI;
  642. if (NewPHI->getType() != BasePtr->getType())
  643. NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(),
  644. getInstrName(NewPHI, CastNodeNameSuffix),
  645. &*Header->getFirstInsertionPt());
  646. else
  647. NewBasePtr = NewPHI;
  648. }
  649. BasePtr->replaceAllUsesWith(NewBasePtr);
  650. DeletedPtrs.insert(BasePtr);
  651. return std::make_pair(NewBasePtr, PtrInc);
  652. }
  653. Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
  654. std::pair<Instruction *, Instruction *> Base, const BucketElement &Element,
  655. Value *OffToBase, SmallPtrSet<Value *, 16> &DeletedPtrs) {
  656. Instruction *NewBasePtr = Base.first;
  657. Instruction *PtrInc = Base.second;
  658. assert((NewBasePtr && PtrInc) && "base does not exist!\n");
  659. Type *I8Ty = Type::getInt8Ty(PtrInc->getParent()->getContext());
  660. Value *Ptr = getPointerOperandAndType(Element.Instr);
  661. assert(Ptr && "No pointer operand");
  662. Instruction *RealNewPtr;
  663. if (!Element.Offset ||
  664. (isa<SCEVConstant>(Element.Offset) &&
  665. cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) {
  666. RealNewPtr = NewBasePtr;
  667. } else {
  668. Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
  669. if (PtrIP && isa<Instruction>(NewBasePtr) &&
  670. cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
  671. PtrIP = nullptr;
  672. else if (PtrIP && isa<PHINode>(PtrIP))
  673. PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
  674. else if (!PtrIP)
  675. PtrIP = Element.Instr;
  676. assert(OffToBase && "There should be an offset for non base element!\n");
  677. GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
  678. I8Ty, PtrInc, OffToBase,
  679. getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP);
  680. if (!PtrIP)
  681. NewPtr->insertAfter(cast<Instruction>(PtrInc));
  682. NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
  683. RealNewPtr = NewPtr;
  684. }
  685. Instruction *ReplNewPtr;
  686. if (Ptr->getType() != RealNewPtr->getType()) {
  687. ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
  688. getInstrName(Ptr, CastNodeNameSuffix));
  689. ReplNewPtr->insertAfter(RealNewPtr);
  690. } else
  691. ReplNewPtr = RealNewPtr;
  692. Ptr->replaceAllUsesWith(ReplNewPtr);
  693. DeletedPtrs.insert(Ptr);
  694. return ReplNewPtr;
  695. }
  696. void PPCLoopInstrFormPrep::addOneCandidate(
  697. Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets,
  698. std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
  699. assert((MemI && getPointerOperandAndType(MemI)) &&
  700. "Candidate should be a memory instruction.");
  701. assert(LSCEV && "Invalid SCEV for Ptr value.");
  702. bool FoundBucket = false;
  703. for (auto &B : Buckets) {
  704. if (cast<SCEVAddRecExpr>(B.BaseSCEV)->getStepRecurrence(*SE) !=
  705. cast<SCEVAddRecExpr>(LSCEV)->getStepRecurrence(*SE))
  706. continue;
  707. const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
  708. if (isValidDiff(Diff)) {
  709. B.Elements.push_back(BucketElement(Diff, MemI));
  710. FoundBucket = true;
  711. break;
  712. }
  713. }
  714. if (!FoundBucket) {
  715. if (Buckets.size() == MaxCandidateNum) {
  716. LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit "
  717. << MaxCandidateNum << "\n");
  718. return;
  719. }
  720. Buckets.push_back(Bucket(LSCEV, MemI));
  721. }
  722. }
  723. SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
  724. Loop *L,
  725. std::function<bool(const Instruction *, Value *, const Type *)>
  726. isValidCandidate,
  727. std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
  728. SmallVector<Bucket, 16> Buckets;
  729. for (const auto &BB : L->blocks())
  730. for (auto &J : *BB) {
  731. Value *PtrValue = nullptr;
  732. Type *PointerElementType = nullptr;
  733. PtrValue = getPointerOperandAndType(&J, &PointerElementType);
  734. if (!PtrValue)
  735. continue;
  736. if (PtrValue->getType()->getPointerAddressSpace())
  737. continue;
  738. if (L->isLoopInvariant(PtrValue))
  739. continue;
  740. const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
  741. const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
  742. if (!LARSCEV || LARSCEV->getLoop() != L)
  743. continue;
  744. // Mark that we have candidates for preparing.
  745. HasCandidateForPrepare = true;
  746. if (isValidCandidate(&J, PtrValue, PointerElementType))
  747. addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum);
  748. }
  749. return Buckets;
  750. }
  751. bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
  752. PrepForm Form) {
  753. // RemainderOffsetInfo details:
  754. // key: value of (Offset urem DispConstraint). For DSForm, it can
  755. // be [0, 4).
  756. // first of pair: the index of first BucketElement whose remainder is equal
  757. // to key. For key 0, this value must be 0.
  758. // second of pair: number of load/stores with the same remainder.
  759. DenseMap<unsigned, std::pair<unsigned, unsigned>> RemainderOffsetInfo;
  760. for (unsigned j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
  761. if (!BucketChain.Elements[j].Offset)
  762. RemainderOffsetInfo[0] = std::make_pair(0, 1);
  763. else {
  764. unsigned Remainder = cast<SCEVConstant>(BucketChain.Elements[j].Offset)
  765. ->getAPInt()
  766. .urem(Form);
  767. if (RemainderOffsetInfo.find(Remainder) == RemainderOffsetInfo.end())
  768. RemainderOffsetInfo[Remainder] = std::make_pair(j, 1);
  769. else
  770. RemainderOffsetInfo[Remainder].second++;
  771. }
  772. }
  773. // Currently we choose the most profitable base as the one which has the max
  774. // number of load/store with same remainder.
  775. // FIXME: adjust the base selection strategy according to load/store offset
  776. // distribution.
  777. // For example, if we have one candidate chain for DS form preparation, which
  778. // contains following load/stores with different remainders:
  779. // 1: 10 load/store whose remainder is 1;
  780. // 2: 9 load/store whose remainder is 2;
  781. // 3: 1 for remainder 3 and 0 for remainder 0;
  782. // Now we will choose the first load/store whose remainder is 1 as base and
  783. // adjust all other load/stores according to new base, so we will get 10 DS
  784. // form and 10 X form.
  785. // But we should be more clever, for this case we could use two bases, one for
  786. // remainder 1 and the other for remainder 2, thus we could get 19 DS form and
  787. // 1 X form.
  788. unsigned MaxCountRemainder = 0;
  789. for (unsigned j = 0; j < (unsigned)Form; j++)
  790. if ((RemainderOffsetInfo.find(j) != RemainderOffsetInfo.end()) &&
  791. RemainderOffsetInfo[j].second >
  792. RemainderOffsetInfo[MaxCountRemainder].second)
  793. MaxCountRemainder = j;
  794. // Abort when there are too few insts with common base.
  795. if (RemainderOffsetInfo[MaxCountRemainder].second < DispFormPrepMinThreshold)
  796. return false;
  797. // If the first value is most profitable, no needed to adjust BucketChain
  798. // elements as they are substracted the first value when collecting.
  799. if (MaxCountRemainder == 0)
  800. return true;
  801. // Adjust load/store to the new chosen base.
  802. const SCEV *Offset =
  803. BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first].Offset;
  804. BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
  805. for (auto &E : BucketChain.Elements) {
  806. if (E.Offset)
  807. E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
  808. else
  809. E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
  810. }
  811. std::swap(BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first],
  812. BucketChain.Elements[0]);
  813. return true;
  814. }
  815. // FIXME: implement a more clever base choosing policy.
  816. // Currently we always choose an exist load/store offset. This maybe lead to
  817. // suboptimal code sequences. For example, for one DS chain with offsets
  818. // {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
  819. // for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
  820. // multipler of 4, it cannot be represented by sint16.
  821. bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
  822. // We have a choice now of which instruction's memory operand we use as the
  823. // base for the generated PHI. Always picking the first instruction in each
  824. // bucket does not work well, specifically because that instruction might
  825. // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
  826. // the choice is somewhat arbitrary, because the backend will happily
  827. // generate direct offsets from both the pre-incremented and
  828. // post-incremented pointer values. Thus, we'll pick the first non-prefetch
  829. // instruction in each bucket, and adjust the recurrence and other offsets
  830. // accordingly.
  831. for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
  832. if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr))
  833. if (II->getIntrinsicID() == Intrinsic::prefetch)
  834. continue;
  835. // If we'd otherwise pick the first element anyway, there's nothing to do.
  836. if (j == 0)
  837. break;
  838. // If our chosen element has no offset from the base pointer, there's
  839. // nothing to do.
  840. if (!BucketChain.Elements[j].Offset ||
  841. cast<SCEVConstant>(BucketChain.Elements[j].Offset)->isZero())
  842. break;
  843. const SCEV *Offset = BucketChain.Elements[j].Offset;
  844. BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
  845. for (auto &E : BucketChain.Elements) {
  846. if (E.Offset)
  847. E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
  848. else
  849. E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
  850. }
  851. std::swap(BucketChain.Elements[j], BucketChain.Elements[0]);
  852. break;
  853. }
  854. return true;
  855. }
  856. bool PPCLoopInstrFormPrep::rewriteLoadStores(
  857. Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
  858. PrepForm Form) {
  859. bool MadeChange = false;
  860. const SCEVAddRecExpr *BasePtrSCEV =
  861. cast<SCEVAddRecExpr>(BucketChain.BaseSCEV);
  862. if (!BasePtrSCEV->isAffine())
  863. return MadeChange;
  864. if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
  865. return MadeChange;
  866. SmallPtrSet<Value *, 16> DeletedPtrs;
  867. BasicBlock *Header = L->getHeader();
  868. SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
  869. "loopprepare-formrewrite");
  870. // For some DS form load/store instructions, it can also be an update form,
  871. // if the stride is constant and is a multipler of 4. Use update form if
  872. // prefer it.
  873. bool CanPreInc = (Form == UpdateForm ||
  874. ((Form == DSForm) &&
  875. isa<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)) &&
  876. !cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE))
  877. ->getAPInt()
  878. .urem(4) &&
  879. PreferUpdateForm));
  880. std::pair<Instruction *, Instruction *> Base =
  881. rewriteForBase(L, BasePtrSCEV, BucketChain.Elements.begin()->Instr,
  882. CanPreInc, Form, SCEVE, DeletedPtrs);
  883. if (!Base.first || !Base.second)
  884. return MadeChange;
  885. // Keep track of the replacement pointer values we've inserted so that we
  886. // don't generate more pointer values than necessary.
  887. SmallPtrSet<Value *, 16> NewPtrs;
  888. NewPtrs.insert(Base.first);
  889. for (auto I = std::next(BucketChain.Elements.begin()),
  890. IE = BucketChain.Elements.end(); I != IE; ++I) {
  891. Value *Ptr = getPointerOperandAndType(I->Instr);
  892. assert(Ptr && "No pointer operand");
  893. if (NewPtrs.count(Ptr))
  894. continue;
  895. Instruction *NewPtr = rewriteForBucketElement(
  896. Base, *I,
  897. I->Offset ? cast<SCEVConstant>(I->Offset)->getValue() : nullptr,
  898. DeletedPtrs);
  899. assert(NewPtr && "wrong rewrite!\n");
  900. NewPtrs.insert(NewPtr);
  901. }
  902. // Clear the rewriter cache, because values that are in the rewriter's cache
  903. // can be deleted below, causing the AssertingVH in the cache to trigger.
  904. SCEVE.clear();
  905. for (auto *Ptr : DeletedPtrs) {
  906. if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
  907. BBChanged.insert(IDel->getParent());
  908. RecursivelyDeleteTriviallyDeadInstructions(Ptr);
  909. }
  910. MadeChange = true;
  911. SuccPrepCount++;
  912. if (Form == DSForm && !CanPreInc)
  913. DSFormChainRewritten++;
  914. else if (Form == DQForm)
  915. DQFormChainRewritten++;
  916. else if (Form == UpdateForm || (Form == DSForm && CanPreInc))
  917. UpdFormChainRewritten++;
  918. return MadeChange;
  919. }
  920. bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
  921. SmallVector<Bucket, 16> &Buckets) {
  922. bool MadeChange = false;
  923. if (Buckets.empty())
  924. return MadeChange;
  925. SmallSet<BasicBlock *, 16> BBChanged;
  926. for (auto &Bucket : Buckets)
  927. // The base address of each bucket is transformed into a phi and the others
  928. // are rewritten based on new base.
  929. if (prepareBaseForUpdateFormChain(Bucket))
  930. MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, UpdateForm);
  931. if (MadeChange)
  932. for (auto *BB : BBChanged)
  933. DeleteDeadPHIs(BB);
  934. return MadeChange;
  935. }
  936. bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
  937. SmallVector<Bucket, 16> &Buckets,
  938. PrepForm Form) {
  939. bool MadeChange = false;
  940. if (Buckets.empty())
  941. return MadeChange;
  942. SmallSet<BasicBlock *, 16> BBChanged;
  943. for (auto &Bucket : Buckets) {
  944. if (Bucket.Elements.size() < DispFormPrepMinThreshold)
  945. continue;
  946. if (prepareBaseForDispFormChain(Bucket, Form))
  947. MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, Form);
  948. }
  949. if (MadeChange)
  950. for (auto *BB : BBChanged)
  951. DeleteDeadPHIs(BB);
  952. return MadeChange;
  953. }
  954. // Find the loop invariant increment node for SCEV BasePtrIncSCEV.
  955. // bb.loop.preheader:
  956. // %start = ...
  957. // bb.loop.body:
  958. // %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ]
  959. // ...
  960. // %add = add %phinode, %inc ; %inc is what we want to get.
  961. //
  962. Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI,
  963. const SCEV *BasePtrIncSCEV) {
  964. // If the increment is a constant, no definition is needed.
  965. // Return the value directly.
  966. if (isa<SCEVConstant>(BasePtrIncSCEV))
  967. return cast<SCEVConstant>(BasePtrIncSCEV)->getValue();
  968. if (!SE->isLoopInvariant(BasePtrIncSCEV, L))
  969. return nullptr;
  970. BasicBlock *BB = MemI->getParent();
  971. if (!BB)
  972. return nullptr;
  973. BasicBlock *LatchBB = L->getLoopLatch();
  974. if (!LatchBB)
  975. return nullptr;
  976. // Run through the PHIs and check their operands to find valid representation
  977. // for the increment SCEV.
  978. iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
  979. for (auto &CurrentPHI : PHIIter) {
  980. PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
  981. if (!CurrentPHINode)
  982. continue;
  983. if (!SE->isSCEVable(CurrentPHINode->getType()))
  984. continue;
  985. const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
  986. const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
  987. if (!PHIBasePtrSCEV)
  988. continue;
  989. const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE);
  990. if (!PHIBasePtrIncSCEV || (PHIBasePtrIncSCEV != BasePtrIncSCEV))
  991. continue;
  992. // Get the incoming value from the loop latch and check if the value has
  993. // the add form with the required increment.
  994. if (Instruction *I = dyn_cast<Instruction>(
  995. CurrentPHINode->getIncomingValueForBlock(LatchBB))) {
  996. Value *StrippedBaseI = I;
  997. while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBaseI))
  998. StrippedBaseI = BC->getOperand(0);
  999. Instruction *StrippedI = dyn_cast<Instruction>(StrippedBaseI);
  1000. if (!StrippedI)
  1001. continue;
  1002. // LSR pass may add a getelementptr instruction to do the loop increment,
  1003. // also search in that getelementptr instruction.
  1004. if (StrippedI->getOpcode() == Instruction::Add ||
  1005. (StrippedI->getOpcode() == Instruction::GetElementPtr &&
  1006. StrippedI->getNumOperands() == 2)) {
  1007. if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV)
  1008. return StrippedI->getOperand(0);
  1009. if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV)
  1010. return StrippedI->getOperand(1);
  1011. }
  1012. }
  1013. }
  1014. return nullptr;
  1015. }
  1016. // In order to prepare for the preferred instruction form, a PHI is added.
  1017. // This function will check to see if that PHI already exists and will return
  1018. // true if it found an existing PHI with the matched start and increment as the
  1019. // one we wanted to create.
  1020. bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI,
  1021. const SCEV *BasePtrStartSCEV,
  1022. const SCEV *BasePtrIncSCEV,
  1023. PrepForm Form) {
  1024. BasicBlock *BB = MemI->getParent();
  1025. if (!BB)
  1026. return false;
  1027. BasicBlock *PredBB = L->getLoopPredecessor();
  1028. BasicBlock *LatchBB = L->getLoopLatch();
  1029. if (!PredBB || !LatchBB)
  1030. return false;
  1031. // Run through the PHIs and see if we have some that looks like a preparation
  1032. iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
  1033. for (auto & CurrentPHI : PHIIter) {
  1034. PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
  1035. if (!CurrentPHINode)
  1036. continue;
  1037. if (!SE->isSCEVable(CurrentPHINode->getType()))
  1038. continue;
  1039. const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
  1040. const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
  1041. if (!PHIBasePtrSCEV)
  1042. continue;
  1043. const SCEVConstant *PHIBasePtrIncSCEV =
  1044. dyn_cast<SCEVConstant>(PHIBasePtrSCEV->getStepRecurrence(*SE));
  1045. if (!PHIBasePtrIncSCEV)
  1046. continue;
  1047. if (CurrentPHINode->getNumIncomingValues() == 2) {
  1048. if ((CurrentPHINode->getIncomingBlock(0) == LatchBB &&
  1049. CurrentPHINode->getIncomingBlock(1) == PredBB) ||
  1050. (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
  1051. CurrentPHINode->getIncomingBlock(0) == PredBB)) {
  1052. if (PHIBasePtrIncSCEV == BasePtrIncSCEV) {
  1053. // The existing PHI (CurrentPHINode) has the same start and increment
  1054. // as the PHI that we wanted to create.
  1055. if ((Form == UpdateForm || Form == ChainCommoning ) &&
  1056. PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) {
  1057. ++PHINodeAlreadyExistsUpdate;
  1058. return true;
  1059. }
  1060. if (Form == DSForm || Form == DQForm) {
  1061. const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
  1062. SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV));
  1063. if (Diff && !Diff->getAPInt().urem(Form)) {
  1064. if (Form == DSForm)
  1065. ++PHINodeAlreadyExistsDS;
  1066. else
  1067. ++PHINodeAlreadyExistsDQ;
  1068. return true;
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. }
  1075. return false;
  1076. }
  1077. bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
  1078. bool MadeChange = false;
  1079. // Only prep. the inner-most loop
  1080. if (!L->isInnermost())
  1081. return MadeChange;
  1082. // Return if already done enough preparation.
  1083. if (SuccPrepCount >= MaxVarsPrep)
  1084. return MadeChange;
  1085. LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
  1086. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  1087. // If there is no loop predecessor, or the loop predecessor's terminator
  1088. // returns a value (which might contribute to determining the loop's
  1089. // iteration space), insert a new preheader for the loop.
  1090. if (!LoopPredecessor ||
  1091. !LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
  1092. LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
  1093. if (LoopPredecessor)
  1094. MadeChange = true;
  1095. }
  1096. if (!LoopPredecessor) {
  1097. LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n");
  1098. return MadeChange;
  1099. }
  1100. // Check if a load/store has update form. This lambda is used by function
  1101. // collectCandidates which can collect candidates for types defined by lambda.
  1102. auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue,
  1103. const Type *PointerElementType) {
  1104. assert((PtrValue && I) && "Invalid parameter!");
  1105. // There are no update forms for Altivec vector load/stores.
  1106. if (ST && ST->hasAltivec() && PointerElementType->isVectorTy())
  1107. return false;
  1108. // There are no update forms for P10 lxvp/stxvp intrinsic.
  1109. auto *II = dyn_cast<IntrinsicInst>(I);
  1110. if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) ||
  1111. II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp))
  1112. return false;
  1113. // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
  1114. // be 4's multiple (DS-form). For i64 loads/stores when the displacement
  1115. // fits in a 16-bit signed field but isn't a multiple of 4, it will be
  1116. // useless and possible to break some original well-form addressing mode
  1117. // to make this pre-inc prep for it.
  1118. if (PointerElementType->isIntegerTy(64)) {
  1119. const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
  1120. const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
  1121. if (!LARSCEV || LARSCEV->getLoop() != L)
  1122. return false;
  1123. if (const SCEVConstant *StepConst =
  1124. dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
  1125. const APInt &ConstInt = StepConst->getValue()->getValue();
  1126. if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
  1127. return false;
  1128. }
  1129. }
  1130. return true;
  1131. };
  1132. // Check if a load/store has DS form.
  1133. auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue,
  1134. const Type *PointerElementType) {
  1135. assert((PtrValue && I) && "Invalid parameter!");
  1136. if (isa<IntrinsicInst>(I))
  1137. return false;
  1138. return (PointerElementType->isIntegerTy(64)) ||
  1139. (PointerElementType->isFloatTy()) ||
  1140. (PointerElementType->isDoubleTy()) ||
  1141. (PointerElementType->isIntegerTy(32) &&
  1142. llvm::any_of(I->users(),
  1143. [](const User *U) { return isa<SExtInst>(U); }));
  1144. };
  1145. // Check if a load/store has DQ form.
  1146. auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue,
  1147. const Type *PointerElementType) {
  1148. assert((PtrValue && I) && "Invalid parameter!");
  1149. // Check if it is a P10 lxvp/stxvp intrinsic.
  1150. auto *II = dyn_cast<IntrinsicInst>(I);
  1151. if (II)
  1152. return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp ||
  1153. II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp;
  1154. // Check if it is a P9 vector load/store.
  1155. return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy());
  1156. };
  1157. // Check if a load/store is candidate for chain commoning.
  1158. // If the SCEV is only with one ptr operand in its start, we can use that
  1159. // start as a chain separator. Mark this load/store as a candidate.
  1160. auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue,
  1161. const Type *PointerElementType) {
  1162. const SCEVAddRecExpr *ARSCEV =
  1163. cast<SCEVAddRecExpr>(SE->getSCEVAtScope(PtrValue, L));
  1164. if (!ARSCEV)
  1165. return false;
  1166. if (!ARSCEV->isAffine())
  1167. return false;
  1168. const SCEV *Start = ARSCEV->getStart();
  1169. // A single pointer. We can treat it as offset 0.
  1170. if (isa<SCEVUnknown>(Start) && Start->getType()->isPointerTy())
  1171. return true;
  1172. const SCEVAddExpr *ASCEV = dyn_cast<SCEVAddExpr>(Start);
  1173. // We need a SCEVAddExpr to include both base and offset.
  1174. if (!ASCEV)
  1175. return false;
  1176. // Make sure there is only one pointer operand(base) and all other operands
  1177. // are integer type.
  1178. bool SawPointer = false;
  1179. for (const SCEV *Op : ASCEV->operands()) {
  1180. if (Op->getType()->isPointerTy()) {
  1181. if (SawPointer)
  1182. return false;
  1183. SawPointer = true;
  1184. } else if (!Op->getType()->isIntegerTy())
  1185. return false;
  1186. }
  1187. return SawPointer;
  1188. };
  1189. // Check if the diff is a constant type. This is used for update/DS/DQ form
  1190. // preparation.
  1191. auto isValidConstantDiff = [](const SCEV *Diff) {
  1192. return dyn_cast<SCEVConstant>(Diff) != nullptr;
  1193. };
  1194. // Make sure the diff between the base and new candidate is required type.
  1195. // This is used for chain commoning preparation.
  1196. auto isValidChainCommoningDiff = [](const SCEV *Diff) {
  1197. assert(Diff && "Invalid Diff!\n");
  1198. // Don't mess up previous dform prepare.
  1199. if (isa<SCEVConstant>(Diff))
  1200. return false;
  1201. // A single integer type offset.
  1202. if (isa<SCEVUnknown>(Diff) && Diff->getType()->isIntegerTy())
  1203. return true;
  1204. const SCEVNAryExpr *ADiff = dyn_cast<SCEVNAryExpr>(Diff);
  1205. if (!ADiff)
  1206. return false;
  1207. for (const SCEV *Op : ADiff->operands())
  1208. if (!Op->getType()->isIntegerTy())
  1209. return false;
  1210. return true;
  1211. };
  1212. HasCandidateForPrepare = false;
  1213. LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n");
  1214. // Collect buckets of comparable addresses used by loads and stores for update
  1215. // form.
  1216. SmallVector<Bucket, 16> UpdateFormBuckets = collectCandidates(
  1217. L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm);
  1218. // Prepare for update form.
  1219. if (!UpdateFormBuckets.empty())
  1220. MadeChange |= updateFormPrep(L, UpdateFormBuckets);
  1221. else if (!HasCandidateForPrepare) {
  1222. LLVM_DEBUG(
  1223. dbgs()
  1224. << "No prepare candidates found, stop praparation for current loop!\n");
  1225. // If no candidate for preparing, return early.
  1226. return MadeChange;
  1227. }
  1228. LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n");
  1229. // Collect buckets of comparable addresses used by loads and stores for DS
  1230. // form.
  1231. SmallVector<Bucket, 16> DSFormBuckets = collectCandidates(
  1232. L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm);
  1233. // Prepare for DS form.
  1234. if (!DSFormBuckets.empty())
  1235. MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm);
  1236. LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n");
  1237. // Collect buckets of comparable addresses used by loads and stores for DQ
  1238. // form.
  1239. SmallVector<Bucket, 16> DQFormBuckets = collectCandidates(
  1240. L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm);
  1241. // Prepare for DQ form.
  1242. if (!DQFormBuckets.empty())
  1243. MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm);
  1244. // Collect buckets of comparable addresses used by loads and stores for chain
  1245. // commoning. With chain commoning, we reuse offsets between the chains, so
  1246. // the register pressure will be reduced.
  1247. if (!EnableChainCommoning) {
  1248. LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n");
  1249. return MadeChange;
  1250. }
  1251. LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n");
  1252. SmallVector<Bucket, 16> Buckets =
  1253. collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff,
  1254. MaxVarsChainCommon);
  1255. // Prepare for chain commoning.
  1256. if (!Buckets.empty())
  1257. MadeChange |= chainCommoning(L, Buckets);
  1258. return MadeChange;
  1259. }