PPCLoopInstrFormPrep.cpp 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491
  1. //===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements a pass to prepare loops for ppc preferred addressing
  10. // modes, leveraging different instruction form. (eg: DS/DQ form, D/DS form with
  11. // update)
  12. // Additional PHIs are created for loop induction variables used by load/store
  13. // instructions so that preferred addressing modes can be used.
  14. //
  15. // 1: DS/DQ form preparation, prepare the load/store instructions so that they
  16. // can satisfy the DS/DQ form displacement requirements.
  17. // Generically, this means transforming loops like this:
  18. // for (int i = 0; i < n; ++i) {
  19. // unsigned long x1 = *(unsigned long *)(p + i + 5);
  20. // unsigned long x2 = *(unsigned long *)(p + i + 9);
  21. // }
  22. //
  23. // to look like this:
  24. //
  25. // unsigned NewP = p + 5;
  26. // for (int i = 0; i < n; ++i) {
  27. // unsigned long x1 = *(unsigned long *)(i + NewP);
  28. // unsigned long x2 = *(unsigned long *)(i + NewP + 4);
  29. // }
  30. //
  31. // 2: D/DS form with update preparation, prepare the load/store instructions so
  32. // that we can use update form to do pre-increment.
  33. // Generically, this means transforming loops like this:
  34. // for (int i = 0; i < n; ++i)
  35. // array[i] = c;
  36. //
  37. // to look like this:
  38. //
  39. // T *p = array[-1];
  40. // for (int i = 0; i < n; ++i)
  41. // *++p = c;
  42. //
  43. // 3: common multiple chains for the load/stores with same offsets in the loop,
  44. // so that we can reuse the offsets and reduce the register pressure in the
  45. // loop. This transformation can also increase the loop ILP as now each chain
  46. // uses its own loop induction add/addi. But this will increase the number of
  47. // add/addi in the loop.
  48. //
  49. // Generically, this means transforming loops like this:
  50. //
  51. // char *p;
  52. // A1 = p + base1
  53. // A2 = p + base1 + offset
  54. // B1 = p + base2
  55. // B2 = p + base2 + offset
  56. //
  57. // for (int i = 0; i < n; i++)
  58. // unsigned long x1 = *(unsigned long *)(A1 + i);
  59. // unsigned long x2 = *(unsigned long *)(A2 + i)
  60. // unsigned long x3 = *(unsigned long *)(B1 + i);
  61. // unsigned long x4 = *(unsigned long *)(B2 + i);
  62. // }
  63. //
  64. // to look like this:
  65. //
  66. // A1_new = p + base1 // chain 1
  67. // B1_new = p + base2 // chain 2, now inside the loop, common offset is
  68. // // reused.
  69. //
  70. // for (long long i = 0; i < n; i+=count) {
  71. // unsigned long x1 = *(unsigned long *)(A1_new + i);
  72. // unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
  73. // unsigned long x3 = *(unsigned long *)(B1_new + i);
  74. // unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
  75. // }
  76. //===----------------------------------------------------------------------===//
  77. #include "PPC.h"
  78. #include "PPCSubtarget.h"
  79. #include "PPCTargetMachine.h"
  80. #include "llvm/ADT/DepthFirstIterator.h"
  81. #include "llvm/ADT/SmallPtrSet.h"
  82. #include "llvm/ADT/SmallSet.h"
  83. #include "llvm/ADT/SmallVector.h"
  84. #include "llvm/ADT/Statistic.h"
  85. #include "llvm/Analysis/LoopInfo.h"
  86. #include "llvm/Analysis/ScalarEvolution.h"
  87. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  88. #include "llvm/IR/BasicBlock.h"
  89. #include "llvm/IR/CFG.h"
  90. #include "llvm/IR/Dominators.h"
  91. #include "llvm/IR/Instruction.h"
  92. #include "llvm/IR/Instructions.h"
  93. #include "llvm/IR/IntrinsicInst.h"
  94. #include "llvm/IR/IntrinsicsPowerPC.h"
  95. #include "llvm/IR/Module.h"
  96. #include "llvm/IR/Type.h"
  97. #include "llvm/IR/Value.h"
  98. #include "llvm/InitializePasses.h"
  99. #include "llvm/Pass.h"
  100. #include "llvm/Support/Casting.h"
  101. #include "llvm/Support/CommandLine.h"
  102. #include "llvm/Support/Debug.h"
  103. #include "llvm/Transforms/Scalar.h"
  104. #include "llvm/Transforms/Utils.h"
  105. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  106. #include "llvm/Transforms/Utils/Local.h"
  107. #include "llvm/Transforms/Utils/LoopUtils.h"
  108. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  109. #include <cassert>
  110. #include <cmath>
  111. #include <iterator>
  112. #include <utility>
  113. #define DEBUG_TYPE "ppc-loop-instr-form-prep"
  114. using namespace llvm;
  115. static cl::opt<unsigned>
  116. MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24),
  117. cl::desc("Potential common base number threshold per function "
  118. "for PPC loop prep"));
  119. static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
  120. cl::init(true), cl::Hidden,
  121. cl::desc("prefer update form when ds form is also a update form"));
  122. static cl::opt<bool> EnableUpdateFormForNonConstInc(
  123. "ppc-formprep-update-nonconst-inc", cl::init(false), cl::Hidden,
  124. cl::desc("prepare update form when the load/store increment is a loop "
  125. "invariant non-const value."));
  126. static cl::opt<bool> EnableChainCommoning(
  127. "ppc-formprep-chain-commoning", cl::init(false), cl::Hidden,
  128. cl::desc("Enable chain commoning in PPC loop prepare pass."));
  129. // Sum of following 3 per loop thresholds for all loops can not be larger
  130. // than MaxVarsPrep.
  131. // now the thresholds for each kind prep are exterimental values on Power9.
  132. static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars",
  133. cl::Hidden, cl::init(3),
  134. cl::desc("Potential PHI threshold per loop for PPC loop prep of update "
  135. "form"));
  136. static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars",
  137. cl::Hidden, cl::init(3),
  138. cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form"));
  139. static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
  140. cl::Hidden, cl::init(8),
  141. cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
  142. // Commoning chain will reduce the register pressure, so we don't consider about
  143. // the PHI nodes number.
  144. // But commoning chain will increase the addi/add number in the loop and also
  145. // increase loop ILP. Maximum chain number should be same with hardware
  146. // IssueWidth, because we won't benefit from ILP if the parallel chains number
  147. // is bigger than IssueWidth. We assume there are 2 chains in one bucket, so
  148. // there would be 4 buckets at most on P9(IssueWidth is 8).
  149. static cl::opt<unsigned> MaxVarsChainCommon(
  150. "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4),
  151. cl::desc("Bucket number per loop for PPC loop chain common"));
  152. // If would not be profitable if the common base has only one load/store, ISEL
  153. // should already be able to choose best load/store form based on offset for
  154. // single load/store. Set minimal profitable value default to 2 and make it as
  155. // an option.
  156. static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold",
  157. cl::Hidden, cl::init(2),
  158. cl::desc("Minimal common base load/store instructions triggering DS/DQ form "
  159. "preparation"));
  160. static cl::opt<unsigned> ChainCommonPrepMinThreshold(
  161. "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4),
  162. cl::desc("Minimal common base load/store instructions triggering chain "
  163. "commoning preparation. Must be not smaller than 4"));
  164. STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form");
  165. STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form");
  166. STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form");
  167. STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten");
  168. STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten");
  169. STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
  170. STATISTIC(ChainCommoningRewritten, "Num of commoning chains");
  171. namespace {
  172. struct BucketElement {
  173. BucketElement(const SCEV *O, Instruction *I) : Offset(O), Instr(I) {}
  174. BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
  175. const SCEV *Offset;
  176. Instruction *Instr;
  177. };
  178. struct Bucket {
  179. Bucket(const SCEV *B, Instruction *I)
  180. : BaseSCEV(B), Elements(1, BucketElement(I)) {
  181. ChainSize = 0;
  182. }
  183. // The base of the whole bucket.
  184. const SCEV *BaseSCEV;
  185. // All elements in the bucket. In the bucket, the element with the BaseSCEV
  186. // has no offset and all other elements are stored as offsets to the
  187. // BaseSCEV.
  188. SmallVector<BucketElement, 16> Elements;
  189. // The potential chains size. This is used for chain commoning only.
  190. unsigned ChainSize;
  191. // The base for each potential chain. This is used for chain commoning only.
  192. SmallVector<BucketElement, 16> ChainBases;
  193. };
  194. // "UpdateForm" is not a real PPC instruction form, it stands for dform
  195. // load/store with update like ldu/stdu, or Prefetch intrinsic.
  196. // For DS form instructions, their displacements must be multiple of 4.
  197. // For DQ form instructions, their displacements must be multiple of 16.
  198. enum PrepForm { UpdateForm = 1, DSForm = 4, DQForm = 16, ChainCommoning };
  199. class PPCLoopInstrFormPrep : public FunctionPass {
  200. public:
  201. static char ID; // Pass ID, replacement for typeid
  202. PPCLoopInstrFormPrep() : FunctionPass(ID) {
  203. initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
  204. }
  205. PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
  206. initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
  207. }
  208. void getAnalysisUsage(AnalysisUsage &AU) const override {
  209. AU.addPreserved<DominatorTreeWrapperPass>();
  210. AU.addRequired<LoopInfoWrapperPass>();
  211. AU.addPreserved<LoopInfoWrapperPass>();
  212. AU.addRequired<ScalarEvolutionWrapperPass>();
  213. }
  214. bool runOnFunction(Function &F) override;
  215. private:
  216. PPCTargetMachine *TM = nullptr;
  217. const PPCSubtarget *ST;
  218. DominatorTree *DT;
  219. LoopInfo *LI;
  220. ScalarEvolution *SE;
  221. bool PreserveLCSSA;
  222. bool HasCandidateForPrepare;
  223. /// Successful preparation number for Update/DS/DQ form in all inner most
  224. /// loops. One successful preparation will put one common base out of loop,
  225. /// this may leads to register presure like LICM does.
  226. /// Make sure total preparation number can be controlled by option.
  227. unsigned SuccPrepCount;
  228. bool runOnLoop(Loop *L);
  229. /// Check if required PHI node is already exist in Loop \p L.
  230. bool alreadyPrepared(Loop *L, Instruction *MemI,
  231. const SCEV *BasePtrStartSCEV,
  232. const SCEV *BasePtrIncSCEV, PrepForm Form);
  233. /// Get the value which defines the increment SCEV \p BasePtrIncSCEV.
  234. Value *getNodeForInc(Loop *L, Instruction *MemI,
  235. const SCEV *BasePtrIncSCEV);
  236. /// Common chains to reuse offsets for a loop to reduce register pressure.
  237. bool chainCommoning(Loop *L, SmallVector<Bucket, 16> &Buckets);
  238. /// Find out the potential commoning chains and their bases.
  239. bool prepareBasesForCommoningChains(Bucket &BucketChain);
  240. /// Rewrite load/store according to the common chains.
  241. bool
  242. rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
  243. SmallSet<BasicBlock *, 16> &BBChanged);
  244. /// Collect condition matched(\p isValidCandidate() returns true)
  245. /// candidates in Loop \p L.
  246. SmallVector<Bucket, 16> collectCandidates(
  247. Loop *L,
  248. std::function<bool(const Instruction *, Value *, const Type *)>
  249. isValidCandidate,
  250. std::function<bool(const SCEV *)> isValidDiff,
  251. unsigned MaxCandidateNum);
  252. /// Add a candidate to candidates \p Buckets if diff between candidate and
  253. /// one base in \p Buckets matches \p isValidDiff.
  254. void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
  255. SmallVector<Bucket, 16> &Buckets,
  256. std::function<bool(const SCEV *)> isValidDiff,
  257. unsigned MaxCandidateNum);
  258. /// Prepare all candidates in \p Buckets for update form.
  259. bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets);
  260. /// Prepare all candidates in \p Buckets for displacement form, now for
  261. /// ds/dq.
  262. bool dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets, PrepForm Form);
  263. /// Prepare for one chain \p BucketChain, find the best base element and
  264. /// update all other elements in \p BucketChain accordingly.
  265. /// \p Form is used to find the best base element.
  266. /// If success, best base element must be stored as the first element of
  267. /// \p BucketChain.
  268. /// Return false if no base element found, otherwise return true.
  269. bool prepareBaseForDispFormChain(Bucket &BucketChain, PrepForm Form);
  270. /// Prepare for one chain \p BucketChain, find the best base element and
  271. /// update all other elements in \p BucketChain accordingly.
  272. /// If success, best base element must be stored as the first element of
  273. /// \p BucketChain.
  274. /// Return false if no base element found, otherwise return true.
  275. bool prepareBaseForUpdateFormChain(Bucket &BucketChain);
  276. /// Rewrite load/store instructions in \p BucketChain according to
  277. /// preparation.
  278. bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
  279. SmallSet<BasicBlock *, 16> &BBChanged,
  280. PrepForm Form);
  281. /// Rewrite for the base load/store of a chain.
  282. std::pair<Instruction *, Instruction *>
  283. rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
  284. Instruction *BaseMemI, bool CanPreInc, PrepForm Form,
  285. SCEVExpander &SCEVE, SmallPtrSet<Value *, 16> &DeletedPtrs);
  286. /// Rewrite for the other load/stores of a chain according to the new \p
  287. /// Base.
  288. Instruction *
  289. rewriteForBucketElement(std::pair<Instruction *, Instruction *> Base,
  290. const BucketElement &Element, Value *OffToBase,
  291. SmallPtrSet<Value *, 16> &DeletedPtrs);
  292. };
  293. } // end anonymous namespace
  294. char PPCLoopInstrFormPrep::ID = 0;
  295. static const char *name = "Prepare loop for ppc preferred instruction forms";
  296. INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
  297. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  298. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  299. INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
  300. static constexpr StringRef PHINodeNameSuffix = ".phi";
  301. static constexpr StringRef CastNodeNameSuffix = ".cast";
  302. static constexpr StringRef GEPNodeIncNameSuffix = ".inc";
  303. static constexpr StringRef GEPNodeOffNameSuffix = ".off";
  304. FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) {
  305. return new PPCLoopInstrFormPrep(TM);
  306. }
  307. static bool IsPtrInBounds(Value *BasePtr) {
  308. Value *StrippedBasePtr = BasePtr;
  309. while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
  310. StrippedBasePtr = BC->getOperand(0);
  311. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr))
  312. return GEP->isInBounds();
  313. return false;
  314. }
  315. static std::string getInstrName(const Value *I, StringRef Suffix) {
  316. assert(I && "Invalid paramater!");
  317. if (I->hasName())
  318. return (I->getName() + Suffix).str();
  319. else
  320. return "";
  321. }
  322. static Value *getPointerOperandAndType(Value *MemI,
  323. Type **PtrElementType = nullptr) {
  324. Value *PtrValue = nullptr;
  325. Type *PointerElementType = nullptr;
  326. if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
  327. PtrValue = LMemI->getPointerOperand();
  328. PointerElementType = LMemI->getType();
  329. } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
  330. PtrValue = SMemI->getPointerOperand();
  331. PointerElementType = SMemI->getValueOperand()->getType();
  332. } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
  333. PointerElementType = Type::getInt8Ty(MemI->getContext());
  334. if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
  335. IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
  336. PtrValue = IMemI->getArgOperand(0);
  337. } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
  338. PtrValue = IMemI->getArgOperand(1);
  339. }
  340. }
  341. /*Get ElementType if PtrElementType is not null.*/
  342. if (PtrElementType)
  343. *PtrElementType = PointerElementType;
  344. return PtrValue;
  345. }
  346. bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
  347. if (skipFunction(F))
  348. return false;
  349. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  350. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  351. auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
  352. DT = DTWP ? &DTWP->getDomTree() : nullptr;
  353. PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  354. ST = TM ? TM->getSubtargetImpl(F) : nullptr;
  355. SuccPrepCount = 0;
  356. bool MadeChange = false;
  357. for (Loop *I : *LI)
  358. for (Loop *L : depth_first(I))
  359. MadeChange |= runOnLoop(L);
  360. return MadeChange;
  361. }
  362. // Finding the minimal(chain_number + reusable_offset_number) is a complicated
  363. // algorithmic problem.
  364. // For now, the algorithm used here is simply adjusted to handle the case for
  365. // manually unrolling cases.
  366. // FIXME: use a more powerful algorithm to find minimal sum of chain_number and
  367. // reusable_offset_number for one base with multiple offsets.
  368. bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) {
  369. // The minimal size for profitable chain commoning:
  370. // A1 = base + offset1
  371. // A2 = base + offset2 (offset2 - offset1 = X)
  372. // A3 = base + offset3
  373. // A4 = base + offset4 (offset4 - offset3 = X)
  374. // ======>
  375. // base1 = base + offset1
  376. // base2 = base + offset3
  377. // A1 = base1
  378. // A2 = base1 + X
  379. // A3 = base2
  380. // A4 = base2 + X
  381. //
  382. // There is benefit because of reuse of offest 'X'.
  383. assert(ChainCommonPrepMinThreshold >= 4 &&
  384. "Thredhold can not be smaller than 4!\n");
  385. if (CBucket.Elements.size() < ChainCommonPrepMinThreshold)
  386. return false;
  387. // We simply select the FirstOffset as the first reusable offset between each
  388. // chain element 1 and element 0.
  389. const SCEV *FirstOffset = CBucket.Elements[1].Offset;
  390. // Figure out how many times above FirstOffset is used in the chain.
  391. // For a success commoning chain candidate, offset difference between each
  392. // chain element 1 and element 0 must be also FirstOffset.
  393. unsigned FirstOffsetReusedCount = 1;
  394. // Figure out how many times above FirstOffset is used in the first chain.
  395. // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain
  396. unsigned FirstOffsetReusedCountInFirstChain = 1;
  397. unsigned EleNum = CBucket.Elements.size();
  398. bool SawChainSeparater = false;
  399. for (unsigned j = 2; j != EleNum; ++j) {
  400. if (SE->getMinusSCEV(CBucket.Elements[j].Offset,
  401. CBucket.Elements[j - 1].Offset) == FirstOffset) {
  402. if (!SawChainSeparater)
  403. FirstOffsetReusedCountInFirstChain++;
  404. FirstOffsetReusedCount++;
  405. } else
  406. // For now, if we meet any offset which is not FirstOffset, we assume we
  407. // find a new Chain.
  408. // This makes us miss some opportunities.
  409. // For example, we can common:
  410. //
  411. // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB}
  412. //
  413. // as two chains:
  414. // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}}
  415. // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2
  416. //
  417. // But we fail to common:
  418. //
  419. // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA}
  420. // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1
  421. SawChainSeparater = true;
  422. }
  423. // FirstOffset is not reused, skip this bucket.
  424. if (FirstOffsetReusedCount == 1)
  425. return false;
  426. unsigned ChainNum =
  427. FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain;
  428. // All elements are increased by FirstOffset.
  429. // The number of chains should be sqrt(EleNum).
  430. if (!SawChainSeparater)
  431. ChainNum = (unsigned)sqrt((double)EleNum);
  432. CBucket.ChainSize = (unsigned)(EleNum / ChainNum);
  433. // If this is not a perfect chain(eg: not all elements can be put inside
  434. // commoning chains.), skip now.
  435. if (CBucket.ChainSize * ChainNum != EleNum)
  436. return false;
  437. if (SawChainSeparater) {
  438. // Check that the offset seqs are the same for all chains.
  439. for (unsigned i = 1; i < CBucket.ChainSize; i++)
  440. for (unsigned j = 1; j < ChainNum; j++)
  441. if (CBucket.Elements[i].Offset !=
  442. SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset,
  443. CBucket.Elements[j * CBucket.ChainSize].Offset))
  444. return false;
  445. }
  446. for (unsigned i = 0; i < ChainNum; i++)
  447. CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]);
  448. LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n");
  449. return true;
  450. }
  451. bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
  452. SmallVector<Bucket, 16> &Buckets) {
  453. bool MadeChange = false;
  454. if (Buckets.empty())
  455. return MadeChange;
  456. SmallSet<BasicBlock *, 16> BBChanged;
  457. for (auto &Bucket : Buckets) {
  458. if (prepareBasesForCommoningChains(Bucket))
  459. MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged);
  460. }
  461. if (MadeChange)
  462. for (auto *BB : BBChanged)
  463. DeleteDeadPHIs(BB);
  464. return MadeChange;
  465. }
  466. bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
  467. Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
  468. bool MadeChange = false;
  469. assert(Bucket.Elements.size() ==
  470. Bucket.ChainBases.size() * Bucket.ChainSize &&
  471. "invalid bucket for chain commoning!\n");
  472. SmallPtrSet<Value *, 16> DeletedPtrs;
  473. BasicBlock *Header = L->getHeader();
  474. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  475. SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
  476. "loopprepare-chaincommon");
  477. for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) {
  478. unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx;
  479. const SCEV *BaseSCEV =
  480. ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV,
  481. Bucket.Elements[BaseElemIdx].Offset)
  482. : Bucket.BaseSCEV;
  483. const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
  484. // Make sure the base is able to expand.
  485. if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
  486. return MadeChange;
  487. assert(BasePtrSCEV->isAffine() &&
  488. "Invalid SCEV type for the base ptr for a candidate chain!\n");
  489. std::pair<Instruction *, Instruction *> Base = rewriteForBase(
  490. L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr,
  491. false /* CanPreInc */, ChainCommoning, SCEVE, DeletedPtrs);
  492. if (!Base.first || !Base.second)
  493. return MadeChange;
  494. // Keep track of the replacement pointer values we've inserted so that we
  495. // don't generate more pointer values than necessary.
  496. SmallPtrSet<Value *, 16> NewPtrs;
  497. NewPtrs.insert(Base.first);
  498. for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize;
  499. ++Idx) {
  500. BucketElement &I = Bucket.Elements[Idx];
  501. Value *Ptr = getPointerOperandAndType(I.Instr);
  502. assert(Ptr && "No pointer operand");
  503. if (NewPtrs.count(Ptr))
  504. continue;
  505. const SCEV *OffsetSCEV =
  506. BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset,
  507. Bucket.Elements[BaseElemIdx].Offset)
  508. : Bucket.Elements[Idx].Offset;
  509. // Make sure offset is able to expand. Only need to check one time as the
  510. // offsets are reused between different chains.
  511. if (!BaseElemIdx)
  512. if (!SCEVE.isSafeToExpand(OffsetSCEV))
  513. return false;
  514. Value *OffsetValue = SCEVE.expandCodeFor(
  515. OffsetSCEV, OffsetSCEV->getType(), LoopPredecessor->getTerminator());
  516. Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx],
  517. OffsetValue, DeletedPtrs);
  518. assert(NewPtr && "Wrong rewrite!\n");
  519. NewPtrs.insert(NewPtr);
  520. }
  521. ++ChainCommoningRewritten;
  522. }
  523. // Clear the rewriter cache, because values that are in the rewriter's cache
  524. // can be deleted below, causing the AssertingVH in the cache to trigger.
  525. SCEVE.clear();
  526. for (auto *Ptr : DeletedPtrs) {
  527. if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
  528. BBChanged.insert(IDel->getParent());
  529. RecursivelyDeleteTriviallyDeadInstructions(Ptr);
  530. }
  531. MadeChange = true;
  532. return MadeChange;
  533. }
  534. // Rewrite the new base according to BasePtrSCEV.
  535. // bb.loop.preheader:
  536. // %newstart = ...
  537. // bb.loop.body:
  538. // %phinode = phi [ %newstart, %bb.loop.preheader ], [ %add, %bb.loop.body ]
  539. // ...
  540. // %add = getelementptr %phinode, %inc
  541. //
  542. // First returned instruciton is %phinode (or a type cast to %phinode), caller
  543. // needs this value to rewrite other load/stores in the same chain.
  544. // Second returned instruction is %add, caller needs this value to rewrite other
  545. // load/stores in the same chain.
  546. std::pair<Instruction *, Instruction *>
  547. PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
  548. Instruction *BaseMemI, bool CanPreInc,
  549. PrepForm Form, SCEVExpander &SCEVE,
  550. SmallPtrSet<Value *, 16> &DeletedPtrs) {
  551. LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
  552. assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
  553. Value *BasePtr = getPointerOperandAndType(BaseMemI);
  554. assert(BasePtr && "No pointer operand");
  555. Type *I8Ty = Type::getInt8Ty(BaseMemI->getParent()->getContext());
  556. Type *I8PtrTy =
  557. Type::getInt8PtrTy(BaseMemI->getParent()->getContext(),
  558. BasePtr->getType()->getPointerAddressSpace());
  559. bool IsConstantInc = false;
  560. const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE);
  561. Value *IncNode = getNodeForInc(L, BaseMemI, BasePtrIncSCEV);
  562. const SCEVConstant *BasePtrIncConstantSCEV =
  563. dyn_cast<SCEVConstant>(BasePtrIncSCEV);
  564. if (BasePtrIncConstantSCEV)
  565. IsConstantInc = true;
  566. // No valid representation for the increment.
  567. if (!IncNode) {
  568. LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n");
  569. return std::make_pair(nullptr, nullptr);
  570. }
  571. if (Form == UpdateForm && !IsConstantInc && !EnableUpdateFormForNonConstInc) {
  572. LLVM_DEBUG(
  573. dbgs()
  574. << "Update form prepare for non-const increment is not enabled!\n");
  575. return std::make_pair(nullptr, nullptr);
  576. }
  577. const SCEV *BasePtrStartSCEV = nullptr;
  578. if (CanPreInc) {
  579. assert(SE->isLoopInvariant(BasePtrIncSCEV, L) &&
  580. "Increment is not loop invariant!\n");
  581. BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(),
  582. IsConstantInc ? BasePtrIncConstantSCEV
  583. : BasePtrIncSCEV);
  584. } else
  585. BasePtrStartSCEV = BasePtrSCEV->getStart();
  586. if (alreadyPrepared(L, BaseMemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) {
  587. LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n");
  588. return std::make_pair(nullptr, nullptr);
  589. }
  590. LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
  591. BasicBlock *Header = L->getHeader();
  592. unsigned HeaderLoopPredCount = pred_size(Header);
  593. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  594. PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
  595. getInstrName(BaseMemI, PHINodeNameSuffix),
  596. Header->getFirstNonPHI());
  597. Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
  598. LoopPredecessor->getTerminator());
  599. // Note that LoopPredecessor might occur in the predecessor list multiple
  600. // times, and we need to add it the right number of times.
  601. for (auto *PI : predecessors(Header)) {
  602. if (PI != LoopPredecessor)
  603. continue;
  604. NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
  605. }
  606. Instruction *PtrInc = nullptr;
  607. Instruction *NewBasePtr = nullptr;
  608. if (CanPreInc) {
  609. Instruction *InsPoint = &*Header->getFirstInsertionPt();
  610. PtrInc = GetElementPtrInst::Create(
  611. I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
  612. InsPoint);
  613. cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
  614. for (auto *PI : predecessors(Header)) {
  615. if (PI == LoopPredecessor)
  616. continue;
  617. NewPHI->addIncoming(PtrInc, PI);
  618. }
  619. if (PtrInc->getType() != BasePtr->getType())
  620. NewBasePtr =
  621. new BitCastInst(PtrInc, BasePtr->getType(),
  622. getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
  623. else
  624. NewBasePtr = PtrInc;
  625. } else {
  626. // Note that LoopPredecessor might occur in the predecessor list multiple
  627. // times, and we need to make sure no more incoming value for them in PHI.
  628. for (auto *PI : predecessors(Header)) {
  629. if (PI == LoopPredecessor)
  630. continue;
  631. // For the latch predecessor, we need to insert a GEP just before the
  632. // terminator to increase the address.
  633. BasicBlock *BB = PI;
  634. Instruction *InsPoint = BB->getTerminator();
  635. PtrInc = GetElementPtrInst::Create(
  636. I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
  637. InsPoint);
  638. cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
  639. NewPHI->addIncoming(PtrInc, PI);
  640. }
  641. PtrInc = NewPHI;
  642. if (NewPHI->getType() != BasePtr->getType())
  643. NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(),
  644. getInstrName(NewPHI, CastNodeNameSuffix),
  645. &*Header->getFirstInsertionPt());
  646. else
  647. NewBasePtr = NewPHI;
  648. }
  649. BasePtr->replaceAllUsesWith(NewBasePtr);
  650. DeletedPtrs.insert(BasePtr);
  651. return std::make_pair(NewBasePtr, PtrInc);
  652. }
  653. Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
  654. std::pair<Instruction *, Instruction *> Base, const BucketElement &Element,
  655. Value *OffToBase, SmallPtrSet<Value *, 16> &DeletedPtrs) {
  656. Instruction *NewBasePtr = Base.first;
  657. Instruction *PtrInc = Base.second;
  658. assert((NewBasePtr && PtrInc) && "base does not exist!\n");
  659. Type *I8Ty = Type::getInt8Ty(PtrInc->getParent()->getContext());
  660. Value *Ptr = getPointerOperandAndType(Element.Instr);
  661. assert(Ptr && "No pointer operand");
  662. Instruction *RealNewPtr;
  663. if (!Element.Offset ||
  664. (isa<SCEVConstant>(Element.Offset) &&
  665. cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) {
  666. RealNewPtr = NewBasePtr;
  667. } else {
  668. Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
  669. if (PtrIP && isa<Instruction>(NewBasePtr) &&
  670. cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
  671. PtrIP = nullptr;
  672. else if (PtrIP && isa<PHINode>(PtrIP))
  673. PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
  674. else if (!PtrIP)
  675. PtrIP = Element.Instr;
  676. assert(OffToBase && "There should be an offset for non base element!\n");
  677. GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
  678. I8Ty, PtrInc, OffToBase,
  679. getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP);
  680. if (!PtrIP)
  681. NewPtr->insertAfter(cast<Instruction>(PtrInc));
  682. NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
  683. RealNewPtr = NewPtr;
  684. }
  685. Instruction *ReplNewPtr;
  686. if (Ptr->getType() != RealNewPtr->getType()) {
  687. ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
  688. getInstrName(Ptr, CastNodeNameSuffix));
  689. ReplNewPtr->insertAfter(RealNewPtr);
  690. } else
  691. ReplNewPtr = RealNewPtr;
  692. Ptr->replaceAllUsesWith(ReplNewPtr);
  693. DeletedPtrs.insert(Ptr);
  694. return ReplNewPtr;
  695. }
  696. void PPCLoopInstrFormPrep::addOneCandidate(
  697. Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets,
  698. std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
  699. assert((MemI && getPointerOperandAndType(MemI)) &&
  700. "Candidate should be a memory instruction.");
  701. assert(LSCEV && "Invalid SCEV for Ptr value.");
  702. bool FoundBucket = false;
  703. for (auto &B : Buckets) {
  704. if (cast<SCEVAddRecExpr>(B.BaseSCEV)->getStepRecurrence(*SE) !=
  705. cast<SCEVAddRecExpr>(LSCEV)->getStepRecurrence(*SE))
  706. continue;
  707. const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
  708. if (isValidDiff(Diff)) {
  709. B.Elements.push_back(BucketElement(Diff, MemI));
  710. FoundBucket = true;
  711. break;
  712. }
  713. }
  714. if (!FoundBucket) {
  715. if (Buckets.size() == MaxCandidateNum) {
  716. LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit "
  717. << MaxCandidateNum << "\n");
  718. return;
  719. }
  720. Buckets.push_back(Bucket(LSCEV, MemI));
  721. }
  722. }
  723. SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
  724. Loop *L,
  725. std::function<bool(const Instruction *, Value *, const Type *)>
  726. isValidCandidate,
  727. std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
  728. SmallVector<Bucket, 16> Buckets;
  729. for (const auto &BB : L->blocks())
  730. for (auto &J : *BB) {
  731. Value *PtrValue = nullptr;
  732. Type *PointerElementType = nullptr;
  733. PtrValue = getPointerOperandAndType(&J, &PointerElementType);
  734. if (!PtrValue)
  735. continue;
  736. if (PtrValue->getType()->getPointerAddressSpace())
  737. continue;
  738. if (L->isLoopInvariant(PtrValue))
  739. continue;
  740. const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
  741. const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
  742. if (!LARSCEV || LARSCEV->getLoop() != L)
  743. continue;
  744. // Mark that we have candidates for preparing.
  745. HasCandidateForPrepare = true;
  746. if (isValidCandidate(&J, PtrValue, PointerElementType))
  747. addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum);
  748. }
  749. return Buckets;
  750. }
  751. bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
  752. PrepForm Form) {
  753. // RemainderOffsetInfo details:
  754. // key: value of (Offset urem DispConstraint). For DSForm, it can
  755. // be [0, 4).
  756. // first of pair: the index of first BucketElement whose remainder is equal
  757. // to key. For key 0, this value must be 0.
  758. // second of pair: number of load/stores with the same remainder.
  759. DenseMap<unsigned, std::pair<unsigned, unsigned>> RemainderOffsetInfo;
  760. for (unsigned j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
  761. if (!BucketChain.Elements[j].Offset)
  762. RemainderOffsetInfo[0] = std::make_pair(0, 1);
  763. else {
  764. unsigned Remainder = cast<SCEVConstant>(BucketChain.Elements[j].Offset)
  765. ->getAPInt()
  766. .urem(Form);
  767. if (RemainderOffsetInfo.find(Remainder) == RemainderOffsetInfo.end())
  768. RemainderOffsetInfo[Remainder] = std::make_pair(j, 1);
  769. else
  770. RemainderOffsetInfo[Remainder].second++;
  771. }
  772. }
  773. // Currently we choose the most profitable base as the one which has the max
  774. // number of load/store with same remainder.
  775. // FIXME: adjust the base selection strategy according to load/store offset
  776. // distribution.
  777. // For example, if we have one candidate chain for DS form preparation, which
  778. // contains following load/stores with different remainders:
  779. // 1: 10 load/store whose remainder is 1;
  780. // 2: 9 load/store whose remainder is 2;
  781. // 3: 1 for remainder 3 and 0 for remainder 0;
  782. // Now we will choose the first load/store whose remainder is 1 as base and
  783. // adjust all other load/stores according to new base, so we will get 10 DS
  784. // form and 10 X form.
  785. // But we should be more clever, for this case we could use two bases, one for
  786. // remainder 1 and the other for remainder 2, thus we could get 19 DS form and
  787. // 1 X form.
  788. unsigned MaxCountRemainder = 0;
  789. for (unsigned j = 0; j < (unsigned)Form; j++)
  790. if ((RemainderOffsetInfo.find(j) != RemainderOffsetInfo.end()) &&
  791. RemainderOffsetInfo[j].second >
  792. RemainderOffsetInfo[MaxCountRemainder].second)
  793. MaxCountRemainder = j;
  794. // Abort when there are too few insts with common base.
  795. if (RemainderOffsetInfo[MaxCountRemainder].second < DispFormPrepMinThreshold)
  796. return false;
  797. // If the first value is most profitable, no needed to adjust BucketChain
  798. // elements as they are substracted the first value when collecting.
  799. if (MaxCountRemainder == 0)
  800. return true;
  801. // Adjust load/store to the new chosen base.
  802. const SCEV *Offset =
  803. BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first].Offset;
  804. BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
  805. for (auto &E : BucketChain.Elements) {
  806. if (E.Offset)
  807. E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
  808. else
  809. E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
  810. }
  811. std::swap(BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first],
  812. BucketChain.Elements[0]);
  813. return true;
  814. }
  815. // FIXME: implement a more clever base choosing policy.
  816. // Currently we always choose an exist load/store offset. This maybe lead to
  817. // suboptimal code sequences. For example, for one DS chain with offsets
  818. // {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
  819. // for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
  820. // multipler of 4, it cannot be represented by sint16.
  821. bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
  822. // We have a choice now of which instruction's memory operand we use as the
  823. // base for the generated PHI. Always picking the first instruction in each
  824. // bucket does not work well, specifically because that instruction might
  825. // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
  826. // the choice is somewhat arbitrary, because the backend will happily
  827. // generate direct offsets from both the pre-incremented and
  828. // post-incremented pointer values. Thus, we'll pick the first non-prefetch
  829. // instruction in each bucket, and adjust the recurrence and other offsets
  830. // accordingly.
  831. for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
  832. if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr))
  833. if (II->getIntrinsicID() == Intrinsic::prefetch)
  834. continue;
  835. // If we'd otherwise pick the first element anyway, there's nothing to do.
  836. if (j == 0)
  837. break;
  838. // If our chosen element has no offset from the base pointer, there's
  839. // nothing to do.
  840. if (!BucketChain.Elements[j].Offset ||
  841. cast<SCEVConstant>(BucketChain.Elements[j].Offset)->isZero())
  842. break;
  843. const SCEV *Offset = BucketChain.Elements[j].Offset;
  844. BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
  845. for (auto &E : BucketChain.Elements) {
  846. if (E.Offset)
  847. E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
  848. else
  849. E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
  850. }
  851. std::swap(BucketChain.Elements[j], BucketChain.Elements[0]);
  852. break;
  853. }
  854. return true;
  855. }
  856. bool PPCLoopInstrFormPrep::rewriteLoadStores(
  857. Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
  858. PrepForm Form) {
  859. bool MadeChange = false;
  860. const SCEVAddRecExpr *BasePtrSCEV =
  861. cast<SCEVAddRecExpr>(BucketChain.BaseSCEV);
  862. if (!BasePtrSCEV->isAffine())
  863. return MadeChange;
  864. BasicBlock *Header = L->getHeader();
  865. SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
  866. "loopprepare-formrewrite");
  867. if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
  868. return MadeChange;
  869. SmallPtrSet<Value *, 16> DeletedPtrs;
  870. // For some DS form load/store instructions, it can also be an update form,
  871. // if the stride is constant and is a multipler of 4. Use update form if
  872. // prefer it.
  873. bool CanPreInc = (Form == UpdateForm ||
  874. ((Form == DSForm) &&
  875. isa<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)) &&
  876. !cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE))
  877. ->getAPInt()
  878. .urem(4) &&
  879. PreferUpdateForm));
  880. std::pair<Instruction *, Instruction *> Base =
  881. rewriteForBase(L, BasePtrSCEV, BucketChain.Elements.begin()->Instr,
  882. CanPreInc, Form, SCEVE, DeletedPtrs);
  883. if (!Base.first || !Base.second)
  884. return MadeChange;
  885. // Keep track of the replacement pointer values we've inserted so that we
  886. // don't generate more pointer values than necessary.
  887. SmallPtrSet<Value *, 16> NewPtrs;
  888. NewPtrs.insert(Base.first);
  889. for (const BucketElement &BE : llvm::drop_begin(BucketChain.Elements)) {
  890. Value *Ptr = getPointerOperandAndType(BE.Instr);
  891. assert(Ptr && "No pointer operand");
  892. if (NewPtrs.count(Ptr))
  893. continue;
  894. Instruction *NewPtr = rewriteForBucketElement(
  895. Base, BE,
  896. BE.Offset ? cast<SCEVConstant>(BE.Offset)->getValue() : nullptr,
  897. DeletedPtrs);
  898. assert(NewPtr && "wrong rewrite!\n");
  899. NewPtrs.insert(NewPtr);
  900. }
  901. // Clear the rewriter cache, because values that are in the rewriter's cache
  902. // can be deleted below, causing the AssertingVH in the cache to trigger.
  903. SCEVE.clear();
  904. for (auto *Ptr : DeletedPtrs) {
  905. if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
  906. BBChanged.insert(IDel->getParent());
  907. RecursivelyDeleteTriviallyDeadInstructions(Ptr);
  908. }
  909. MadeChange = true;
  910. SuccPrepCount++;
  911. if (Form == DSForm && !CanPreInc)
  912. DSFormChainRewritten++;
  913. else if (Form == DQForm)
  914. DQFormChainRewritten++;
  915. else if (Form == UpdateForm || (Form == DSForm && CanPreInc))
  916. UpdFormChainRewritten++;
  917. return MadeChange;
  918. }
  919. bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
  920. SmallVector<Bucket, 16> &Buckets) {
  921. bool MadeChange = false;
  922. if (Buckets.empty())
  923. return MadeChange;
  924. SmallSet<BasicBlock *, 16> BBChanged;
  925. for (auto &Bucket : Buckets)
  926. // The base address of each bucket is transformed into a phi and the others
  927. // are rewritten based on new base.
  928. if (prepareBaseForUpdateFormChain(Bucket))
  929. MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, UpdateForm);
  930. if (MadeChange)
  931. for (auto *BB : BBChanged)
  932. DeleteDeadPHIs(BB);
  933. return MadeChange;
  934. }
  935. bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
  936. SmallVector<Bucket, 16> &Buckets,
  937. PrepForm Form) {
  938. bool MadeChange = false;
  939. if (Buckets.empty())
  940. return MadeChange;
  941. SmallSet<BasicBlock *, 16> BBChanged;
  942. for (auto &Bucket : Buckets) {
  943. if (Bucket.Elements.size() < DispFormPrepMinThreshold)
  944. continue;
  945. if (prepareBaseForDispFormChain(Bucket, Form))
  946. MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, Form);
  947. }
  948. if (MadeChange)
  949. for (auto *BB : BBChanged)
  950. DeleteDeadPHIs(BB);
  951. return MadeChange;
  952. }
  953. // Find the loop invariant increment node for SCEV BasePtrIncSCEV.
  954. // bb.loop.preheader:
  955. // %start = ...
  956. // bb.loop.body:
  957. // %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ]
  958. // ...
  959. // %add = add %phinode, %inc ; %inc is what we want to get.
  960. //
  961. Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI,
  962. const SCEV *BasePtrIncSCEV) {
  963. // If the increment is a constant, no definition is needed.
  964. // Return the value directly.
  965. if (isa<SCEVConstant>(BasePtrIncSCEV))
  966. return cast<SCEVConstant>(BasePtrIncSCEV)->getValue();
  967. if (!SE->isLoopInvariant(BasePtrIncSCEV, L))
  968. return nullptr;
  969. BasicBlock *BB = MemI->getParent();
  970. if (!BB)
  971. return nullptr;
  972. BasicBlock *LatchBB = L->getLoopLatch();
  973. if (!LatchBB)
  974. return nullptr;
  975. // Run through the PHIs and check their operands to find valid representation
  976. // for the increment SCEV.
  977. iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
  978. for (auto &CurrentPHI : PHIIter) {
  979. PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
  980. if (!CurrentPHINode)
  981. continue;
  982. if (!SE->isSCEVable(CurrentPHINode->getType()))
  983. continue;
  984. const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
  985. const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
  986. if (!PHIBasePtrSCEV)
  987. continue;
  988. const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE);
  989. if (!PHIBasePtrIncSCEV || (PHIBasePtrIncSCEV != BasePtrIncSCEV))
  990. continue;
  991. // Get the incoming value from the loop latch and check if the value has
  992. // the add form with the required increment.
  993. if (Instruction *I = dyn_cast<Instruction>(
  994. CurrentPHINode->getIncomingValueForBlock(LatchBB))) {
  995. Value *StrippedBaseI = I;
  996. while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBaseI))
  997. StrippedBaseI = BC->getOperand(0);
  998. Instruction *StrippedI = dyn_cast<Instruction>(StrippedBaseI);
  999. if (!StrippedI)
  1000. continue;
  1001. // LSR pass may add a getelementptr instruction to do the loop increment,
  1002. // also search in that getelementptr instruction.
  1003. if (StrippedI->getOpcode() == Instruction::Add ||
  1004. (StrippedI->getOpcode() == Instruction::GetElementPtr &&
  1005. StrippedI->getNumOperands() == 2)) {
  1006. if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV)
  1007. return StrippedI->getOperand(0);
  1008. if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV)
  1009. return StrippedI->getOperand(1);
  1010. }
  1011. }
  1012. }
  1013. return nullptr;
  1014. }
  1015. // In order to prepare for the preferred instruction form, a PHI is added.
  1016. // This function will check to see if that PHI already exists and will return
  1017. // true if it found an existing PHI with the matched start and increment as the
  1018. // one we wanted to create.
  1019. bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI,
  1020. const SCEV *BasePtrStartSCEV,
  1021. const SCEV *BasePtrIncSCEV,
  1022. PrepForm Form) {
  1023. BasicBlock *BB = MemI->getParent();
  1024. if (!BB)
  1025. return false;
  1026. BasicBlock *PredBB = L->getLoopPredecessor();
  1027. BasicBlock *LatchBB = L->getLoopLatch();
  1028. if (!PredBB || !LatchBB)
  1029. return false;
  1030. // Run through the PHIs and see if we have some that looks like a preparation
  1031. iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
  1032. for (auto & CurrentPHI : PHIIter) {
  1033. PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
  1034. if (!CurrentPHINode)
  1035. continue;
  1036. if (!SE->isSCEVable(CurrentPHINode->getType()))
  1037. continue;
  1038. const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
  1039. const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
  1040. if (!PHIBasePtrSCEV)
  1041. continue;
  1042. const SCEVConstant *PHIBasePtrIncSCEV =
  1043. dyn_cast<SCEVConstant>(PHIBasePtrSCEV->getStepRecurrence(*SE));
  1044. if (!PHIBasePtrIncSCEV)
  1045. continue;
  1046. if (CurrentPHINode->getNumIncomingValues() == 2) {
  1047. if ((CurrentPHINode->getIncomingBlock(0) == LatchBB &&
  1048. CurrentPHINode->getIncomingBlock(1) == PredBB) ||
  1049. (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
  1050. CurrentPHINode->getIncomingBlock(0) == PredBB)) {
  1051. if (PHIBasePtrIncSCEV == BasePtrIncSCEV) {
  1052. // The existing PHI (CurrentPHINode) has the same start and increment
  1053. // as the PHI that we wanted to create.
  1054. if ((Form == UpdateForm || Form == ChainCommoning ) &&
  1055. PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) {
  1056. ++PHINodeAlreadyExistsUpdate;
  1057. return true;
  1058. }
  1059. if (Form == DSForm || Form == DQForm) {
  1060. const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
  1061. SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV));
  1062. if (Diff && !Diff->getAPInt().urem(Form)) {
  1063. if (Form == DSForm)
  1064. ++PHINodeAlreadyExistsDS;
  1065. else
  1066. ++PHINodeAlreadyExistsDQ;
  1067. return true;
  1068. }
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. return false;
  1075. }
  1076. bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
  1077. bool MadeChange = false;
  1078. // Only prep. the inner-most loop
  1079. if (!L->isInnermost())
  1080. return MadeChange;
  1081. // Return if already done enough preparation.
  1082. if (SuccPrepCount >= MaxVarsPrep)
  1083. return MadeChange;
  1084. LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
  1085. BasicBlock *LoopPredecessor = L->getLoopPredecessor();
  1086. // If there is no loop predecessor, or the loop predecessor's terminator
  1087. // returns a value (which might contribute to determining the loop's
  1088. // iteration space), insert a new preheader for the loop.
  1089. if (!LoopPredecessor ||
  1090. !LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
  1091. LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
  1092. if (LoopPredecessor)
  1093. MadeChange = true;
  1094. }
  1095. if (!LoopPredecessor) {
  1096. LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n");
  1097. return MadeChange;
  1098. }
  1099. // Check if a load/store has update form. This lambda is used by function
  1100. // collectCandidates which can collect candidates for types defined by lambda.
  1101. auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue,
  1102. const Type *PointerElementType) {
  1103. assert((PtrValue && I) && "Invalid parameter!");
  1104. // There are no update forms for Altivec vector load/stores.
  1105. if (ST && ST->hasAltivec() && PointerElementType->isVectorTy())
  1106. return false;
  1107. // There are no update forms for P10 lxvp/stxvp intrinsic.
  1108. auto *II = dyn_cast<IntrinsicInst>(I);
  1109. if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) ||
  1110. II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp))
  1111. return false;
  1112. // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
  1113. // be 4's multiple (DS-form). For i64 loads/stores when the displacement
  1114. // fits in a 16-bit signed field but isn't a multiple of 4, it will be
  1115. // useless and possible to break some original well-form addressing mode
  1116. // to make this pre-inc prep for it.
  1117. if (PointerElementType->isIntegerTy(64)) {
  1118. const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
  1119. const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
  1120. if (!LARSCEV || LARSCEV->getLoop() != L)
  1121. return false;
  1122. if (const SCEVConstant *StepConst =
  1123. dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
  1124. const APInt &ConstInt = StepConst->getValue()->getValue();
  1125. if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
  1126. return false;
  1127. }
  1128. }
  1129. return true;
  1130. };
  1131. // Check if a load/store has DS form.
  1132. auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue,
  1133. const Type *PointerElementType) {
  1134. assert((PtrValue && I) && "Invalid parameter!");
  1135. if (isa<IntrinsicInst>(I))
  1136. return false;
  1137. return (PointerElementType->isIntegerTy(64)) ||
  1138. (PointerElementType->isFloatTy()) ||
  1139. (PointerElementType->isDoubleTy()) ||
  1140. (PointerElementType->isIntegerTy(32) &&
  1141. llvm::any_of(I->users(),
  1142. [](const User *U) { return isa<SExtInst>(U); }));
  1143. };
  1144. // Check if a load/store has DQ form.
  1145. auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue,
  1146. const Type *PointerElementType) {
  1147. assert((PtrValue && I) && "Invalid parameter!");
  1148. // Check if it is a P10 lxvp/stxvp intrinsic.
  1149. auto *II = dyn_cast<IntrinsicInst>(I);
  1150. if (II)
  1151. return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp ||
  1152. II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp;
  1153. // Check if it is a P9 vector load/store.
  1154. return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy());
  1155. };
  1156. // Check if a load/store is candidate for chain commoning.
  1157. // If the SCEV is only with one ptr operand in its start, we can use that
  1158. // start as a chain separator. Mark this load/store as a candidate.
  1159. auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue,
  1160. const Type *PointerElementType) {
  1161. const SCEVAddRecExpr *ARSCEV =
  1162. cast<SCEVAddRecExpr>(SE->getSCEVAtScope(PtrValue, L));
  1163. if (!ARSCEV)
  1164. return false;
  1165. if (!ARSCEV->isAffine())
  1166. return false;
  1167. const SCEV *Start = ARSCEV->getStart();
  1168. // A single pointer. We can treat it as offset 0.
  1169. if (isa<SCEVUnknown>(Start) && Start->getType()->isPointerTy())
  1170. return true;
  1171. const SCEVAddExpr *ASCEV = dyn_cast<SCEVAddExpr>(Start);
  1172. // We need a SCEVAddExpr to include both base and offset.
  1173. if (!ASCEV)
  1174. return false;
  1175. // Make sure there is only one pointer operand(base) and all other operands
  1176. // are integer type.
  1177. bool SawPointer = false;
  1178. for (const SCEV *Op : ASCEV->operands()) {
  1179. if (Op->getType()->isPointerTy()) {
  1180. if (SawPointer)
  1181. return false;
  1182. SawPointer = true;
  1183. } else if (!Op->getType()->isIntegerTy())
  1184. return false;
  1185. }
  1186. return SawPointer;
  1187. };
  1188. // Check if the diff is a constant type. This is used for update/DS/DQ form
  1189. // preparation.
  1190. auto isValidConstantDiff = [](const SCEV *Diff) {
  1191. return dyn_cast<SCEVConstant>(Diff) != nullptr;
  1192. };
  1193. // Make sure the diff between the base and new candidate is required type.
  1194. // This is used for chain commoning preparation.
  1195. auto isValidChainCommoningDiff = [](const SCEV *Diff) {
  1196. assert(Diff && "Invalid Diff!\n");
  1197. // Don't mess up previous dform prepare.
  1198. if (isa<SCEVConstant>(Diff))
  1199. return false;
  1200. // A single integer type offset.
  1201. if (isa<SCEVUnknown>(Diff) && Diff->getType()->isIntegerTy())
  1202. return true;
  1203. const SCEVNAryExpr *ADiff = dyn_cast<SCEVNAryExpr>(Diff);
  1204. if (!ADiff)
  1205. return false;
  1206. for (const SCEV *Op : ADiff->operands())
  1207. if (!Op->getType()->isIntegerTy())
  1208. return false;
  1209. return true;
  1210. };
  1211. HasCandidateForPrepare = false;
  1212. LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n");
  1213. // Collect buckets of comparable addresses used by loads and stores for update
  1214. // form.
  1215. SmallVector<Bucket, 16> UpdateFormBuckets = collectCandidates(
  1216. L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm);
  1217. // Prepare for update form.
  1218. if (!UpdateFormBuckets.empty())
  1219. MadeChange |= updateFormPrep(L, UpdateFormBuckets);
  1220. else if (!HasCandidateForPrepare) {
  1221. LLVM_DEBUG(
  1222. dbgs()
  1223. << "No prepare candidates found, stop praparation for current loop!\n");
  1224. // If no candidate for preparing, return early.
  1225. return MadeChange;
  1226. }
  1227. LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n");
  1228. // Collect buckets of comparable addresses used by loads and stores for DS
  1229. // form.
  1230. SmallVector<Bucket, 16> DSFormBuckets = collectCandidates(
  1231. L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm);
  1232. // Prepare for DS form.
  1233. if (!DSFormBuckets.empty())
  1234. MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm);
  1235. LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n");
  1236. // Collect buckets of comparable addresses used by loads and stores for DQ
  1237. // form.
  1238. SmallVector<Bucket, 16> DQFormBuckets = collectCandidates(
  1239. L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm);
  1240. // Prepare for DQ form.
  1241. if (!DQFormBuckets.empty())
  1242. MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm);
  1243. // Collect buckets of comparable addresses used by loads and stores for chain
  1244. // commoning. With chain commoning, we reuse offsets between the chains, so
  1245. // the register pressure will be reduced.
  1246. if (!EnableChainCommoning) {
  1247. LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n");
  1248. return MadeChange;
  1249. }
  1250. LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n");
  1251. SmallVector<Bucket, 16> Buckets =
  1252. collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff,
  1253. MaxVarsChainCommon);
  1254. // Prepare for chain commoning.
  1255. if (!Buckets.empty())
  1256. MadeChange |= chainCommoning(L, Buckets);
  1257. return MadeChange;
  1258. }