LoopIdiomRecognize.cpp 111 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934
  1. //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass implements an idiom recognizer that transforms simple loops into a
  10. // non-loop form. In cases that this kicks in, it can be a significant
  11. // performance win.
  12. //
  13. // If compiling for code size we avoid idiom recognition if the resulting
  14. // code could be larger than the code for the original loop. One way this could
  15. // happen is if the loop is not removable after idiom recognition due to the
  16. // presence of non-idiom instructions. The initial implementation of the
  17. // heuristics applies to idioms in multi-block loops.
  18. //
  19. //===----------------------------------------------------------------------===//
  20. //
  21. // TODO List:
  22. //
  23. // Future loop memory idioms to recognize:
  24. // memcmp, strlen, etc.
  25. // Future floating point idioms to recognize in -ffast-math mode:
  26. // fpowi
  27. // Future integer operation idioms to recognize:
  28. // ctpop
  29. //
  30. // Beware that isel's default lowering for ctpop is highly inefficient for
  31. // i64 and larger types when i64 is legal and the value has few bits set. It
  32. // would be good to enhance isel to emit a loop for ctpop in this case.
  33. //
  34. // This could recognize common matrix multiplies and dot product idioms and
  35. // replace them with calls to BLAS (if linked in??).
  36. //
  37. //===----------------------------------------------------------------------===//
  38. #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
  39. #include "llvm/ADT/APInt.h"
  40. #include "llvm/ADT/ArrayRef.h"
  41. #include "llvm/ADT/DenseMap.h"
  42. #include "llvm/ADT/MapVector.h"
  43. #include "llvm/ADT/SetVector.h"
  44. #include "llvm/ADT/SmallPtrSet.h"
  45. #include "llvm/ADT/SmallVector.h"
  46. #include "llvm/ADT/Statistic.h"
  47. #include "llvm/ADT/StringRef.h"
  48. #include "llvm/Analysis/AliasAnalysis.h"
  49. #include "llvm/Analysis/CmpInstAnalysis.h"
  50. #include "llvm/Analysis/LoopAccessAnalysis.h"
  51. #include "llvm/Analysis/LoopInfo.h"
  52. #include "llvm/Analysis/LoopPass.h"
  53. #include "llvm/Analysis/MemoryLocation.h"
  54. #include "llvm/Analysis/MemorySSA.h"
  55. #include "llvm/Analysis/MemorySSAUpdater.h"
  56. #include "llvm/Analysis/MustExecute.h"
  57. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  58. #include "llvm/Analysis/ScalarEvolution.h"
  59. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  60. #include "llvm/Analysis/TargetLibraryInfo.h"
  61. #include "llvm/Analysis/TargetTransformInfo.h"
  62. #include "llvm/Analysis/ValueTracking.h"
  63. #include "llvm/IR/Attributes.h"
  64. #include "llvm/IR/BasicBlock.h"
  65. #include "llvm/IR/Constant.h"
  66. #include "llvm/IR/Constants.h"
  67. #include "llvm/IR/DataLayout.h"
  68. #include "llvm/IR/DebugLoc.h"
  69. #include "llvm/IR/DerivedTypes.h"
  70. #include "llvm/IR/Dominators.h"
  71. #include "llvm/IR/GlobalValue.h"
  72. #include "llvm/IR/GlobalVariable.h"
  73. #include "llvm/IR/IRBuilder.h"
  74. #include "llvm/IR/InstrTypes.h"
  75. #include "llvm/IR/Instruction.h"
  76. #include "llvm/IR/Instructions.h"
  77. #include "llvm/IR/IntrinsicInst.h"
  78. #include "llvm/IR/Intrinsics.h"
  79. #include "llvm/IR/LLVMContext.h"
  80. #include "llvm/IR/Module.h"
  81. #include "llvm/IR/PassManager.h"
  82. #include "llvm/IR/PatternMatch.h"
  83. #include "llvm/IR/Type.h"
  84. #include "llvm/IR/User.h"
  85. #include "llvm/IR/Value.h"
  86. #include "llvm/IR/ValueHandle.h"
  87. #include "llvm/InitializePasses.h"
  88. #include "llvm/Pass.h"
  89. #include "llvm/Support/Casting.h"
  90. #include "llvm/Support/CommandLine.h"
  91. #include "llvm/Support/Debug.h"
  92. #include "llvm/Support/InstructionCost.h"
  93. #include "llvm/Support/raw_ostream.h"
  94. #include "llvm/Transforms/Scalar.h"
  95. #include "llvm/Transforms/Utils/BuildLibCalls.h"
  96. #include "llvm/Transforms/Utils/Local.h"
  97. #include "llvm/Transforms/Utils/LoopUtils.h"
  98. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  99. #include <algorithm>
  100. #include <cassert>
  101. #include <cstdint>
  102. #include <utility>
  103. #include <vector>
  104. using namespace llvm;
  105. #define DEBUG_TYPE "loop-idiom"
  106. STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
  107. STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
  108. STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
  109. STATISTIC(
  110. NumShiftUntilBitTest,
  111. "Number of uncountable loops recognized as 'shift until bitttest' idiom");
  112. STATISTIC(NumShiftUntilZero,
  113. "Number of uncountable loops recognized as 'shift until zero' idiom");
  114. bool DisableLIRP::All;
  115. static cl::opt<bool, true>
  116. DisableLIRPAll("disable-" DEBUG_TYPE "-all",
  117. cl::desc("Options to disable Loop Idiom Recognize Pass."),
  118. cl::location(DisableLIRP::All), cl::init(false),
  119. cl::ReallyHidden);
  120. bool DisableLIRP::Memset;
  121. static cl::opt<bool, true>
  122. DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
  123. cl::desc("Proceed with loop idiom recognize pass, but do "
  124. "not convert loop(s) to memset."),
  125. cl::location(DisableLIRP::Memset), cl::init(false),
  126. cl::ReallyHidden);
  127. bool DisableLIRP::Memcpy;
  128. static cl::opt<bool, true>
  129. DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
  130. cl::desc("Proceed with loop idiom recognize pass, but do "
  131. "not convert loop(s) to memcpy."),
  132. cl::location(DisableLIRP::Memcpy), cl::init(false),
  133. cl::ReallyHidden);
  134. static cl::opt<bool> UseLIRCodeSizeHeurs(
  135. "use-lir-code-size-heurs",
  136. cl::desc("Use loop idiom recognition code size heuristics when compiling"
  137. "with -Os/-Oz"),
  138. cl::init(true), cl::Hidden);
  139. namespace {
  140. class LoopIdiomRecognize {
  141. Loop *CurLoop = nullptr;
  142. AliasAnalysis *AA;
  143. DominatorTree *DT;
  144. LoopInfo *LI;
  145. ScalarEvolution *SE;
  146. TargetLibraryInfo *TLI;
  147. const TargetTransformInfo *TTI;
  148. const DataLayout *DL;
  149. OptimizationRemarkEmitter &ORE;
  150. bool ApplyCodeSizeHeuristics;
  151. std::unique_ptr<MemorySSAUpdater> MSSAU;
  152. public:
  153. explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
  154. LoopInfo *LI, ScalarEvolution *SE,
  155. TargetLibraryInfo *TLI,
  156. const TargetTransformInfo *TTI, MemorySSA *MSSA,
  157. const DataLayout *DL,
  158. OptimizationRemarkEmitter &ORE)
  159. : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
  160. if (MSSA)
  161. MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
  162. }
  163. bool runOnLoop(Loop *L);
  164. private:
  165. using StoreList = SmallVector<StoreInst *, 8>;
  166. using StoreListMap = MapVector<Value *, StoreList>;
  167. StoreListMap StoreRefsForMemset;
  168. StoreListMap StoreRefsForMemsetPattern;
  169. StoreList StoreRefsForMemcpy;
  170. bool HasMemset;
  171. bool HasMemsetPattern;
  172. bool HasMemcpy;
  173. /// Return code for isLegalStore()
  174. enum LegalStoreKind {
  175. None = 0,
  176. Memset,
  177. MemsetPattern,
  178. Memcpy,
  179. UnorderedAtomicMemcpy,
  180. DontUse // Dummy retval never to be used. Allows catching errors in retval
  181. // handling.
  182. };
  183. /// \name Countable Loop Idiom Handling
  184. /// @{
  185. bool runOnCountableLoop();
  186. bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
  187. SmallVectorImpl<BasicBlock *> &ExitBlocks);
  188. void collectStores(BasicBlock *BB);
  189. LegalStoreKind isLegalStore(StoreInst *SI);
  190. enum class ForMemset { No, Yes };
  191. bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
  192. ForMemset For);
  193. template <typename MemInst>
  194. bool processLoopMemIntrinsic(
  195. BasicBlock *BB,
  196. bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
  197. const SCEV *BECount);
  198. bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
  199. bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
  200. bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
  201. MaybeAlign StoreAlignment, Value *StoredVal,
  202. Instruction *TheStore,
  203. SmallPtrSetImpl<Instruction *> &Stores,
  204. const SCEVAddRecExpr *Ev, const SCEV *BECount,
  205. bool IsNegStride, bool IsLoopMemset = false);
  206. bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
  207. bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
  208. const SCEV *StoreSize, MaybeAlign StoreAlign,
  209. MaybeAlign LoadAlign, Instruction *TheStore,
  210. Instruction *TheLoad,
  211. const SCEVAddRecExpr *StoreEv,
  212. const SCEVAddRecExpr *LoadEv,
  213. const SCEV *BECount);
  214. bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
  215. bool IsLoopMemset = false);
  216. /// @}
  217. /// \name Noncountable Loop Idiom Handling
  218. /// @{
  219. bool runOnNoncountableLoop();
  220. bool recognizePopcount();
  221. void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
  222. PHINode *CntPhi, Value *Var);
  223. bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
  224. void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
  225. Instruction *CntInst, PHINode *CntPhi,
  226. Value *Var, Instruction *DefX,
  227. const DebugLoc &DL, bool ZeroCheck,
  228. bool IsCntPhiUsedOutsideLoop);
  229. bool recognizeShiftUntilBitTest();
  230. bool recognizeShiftUntilZero();
  231. /// @}
  232. };
  233. class LoopIdiomRecognizeLegacyPass : public LoopPass {
  234. public:
  235. static char ID;
  236. explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
  237. initializeLoopIdiomRecognizeLegacyPassPass(
  238. *PassRegistry::getPassRegistry());
  239. }
  240. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  241. if (DisableLIRP::All)
  242. return false;
  243. if (skipLoop(L))
  244. return false;
  245. AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  246. DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  247. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  248. ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  249. TargetLibraryInfo *TLI =
  250. &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
  251. *L->getHeader()->getParent());
  252. const TargetTransformInfo *TTI =
  253. &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
  254. *L->getHeader()->getParent());
  255. const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
  256. auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
  257. MemorySSA *MSSA = nullptr;
  258. if (MSSAAnalysis)
  259. MSSA = &MSSAAnalysis->getMSSA();
  260. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  261. // pass. Function analyses need to be preserved across loop transformations
  262. // but ORE cannot be preserved (see comment before the pass definition).
  263. OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
  264. LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
  265. return LIR.runOnLoop(L);
  266. }
  267. /// This transformation requires natural loop information & requires that
  268. /// loop preheaders be inserted into the CFG.
  269. void getAnalysisUsage(AnalysisUsage &AU) const override {
  270. AU.addRequired<TargetLibraryInfoWrapperPass>();
  271. AU.addRequired<TargetTransformInfoWrapperPass>();
  272. AU.addPreserved<MemorySSAWrapperPass>();
  273. getLoopAnalysisUsage(AU);
  274. }
  275. };
  276. } // end anonymous namespace
  277. char LoopIdiomRecognizeLegacyPass::ID = 0;
  278. PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
  279. LoopStandardAnalysisResults &AR,
  280. LPMUpdater &) {
  281. if (DisableLIRP::All)
  282. return PreservedAnalyses::all();
  283. const auto *DL = &L.getHeader()->getModule()->getDataLayout();
  284. // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
  285. // pass. Function analyses need to be preserved across loop transformations
  286. // but ORE cannot be preserved (see comment before the pass definition).
  287. OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
  288. LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
  289. AR.MSSA, DL, ORE);
  290. if (!LIR.runOnLoop(&L))
  291. return PreservedAnalyses::all();
  292. auto PA = getLoopPassPreservedAnalyses();
  293. if (AR.MSSA)
  294. PA.preserve<MemorySSAAnalysis>();
  295. return PA;
  296. }
  297. INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
  298. "Recognize loop idioms", false, false)
  299. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  300. INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
  301. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  302. INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
  303. "Recognize loop idioms", false, false)
  304. Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
  305. static void deleteDeadInstruction(Instruction *I) {
  306. I->replaceAllUsesWith(UndefValue::get(I->getType()));
  307. I->eraseFromParent();
  308. }
  309. //===----------------------------------------------------------------------===//
  310. //
  311. // Implementation of LoopIdiomRecognize
  312. //
  313. //===----------------------------------------------------------------------===//
  314. bool LoopIdiomRecognize::runOnLoop(Loop *L) {
  315. CurLoop = L;
  316. // If the loop could not be converted to canonical form, it must have an
  317. // indirectbr in it, just give up.
  318. if (!L->getLoopPreheader())
  319. return false;
  320. // Disable loop idiom recognition if the function's name is a common idiom.
  321. StringRef Name = L->getHeader()->getParent()->getName();
  322. if (Name == "memset" || Name == "memcpy")
  323. return false;
  324. // Determine if code size heuristics need to be applied.
  325. ApplyCodeSizeHeuristics =
  326. L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
  327. HasMemset = TLI->has(LibFunc_memset);
  328. HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
  329. HasMemcpy = TLI->has(LibFunc_memcpy);
  330. if (HasMemset || HasMemsetPattern || HasMemcpy)
  331. if (SE->hasLoopInvariantBackedgeTakenCount(L))
  332. return runOnCountableLoop();
  333. return runOnNoncountableLoop();
  334. }
  335. bool LoopIdiomRecognize::runOnCountableLoop() {
  336. const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
  337. assert(!isa<SCEVCouldNotCompute>(BECount) &&
  338. "runOnCountableLoop() called on a loop without a predictable"
  339. "backedge-taken count");
  340. // If this loop executes exactly one time, then it should be peeled, not
  341. // optimized by this pass.
  342. if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
  343. if (BECst->getAPInt() == 0)
  344. return false;
  345. SmallVector<BasicBlock *, 8> ExitBlocks;
  346. CurLoop->getUniqueExitBlocks(ExitBlocks);
  347. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
  348. << CurLoop->getHeader()->getParent()->getName()
  349. << "] Countable Loop %" << CurLoop->getHeader()->getName()
  350. << "\n");
  351. // The following transforms hoist stores/memsets into the loop pre-header.
  352. // Give up if the loop has instructions that may throw.
  353. SimpleLoopSafetyInfo SafetyInfo;
  354. SafetyInfo.computeLoopSafetyInfo(CurLoop);
  355. if (SafetyInfo.anyBlockMayThrow())
  356. return false;
  357. bool MadeChange = false;
  358. // Scan all the blocks in the loop that are not in subloops.
  359. for (auto *BB : CurLoop->getBlocks()) {
  360. // Ignore blocks in subloops.
  361. if (LI->getLoopFor(BB) != CurLoop)
  362. continue;
  363. MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
  364. }
  365. return MadeChange;
  366. }
  367. static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
  368. const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
  369. return ConstStride->getAPInt();
  370. }
  371. /// getMemSetPatternValue - If a strided store of the specified value is safe to
  372. /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
  373. /// be passed in. Otherwise, return null.
  374. ///
  375. /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
  376. /// just replicate their input array and then pass on to memset_pattern16.
  377. static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
  378. // FIXME: This could check for UndefValue because it can be merged into any
  379. // other valid pattern.
  380. // If the value isn't a constant, we can't promote it to being in a constant
  381. // array. We could theoretically do a store to an alloca or something, but
  382. // that doesn't seem worthwhile.
  383. Constant *C = dyn_cast<Constant>(V);
  384. if (!C)
  385. return nullptr;
  386. // Only handle simple values that are a power of two bytes in size.
  387. uint64_t Size = DL->getTypeSizeInBits(V->getType());
  388. if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
  389. return nullptr;
  390. // Don't care enough about darwin/ppc to implement this.
  391. if (DL->isBigEndian())
  392. return nullptr;
  393. // Convert to size in bytes.
  394. Size /= 8;
  395. // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
  396. // if the top and bottom are the same (e.g. for vectors and large integers).
  397. if (Size > 16)
  398. return nullptr;
  399. // If the constant is exactly 16 bytes, just use it.
  400. if (Size == 16)
  401. return C;
  402. // Otherwise, we'll use an array of the constants.
  403. unsigned ArraySize = 16 / Size;
  404. ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
  405. return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
  406. }
  407. LoopIdiomRecognize::LegalStoreKind
  408. LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
  409. // Don't touch volatile stores.
  410. if (SI->isVolatile())
  411. return LegalStoreKind::None;
  412. // We only want simple or unordered-atomic stores.
  413. if (!SI->isUnordered())
  414. return LegalStoreKind::None;
  415. // Avoid merging nontemporal stores.
  416. if (SI->getMetadata(LLVMContext::MD_nontemporal))
  417. return LegalStoreKind::None;
  418. Value *StoredVal = SI->getValueOperand();
  419. Value *StorePtr = SI->getPointerOperand();
  420. // Don't convert stores of non-integral pointer types to memsets (which stores
  421. // integers).
  422. if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
  423. return LegalStoreKind::None;
  424. // Reject stores that are so large that they overflow an unsigned.
  425. // When storing out scalable vectors we bail out for now, since the code
  426. // below currently only works for constant strides.
  427. TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
  428. if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
  429. (SizeInBits.getFixedSize() >> 32) != 0)
  430. return LegalStoreKind::None;
  431. // See if the pointer expression is an AddRec like {base,+,1} on the current
  432. // loop, which indicates a strided store. If we have something else, it's a
  433. // random store we can't handle.
  434. const SCEVAddRecExpr *StoreEv =
  435. dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  436. if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
  437. return LegalStoreKind::None;
  438. // Check to see if we have a constant stride.
  439. if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
  440. return LegalStoreKind::None;
  441. // See if the store can be turned into a memset.
  442. // If the stored value is a byte-wise value (like i32 -1), then it may be
  443. // turned into a memset of i8 -1, assuming that all the consecutive bytes
  444. // are stored. A store of i32 0x01020304 can never be turned into a memset,
  445. // but it can be turned into memset_pattern if the target supports it.
  446. Value *SplatValue = isBytewiseValue(StoredVal, *DL);
  447. // Note: memset and memset_pattern on unordered-atomic is yet not supported
  448. bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
  449. // If we're allowed to form a memset, and the stored value would be
  450. // acceptable for memset, use it.
  451. if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
  452. // Verify that the stored value is loop invariant. If not, we can't
  453. // promote the memset.
  454. CurLoop->isLoopInvariant(SplatValue)) {
  455. // It looks like we can use SplatValue.
  456. return LegalStoreKind::Memset;
  457. }
  458. if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
  459. // Don't create memset_pattern16s with address spaces.
  460. StorePtr->getType()->getPointerAddressSpace() == 0 &&
  461. getMemSetPatternValue(StoredVal, DL)) {
  462. // It looks like we can use PatternValue!
  463. return LegalStoreKind::MemsetPattern;
  464. }
  465. // Otherwise, see if the store can be turned into a memcpy.
  466. if (HasMemcpy && !DisableLIRP::Memcpy) {
  467. // Check to see if the stride matches the size of the store. If so, then we
  468. // know that every byte is touched in the loop.
  469. APInt Stride = getStoreStride(StoreEv);
  470. unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
  471. if (StoreSize != Stride && StoreSize != -Stride)
  472. return LegalStoreKind::None;
  473. // The store must be feeding a non-volatile load.
  474. LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
  475. // Only allow non-volatile loads
  476. if (!LI || LI->isVolatile())
  477. return LegalStoreKind::None;
  478. // Only allow simple or unordered-atomic loads
  479. if (!LI->isUnordered())
  480. return LegalStoreKind::None;
  481. // See if the pointer expression is an AddRec like {base,+,1} on the current
  482. // loop, which indicates a strided load. If we have something else, it's a
  483. // random load we can't handle.
  484. const SCEVAddRecExpr *LoadEv =
  485. dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
  486. if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
  487. return LegalStoreKind::None;
  488. // The store and load must share the same stride.
  489. if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
  490. return LegalStoreKind::None;
  491. // Success. This store can be converted into a memcpy.
  492. UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
  493. return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
  494. : LegalStoreKind::Memcpy;
  495. }
  496. // This store can't be transformed into a memset/memcpy.
  497. return LegalStoreKind::None;
  498. }
  499. void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
  500. StoreRefsForMemset.clear();
  501. StoreRefsForMemsetPattern.clear();
  502. StoreRefsForMemcpy.clear();
  503. for (Instruction &I : *BB) {
  504. StoreInst *SI = dyn_cast<StoreInst>(&I);
  505. if (!SI)
  506. continue;
  507. // Make sure this is a strided store with a constant stride.
  508. switch (isLegalStore(SI)) {
  509. case LegalStoreKind::None:
  510. // Nothing to do
  511. break;
  512. case LegalStoreKind::Memset: {
  513. // Find the base pointer.
  514. Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
  515. StoreRefsForMemset[Ptr].push_back(SI);
  516. } break;
  517. case LegalStoreKind::MemsetPattern: {
  518. // Find the base pointer.
  519. Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
  520. StoreRefsForMemsetPattern[Ptr].push_back(SI);
  521. } break;
  522. case LegalStoreKind::Memcpy:
  523. case LegalStoreKind::UnorderedAtomicMemcpy:
  524. StoreRefsForMemcpy.push_back(SI);
  525. break;
  526. default:
  527. assert(false && "unhandled return value");
  528. break;
  529. }
  530. }
  531. }
  532. /// runOnLoopBlock - Process the specified block, which lives in a counted loop
  533. /// with the specified backedge count. This block is known to be in the current
  534. /// loop and not in any subloops.
  535. bool LoopIdiomRecognize::runOnLoopBlock(
  536. BasicBlock *BB, const SCEV *BECount,
  537. SmallVectorImpl<BasicBlock *> &ExitBlocks) {
  538. // We can only promote stores in this block if they are unconditionally
  539. // executed in the loop. For a block to be unconditionally executed, it has
  540. // to dominate all the exit blocks of the loop. Verify this now.
  541. for (BasicBlock *ExitBlock : ExitBlocks)
  542. if (!DT->dominates(BB, ExitBlock))
  543. return false;
  544. bool MadeChange = false;
  545. // Look for store instructions, which may be optimized to memset/memcpy.
  546. collectStores(BB);
  547. // Look for a single store or sets of stores with a common base, which can be
  548. // optimized into a memset (memset_pattern). The latter most commonly happens
  549. // with structs and handunrolled loops.
  550. for (auto &SL : StoreRefsForMemset)
  551. MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
  552. for (auto &SL : StoreRefsForMemsetPattern)
  553. MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
  554. // Optimize the store into a memcpy, if it feeds an similarly strided load.
  555. for (auto &SI : StoreRefsForMemcpy)
  556. MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
  557. MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
  558. BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
  559. MadeChange |= processLoopMemIntrinsic<MemSetInst>(
  560. BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
  561. return MadeChange;
  562. }
  563. /// See if this store(s) can be promoted to a memset.
  564. bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
  565. const SCEV *BECount, ForMemset For) {
  566. // Try to find consecutive stores that can be transformed into memsets.
  567. SetVector<StoreInst *> Heads, Tails;
  568. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  569. // Do a quadratic search on all of the given stores and find
  570. // all of the pairs of stores that follow each other.
  571. SmallVector<unsigned, 16> IndexQueue;
  572. for (unsigned i = 0, e = SL.size(); i < e; ++i) {
  573. assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
  574. Value *FirstStoredVal = SL[i]->getValueOperand();
  575. Value *FirstStorePtr = SL[i]->getPointerOperand();
  576. const SCEVAddRecExpr *FirstStoreEv =
  577. cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
  578. APInt FirstStride = getStoreStride(FirstStoreEv);
  579. unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
  580. // See if we can optimize just this store in isolation.
  581. if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
  582. Heads.insert(SL[i]);
  583. continue;
  584. }
  585. Value *FirstSplatValue = nullptr;
  586. Constant *FirstPatternValue = nullptr;
  587. if (For == ForMemset::Yes)
  588. FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
  589. else
  590. FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
  591. assert((FirstSplatValue || FirstPatternValue) &&
  592. "Expected either splat value or pattern value.");
  593. IndexQueue.clear();
  594. // If a store has multiple consecutive store candidates, search Stores
  595. // array according to the sequence: from i+1 to e, then from i-1 to 0.
  596. // This is because usually pairing with immediate succeeding or preceding
  597. // candidate create the best chance to find memset opportunity.
  598. unsigned j = 0;
  599. for (j = i + 1; j < e; ++j)
  600. IndexQueue.push_back(j);
  601. for (j = i; j > 0; --j)
  602. IndexQueue.push_back(j - 1);
  603. for (auto &k : IndexQueue) {
  604. assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
  605. Value *SecondStorePtr = SL[k]->getPointerOperand();
  606. const SCEVAddRecExpr *SecondStoreEv =
  607. cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
  608. APInt SecondStride = getStoreStride(SecondStoreEv);
  609. if (FirstStride != SecondStride)
  610. continue;
  611. Value *SecondStoredVal = SL[k]->getValueOperand();
  612. Value *SecondSplatValue = nullptr;
  613. Constant *SecondPatternValue = nullptr;
  614. if (For == ForMemset::Yes)
  615. SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
  616. else
  617. SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
  618. assert((SecondSplatValue || SecondPatternValue) &&
  619. "Expected either splat value or pattern value.");
  620. if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
  621. if (For == ForMemset::Yes) {
  622. if (isa<UndefValue>(FirstSplatValue))
  623. FirstSplatValue = SecondSplatValue;
  624. if (FirstSplatValue != SecondSplatValue)
  625. continue;
  626. } else {
  627. if (isa<UndefValue>(FirstPatternValue))
  628. FirstPatternValue = SecondPatternValue;
  629. if (FirstPatternValue != SecondPatternValue)
  630. continue;
  631. }
  632. Tails.insert(SL[k]);
  633. Heads.insert(SL[i]);
  634. ConsecutiveChain[SL[i]] = SL[k];
  635. break;
  636. }
  637. }
  638. }
  639. // We may run into multiple chains that merge into a single chain. We mark the
  640. // stores that we transformed so that we don't visit the same store twice.
  641. SmallPtrSet<Value *, 16> TransformedStores;
  642. bool Changed = false;
  643. // For stores that start but don't end a link in the chain:
  644. for (StoreInst *I : Heads) {
  645. if (Tails.count(I))
  646. continue;
  647. // We found a store instr that starts a chain. Now follow the chain and try
  648. // to transform it.
  649. SmallPtrSet<Instruction *, 8> AdjacentStores;
  650. StoreInst *HeadStore = I;
  651. unsigned StoreSize = 0;
  652. // Collect the chain into a list.
  653. while (Tails.count(I) || Heads.count(I)) {
  654. if (TransformedStores.count(I))
  655. break;
  656. AdjacentStores.insert(I);
  657. StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
  658. // Move to the next value in the chain.
  659. I = ConsecutiveChain[I];
  660. }
  661. Value *StoredVal = HeadStore->getValueOperand();
  662. Value *StorePtr = HeadStore->getPointerOperand();
  663. const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  664. APInt Stride = getStoreStride(StoreEv);
  665. // Check to see if the stride matches the size of the stores. If so, then
  666. // we know that every byte is touched in the loop.
  667. if (StoreSize != Stride && StoreSize != -Stride)
  668. continue;
  669. bool IsNegStride = StoreSize == -Stride;
  670. Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
  671. const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
  672. if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
  673. MaybeAlign(HeadStore->getAlign()), StoredVal,
  674. HeadStore, AdjacentStores, StoreEv, BECount,
  675. IsNegStride)) {
  676. TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
  677. Changed = true;
  678. }
  679. }
  680. return Changed;
  681. }
  682. /// processLoopMemIntrinsic - Template function for calling different processor
  683. /// functions based on mem instrinsic type.
  684. template <typename MemInst>
  685. bool LoopIdiomRecognize::processLoopMemIntrinsic(
  686. BasicBlock *BB,
  687. bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
  688. const SCEV *BECount) {
  689. bool MadeChange = false;
  690. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  691. Instruction *Inst = &*I++;
  692. // Look for memory instructions, which may be optimized to a larger one.
  693. if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
  694. WeakTrackingVH InstPtr(&*I);
  695. if (!(this->*Processor)(MI, BECount))
  696. continue;
  697. MadeChange = true;
  698. // If processing the instruction invalidated our iterator, start over from
  699. // the top of the block.
  700. if (!InstPtr)
  701. I = BB->begin();
  702. }
  703. }
  704. return MadeChange;
  705. }
  706. /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
  707. bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
  708. const SCEV *BECount) {
  709. // We can only handle non-volatile memcpys with a constant size.
  710. if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
  711. return false;
  712. // If we're not allowed to hack on memcpy, we fail.
  713. if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
  714. return false;
  715. Value *Dest = MCI->getDest();
  716. Value *Source = MCI->getSource();
  717. if (!Dest || !Source)
  718. return false;
  719. // See if the load and store pointer expressions are AddRec like {base,+,1} on
  720. // the current loop, which indicates a strided load and store. If we have
  721. // something else, it's a random load or store we can't handle.
  722. const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
  723. if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
  724. return false;
  725. const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
  726. if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
  727. return false;
  728. // Reject memcpys that are so large that they overflow an unsigned.
  729. uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
  730. if ((SizeInBytes >> 32) != 0)
  731. return false;
  732. // Check if the stride matches the size of the memcpy. If so, then we know
  733. // that every byte is touched in the loop.
  734. const SCEVConstant *ConstStoreStride =
  735. dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
  736. const SCEVConstant *ConstLoadStride =
  737. dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
  738. if (!ConstStoreStride || !ConstLoadStride)
  739. return false;
  740. APInt StoreStrideValue = ConstStoreStride->getAPInt();
  741. APInt LoadStrideValue = ConstLoadStride->getAPInt();
  742. // Huge stride value - give up
  743. if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
  744. return false;
  745. if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
  746. ORE.emit([&]() {
  747. return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
  748. << ore::NV("Inst", "memcpy") << " in "
  749. << ore::NV("Function", MCI->getFunction())
  750. << " function will not be hoisted: "
  751. << ore::NV("Reason", "memcpy size is not equal to stride");
  752. });
  753. return false;
  754. }
  755. int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
  756. int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
  757. // Check if the load stride matches the store stride.
  758. if (StoreStrideInt != LoadStrideInt)
  759. return false;
  760. return processLoopStoreOfLoopLoad(
  761. Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
  762. MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
  763. BECount);
  764. }
  765. /// processLoopMemSet - See if this memset can be promoted to a large memset.
  766. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
  767. const SCEV *BECount) {
  768. // We can only handle non-volatile memsets.
  769. if (MSI->isVolatile())
  770. return false;
  771. // If we're not allowed to hack on memset, we fail.
  772. if (!HasMemset || DisableLIRP::Memset)
  773. return false;
  774. Value *Pointer = MSI->getDest();
  775. // See if the pointer expression is an AddRec like {base,+,1} on the current
  776. // loop, which indicates a strided store. If we have something else, it's a
  777. // random store we can't handle.
  778. const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
  779. if (!Ev || Ev->getLoop() != CurLoop)
  780. return false;
  781. if (!Ev->isAffine()) {
  782. LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
  783. return false;
  784. }
  785. const SCEV *PointerStrideSCEV = Ev->getOperand(1);
  786. const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
  787. if (!PointerStrideSCEV || !MemsetSizeSCEV)
  788. return false;
  789. bool IsNegStride = false;
  790. const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
  791. if (IsConstantSize) {
  792. // Memset size is constant.
  793. // Check if the pointer stride matches the memset size. If so, then
  794. // we know that every byte is touched in the loop.
  795. LLVM_DEBUG(dbgs() << " memset size is constant\n");
  796. uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
  797. const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
  798. if (!ConstStride)
  799. return false;
  800. APInt Stride = ConstStride->getAPInt();
  801. if (SizeInBytes != Stride && SizeInBytes != -Stride)
  802. return false;
  803. IsNegStride = SizeInBytes == -Stride;
  804. } else {
  805. // Memset size is non-constant.
  806. // Check if the pointer stride matches the memset size.
  807. // To be conservative, the pass would not promote pointers that aren't in
  808. // address space zero. Also, the pass only handles memset length and stride
  809. // that are invariant for the top level loop.
  810. LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
  811. if (Pointer->getType()->getPointerAddressSpace() != 0) {
  812. LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
  813. << "abort\n");
  814. return false;
  815. }
  816. if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
  817. LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
  818. << "abort\n");
  819. return false;
  820. }
  821. // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
  822. IsNegStride = PointerStrideSCEV->isNonConstantNegative();
  823. const SCEV *PositiveStrideSCEV =
  824. IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
  825. : PointerStrideSCEV;
  826. LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
  827. << " PositiveStrideSCEV: " << *PositiveStrideSCEV
  828. << "\n");
  829. if (PositiveStrideSCEV != MemsetSizeSCEV) {
  830. // If an expression is covered by the loop guard, compare again and
  831. // proceed with optimization if equal.
  832. const SCEV *FoldedPositiveStride =
  833. SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
  834. const SCEV *FoldedMemsetSize =
  835. SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
  836. LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
  837. << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
  838. << " FoldedPositiveStride: " << *FoldedPositiveStride
  839. << "\n");
  840. if (FoldedPositiveStride != FoldedMemsetSize) {
  841. LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
  842. return false;
  843. }
  844. }
  845. }
  846. // Verify that the memset value is loop invariant. If not, we can't promote
  847. // the memset.
  848. Value *SplatValue = MSI->getValue();
  849. if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
  850. return false;
  851. SmallPtrSet<Instruction *, 1> MSIs;
  852. MSIs.insert(MSI);
  853. return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
  854. MaybeAlign(MSI->getDestAlignment()),
  855. SplatValue, MSI, MSIs, Ev, BECount,
  856. IsNegStride, /*IsLoopMemset=*/true);
  857. }
  858. /// mayLoopAccessLocation - Return true if the specified loop might access the
  859. /// specified pointer location, which is a loop-strided access. The 'Access'
  860. /// argument specifies what the verboten forms of access are (read or write).
  861. static bool
  862. mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
  863. const SCEV *BECount, const SCEV *StoreSizeSCEV,
  864. AliasAnalysis &AA,
  865. SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
  866. // Get the location that may be stored across the loop. Since the access is
  867. // strided positively through memory, we say that the modified location starts
  868. // at the pointer and has infinite size.
  869. LocationSize AccessSize = LocationSize::afterPointer();
  870. // If the loop iterates a fixed number of times, we can refine the access size
  871. // to be exactly the size of the memset, which is (BECount+1)*StoreSize
  872. const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
  873. const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
  874. if (BECst && ConstSize)
  875. AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
  876. ConstSize->getValue()->getZExtValue());
  877. // TODO: For this to be really effective, we have to dive into the pointer
  878. // operand in the store. Store to &A[i] of 100 will always return may alias
  879. // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
  880. // which will then no-alias a store to &A[100].
  881. MemoryLocation StoreLoc(Ptr, AccessSize);
  882. for (BasicBlock *B : L->blocks())
  883. for (Instruction &I : *B)
  884. if (!IgnoredInsts.contains(&I) &&
  885. isModOrRefSet(
  886. intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
  887. return true;
  888. return false;
  889. }
  890. // If we have a negative stride, Start refers to the end of the memory location
  891. // we're trying to memset. Therefore, we need to recompute the base pointer,
  892. // which is just Start - BECount*Size.
  893. static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
  894. Type *IntPtr, const SCEV *StoreSizeSCEV,
  895. ScalarEvolution *SE) {
  896. const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
  897. if (!StoreSizeSCEV->isOne()) {
  898. // index = back edge count * store size
  899. Index = SE->getMulExpr(Index,
  900. SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
  901. SCEV::FlagNUW);
  902. }
  903. // base pointer = start - index * store size
  904. return SE->getMinusSCEV(Start, Index);
  905. }
  906. /// Compute trip count from the backedge taken count.
  907. static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
  908. Loop *CurLoop, const DataLayout *DL,
  909. ScalarEvolution *SE) {
  910. const SCEV *TripCountS = nullptr;
  911. // The # stored bytes is (BECount+1). Expand the trip count out to
  912. // pointer size if it isn't already.
  913. //
  914. // If we're going to need to zero extend the BE count, check if we can add
  915. // one to it prior to zero extending without overflow. Provided this is safe,
  916. // it allows better simplification of the +1.
  917. if (DL->getTypeSizeInBits(BECount->getType()) <
  918. DL->getTypeSizeInBits(IntPtr) &&
  919. SE->isLoopEntryGuardedByCond(
  920. CurLoop, ICmpInst::ICMP_NE, BECount,
  921. SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
  922. TripCountS = SE->getZeroExtendExpr(
  923. SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
  924. IntPtr);
  925. } else {
  926. TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
  927. SE->getOne(IntPtr), SCEV::FlagNUW);
  928. }
  929. return TripCountS;
  930. }
  931. /// Compute the number of bytes as a SCEV from the backedge taken count.
  932. ///
  933. /// This also maps the SCEV into the provided type and tries to handle the
  934. /// computation in a way that will fold cleanly.
  935. static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
  936. const SCEV *StoreSizeSCEV, Loop *CurLoop,
  937. const DataLayout *DL, ScalarEvolution *SE) {
  938. const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
  939. return SE->getMulExpr(TripCountSCEV,
  940. SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
  941. SCEV::FlagNUW);
  942. }
  943. /// processLoopStridedStore - We see a strided store of some value. If we can
  944. /// transform this into a memset or memset_pattern in the loop preheader, do so.
  945. bool LoopIdiomRecognize::processLoopStridedStore(
  946. Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
  947. Value *StoredVal, Instruction *TheStore,
  948. SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
  949. const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
  950. Value *SplatValue = isBytewiseValue(StoredVal, *DL);
  951. Constant *PatternValue = nullptr;
  952. if (!SplatValue)
  953. PatternValue = getMemSetPatternValue(StoredVal, DL);
  954. assert((SplatValue || PatternValue) &&
  955. "Expected either splat value or pattern value.");
  956. // The trip count of the loop and the base pointer of the addrec SCEV is
  957. // guaranteed to be loop invariant, which means that it should dominate the
  958. // header. This allows us to insert code for it in the preheader.
  959. unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
  960. BasicBlock *Preheader = CurLoop->getLoopPreheader();
  961. IRBuilder<> Builder(Preheader->getTerminator());
  962. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  963. SCEVExpanderCleaner ExpCleaner(Expander);
  964. Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
  965. Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
  966. bool Changed = false;
  967. const SCEV *Start = Ev->getStart();
  968. // Handle negative strided loops.
  969. if (IsNegStride)
  970. Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
  971. // TODO: ideally we should still be able to generate memset if SCEV expander
  972. // is taught to generate the dependencies at the latest point.
  973. if (!isSafeToExpand(Start, *SE))
  974. return Changed;
  975. // Okay, we have a strided store "p[i]" of a splattable value. We can turn
  976. // this into a memset in the loop preheader now if we want. However, this
  977. // would be unsafe to do if there is anything else in the loop that may read
  978. // or write to the aliased location. Check for any overlap by generating the
  979. // base pointer and checking the region.
  980. Value *BasePtr =
  981. Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
  982. // From here on out, conservatively report to the pass manager that we've
  983. // changed the IR, even if we later clean up these added instructions. There
  984. // may be structural differences e.g. in the order of use lists not accounted
  985. // for in just a textual dump of the IR. This is written as a variable, even
  986. // though statically all the places this dominates could be replaced with
  987. // 'true', with the hope that anyone trying to be clever / "more precise" with
  988. // the return value will read this comment, and leave them alone.
  989. Changed = true;
  990. if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
  991. StoreSizeSCEV, *AA, Stores))
  992. return Changed;
  993. if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
  994. return Changed;
  995. // Okay, everything looks good, insert the memset.
  996. const SCEV *NumBytesS =
  997. getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
  998. // TODO: ideally we should still be able to generate memset if SCEV expander
  999. // is taught to generate the dependencies at the latest point.
  1000. if (!isSafeToExpand(NumBytesS, *SE))
  1001. return Changed;
  1002. Value *NumBytes =
  1003. Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  1004. CallInst *NewCall;
  1005. if (SplatValue) {
  1006. AAMDNodes AATags = TheStore->getAAMetadata();
  1007. for (Instruction *Store : Stores)
  1008. AATags = AATags.merge(Store->getAAMetadata());
  1009. if (auto CI = dyn_cast<ConstantInt>(NumBytes))
  1010. AATags = AATags.extendTo(CI->getZExtValue());
  1011. else
  1012. AATags = AATags.extendTo(-1);
  1013. NewCall = Builder.CreateMemSet(
  1014. BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
  1015. /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
  1016. } else {
  1017. // Everything is emitted in default address space
  1018. Type *Int8PtrTy = DestInt8PtrTy;
  1019. Module *M = TheStore->getModule();
  1020. StringRef FuncName = "memset_pattern16";
  1021. FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
  1022. Int8PtrTy, Int8PtrTy, IntIdxTy);
  1023. inferLibFuncAttributes(M, FuncName, *TLI);
  1024. // Otherwise we should form a memset_pattern16. PatternValue is known to be
  1025. // an constant array of 16-bytes. Plop the value into a mergable global.
  1026. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
  1027. GlobalValue::PrivateLinkage,
  1028. PatternValue, ".memset_pattern");
  1029. GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
  1030. GV->setAlignment(Align(16));
  1031. Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
  1032. NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
  1033. }
  1034. NewCall->setDebugLoc(TheStore->getDebugLoc());
  1035. if (MSSAU) {
  1036. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
  1037. NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
  1038. MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  1039. }
  1040. LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
  1041. << " from store to: " << *Ev << " at: " << *TheStore
  1042. << "\n");
  1043. ORE.emit([&]() {
  1044. OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
  1045. NewCall->getDebugLoc(), Preheader);
  1046. R << "Transformed loop-strided store in "
  1047. << ore::NV("Function", TheStore->getFunction())
  1048. << " function into a call to "
  1049. << ore::NV("NewFunction", NewCall->getCalledFunction())
  1050. << "() intrinsic";
  1051. if (!Stores.empty())
  1052. R << ore::setExtraArgs();
  1053. for (auto *I : Stores) {
  1054. R << ore::NV("FromBlock", I->getParent()->getName())
  1055. << ore::NV("ToBlock", Preheader->getName());
  1056. }
  1057. return R;
  1058. });
  1059. // Okay, the memset has been formed. Zap the original store and anything that
  1060. // feeds into it.
  1061. for (auto *I : Stores) {
  1062. if (MSSAU)
  1063. MSSAU->removeMemoryAccess(I, true);
  1064. deleteDeadInstruction(I);
  1065. }
  1066. if (MSSAU && VerifyMemorySSA)
  1067. MSSAU->getMemorySSA()->verifyMemorySSA();
  1068. ++NumMemSet;
  1069. ExpCleaner.markResultUsed();
  1070. return true;
  1071. }
  1072. /// If the stored value is a strided load in the same loop with the same stride
  1073. /// this may be transformable into a memcpy. This kicks in for stuff like
  1074. /// for (i) A[i] = B[i];
  1075. bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
  1076. const SCEV *BECount) {
  1077. assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
  1078. Value *StorePtr = SI->getPointerOperand();
  1079. const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  1080. unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
  1081. // The store must be feeding a non-volatile load.
  1082. LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
  1083. assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
  1084. // See if the pointer expression is an AddRec like {base,+,1} on the current
  1085. // loop, which indicates a strided load. If we have something else, it's a
  1086. // random load we can't handle.
  1087. Value *LoadPtr = LI->getPointerOperand();
  1088. const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
  1089. const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
  1090. return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
  1091. SI->getAlign(), LI->getAlign(), SI, LI,
  1092. StoreEv, LoadEv, BECount);
  1093. }
  1094. class MemmoveVerifier {
  1095. public:
  1096. explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
  1097. const DataLayout &DL)
  1098. : DL(DL), LoadOff(0), StoreOff(0),
  1099. BP1(llvm::GetPointerBaseWithConstantOffset(
  1100. LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
  1101. BP2(llvm::GetPointerBaseWithConstantOffset(
  1102. StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
  1103. IsSameObject(BP1 == BP2) {}
  1104. bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
  1105. const Instruction &TheLoad,
  1106. bool IsMemCpy) const {
  1107. if (IsMemCpy) {
  1108. // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
  1109. // for negative stride.
  1110. if ((!IsNegStride && LoadOff <= StoreOff) ||
  1111. (IsNegStride && LoadOff >= StoreOff))
  1112. return false;
  1113. } else {
  1114. // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
  1115. // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
  1116. int64_t LoadSize =
  1117. DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
  1118. if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
  1119. return false;
  1120. if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
  1121. (IsNegStride && LoadOff + LoadSize > StoreOff))
  1122. return false;
  1123. }
  1124. return true;
  1125. }
  1126. private:
  1127. const DataLayout &DL;
  1128. int64_t LoadOff;
  1129. int64_t StoreOff;
  1130. const Value *BP1;
  1131. const Value *BP2;
  1132. public:
  1133. const bool IsSameObject;
  1134. };
  1135. bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
  1136. Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
  1137. MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
  1138. Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
  1139. const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
  1140. // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
  1141. // conservatively bail here, since otherwise we may have to transform
  1142. // llvm.memcpy.inline into llvm.memcpy which is illegal.
  1143. if (isa<MemCpyInlineInst>(TheStore))
  1144. return false;
  1145. // The trip count of the loop and the base pointer of the addrec SCEV is
  1146. // guaranteed to be loop invariant, which means that it should dominate the
  1147. // header. This allows us to insert code for it in the preheader.
  1148. BasicBlock *Preheader = CurLoop->getLoopPreheader();
  1149. IRBuilder<> Builder(Preheader->getTerminator());
  1150. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  1151. SCEVExpanderCleaner ExpCleaner(Expander);
  1152. bool Changed = false;
  1153. const SCEV *StrStart = StoreEv->getStart();
  1154. unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
  1155. Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
  1156. APInt Stride = getStoreStride(StoreEv);
  1157. const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
  1158. // TODO: Deal with non-constant size; Currently expect constant store size
  1159. assert(ConstStoreSize && "store size is expected to be a constant");
  1160. int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
  1161. bool IsNegStride = StoreSize == -Stride;
  1162. // Handle negative strided loops.
  1163. if (IsNegStride)
  1164. StrStart =
  1165. getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
  1166. // Okay, we have a strided store "p[i]" of a loaded value. We can turn
  1167. // this into a memcpy in the loop preheader now if we want. However, this
  1168. // would be unsafe to do if there is anything else in the loop that may read
  1169. // or write the memory region we're storing to. This includes the load that
  1170. // feeds the stores. Check for an alias by generating the base address and
  1171. // checking everything.
  1172. Value *StoreBasePtr = Expander.expandCodeFor(
  1173. StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
  1174. // From here on out, conservatively report to the pass manager that we've
  1175. // changed the IR, even if we later clean up these added instructions. There
  1176. // may be structural differences e.g. in the order of use lists not accounted
  1177. // for in just a textual dump of the IR. This is written as a variable, even
  1178. // though statically all the places this dominates could be replaced with
  1179. // 'true', with the hope that anyone trying to be clever / "more precise" with
  1180. // the return value will read this comment, and leave them alone.
  1181. Changed = true;
  1182. SmallPtrSet<Instruction *, 2> IgnoredInsts;
  1183. IgnoredInsts.insert(TheStore);
  1184. bool IsMemCpy = isa<MemCpyInst>(TheStore);
  1185. const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
  1186. bool LoopAccessStore =
  1187. mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
  1188. StoreSizeSCEV, *AA, IgnoredInsts);
  1189. if (LoopAccessStore) {
  1190. // For memmove case it's not enough to guarantee that loop doesn't access
  1191. // TheStore and TheLoad. Additionally we need to make sure that TheStore is
  1192. // the only user of TheLoad.
  1193. if (!TheLoad->hasOneUse())
  1194. return Changed;
  1195. IgnoredInsts.insert(TheLoad);
  1196. if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
  1197. BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
  1198. ORE.emit([&]() {
  1199. return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
  1200. TheStore)
  1201. << ore::NV("Inst", InstRemark) << " in "
  1202. << ore::NV("Function", TheStore->getFunction())
  1203. << " function will not be hoisted: "
  1204. << ore::NV("Reason", "The loop may access store location");
  1205. });
  1206. return Changed;
  1207. }
  1208. IgnoredInsts.erase(TheLoad);
  1209. }
  1210. const SCEV *LdStart = LoadEv->getStart();
  1211. unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
  1212. // Handle negative strided loops.
  1213. if (IsNegStride)
  1214. LdStart =
  1215. getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
  1216. // For a memcpy, we have to make sure that the input array is not being
  1217. // mutated by the loop.
  1218. Value *LoadBasePtr = Expander.expandCodeFor(
  1219. LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
  1220. // If the store is a memcpy instruction, we must check if it will write to
  1221. // the load memory locations. So remove it from the ignored stores.
  1222. MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
  1223. if (IsMemCpy && !Verifier.IsSameObject)
  1224. IgnoredInsts.erase(TheStore);
  1225. if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
  1226. StoreSizeSCEV, *AA, IgnoredInsts)) {
  1227. ORE.emit([&]() {
  1228. return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
  1229. << ore::NV("Inst", InstRemark) << " in "
  1230. << ore::NV("Function", TheStore->getFunction())
  1231. << " function will not be hoisted: "
  1232. << ore::NV("Reason", "The loop may access load location");
  1233. });
  1234. return Changed;
  1235. }
  1236. bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
  1237. if (UseMemMove)
  1238. if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
  1239. IsMemCpy))
  1240. return Changed;
  1241. if (avoidLIRForMultiBlockLoop())
  1242. return Changed;
  1243. // Okay, everything is safe, we can transform this!
  1244. const SCEV *NumBytesS =
  1245. getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
  1246. Value *NumBytes =
  1247. Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  1248. AAMDNodes AATags = TheLoad->getAAMetadata();
  1249. AAMDNodes StoreAATags = TheStore->getAAMetadata();
  1250. AATags = AATags.merge(StoreAATags);
  1251. if (auto CI = dyn_cast<ConstantInt>(NumBytes))
  1252. AATags = AATags.extendTo(CI->getZExtValue());
  1253. else
  1254. AATags = AATags.extendTo(-1);
  1255. CallInst *NewCall = nullptr;
  1256. // Check whether to generate an unordered atomic memcpy:
  1257. // If the load or store are atomic, then they must necessarily be unordered
  1258. // by previous checks.
  1259. if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
  1260. if (UseMemMove)
  1261. NewCall = Builder.CreateMemMove(
  1262. StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
  1263. /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
  1264. else
  1265. NewCall =
  1266. Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
  1267. NumBytes, /*isVolatile=*/false, AATags.TBAA,
  1268. AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
  1269. } else {
  1270. // For now don't support unordered atomic memmove.
  1271. if (UseMemMove)
  1272. return Changed;
  1273. // We cannot allow unaligned ops for unordered load/store, so reject
  1274. // anything where the alignment isn't at least the element size.
  1275. assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
  1276. "Expect unordered load/store to have align.");
  1277. if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
  1278. return Changed;
  1279. // If the element.atomic memcpy is not lowered into explicit
  1280. // loads/stores later, then it will be lowered into an element-size
  1281. // specific lib call. If the lib call doesn't exist for our store size, then
  1282. // we shouldn't generate the memcpy.
  1283. if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
  1284. return Changed;
  1285. // Create the call.
  1286. // Note that unordered atomic loads/stores are *required* by the spec to
  1287. // have an alignment but non-atomic loads/stores may not.
  1288. NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
  1289. StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
  1290. NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
  1291. AATags.NoAlias);
  1292. }
  1293. NewCall->setDebugLoc(TheStore->getDebugLoc());
  1294. if (MSSAU) {
  1295. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
  1296. NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
  1297. MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  1298. }
  1299. LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
  1300. << " from load ptr=" << *LoadEv << " at: " << *TheLoad
  1301. << "\n"
  1302. << " from store ptr=" << *StoreEv << " at: " << *TheStore
  1303. << "\n");
  1304. ORE.emit([&]() {
  1305. return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
  1306. NewCall->getDebugLoc(), Preheader)
  1307. << "Formed a call to "
  1308. << ore::NV("NewFunction", NewCall->getCalledFunction())
  1309. << "() intrinsic from " << ore::NV("Inst", InstRemark)
  1310. << " instruction in " << ore::NV("Function", TheStore->getFunction())
  1311. << " function"
  1312. << ore::setExtraArgs()
  1313. << ore::NV("FromBlock", TheStore->getParent()->getName())
  1314. << ore::NV("ToBlock", Preheader->getName());
  1315. });
  1316. // Okay, a new call to memcpy/memmove has been formed. Zap the original store
  1317. // and anything that feeds into it.
  1318. if (MSSAU)
  1319. MSSAU->removeMemoryAccess(TheStore, true);
  1320. deleteDeadInstruction(TheStore);
  1321. if (MSSAU && VerifyMemorySSA)
  1322. MSSAU->getMemorySSA()->verifyMemorySSA();
  1323. if (UseMemMove)
  1324. ++NumMemMove;
  1325. else
  1326. ++NumMemCpy;
  1327. ExpCleaner.markResultUsed();
  1328. return true;
  1329. }
  1330. // When compiling for codesize we avoid idiom recognition for a multi-block loop
  1331. // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
  1332. //
  1333. bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
  1334. bool IsLoopMemset) {
  1335. if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
  1336. if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
  1337. LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
  1338. << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
  1339. << " avoided: multi-block top-level loop\n");
  1340. return true;
  1341. }
  1342. }
  1343. return false;
  1344. }
  1345. bool LoopIdiomRecognize::runOnNoncountableLoop() {
  1346. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
  1347. << CurLoop->getHeader()->getParent()->getName()
  1348. << "] Noncountable Loop %"
  1349. << CurLoop->getHeader()->getName() << "\n");
  1350. return recognizePopcount() || recognizeAndInsertFFS() ||
  1351. recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
  1352. }
  1353. /// Check if the given conditional branch is based on the comparison between
  1354. /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
  1355. /// true), the control yields to the loop entry. If the branch matches the
  1356. /// behavior, the variable involved in the comparison is returned. This function
  1357. /// will be called to see if the precondition and postcondition of the loop are
  1358. /// in desirable form.
  1359. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
  1360. bool JmpOnZero = false) {
  1361. if (!BI || !BI->isConditional())
  1362. return nullptr;
  1363. ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
  1364. if (!Cond)
  1365. return nullptr;
  1366. ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
  1367. if (!CmpZero || !CmpZero->isZero())
  1368. return nullptr;
  1369. BasicBlock *TrueSucc = BI->getSuccessor(0);
  1370. BasicBlock *FalseSucc = BI->getSuccessor(1);
  1371. if (JmpOnZero)
  1372. std::swap(TrueSucc, FalseSucc);
  1373. ICmpInst::Predicate Pred = Cond->getPredicate();
  1374. if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
  1375. (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
  1376. return Cond->getOperand(0);
  1377. return nullptr;
  1378. }
  1379. // Check if the recurrence variable `VarX` is in the right form to create
  1380. // the idiom. Returns the value coerced to a PHINode if so.
  1381. static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
  1382. BasicBlock *LoopEntry) {
  1383. auto *PhiX = dyn_cast<PHINode>(VarX);
  1384. if (PhiX && PhiX->getParent() == LoopEntry &&
  1385. (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
  1386. return PhiX;
  1387. return nullptr;
  1388. }
  1389. /// Return true iff the idiom is detected in the loop.
  1390. ///
  1391. /// Additionally:
  1392. /// 1) \p CntInst is set to the instruction counting the population bit.
  1393. /// 2) \p CntPhi is set to the corresponding phi node.
  1394. /// 3) \p Var is set to the value whose population bits are being counted.
  1395. ///
  1396. /// The core idiom we are trying to detect is:
  1397. /// \code
  1398. /// if (x0 != 0)
  1399. /// goto loop-exit // the precondition of the loop
  1400. /// cnt0 = init-val;
  1401. /// do {
  1402. /// x1 = phi (x0, x2);
  1403. /// cnt1 = phi(cnt0, cnt2);
  1404. ///
  1405. /// cnt2 = cnt1 + 1;
  1406. /// ...
  1407. /// x2 = x1 & (x1 - 1);
  1408. /// ...
  1409. /// } while(x != 0);
  1410. ///
  1411. /// loop-exit:
  1412. /// \endcode
  1413. static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
  1414. Instruction *&CntInst, PHINode *&CntPhi,
  1415. Value *&Var) {
  1416. // step 1: Check to see if the look-back branch match this pattern:
  1417. // "if (a!=0) goto loop-entry".
  1418. BasicBlock *LoopEntry;
  1419. Instruction *DefX2, *CountInst;
  1420. Value *VarX1, *VarX0;
  1421. PHINode *PhiX, *CountPhi;
  1422. DefX2 = CountInst = nullptr;
  1423. VarX1 = VarX0 = nullptr;
  1424. PhiX = CountPhi = nullptr;
  1425. LoopEntry = *(CurLoop->block_begin());
  1426. // step 1: Check if the loop-back branch is in desirable form.
  1427. {
  1428. if (Value *T = matchCondition(
  1429. dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
  1430. DefX2 = dyn_cast<Instruction>(T);
  1431. else
  1432. return false;
  1433. }
  1434. // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
  1435. {
  1436. if (!DefX2 || DefX2->getOpcode() != Instruction::And)
  1437. return false;
  1438. BinaryOperator *SubOneOp;
  1439. if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
  1440. VarX1 = DefX2->getOperand(1);
  1441. else {
  1442. VarX1 = DefX2->getOperand(0);
  1443. SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
  1444. }
  1445. if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
  1446. return false;
  1447. ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
  1448. if (!Dec ||
  1449. !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
  1450. (SubOneOp->getOpcode() == Instruction::Add &&
  1451. Dec->isMinusOne()))) {
  1452. return false;
  1453. }
  1454. }
  1455. // step 3: Check the recurrence of variable X
  1456. PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
  1457. if (!PhiX)
  1458. return false;
  1459. // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
  1460. {
  1461. CountInst = nullptr;
  1462. for (Instruction &Inst : llvm::make_range(
  1463. LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
  1464. if (Inst.getOpcode() != Instruction::Add)
  1465. continue;
  1466. ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
  1467. if (!Inc || !Inc->isOne())
  1468. continue;
  1469. PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
  1470. if (!Phi)
  1471. continue;
  1472. // Check if the result of the instruction is live of the loop.
  1473. bool LiveOutLoop = false;
  1474. for (User *U : Inst.users()) {
  1475. if ((cast<Instruction>(U))->getParent() != LoopEntry) {
  1476. LiveOutLoop = true;
  1477. break;
  1478. }
  1479. }
  1480. if (LiveOutLoop) {
  1481. CountInst = &Inst;
  1482. CountPhi = Phi;
  1483. break;
  1484. }
  1485. }
  1486. if (!CountInst)
  1487. return false;
  1488. }
  1489. // step 5: check if the precondition is in this form:
  1490. // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
  1491. {
  1492. auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1493. Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
  1494. if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
  1495. return false;
  1496. CntInst = CountInst;
  1497. CntPhi = CountPhi;
  1498. Var = T;
  1499. }
  1500. return true;
  1501. }
  1502. /// Return true if the idiom is detected in the loop.
  1503. ///
  1504. /// Additionally:
  1505. /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
  1506. /// or nullptr if there is no such.
  1507. /// 2) \p CntPhi is set to the corresponding phi node
  1508. /// or nullptr if there is no such.
  1509. /// 3) \p Var is set to the value whose CTLZ could be used.
  1510. /// 4) \p DefX is set to the instruction calculating Loop exit condition.
  1511. ///
  1512. /// The core idiom we are trying to detect is:
  1513. /// \code
  1514. /// if (x0 == 0)
  1515. /// goto loop-exit // the precondition of the loop
  1516. /// cnt0 = init-val;
  1517. /// do {
  1518. /// x = phi (x0, x.next); //PhiX
  1519. /// cnt = phi(cnt0, cnt.next);
  1520. ///
  1521. /// cnt.next = cnt + 1;
  1522. /// ...
  1523. /// x.next = x >> 1; // DefX
  1524. /// ...
  1525. /// } while(x.next != 0);
  1526. ///
  1527. /// loop-exit:
  1528. /// \endcode
  1529. static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
  1530. Intrinsic::ID &IntrinID, Value *&InitX,
  1531. Instruction *&CntInst, PHINode *&CntPhi,
  1532. Instruction *&DefX) {
  1533. BasicBlock *LoopEntry;
  1534. Value *VarX = nullptr;
  1535. DefX = nullptr;
  1536. CntInst = nullptr;
  1537. CntPhi = nullptr;
  1538. LoopEntry = *(CurLoop->block_begin());
  1539. // step 1: Check if the loop-back branch is in desirable form.
  1540. if (Value *T = matchCondition(
  1541. dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
  1542. DefX = dyn_cast<Instruction>(T);
  1543. else
  1544. return false;
  1545. // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
  1546. if (!DefX || !DefX->isShift())
  1547. return false;
  1548. IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
  1549. Intrinsic::ctlz;
  1550. ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
  1551. if (!Shft || !Shft->isOne())
  1552. return false;
  1553. VarX = DefX->getOperand(0);
  1554. // step 3: Check the recurrence of variable X
  1555. PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
  1556. if (!PhiX)
  1557. return false;
  1558. InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
  1559. // Make sure the initial value can't be negative otherwise the ashr in the
  1560. // loop might never reach zero which would make the loop infinite.
  1561. if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
  1562. return false;
  1563. // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
  1564. // or cnt.next = cnt + -1.
  1565. // TODO: We can skip the step. If loop trip count is known (CTLZ),
  1566. // then all uses of "cnt.next" could be optimized to the trip count
  1567. // plus "cnt0". Currently it is not optimized.
  1568. // This step could be used to detect POPCNT instruction:
  1569. // cnt.next = cnt + (x.next & 1)
  1570. for (Instruction &Inst : llvm::make_range(
  1571. LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
  1572. if (Inst.getOpcode() != Instruction::Add)
  1573. continue;
  1574. ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
  1575. if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
  1576. continue;
  1577. PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
  1578. if (!Phi)
  1579. continue;
  1580. CntInst = &Inst;
  1581. CntPhi = Phi;
  1582. break;
  1583. }
  1584. if (!CntInst)
  1585. return false;
  1586. return true;
  1587. }
  1588. /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
  1589. /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
  1590. /// trip count returns true; otherwise, returns false.
  1591. bool LoopIdiomRecognize::recognizeAndInsertFFS() {
  1592. // Give up if the loop has multiple blocks or multiple backedges.
  1593. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
  1594. return false;
  1595. Intrinsic::ID IntrinID;
  1596. Value *InitX;
  1597. Instruction *DefX = nullptr;
  1598. PHINode *CntPhi = nullptr;
  1599. Instruction *CntInst = nullptr;
  1600. // Help decide if transformation is profitable. For ShiftUntilZero idiom,
  1601. // this is always 6.
  1602. size_t IdiomCanonicalSize = 6;
  1603. if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
  1604. CntInst, CntPhi, DefX))
  1605. return false;
  1606. bool IsCntPhiUsedOutsideLoop = false;
  1607. for (User *U : CntPhi->users())
  1608. if (!CurLoop->contains(cast<Instruction>(U))) {
  1609. IsCntPhiUsedOutsideLoop = true;
  1610. break;
  1611. }
  1612. bool IsCntInstUsedOutsideLoop = false;
  1613. for (User *U : CntInst->users())
  1614. if (!CurLoop->contains(cast<Instruction>(U))) {
  1615. IsCntInstUsedOutsideLoop = true;
  1616. break;
  1617. }
  1618. // If both CntInst and CntPhi are used outside the loop the profitability
  1619. // is questionable.
  1620. if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
  1621. return false;
  1622. // For some CPUs result of CTLZ(X) intrinsic is undefined
  1623. // when X is 0. If we can not guarantee X != 0, we need to check this
  1624. // when expand.
  1625. bool ZeroCheck = false;
  1626. // It is safe to assume Preheader exist as it was checked in
  1627. // parent function RunOnLoop.
  1628. BasicBlock *PH = CurLoop->getLoopPreheader();
  1629. // If we are using the count instruction outside the loop, make sure we
  1630. // have a zero check as a precondition. Without the check the loop would run
  1631. // one iteration for before any check of the input value. This means 0 and 1
  1632. // would have identical behavior in the original loop and thus
  1633. if (!IsCntPhiUsedOutsideLoop) {
  1634. auto *PreCondBB = PH->getSinglePredecessor();
  1635. if (!PreCondBB)
  1636. return false;
  1637. auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1638. if (!PreCondBI)
  1639. return false;
  1640. if (matchCondition(PreCondBI, PH) != InitX)
  1641. return false;
  1642. ZeroCheck = true;
  1643. }
  1644. // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
  1645. // profitable if we delete the loop.
  1646. // the loop has only 6 instructions:
  1647. // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
  1648. // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
  1649. // %shr = ashr %n.addr.0, 1
  1650. // %tobool = icmp eq %shr, 0
  1651. // %inc = add nsw %i.0, 1
  1652. // br i1 %tobool
  1653. const Value *Args[] = {InitX,
  1654. ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
  1655. // @llvm.dbg doesn't count as they have no semantic effect.
  1656. auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
  1657. uint32_t HeaderSize =
  1658. std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
  1659. IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
  1660. InstructionCost Cost =
  1661. TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
  1662. if (HeaderSize != IdiomCanonicalSize &&
  1663. Cost > TargetTransformInfo::TCC_Basic)
  1664. return false;
  1665. transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
  1666. DefX->getDebugLoc(), ZeroCheck,
  1667. IsCntPhiUsedOutsideLoop);
  1668. return true;
  1669. }
  1670. /// Recognizes a population count idiom in a non-countable loop.
  1671. ///
  1672. /// If detected, transforms the relevant code to issue the popcount intrinsic
  1673. /// function call, and returns true; otherwise, returns false.
  1674. bool LoopIdiomRecognize::recognizePopcount() {
  1675. if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
  1676. return false;
  1677. // Counting population are usually conducted by few arithmetic instructions.
  1678. // Such instructions can be easily "absorbed" by vacant slots in a
  1679. // non-compact loop. Therefore, recognizing popcount idiom only makes sense
  1680. // in a compact loop.
  1681. // Give up if the loop has multiple blocks or multiple backedges.
  1682. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
  1683. return false;
  1684. BasicBlock *LoopBody = *(CurLoop->block_begin());
  1685. if (LoopBody->size() >= 20) {
  1686. // The loop is too big, bail out.
  1687. return false;
  1688. }
  1689. // It should have a preheader containing nothing but an unconditional branch.
  1690. BasicBlock *PH = CurLoop->getLoopPreheader();
  1691. if (!PH || &PH->front() != PH->getTerminator())
  1692. return false;
  1693. auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
  1694. if (!EntryBI || EntryBI->isConditional())
  1695. return false;
  1696. // It should have a precondition block where the generated popcount intrinsic
  1697. // function can be inserted.
  1698. auto *PreCondBB = PH->getSinglePredecessor();
  1699. if (!PreCondBB)
  1700. return false;
  1701. auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1702. if (!PreCondBI || PreCondBI->isUnconditional())
  1703. return false;
  1704. Instruction *CntInst;
  1705. PHINode *CntPhi;
  1706. Value *Val;
  1707. if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
  1708. return false;
  1709. transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
  1710. return true;
  1711. }
  1712. static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
  1713. const DebugLoc &DL) {
  1714. Value *Ops[] = {Val};
  1715. Type *Tys[] = {Val->getType()};
  1716. Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  1717. Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
  1718. CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  1719. CI->setDebugLoc(DL);
  1720. return CI;
  1721. }
  1722. static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
  1723. const DebugLoc &DL, bool ZeroCheck,
  1724. Intrinsic::ID IID) {
  1725. Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
  1726. Type *Tys[] = {Val->getType()};
  1727. Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  1728. Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
  1729. CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  1730. CI->setDebugLoc(DL);
  1731. return CI;
  1732. }
  1733. /// Transform the following loop (Using CTLZ, CTTZ is similar):
  1734. /// loop:
  1735. /// CntPhi = PHI [Cnt0, CntInst]
  1736. /// PhiX = PHI [InitX, DefX]
  1737. /// CntInst = CntPhi + 1
  1738. /// DefX = PhiX >> 1
  1739. /// LOOP_BODY
  1740. /// Br: loop if (DefX != 0)
  1741. /// Use(CntPhi) or Use(CntInst)
  1742. ///
  1743. /// Into:
  1744. /// If CntPhi used outside the loop:
  1745. /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
  1746. /// Count = CountPrev + 1
  1747. /// else
  1748. /// Count = BitWidth(InitX) - CTLZ(InitX)
  1749. /// loop:
  1750. /// CntPhi = PHI [Cnt0, CntInst]
  1751. /// PhiX = PHI [InitX, DefX]
  1752. /// PhiCount = PHI [Count, Dec]
  1753. /// CntInst = CntPhi + 1
  1754. /// DefX = PhiX >> 1
  1755. /// Dec = PhiCount - 1
  1756. /// LOOP_BODY
  1757. /// Br: loop if (Dec != 0)
  1758. /// Use(CountPrev + Cnt0) // Use(CntPhi)
  1759. /// or
  1760. /// Use(Count + Cnt0) // Use(CntInst)
  1761. ///
  1762. /// If LOOP_BODY is empty the loop will be deleted.
  1763. /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
  1764. void LoopIdiomRecognize::transformLoopToCountable(
  1765. Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
  1766. PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
  1767. bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
  1768. BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
  1769. // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
  1770. IRBuilder<> Builder(PreheaderBr);
  1771. Builder.SetCurrentDebugLocation(DL);
  1772. // If there are no uses of CntPhi crate:
  1773. // Count = BitWidth - CTLZ(InitX);
  1774. // NewCount = Count;
  1775. // If there are uses of CntPhi create:
  1776. // NewCount = BitWidth - CTLZ(InitX >> 1);
  1777. // Count = NewCount + 1;
  1778. Value *InitXNext;
  1779. if (IsCntPhiUsedOutsideLoop) {
  1780. if (DefX->getOpcode() == Instruction::AShr)
  1781. InitXNext = Builder.CreateAShr(InitX, 1);
  1782. else if (DefX->getOpcode() == Instruction::LShr)
  1783. InitXNext = Builder.CreateLShr(InitX, 1);
  1784. else if (DefX->getOpcode() == Instruction::Shl) // cttz
  1785. InitXNext = Builder.CreateShl(InitX, 1);
  1786. else
  1787. llvm_unreachable("Unexpected opcode!");
  1788. } else
  1789. InitXNext = InitX;
  1790. Value *Count =
  1791. createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
  1792. Type *CountTy = Count->getType();
  1793. Count = Builder.CreateSub(
  1794. ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
  1795. Value *NewCount = Count;
  1796. if (IsCntPhiUsedOutsideLoop)
  1797. Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
  1798. NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
  1799. Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
  1800. if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
  1801. // If the counter was being incremented in the loop, add NewCount to the
  1802. // counter's initial value, but only if the initial value is not zero.
  1803. ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
  1804. if (!InitConst || !InitConst->isZero())
  1805. NewCount = Builder.CreateAdd(NewCount, CntInitVal);
  1806. } else {
  1807. // If the count was being decremented in the loop, subtract NewCount from
  1808. // the counter's initial value.
  1809. NewCount = Builder.CreateSub(CntInitVal, NewCount);
  1810. }
  1811. // Step 2: Insert new IV and loop condition:
  1812. // loop:
  1813. // ...
  1814. // PhiCount = PHI [Count, Dec]
  1815. // ...
  1816. // Dec = PhiCount - 1
  1817. // ...
  1818. // Br: loop if (Dec != 0)
  1819. BasicBlock *Body = *(CurLoop->block_begin());
  1820. auto *LbBr = cast<BranchInst>(Body->getTerminator());
  1821. ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
  1822. PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
  1823. Builder.SetInsertPoint(LbCond);
  1824. Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
  1825. TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
  1826. TcPhi->addIncoming(Count, Preheader);
  1827. TcPhi->addIncoming(TcDec, Body);
  1828. CmpInst::Predicate Pred =
  1829. (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
  1830. LbCond->setPredicate(Pred);
  1831. LbCond->setOperand(0, TcDec);
  1832. LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
  1833. // Step 3: All the references to the original counter outside
  1834. // the loop are replaced with the NewCount
  1835. if (IsCntPhiUsedOutsideLoop)
  1836. CntPhi->replaceUsesOutsideBlock(NewCount, Body);
  1837. else
  1838. CntInst->replaceUsesOutsideBlock(NewCount, Body);
  1839. // step 4: Forget the "non-computable" trip-count SCEV associated with the
  1840. // loop. The loop would otherwise not be deleted even if it becomes empty.
  1841. SE->forgetLoop(CurLoop);
  1842. }
  1843. void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
  1844. Instruction *CntInst,
  1845. PHINode *CntPhi, Value *Var) {
  1846. BasicBlock *PreHead = CurLoop->getLoopPreheader();
  1847. auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
  1848. const DebugLoc &DL = CntInst->getDebugLoc();
  1849. // Assuming before transformation, the loop is following:
  1850. // if (x) // the precondition
  1851. // do { cnt++; x &= x - 1; } while(x);
  1852. // Step 1: Insert the ctpop instruction at the end of the precondition block
  1853. IRBuilder<> Builder(PreCondBr);
  1854. Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
  1855. {
  1856. PopCnt = createPopcntIntrinsic(Builder, Var, DL);
  1857. NewCount = PopCntZext =
  1858. Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
  1859. if (NewCount != PopCnt)
  1860. (cast<Instruction>(NewCount))->setDebugLoc(DL);
  1861. // TripCnt is exactly the number of iterations the loop has
  1862. TripCnt = NewCount;
  1863. // If the population counter's initial value is not zero, insert Add Inst.
  1864. Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
  1865. ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
  1866. if (!InitConst || !InitConst->isZero()) {
  1867. NewCount = Builder.CreateAdd(NewCount, CntInitVal);
  1868. (cast<Instruction>(NewCount))->setDebugLoc(DL);
  1869. }
  1870. }
  1871. // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
  1872. // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
  1873. // function would be partial dead code, and downstream passes will drag
  1874. // it back from the precondition block to the preheader.
  1875. {
  1876. ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
  1877. Value *Opnd0 = PopCntZext;
  1878. Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
  1879. if (PreCond->getOperand(0) != Var)
  1880. std::swap(Opnd0, Opnd1);
  1881. ICmpInst *NewPreCond = cast<ICmpInst>(
  1882. Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
  1883. PreCondBr->setCondition(NewPreCond);
  1884. RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
  1885. }
  1886. // Step 3: Note that the population count is exactly the trip count of the
  1887. // loop in question, which enable us to convert the loop from noncountable
  1888. // loop into a countable one. The benefit is twofold:
  1889. //
  1890. // - If the loop only counts population, the entire loop becomes dead after
  1891. // the transformation. It is a lot easier to prove a countable loop dead
  1892. // than to prove a noncountable one. (In some C dialects, an infinite loop
  1893. // isn't dead even if it computes nothing useful. In general, DCE needs
  1894. // to prove a noncountable loop finite before safely delete it.)
  1895. //
  1896. // - If the loop also performs something else, it remains alive.
  1897. // Since it is transformed to countable form, it can be aggressively
  1898. // optimized by some optimizations which are in general not applicable
  1899. // to a noncountable loop.
  1900. //
  1901. // After this step, this loop (conceptually) would look like following:
  1902. // newcnt = __builtin_ctpop(x);
  1903. // t = newcnt;
  1904. // if (x)
  1905. // do { cnt++; x &= x-1; t--) } while (t > 0);
  1906. BasicBlock *Body = *(CurLoop->block_begin());
  1907. {
  1908. auto *LbBr = cast<BranchInst>(Body->getTerminator());
  1909. ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
  1910. Type *Ty = TripCnt->getType();
  1911. PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
  1912. Builder.SetInsertPoint(LbCond);
  1913. Instruction *TcDec = cast<Instruction>(
  1914. Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
  1915. "tcdec", false, true));
  1916. TcPhi->addIncoming(TripCnt, PreHead);
  1917. TcPhi->addIncoming(TcDec, Body);
  1918. CmpInst::Predicate Pred =
  1919. (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
  1920. LbCond->setPredicate(Pred);
  1921. LbCond->setOperand(0, TcDec);
  1922. LbCond->setOperand(1, ConstantInt::get(Ty, 0));
  1923. }
  1924. // Step 4: All the references to the original population counter outside
  1925. // the loop are replaced with the NewCount -- the value returned from
  1926. // __builtin_ctpop().
  1927. CntInst->replaceUsesOutsideBlock(NewCount, Body);
  1928. // step 5: Forget the "non-computable" trip-count SCEV associated with the
  1929. // loop. The loop would otherwise not be deleted even if it becomes empty.
  1930. SE->forgetLoop(CurLoop);
  1931. }
  1932. /// Match loop-invariant value.
  1933. template <typename SubPattern_t> struct match_LoopInvariant {
  1934. SubPattern_t SubPattern;
  1935. const Loop *L;
  1936. match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
  1937. : SubPattern(SP), L(L) {}
  1938. template <typename ITy> bool match(ITy *V) {
  1939. return L->isLoopInvariant(V) && SubPattern.match(V);
  1940. }
  1941. };
  1942. /// Matches if the value is loop-invariant.
  1943. template <typename Ty>
  1944. inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
  1945. return match_LoopInvariant<Ty>(M, L);
  1946. }
  1947. /// Return true if the idiom is detected in the loop.
  1948. ///
  1949. /// The core idiom we are trying to detect is:
  1950. /// \code
  1951. /// entry:
  1952. /// <...>
  1953. /// %bitmask = shl i32 1, %bitpos
  1954. /// br label %loop
  1955. ///
  1956. /// loop:
  1957. /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
  1958. /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
  1959. /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
  1960. /// %x.next = shl i32 %x.curr, 1
  1961. /// <...>
  1962. /// br i1 %x.curr.isbitunset, label %loop, label %end
  1963. ///
  1964. /// end:
  1965. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  1966. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  1967. /// <...>
  1968. /// \endcode
  1969. static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
  1970. Value *&BitMask, Value *&BitPos,
  1971. Value *&CurrX, Instruction *&NextX) {
  1972. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  1973. " Performing shift-until-bittest idiom detection.\n");
  1974. // Give up if the loop has multiple blocks or multiple backedges.
  1975. if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
  1976. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
  1977. return false;
  1978. }
  1979. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  1980. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  1981. assert(LoopPreheaderBB && "There is always a loop preheader.");
  1982. using namespace PatternMatch;
  1983. // Step 1: Check if the loop backedge is in desirable form.
  1984. ICmpInst::Predicate Pred;
  1985. Value *CmpLHS, *CmpRHS;
  1986. BasicBlock *TrueBB, *FalseBB;
  1987. if (!match(LoopHeaderBB->getTerminator(),
  1988. m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
  1989. m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
  1990. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
  1991. return false;
  1992. }
  1993. // Step 2: Check if the backedge's condition is in desirable form.
  1994. auto MatchVariableBitMask = [&]() {
  1995. return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
  1996. match(CmpLHS,
  1997. m_c_And(m_Value(CurrX),
  1998. m_CombineAnd(
  1999. m_Value(BitMask),
  2000. m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
  2001. CurLoop))));
  2002. };
  2003. auto MatchConstantBitMask = [&]() {
  2004. return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
  2005. match(CmpLHS, m_And(m_Value(CurrX),
  2006. m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
  2007. (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
  2008. };
  2009. auto MatchDecomposableConstantBitMask = [&]() {
  2010. APInt Mask;
  2011. return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
  2012. ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
  2013. (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
  2014. (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
  2015. };
  2016. if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
  2017. !MatchDecomposableConstantBitMask()) {
  2018. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
  2019. return false;
  2020. }
  2021. // Step 3: Check if the recurrence is in desirable form.
  2022. auto *CurrXPN = dyn_cast<PHINode>(CurrX);
  2023. if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
  2024. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
  2025. return false;
  2026. }
  2027. BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
  2028. NextX =
  2029. dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
  2030. assert(CurLoop->isLoopInvariant(BaseX) &&
  2031. "Expected BaseX to be avaliable in the preheader!");
  2032. if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
  2033. // FIXME: support right-shift?
  2034. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
  2035. return false;
  2036. }
  2037. // Step 4: Check if the backedge's destinations are in desirable form.
  2038. assert(ICmpInst::isEquality(Pred) &&
  2039. "Should only get equality predicates here.");
  2040. // cmp-br is commutative, so canonicalize to a single variant.
  2041. if (Pred != ICmpInst::Predicate::ICMP_EQ) {
  2042. Pred = ICmpInst::getInversePredicate(Pred);
  2043. std::swap(TrueBB, FalseBB);
  2044. }
  2045. // We expect to exit loop when comparison yields false,
  2046. // so when it yields true we should branch back to loop header.
  2047. if (TrueBB != LoopHeaderBB) {
  2048. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
  2049. return false;
  2050. }
  2051. // Okay, idiom checks out.
  2052. return true;
  2053. }
  2054. /// Look for the following loop:
  2055. /// \code
  2056. /// entry:
  2057. /// <...>
  2058. /// %bitmask = shl i32 1, %bitpos
  2059. /// br label %loop
  2060. ///
  2061. /// loop:
  2062. /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
  2063. /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
  2064. /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
  2065. /// %x.next = shl i32 %x.curr, 1
  2066. /// <...>
  2067. /// br i1 %x.curr.isbitunset, label %loop, label %end
  2068. ///
  2069. /// end:
  2070. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  2071. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  2072. /// <...>
  2073. /// \endcode
  2074. ///
  2075. /// And transform it into:
  2076. /// \code
  2077. /// entry:
  2078. /// %bitmask = shl i32 1, %bitpos
  2079. /// %lowbitmask = add i32 %bitmask, -1
  2080. /// %mask = or i32 %lowbitmask, %bitmask
  2081. /// %x.masked = and i32 %x, %mask
  2082. /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
  2083. /// i1 true)
  2084. /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
  2085. /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
  2086. /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
  2087. /// %tripcount = add i32 %backedgetakencount, 1
  2088. /// %x.curr = shl i32 %x, %backedgetakencount
  2089. /// %x.next = shl i32 %x, %tripcount
  2090. /// br label %loop
  2091. ///
  2092. /// loop:
  2093. /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
  2094. /// %loop.iv.next = add nuw i32 %loop.iv, 1
  2095. /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
  2096. /// <...>
  2097. /// br i1 %loop.ivcheck, label %end, label %loop
  2098. ///
  2099. /// end:
  2100. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  2101. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  2102. /// <...>
  2103. /// \endcode
  2104. bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
  2105. bool MadeChange = false;
  2106. Value *X, *BitMask, *BitPos, *XCurr;
  2107. Instruction *XNext;
  2108. if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
  2109. XNext)) {
  2110. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2111. " shift-until-bittest idiom detection failed.\n");
  2112. return MadeChange;
  2113. }
  2114. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
  2115. // Ok, it is the idiom we were looking for, we *could* transform this loop,
  2116. // but is it profitable to transform?
  2117. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2118. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2119. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2120. BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  2121. assert(SuccessorBB && "There is only a single successor.");
  2122. IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  2123. Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
  2124. Intrinsic::ID IntrID = Intrinsic::ctlz;
  2125. Type *Ty = X->getType();
  2126. unsigned Bitwidth = Ty->getScalarSizeInBits();
  2127. TargetTransformInfo::TargetCostKind CostKind =
  2128. TargetTransformInfo::TCK_SizeAndLatency;
  2129. // The rewrite is considered to be unprofitable iff and only iff the
  2130. // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
  2131. // making the loop countable, even if nothing else changes.
  2132. IntrinsicCostAttributes Attrs(
  2133. IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
  2134. InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  2135. if (Cost > TargetTransformInfo::TCC_Basic) {
  2136. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2137. " Intrinsic is too costly, not beneficial\n");
  2138. return MadeChange;
  2139. }
  2140. if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
  2141. TargetTransformInfo::TCC_Basic) {
  2142. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
  2143. return MadeChange;
  2144. }
  2145. // Ok, transform appears worthwhile.
  2146. MadeChange = true;
  2147. // Step 1: Compute the loop trip count.
  2148. Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
  2149. BitPos->getName() + ".lowbitmask");
  2150. Value *Mask =
  2151. Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
  2152. Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
  2153. CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
  2154. IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
  2155. /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
  2156. Value *XMaskedNumActiveBits = Builder.CreateSub(
  2157. ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
  2158. XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
  2159. /*HasNSW=*/Bitwidth != 2);
  2160. Value *XMaskedLeadingOnePos =
  2161. Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
  2162. XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
  2163. /*HasNSW=*/Bitwidth > 2);
  2164. Value *LoopBackedgeTakenCount = Builder.CreateSub(
  2165. BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
  2166. /*HasNUW=*/true, /*HasNSW=*/true);
  2167. // We know loop's backedge-taken count, but what's loop's trip count?
  2168. // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
  2169. Value *LoopTripCount =
  2170. Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
  2171. CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
  2172. /*HasNSW=*/Bitwidth != 2);
  2173. // Step 2: Compute the recurrence's final value without a loop.
  2174. // NewX is always safe to compute, because `LoopBackedgeTakenCount`
  2175. // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
  2176. Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
  2177. NewX->takeName(XCurr);
  2178. if (auto *I = dyn_cast<Instruction>(NewX))
  2179. I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
  2180. Value *NewXNext;
  2181. // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
  2182. // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
  2183. // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
  2184. // that isn't the case, we'll need to emit an alternative, safe IR.
  2185. if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
  2186. PatternMatch::match(
  2187. BitPos, PatternMatch::m_SpecificInt_ICMP(
  2188. ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
  2189. Ty->getScalarSizeInBits() - 1))))
  2190. NewXNext = Builder.CreateShl(X, LoopTripCount);
  2191. else {
  2192. // Otherwise, just additionally shift by one. It's the smallest solution,
  2193. // alternatively, we could check that NewX is INT_MIN (or BitPos is )
  2194. // and select 0 instead.
  2195. NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
  2196. }
  2197. NewXNext->takeName(XNext);
  2198. if (auto *I = dyn_cast<Instruction>(NewXNext))
  2199. I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
  2200. // Step 3: Adjust the successor basic block to recieve the computed
  2201. // recurrence's final value instead of the recurrence itself.
  2202. XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
  2203. XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
  2204. // Step 4: Rewrite the loop into a countable form, with canonical IV.
  2205. // The new canonical induction variable.
  2206. Builder.SetInsertPoint(&LoopHeaderBB->front());
  2207. auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
  2208. // The induction itself.
  2209. // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
  2210. Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  2211. auto *IVNext =
  2212. Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
  2213. /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
  2214. // The loop trip count check.
  2215. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
  2216. CurLoop->getName() + ".ivcheck");
  2217. Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
  2218. LoopHeaderBB->getTerminator()->eraseFromParent();
  2219. // Populate the IV PHI.
  2220. IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  2221. IV->addIncoming(IVNext, LoopHeaderBB);
  2222. // Step 5: Forget the "non-computable" trip-count SCEV associated with the
  2223. // loop. The loop would otherwise not be deleted even if it becomes empty.
  2224. SE->forgetLoop(CurLoop);
  2225. // Other passes will take care of actually deleting the loop if possible.
  2226. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
  2227. ++NumShiftUntilBitTest;
  2228. return MadeChange;
  2229. }
  2230. /// Return true if the idiom is detected in the loop.
  2231. ///
  2232. /// The core idiom we are trying to detect is:
  2233. /// \code
  2234. /// entry:
  2235. /// <...>
  2236. /// %start = <...>
  2237. /// %extraoffset = <...>
  2238. /// <...>
  2239. /// br label %for.cond
  2240. ///
  2241. /// loop:
  2242. /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
  2243. /// %nbits = add nsw i8 %iv, %extraoffset
  2244. /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
  2245. /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
  2246. /// %iv.next = add i8 %iv, 1
  2247. /// <...>
  2248. /// br i1 %val.shifted.iszero, label %end, label %loop
  2249. ///
  2250. /// end:
  2251. /// %iv.res = phi i8 [ %iv, %loop ] <...>
  2252. /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
  2253. /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
  2254. /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
  2255. /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
  2256. /// <...>
  2257. /// \endcode
  2258. static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
  2259. Instruction *&ValShiftedIsZero,
  2260. Intrinsic::ID &IntrinID, Instruction *&IV,
  2261. Value *&Start, Value *&Val,
  2262. const SCEV *&ExtraOffsetExpr,
  2263. bool &InvertedCond) {
  2264. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2265. " Performing shift-until-zero idiom detection.\n");
  2266. // Give up if the loop has multiple blocks or multiple backedges.
  2267. if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
  2268. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
  2269. return false;
  2270. }
  2271. Instruction *ValShifted, *NBits, *IVNext;
  2272. Value *ExtraOffset;
  2273. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2274. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2275. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2276. using namespace PatternMatch;
  2277. // Step 1: Check if the loop backedge, condition is in desirable form.
  2278. ICmpInst::Predicate Pred;
  2279. BasicBlock *TrueBB, *FalseBB;
  2280. if (!match(LoopHeaderBB->getTerminator(),
  2281. m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
  2282. m_BasicBlock(FalseBB))) ||
  2283. !match(ValShiftedIsZero,
  2284. m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
  2285. !ICmpInst::isEquality(Pred)) {
  2286. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
  2287. return false;
  2288. }
  2289. // Step 2: Check if the comparison's operand is in desirable form.
  2290. // FIXME: Val could be a one-input PHI node, which we should look past.
  2291. if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
  2292. m_Instruction(NBits)))) {
  2293. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
  2294. return false;
  2295. }
  2296. IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
  2297. : Intrinsic::ctlz;
  2298. // Step 3: Check if the shift amount is in desirable form.
  2299. if (match(NBits, m_c_Add(m_Instruction(IV),
  2300. m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
  2301. (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
  2302. ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
  2303. else if (match(NBits,
  2304. m_Sub(m_Instruction(IV),
  2305. m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
  2306. NBits->hasNoSignedWrap())
  2307. ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
  2308. else {
  2309. IV = NBits;
  2310. ExtraOffsetExpr = SE->getZero(NBits->getType());
  2311. }
  2312. // Step 4: Check if the recurrence is in desirable form.
  2313. auto *IVPN = dyn_cast<PHINode>(IV);
  2314. if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
  2315. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
  2316. return false;
  2317. }
  2318. Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
  2319. IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
  2320. if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
  2321. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
  2322. return false;
  2323. }
  2324. // Step 4: Check if the backedge's destinations are in desirable form.
  2325. assert(ICmpInst::isEquality(Pred) &&
  2326. "Should only get equality predicates here.");
  2327. // cmp-br is commutative, so canonicalize to a single variant.
  2328. InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
  2329. if (InvertedCond) {
  2330. Pred = ICmpInst::getInversePredicate(Pred);
  2331. std::swap(TrueBB, FalseBB);
  2332. }
  2333. // We expect to exit loop when comparison yields true,
  2334. // so when it yields false we should branch back to loop header.
  2335. if (FalseBB != LoopHeaderBB) {
  2336. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
  2337. return false;
  2338. }
  2339. // The new, countable, loop will certainly only run a known number of
  2340. // iterations, It won't be infinite. But the old loop might be infinite
  2341. // under certain conditions. For logical shifts, the value will become zero
  2342. // after at most bitwidth(%Val) loop iterations. However, for arithmetic
  2343. // right-shift, iff the sign bit was set, the value will never become zero,
  2344. // and the loop may never finish.
  2345. if (ValShifted->getOpcode() == Instruction::AShr &&
  2346. !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
  2347. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
  2348. return false;
  2349. }
  2350. // Okay, idiom checks out.
  2351. return true;
  2352. }
  2353. /// Look for the following loop:
  2354. /// \code
  2355. /// entry:
  2356. /// <...>
  2357. /// %start = <...>
  2358. /// %extraoffset = <...>
  2359. /// <...>
  2360. /// br label %for.cond
  2361. ///
  2362. /// loop:
  2363. /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
  2364. /// %nbits = add nsw i8 %iv, %extraoffset
  2365. /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
  2366. /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
  2367. /// %iv.next = add i8 %iv, 1
  2368. /// <...>
  2369. /// br i1 %val.shifted.iszero, label %end, label %loop
  2370. ///
  2371. /// end:
  2372. /// %iv.res = phi i8 [ %iv, %loop ] <...>
  2373. /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
  2374. /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
  2375. /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
  2376. /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
  2377. /// <...>
  2378. /// \endcode
  2379. ///
  2380. /// And transform it into:
  2381. /// \code
  2382. /// entry:
  2383. /// <...>
  2384. /// %start = <...>
  2385. /// %extraoffset = <...>
  2386. /// <...>
  2387. /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
  2388. /// %val.numactivebits = sub i8 8, %val.numleadingzeros
  2389. /// %extraoffset.neg = sub i8 0, %extraoffset
  2390. /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
  2391. /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
  2392. /// %loop.tripcount = sub i8 %iv.final, %start
  2393. /// br label %loop
  2394. ///
  2395. /// loop:
  2396. /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
  2397. /// %loop.iv.next = add i8 %loop.iv, 1
  2398. /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
  2399. /// %iv = add i8 %loop.iv, %start
  2400. /// <...>
  2401. /// br i1 %loop.ivcheck, label %end, label %loop
  2402. ///
  2403. /// end:
  2404. /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
  2405. /// <...>
  2406. /// \endcode
  2407. bool LoopIdiomRecognize::recognizeShiftUntilZero() {
  2408. bool MadeChange = false;
  2409. Instruction *ValShiftedIsZero;
  2410. Intrinsic::ID IntrID;
  2411. Instruction *IV;
  2412. Value *Start, *Val;
  2413. const SCEV *ExtraOffsetExpr;
  2414. bool InvertedCond;
  2415. if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
  2416. Start, Val, ExtraOffsetExpr, InvertedCond)) {
  2417. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2418. " shift-until-zero idiom detection failed.\n");
  2419. return MadeChange;
  2420. }
  2421. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
  2422. // Ok, it is the idiom we were looking for, we *could* transform this loop,
  2423. // but is it profitable to transform?
  2424. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2425. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2426. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2427. BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  2428. assert(SuccessorBB && "There is only a single successor.");
  2429. IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  2430. Builder.SetCurrentDebugLocation(IV->getDebugLoc());
  2431. Type *Ty = Val->getType();
  2432. unsigned Bitwidth = Ty->getScalarSizeInBits();
  2433. TargetTransformInfo::TargetCostKind CostKind =
  2434. TargetTransformInfo::TCK_SizeAndLatency;
  2435. // The rewrite is considered to be unprofitable iff and only iff the
  2436. // intrinsic we'll use are not cheap. Note that we are okay with *just*
  2437. // making the loop countable, even if nothing else changes.
  2438. IntrinsicCostAttributes Attrs(
  2439. IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
  2440. InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  2441. if (Cost > TargetTransformInfo::TCC_Basic) {
  2442. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2443. " Intrinsic is too costly, not beneficial\n");
  2444. return MadeChange;
  2445. }
  2446. // Ok, transform appears worthwhile.
  2447. MadeChange = true;
  2448. bool OffsetIsZero = false;
  2449. if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
  2450. OffsetIsZero = ExtraOffsetExprC->isZero();
  2451. // Step 1: Compute the loop's final IV value / trip count.
  2452. CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
  2453. IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
  2454. /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
  2455. Value *ValNumActiveBits = Builder.CreateSub(
  2456. ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
  2457. Val->getName() + ".numactivebits", /*HasNUW=*/true,
  2458. /*HasNSW=*/Bitwidth != 2);
  2459. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  2460. Expander.setInsertPoint(&*Builder.GetInsertPoint());
  2461. Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
  2462. Value *ValNumActiveBitsOffset = Builder.CreateAdd(
  2463. ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
  2464. /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
  2465. Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
  2466. {ValNumActiveBitsOffset, Start},
  2467. /*FMFSource=*/nullptr, "iv.final");
  2468. auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
  2469. IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
  2470. /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
  2471. // FIXME: or when the offset was `add nuw`
  2472. // We know loop's backedge-taken count, but what's loop's trip count?
  2473. Value *LoopTripCount =
  2474. Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
  2475. CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
  2476. /*HasNSW=*/Bitwidth != 2);
  2477. // Step 2: Adjust the successor basic block to recieve the original
  2478. // induction variable's final value instead of the orig. IV itself.
  2479. IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
  2480. // Step 3: Rewrite the loop into a countable form, with canonical IV.
  2481. // The new canonical induction variable.
  2482. Builder.SetInsertPoint(&LoopHeaderBB->front());
  2483. auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
  2484. // The induction itself.
  2485. Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
  2486. auto *CIVNext =
  2487. Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
  2488. /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
  2489. // The loop trip count check.
  2490. auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
  2491. CurLoop->getName() + ".ivcheck");
  2492. auto *NewIVCheck = CIVCheck;
  2493. if (InvertedCond) {
  2494. NewIVCheck = Builder.CreateNot(CIVCheck);
  2495. NewIVCheck->takeName(ValShiftedIsZero);
  2496. }
  2497. // The original IV, but rebased to be an offset to the CIV.
  2498. auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
  2499. /*HasNSW=*/true); // FIXME: what about NUW?
  2500. IVDePHId->takeName(IV);
  2501. // The loop terminator.
  2502. Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  2503. Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
  2504. LoopHeaderBB->getTerminator()->eraseFromParent();
  2505. // Populate the IV PHI.
  2506. CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  2507. CIV->addIncoming(CIVNext, LoopHeaderBB);
  2508. // Step 4: Forget the "non-computable" trip-count SCEV associated with the
  2509. // loop. The loop would otherwise not be deleted even if it becomes empty.
  2510. SE->forgetLoop(CurLoop);
  2511. // Step 5: Try to cleanup the loop's body somewhat.
  2512. IV->replaceAllUsesWith(IVDePHId);
  2513. IV->eraseFromParent();
  2514. ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
  2515. ValShiftedIsZero->eraseFromParent();
  2516. // Other passes will take care of actually deleting the loop if possible.
  2517. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
  2518. ++NumShiftUntilZero;
  2519. return MadeChange;
  2520. }