LoopIdiomRecognize.cpp 111 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933
  1. //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass implements an idiom recognizer that transforms simple loops into a
  10. // non-loop form. In cases that this kicks in, it can be a significant
  11. // performance win.
  12. //
  13. // If compiling for code size we avoid idiom recognition if the resulting
  14. // code could be larger than the code for the original loop. One way this could
  15. // happen is if the loop is not removable after idiom recognition due to the
  16. // presence of non-idiom instructions. The initial implementation of the
  17. // heuristics applies to idioms in multi-block loops.
  18. //
  19. //===----------------------------------------------------------------------===//
  20. //
  21. // TODO List:
  22. //
  23. // Future loop memory idioms to recognize:
  24. // memcmp, strlen, etc.
  25. // Future floating point idioms to recognize in -ffast-math mode:
  26. // fpowi
  27. // Future integer operation idioms to recognize:
  28. // ctpop
  29. //
  30. // Beware that isel's default lowering for ctpop is highly inefficient for
  31. // i64 and larger types when i64 is legal and the value has few bits set. It
  32. // would be good to enhance isel to emit a loop for ctpop in this case.
  33. //
  34. // This could recognize common matrix multiplies and dot product idioms and
  35. // replace them with calls to BLAS (if linked in??).
  36. //
  37. //===----------------------------------------------------------------------===//
  38. #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
  39. #include "llvm/ADT/APInt.h"
  40. #include "llvm/ADT/ArrayRef.h"
  41. #include "llvm/ADT/DenseMap.h"
  42. #include "llvm/ADT/MapVector.h"
  43. #include "llvm/ADT/SetVector.h"
  44. #include "llvm/ADT/SmallPtrSet.h"
  45. #include "llvm/ADT/SmallVector.h"
  46. #include "llvm/ADT/Statistic.h"
  47. #include "llvm/ADT/StringRef.h"
  48. #include "llvm/Analysis/AliasAnalysis.h"
  49. #include "llvm/Analysis/CmpInstAnalysis.h"
  50. #include "llvm/Analysis/LoopAccessAnalysis.h"
  51. #include "llvm/Analysis/LoopInfo.h"
  52. #include "llvm/Analysis/LoopPass.h"
  53. #include "llvm/Analysis/MemoryLocation.h"
  54. #include "llvm/Analysis/MemorySSA.h"
  55. #include "llvm/Analysis/MemorySSAUpdater.h"
  56. #include "llvm/Analysis/MustExecute.h"
  57. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  58. #include "llvm/Analysis/ScalarEvolution.h"
  59. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  60. #include "llvm/Analysis/TargetLibraryInfo.h"
  61. #include "llvm/Analysis/TargetTransformInfo.h"
  62. #include "llvm/Analysis/ValueTracking.h"
  63. #include "llvm/IR/BasicBlock.h"
  64. #include "llvm/IR/Constant.h"
  65. #include "llvm/IR/Constants.h"
  66. #include "llvm/IR/DataLayout.h"
  67. #include "llvm/IR/DebugLoc.h"
  68. #include "llvm/IR/DerivedTypes.h"
  69. #include "llvm/IR/Dominators.h"
  70. #include "llvm/IR/GlobalValue.h"
  71. #include "llvm/IR/GlobalVariable.h"
  72. #include "llvm/IR/IRBuilder.h"
  73. #include "llvm/IR/InstrTypes.h"
  74. #include "llvm/IR/Instruction.h"
  75. #include "llvm/IR/Instructions.h"
  76. #include "llvm/IR/IntrinsicInst.h"
  77. #include "llvm/IR/Intrinsics.h"
  78. #include "llvm/IR/LLVMContext.h"
  79. #include "llvm/IR/Module.h"
  80. #include "llvm/IR/PassManager.h"
  81. #include "llvm/IR/PatternMatch.h"
  82. #include "llvm/IR/Type.h"
  83. #include "llvm/IR/User.h"
  84. #include "llvm/IR/Value.h"
  85. #include "llvm/IR/ValueHandle.h"
  86. #include "llvm/InitializePasses.h"
  87. #include "llvm/Pass.h"
  88. #include "llvm/Support/Casting.h"
  89. #include "llvm/Support/CommandLine.h"
  90. #include "llvm/Support/Debug.h"
  91. #include "llvm/Support/InstructionCost.h"
  92. #include "llvm/Support/raw_ostream.h"
  93. #include "llvm/Transforms/Scalar.h"
  94. #include "llvm/Transforms/Utils/BuildLibCalls.h"
  95. #include "llvm/Transforms/Utils/Local.h"
  96. #include "llvm/Transforms/Utils/LoopUtils.h"
  97. #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
  98. #include <algorithm>
  99. #include <cassert>
  100. #include <cstdint>
  101. #include <utility>
  102. #include <vector>
  103. using namespace llvm;
  104. #define DEBUG_TYPE "loop-idiom"
  105. STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
  106. STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
  107. STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
  108. STATISTIC(
  109. NumShiftUntilBitTest,
  110. "Number of uncountable loops recognized as 'shift until bitttest' idiom");
  111. STATISTIC(NumShiftUntilZero,
  112. "Number of uncountable loops recognized as 'shift until zero' idiom");
  113. bool DisableLIRP::All;
  114. static cl::opt<bool, true>
  115. DisableLIRPAll("disable-" DEBUG_TYPE "-all",
  116. cl::desc("Options to disable Loop Idiom Recognize Pass."),
  117. cl::location(DisableLIRP::All), cl::init(false),
  118. cl::ReallyHidden);
  119. bool DisableLIRP::Memset;
  120. static cl::opt<bool, true>
  121. DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
  122. cl::desc("Proceed with loop idiom recognize pass, but do "
  123. "not convert loop(s) to memset."),
  124. cl::location(DisableLIRP::Memset), cl::init(false),
  125. cl::ReallyHidden);
  126. bool DisableLIRP::Memcpy;
  127. static cl::opt<bool, true>
  128. DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
  129. cl::desc("Proceed with loop idiom recognize pass, but do "
  130. "not convert loop(s) to memcpy."),
  131. cl::location(DisableLIRP::Memcpy), cl::init(false),
  132. cl::ReallyHidden);
  133. static cl::opt<bool> UseLIRCodeSizeHeurs(
  134. "use-lir-code-size-heurs",
  135. cl::desc("Use loop idiom recognition code size heuristics when compiling"
  136. "with -Os/-Oz"),
  137. cl::init(true), cl::Hidden);
  138. namespace {
  139. class LoopIdiomRecognize {
  140. Loop *CurLoop = nullptr;
  141. AliasAnalysis *AA;
  142. DominatorTree *DT;
  143. LoopInfo *LI;
  144. ScalarEvolution *SE;
  145. TargetLibraryInfo *TLI;
  146. const TargetTransformInfo *TTI;
  147. const DataLayout *DL;
  148. OptimizationRemarkEmitter &ORE;
  149. bool ApplyCodeSizeHeuristics;
  150. std::unique_ptr<MemorySSAUpdater> MSSAU;
  151. public:
  152. explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
  153. LoopInfo *LI, ScalarEvolution *SE,
  154. TargetLibraryInfo *TLI,
  155. const TargetTransformInfo *TTI, MemorySSA *MSSA,
  156. const DataLayout *DL,
  157. OptimizationRemarkEmitter &ORE)
  158. : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
  159. if (MSSA)
  160. MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
  161. }
  162. bool runOnLoop(Loop *L);
  163. private:
  164. using StoreList = SmallVector<StoreInst *, 8>;
  165. using StoreListMap = MapVector<Value *, StoreList>;
  166. StoreListMap StoreRefsForMemset;
  167. StoreListMap StoreRefsForMemsetPattern;
  168. StoreList StoreRefsForMemcpy;
  169. bool HasMemset;
  170. bool HasMemsetPattern;
  171. bool HasMemcpy;
  172. /// Return code for isLegalStore()
  173. enum LegalStoreKind {
  174. None = 0,
  175. Memset,
  176. MemsetPattern,
  177. Memcpy,
  178. UnorderedAtomicMemcpy,
  179. DontUse // Dummy retval never to be used. Allows catching errors in retval
  180. // handling.
  181. };
  182. /// \name Countable Loop Idiom Handling
  183. /// @{
  184. bool runOnCountableLoop();
  185. bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
  186. SmallVectorImpl<BasicBlock *> &ExitBlocks);
  187. void collectStores(BasicBlock *BB);
  188. LegalStoreKind isLegalStore(StoreInst *SI);
  189. enum class ForMemset { No, Yes };
  190. bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
  191. ForMemset For);
  192. template <typename MemInst>
  193. bool processLoopMemIntrinsic(
  194. BasicBlock *BB,
  195. bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
  196. const SCEV *BECount);
  197. bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
  198. bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
  199. bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
  200. MaybeAlign StoreAlignment, Value *StoredVal,
  201. Instruction *TheStore,
  202. SmallPtrSetImpl<Instruction *> &Stores,
  203. const SCEVAddRecExpr *Ev, const SCEV *BECount,
  204. bool IsNegStride, bool IsLoopMemset = false);
  205. bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
  206. bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
  207. const SCEV *StoreSize, MaybeAlign StoreAlign,
  208. MaybeAlign LoadAlign, Instruction *TheStore,
  209. Instruction *TheLoad,
  210. const SCEVAddRecExpr *StoreEv,
  211. const SCEVAddRecExpr *LoadEv,
  212. const SCEV *BECount);
  213. bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
  214. bool IsLoopMemset = false);
  215. /// @}
  216. /// \name Noncountable Loop Idiom Handling
  217. /// @{
  218. bool runOnNoncountableLoop();
  219. bool recognizePopcount();
  220. void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
  221. PHINode *CntPhi, Value *Var);
  222. bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
  223. void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
  224. Instruction *CntInst, PHINode *CntPhi,
  225. Value *Var, Instruction *DefX,
  226. const DebugLoc &DL, bool ZeroCheck,
  227. bool IsCntPhiUsedOutsideLoop);
  228. bool recognizeShiftUntilBitTest();
  229. bool recognizeShiftUntilZero();
  230. /// @}
  231. };
  232. class LoopIdiomRecognizeLegacyPass : public LoopPass {
  233. public:
  234. static char ID;
  235. explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
  236. initializeLoopIdiomRecognizeLegacyPassPass(
  237. *PassRegistry::getPassRegistry());
  238. }
  239. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  240. if (DisableLIRP::All)
  241. return false;
  242. if (skipLoop(L))
  243. return false;
  244. AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  245. DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  246. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  247. ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  248. TargetLibraryInfo *TLI =
  249. &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
  250. *L->getHeader()->getParent());
  251. const TargetTransformInfo *TTI =
  252. &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
  253. *L->getHeader()->getParent());
  254. const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
  255. auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
  256. MemorySSA *MSSA = nullptr;
  257. if (MSSAAnalysis)
  258. MSSA = &MSSAAnalysis->getMSSA();
  259. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  260. // pass. Function analyses need to be preserved across loop transformations
  261. // but ORE cannot be preserved (see comment before the pass definition).
  262. OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
  263. LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
  264. return LIR.runOnLoop(L);
  265. }
  266. /// This transformation requires natural loop information & requires that
  267. /// loop preheaders be inserted into the CFG.
  268. void getAnalysisUsage(AnalysisUsage &AU) const override {
  269. AU.addRequired<TargetLibraryInfoWrapperPass>();
  270. AU.addRequired<TargetTransformInfoWrapperPass>();
  271. AU.addPreserved<MemorySSAWrapperPass>();
  272. getLoopAnalysisUsage(AU);
  273. }
  274. };
  275. } // end anonymous namespace
  276. char LoopIdiomRecognizeLegacyPass::ID = 0;
  277. PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
  278. LoopStandardAnalysisResults &AR,
  279. LPMUpdater &) {
  280. if (DisableLIRP::All)
  281. return PreservedAnalyses::all();
  282. const auto *DL = &L.getHeader()->getModule()->getDataLayout();
  283. // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
  284. // pass. Function analyses need to be preserved across loop transformations
  285. // but ORE cannot be preserved (see comment before the pass definition).
  286. OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
  287. LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
  288. AR.MSSA, DL, ORE);
  289. if (!LIR.runOnLoop(&L))
  290. return PreservedAnalyses::all();
  291. auto PA = getLoopPassPreservedAnalyses();
  292. if (AR.MSSA)
  293. PA.preserve<MemorySSAAnalysis>();
  294. return PA;
  295. }
  296. INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
  297. "Recognize loop idioms", false, false)
  298. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  299. INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
  300. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  301. INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
  302. "Recognize loop idioms", false, false)
  303. Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
  304. static void deleteDeadInstruction(Instruction *I) {
  305. I->replaceAllUsesWith(PoisonValue::get(I->getType()));
  306. I->eraseFromParent();
  307. }
  308. //===----------------------------------------------------------------------===//
  309. //
  310. // Implementation of LoopIdiomRecognize
  311. //
  312. //===----------------------------------------------------------------------===//
  313. bool LoopIdiomRecognize::runOnLoop(Loop *L) {
  314. CurLoop = L;
  315. // If the loop could not be converted to canonical form, it must have an
  316. // indirectbr in it, just give up.
  317. if (!L->getLoopPreheader())
  318. return false;
  319. // Disable loop idiom recognition if the function's name is a common idiom.
  320. StringRef Name = L->getHeader()->getParent()->getName();
  321. if (Name == "memset" || Name == "memcpy")
  322. return false;
  323. // Determine if code size heuristics need to be applied.
  324. ApplyCodeSizeHeuristics =
  325. L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
  326. HasMemset = TLI->has(LibFunc_memset);
  327. HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
  328. HasMemcpy = TLI->has(LibFunc_memcpy);
  329. if (HasMemset || HasMemsetPattern || HasMemcpy)
  330. if (SE->hasLoopInvariantBackedgeTakenCount(L))
  331. return runOnCountableLoop();
  332. return runOnNoncountableLoop();
  333. }
  334. bool LoopIdiomRecognize::runOnCountableLoop() {
  335. const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
  336. assert(!isa<SCEVCouldNotCompute>(BECount) &&
  337. "runOnCountableLoop() called on a loop without a predictable"
  338. "backedge-taken count");
  339. // If this loop executes exactly one time, then it should be peeled, not
  340. // optimized by this pass.
  341. if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
  342. if (BECst->getAPInt() == 0)
  343. return false;
  344. SmallVector<BasicBlock *, 8> ExitBlocks;
  345. CurLoop->getUniqueExitBlocks(ExitBlocks);
  346. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
  347. << CurLoop->getHeader()->getParent()->getName()
  348. << "] Countable Loop %" << CurLoop->getHeader()->getName()
  349. << "\n");
  350. // The following transforms hoist stores/memsets into the loop pre-header.
  351. // Give up if the loop has instructions that may throw.
  352. SimpleLoopSafetyInfo SafetyInfo;
  353. SafetyInfo.computeLoopSafetyInfo(CurLoop);
  354. if (SafetyInfo.anyBlockMayThrow())
  355. return false;
  356. bool MadeChange = false;
  357. // Scan all the blocks in the loop that are not in subloops.
  358. for (auto *BB : CurLoop->getBlocks()) {
  359. // Ignore blocks in subloops.
  360. if (LI->getLoopFor(BB) != CurLoop)
  361. continue;
  362. MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
  363. }
  364. return MadeChange;
  365. }
  366. static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
  367. const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
  368. return ConstStride->getAPInt();
  369. }
  370. /// getMemSetPatternValue - If a strided store of the specified value is safe to
  371. /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
  372. /// be passed in. Otherwise, return null.
  373. ///
  374. /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
  375. /// just replicate their input array and then pass on to memset_pattern16.
  376. static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
  377. // FIXME: This could check for UndefValue because it can be merged into any
  378. // other valid pattern.
  379. // If the value isn't a constant, we can't promote it to being in a constant
  380. // array. We could theoretically do a store to an alloca or something, but
  381. // that doesn't seem worthwhile.
  382. Constant *C = dyn_cast<Constant>(V);
  383. if (!C || isa<ConstantExpr>(C))
  384. return nullptr;
  385. // Only handle simple values that are a power of two bytes in size.
  386. uint64_t Size = DL->getTypeSizeInBits(V->getType());
  387. if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
  388. return nullptr;
  389. // Don't care enough about darwin/ppc to implement this.
  390. if (DL->isBigEndian())
  391. return nullptr;
  392. // Convert to size in bytes.
  393. Size /= 8;
  394. // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
  395. // if the top and bottom are the same (e.g. for vectors and large integers).
  396. if (Size > 16)
  397. return nullptr;
  398. // If the constant is exactly 16 bytes, just use it.
  399. if (Size == 16)
  400. return C;
  401. // Otherwise, we'll use an array of the constants.
  402. unsigned ArraySize = 16 / Size;
  403. ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
  404. return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
  405. }
  406. LoopIdiomRecognize::LegalStoreKind
  407. LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
  408. // Don't touch volatile stores.
  409. if (SI->isVolatile())
  410. return LegalStoreKind::None;
  411. // We only want simple or unordered-atomic stores.
  412. if (!SI->isUnordered())
  413. return LegalStoreKind::None;
  414. // Avoid merging nontemporal stores.
  415. if (SI->getMetadata(LLVMContext::MD_nontemporal))
  416. return LegalStoreKind::None;
  417. Value *StoredVal = SI->getValueOperand();
  418. Value *StorePtr = SI->getPointerOperand();
  419. // Don't convert stores of non-integral pointer types to memsets (which stores
  420. // integers).
  421. if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
  422. return LegalStoreKind::None;
  423. // Reject stores that are so large that they overflow an unsigned.
  424. // When storing out scalable vectors we bail out for now, since the code
  425. // below currently only works for constant strides.
  426. TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
  427. if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
  428. (SizeInBits.getFixedValue() >> 32) != 0)
  429. return LegalStoreKind::None;
  430. // See if the pointer expression is an AddRec like {base,+,1} on the current
  431. // loop, which indicates a strided store. If we have something else, it's a
  432. // random store we can't handle.
  433. const SCEVAddRecExpr *StoreEv =
  434. dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  435. if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
  436. return LegalStoreKind::None;
  437. // Check to see if we have a constant stride.
  438. if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
  439. return LegalStoreKind::None;
  440. // See if the store can be turned into a memset.
  441. // If the stored value is a byte-wise value (like i32 -1), then it may be
  442. // turned into a memset of i8 -1, assuming that all the consecutive bytes
  443. // are stored. A store of i32 0x01020304 can never be turned into a memset,
  444. // but it can be turned into memset_pattern if the target supports it.
  445. Value *SplatValue = isBytewiseValue(StoredVal, *DL);
  446. // Note: memset and memset_pattern on unordered-atomic is yet not supported
  447. bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
  448. // If we're allowed to form a memset, and the stored value would be
  449. // acceptable for memset, use it.
  450. if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
  451. // Verify that the stored value is loop invariant. If not, we can't
  452. // promote the memset.
  453. CurLoop->isLoopInvariant(SplatValue)) {
  454. // It looks like we can use SplatValue.
  455. return LegalStoreKind::Memset;
  456. }
  457. if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
  458. // Don't create memset_pattern16s with address spaces.
  459. StorePtr->getType()->getPointerAddressSpace() == 0 &&
  460. getMemSetPatternValue(StoredVal, DL)) {
  461. // It looks like we can use PatternValue!
  462. return LegalStoreKind::MemsetPattern;
  463. }
  464. // Otherwise, see if the store can be turned into a memcpy.
  465. if (HasMemcpy && !DisableLIRP::Memcpy) {
  466. // Check to see if the stride matches the size of the store. If so, then we
  467. // know that every byte is touched in the loop.
  468. APInt Stride = getStoreStride(StoreEv);
  469. unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
  470. if (StoreSize != Stride && StoreSize != -Stride)
  471. return LegalStoreKind::None;
  472. // The store must be feeding a non-volatile load.
  473. LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
  474. // Only allow non-volatile loads
  475. if (!LI || LI->isVolatile())
  476. return LegalStoreKind::None;
  477. // Only allow simple or unordered-atomic loads
  478. if (!LI->isUnordered())
  479. return LegalStoreKind::None;
  480. // See if the pointer expression is an AddRec like {base,+,1} on the current
  481. // loop, which indicates a strided load. If we have something else, it's a
  482. // random load we can't handle.
  483. const SCEVAddRecExpr *LoadEv =
  484. dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
  485. if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
  486. return LegalStoreKind::None;
  487. // The store and load must share the same stride.
  488. if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
  489. return LegalStoreKind::None;
  490. // Success. This store can be converted into a memcpy.
  491. UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
  492. return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
  493. : LegalStoreKind::Memcpy;
  494. }
  495. // This store can't be transformed into a memset/memcpy.
  496. return LegalStoreKind::None;
  497. }
  498. void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
  499. StoreRefsForMemset.clear();
  500. StoreRefsForMemsetPattern.clear();
  501. StoreRefsForMemcpy.clear();
  502. for (Instruction &I : *BB) {
  503. StoreInst *SI = dyn_cast<StoreInst>(&I);
  504. if (!SI)
  505. continue;
  506. // Make sure this is a strided store with a constant stride.
  507. switch (isLegalStore(SI)) {
  508. case LegalStoreKind::None:
  509. // Nothing to do
  510. break;
  511. case LegalStoreKind::Memset: {
  512. // Find the base pointer.
  513. Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
  514. StoreRefsForMemset[Ptr].push_back(SI);
  515. } break;
  516. case LegalStoreKind::MemsetPattern: {
  517. // Find the base pointer.
  518. Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
  519. StoreRefsForMemsetPattern[Ptr].push_back(SI);
  520. } break;
  521. case LegalStoreKind::Memcpy:
  522. case LegalStoreKind::UnorderedAtomicMemcpy:
  523. StoreRefsForMemcpy.push_back(SI);
  524. break;
  525. default:
  526. assert(false && "unhandled return value");
  527. break;
  528. }
  529. }
  530. }
  531. /// runOnLoopBlock - Process the specified block, which lives in a counted loop
  532. /// with the specified backedge count. This block is known to be in the current
  533. /// loop and not in any subloops.
  534. bool LoopIdiomRecognize::runOnLoopBlock(
  535. BasicBlock *BB, const SCEV *BECount,
  536. SmallVectorImpl<BasicBlock *> &ExitBlocks) {
  537. // We can only promote stores in this block if they are unconditionally
  538. // executed in the loop. For a block to be unconditionally executed, it has
  539. // to dominate all the exit blocks of the loop. Verify this now.
  540. for (BasicBlock *ExitBlock : ExitBlocks)
  541. if (!DT->dominates(BB, ExitBlock))
  542. return false;
  543. bool MadeChange = false;
  544. // Look for store instructions, which may be optimized to memset/memcpy.
  545. collectStores(BB);
  546. // Look for a single store or sets of stores with a common base, which can be
  547. // optimized into a memset (memset_pattern). The latter most commonly happens
  548. // with structs and handunrolled loops.
  549. for (auto &SL : StoreRefsForMemset)
  550. MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
  551. for (auto &SL : StoreRefsForMemsetPattern)
  552. MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
  553. // Optimize the store into a memcpy, if it feeds an similarly strided load.
  554. for (auto &SI : StoreRefsForMemcpy)
  555. MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
  556. MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
  557. BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
  558. MadeChange |= processLoopMemIntrinsic<MemSetInst>(
  559. BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
  560. return MadeChange;
  561. }
  562. /// See if this store(s) can be promoted to a memset.
  563. bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
  564. const SCEV *BECount, ForMemset For) {
  565. // Try to find consecutive stores that can be transformed into memsets.
  566. SetVector<StoreInst *> Heads, Tails;
  567. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  568. // Do a quadratic search on all of the given stores and find
  569. // all of the pairs of stores that follow each other.
  570. SmallVector<unsigned, 16> IndexQueue;
  571. for (unsigned i = 0, e = SL.size(); i < e; ++i) {
  572. assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
  573. Value *FirstStoredVal = SL[i]->getValueOperand();
  574. Value *FirstStorePtr = SL[i]->getPointerOperand();
  575. const SCEVAddRecExpr *FirstStoreEv =
  576. cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
  577. APInt FirstStride = getStoreStride(FirstStoreEv);
  578. unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
  579. // See if we can optimize just this store in isolation.
  580. if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
  581. Heads.insert(SL[i]);
  582. continue;
  583. }
  584. Value *FirstSplatValue = nullptr;
  585. Constant *FirstPatternValue = nullptr;
  586. if (For == ForMemset::Yes)
  587. FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
  588. else
  589. FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
  590. assert((FirstSplatValue || FirstPatternValue) &&
  591. "Expected either splat value or pattern value.");
  592. IndexQueue.clear();
  593. // If a store has multiple consecutive store candidates, search Stores
  594. // array according to the sequence: from i+1 to e, then from i-1 to 0.
  595. // This is because usually pairing with immediate succeeding or preceding
  596. // candidate create the best chance to find memset opportunity.
  597. unsigned j = 0;
  598. for (j = i + 1; j < e; ++j)
  599. IndexQueue.push_back(j);
  600. for (j = i; j > 0; --j)
  601. IndexQueue.push_back(j - 1);
  602. for (auto &k : IndexQueue) {
  603. assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
  604. Value *SecondStorePtr = SL[k]->getPointerOperand();
  605. const SCEVAddRecExpr *SecondStoreEv =
  606. cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
  607. APInt SecondStride = getStoreStride(SecondStoreEv);
  608. if (FirstStride != SecondStride)
  609. continue;
  610. Value *SecondStoredVal = SL[k]->getValueOperand();
  611. Value *SecondSplatValue = nullptr;
  612. Constant *SecondPatternValue = nullptr;
  613. if (For == ForMemset::Yes)
  614. SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
  615. else
  616. SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
  617. assert((SecondSplatValue || SecondPatternValue) &&
  618. "Expected either splat value or pattern value.");
  619. if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
  620. if (For == ForMemset::Yes) {
  621. if (isa<UndefValue>(FirstSplatValue))
  622. FirstSplatValue = SecondSplatValue;
  623. if (FirstSplatValue != SecondSplatValue)
  624. continue;
  625. } else {
  626. if (isa<UndefValue>(FirstPatternValue))
  627. FirstPatternValue = SecondPatternValue;
  628. if (FirstPatternValue != SecondPatternValue)
  629. continue;
  630. }
  631. Tails.insert(SL[k]);
  632. Heads.insert(SL[i]);
  633. ConsecutiveChain[SL[i]] = SL[k];
  634. break;
  635. }
  636. }
  637. }
  638. // We may run into multiple chains that merge into a single chain. We mark the
  639. // stores that we transformed so that we don't visit the same store twice.
  640. SmallPtrSet<Value *, 16> TransformedStores;
  641. bool Changed = false;
  642. // For stores that start but don't end a link in the chain:
  643. for (StoreInst *I : Heads) {
  644. if (Tails.count(I))
  645. continue;
  646. // We found a store instr that starts a chain. Now follow the chain and try
  647. // to transform it.
  648. SmallPtrSet<Instruction *, 8> AdjacentStores;
  649. StoreInst *HeadStore = I;
  650. unsigned StoreSize = 0;
  651. // Collect the chain into a list.
  652. while (Tails.count(I) || Heads.count(I)) {
  653. if (TransformedStores.count(I))
  654. break;
  655. AdjacentStores.insert(I);
  656. StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
  657. // Move to the next value in the chain.
  658. I = ConsecutiveChain[I];
  659. }
  660. Value *StoredVal = HeadStore->getValueOperand();
  661. Value *StorePtr = HeadStore->getPointerOperand();
  662. const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  663. APInt Stride = getStoreStride(StoreEv);
  664. // Check to see if the stride matches the size of the stores. If so, then
  665. // we know that every byte is touched in the loop.
  666. if (StoreSize != Stride && StoreSize != -Stride)
  667. continue;
  668. bool IsNegStride = StoreSize == -Stride;
  669. Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
  670. const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
  671. if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
  672. MaybeAlign(HeadStore->getAlign()), StoredVal,
  673. HeadStore, AdjacentStores, StoreEv, BECount,
  674. IsNegStride)) {
  675. TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
  676. Changed = true;
  677. }
  678. }
  679. return Changed;
  680. }
  681. /// processLoopMemIntrinsic - Template function for calling different processor
  682. /// functions based on mem intrinsic type.
  683. template <typename MemInst>
  684. bool LoopIdiomRecognize::processLoopMemIntrinsic(
  685. BasicBlock *BB,
  686. bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
  687. const SCEV *BECount) {
  688. bool MadeChange = false;
  689. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  690. Instruction *Inst = &*I++;
  691. // Look for memory instructions, which may be optimized to a larger one.
  692. if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
  693. WeakTrackingVH InstPtr(&*I);
  694. if (!(this->*Processor)(MI, BECount))
  695. continue;
  696. MadeChange = true;
  697. // If processing the instruction invalidated our iterator, start over from
  698. // the top of the block.
  699. if (!InstPtr)
  700. I = BB->begin();
  701. }
  702. }
  703. return MadeChange;
  704. }
  705. /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
  706. bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
  707. const SCEV *BECount) {
  708. // We can only handle non-volatile memcpys with a constant size.
  709. if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
  710. return false;
  711. // If we're not allowed to hack on memcpy, we fail.
  712. if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
  713. return false;
  714. Value *Dest = MCI->getDest();
  715. Value *Source = MCI->getSource();
  716. if (!Dest || !Source)
  717. return false;
  718. // See if the load and store pointer expressions are AddRec like {base,+,1} on
  719. // the current loop, which indicates a strided load and store. If we have
  720. // something else, it's a random load or store we can't handle.
  721. const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
  722. if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
  723. return false;
  724. const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
  725. if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
  726. return false;
  727. // Reject memcpys that are so large that they overflow an unsigned.
  728. uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
  729. if ((SizeInBytes >> 32) != 0)
  730. return false;
  731. // Check if the stride matches the size of the memcpy. If so, then we know
  732. // that every byte is touched in the loop.
  733. const SCEVConstant *ConstStoreStride =
  734. dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
  735. const SCEVConstant *ConstLoadStride =
  736. dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
  737. if (!ConstStoreStride || !ConstLoadStride)
  738. return false;
  739. APInt StoreStrideValue = ConstStoreStride->getAPInt();
  740. APInt LoadStrideValue = ConstLoadStride->getAPInt();
  741. // Huge stride value - give up
  742. if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
  743. return false;
  744. if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
  745. ORE.emit([&]() {
  746. return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
  747. << ore::NV("Inst", "memcpy") << " in "
  748. << ore::NV("Function", MCI->getFunction())
  749. << " function will not be hoisted: "
  750. << ore::NV("Reason", "memcpy size is not equal to stride");
  751. });
  752. return false;
  753. }
  754. int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
  755. int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
  756. // Check if the load stride matches the store stride.
  757. if (StoreStrideInt != LoadStrideInt)
  758. return false;
  759. return processLoopStoreOfLoopLoad(
  760. Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
  761. MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
  762. BECount);
  763. }
  764. /// processLoopMemSet - See if this memset can be promoted to a large memset.
  765. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
  766. const SCEV *BECount) {
  767. // We can only handle non-volatile memsets.
  768. if (MSI->isVolatile())
  769. return false;
  770. // If we're not allowed to hack on memset, we fail.
  771. if (!HasMemset || DisableLIRP::Memset)
  772. return false;
  773. Value *Pointer = MSI->getDest();
  774. // See if the pointer expression is an AddRec like {base,+,1} on the current
  775. // loop, which indicates a strided store. If we have something else, it's a
  776. // random store we can't handle.
  777. const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
  778. if (!Ev || Ev->getLoop() != CurLoop)
  779. return false;
  780. if (!Ev->isAffine()) {
  781. LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
  782. return false;
  783. }
  784. const SCEV *PointerStrideSCEV = Ev->getOperand(1);
  785. const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
  786. if (!PointerStrideSCEV || !MemsetSizeSCEV)
  787. return false;
  788. bool IsNegStride = false;
  789. const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
  790. if (IsConstantSize) {
  791. // Memset size is constant.
  792. // Check if the pointer stride matches the memset size. If so, then
  793. // we know that every byte is touched in the loop.
  794. LLVM_DEBUG(dbgs() << " memset size is constant\n");
  795. uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
  796. const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
  797. if (!ConstStride)
  798. return false;
  799. APInt Stride = ConstStride->getAPInt();
  800. if (SizeInBytes != Stride && SizeInBytes != -Stride)
  801. return false;
  802. IsNegStride = SizeInBytes == -Stride;
  803. } else {
  804. // Memset size is non-constant.
  805. // Check if the pointer stride matches the memset size.
  806. // To be conservative, the pass would not promote pointers that aren't in
  807. // address space zero. Also, the pass only handles memset length and stride
  808. // that are invariant for the top level loop.
  809. LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
  810. if (Pointer->getType()->getPointerAddressSpace() != 0) {
  811. LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
  812. << "abort\n");
  813. return false;
  814. }
  815. if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
  816. LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
  817. << "abort\n");
  818. return false;
  819. }
  820. // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
  821. IsNegStride = PointerStrideSCEV->isNonConstantNegative();
  822. const SCEV *PositiveStrideSCEV =
  823. IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
  824. : PointerStrideSCEV;
  825. LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
  826. << " PositiveStrideSCEV: " << *PositiveStrideSCEV
  827. << "\n");
  828. if (PositiveStrideSCEV != MemsetSizeSCEV) {
  829. // If an expression is covered by the loop guard, compare again and
  830. // proceed with optimization if equal.
  831. const SCEV *FoldedPositiveStride =
  832. SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
  833. const SCEV *FoldedMemsetSize =
  834. SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
  835. LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
  836. << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
  837. << " FoldedPositiveStride: " << *FoldedPositiveStride
  838. << "\n");
  839. if (FoldedPositiveStride != FoldedMemsetSize) {
  840. LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
  841. return false;
  842. }
  843. }
  844. }
  845. // Verify that the memset value is loop invariant. If not, we can't promote
  846. // the memset.
  847. Value *SplatValue = MSI->getValue();
  848. if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
  849. return false;
  850. SmallPtrSet<Instruction *, 1> MSIs;
  851. MSIs.insert(MSI);
  852. return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
  853. MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
  854. BECount, IsNegStride, /*IsLoopMemset=*/true);
  855. }
  856. /// mayLoopAccessLocation - Return true if the specified loop might access the
  857. /// specified pointer location, which is a loop-strided access. The 'Access'
  858. /// argument specifies what the verboten forms of access are (read or write).
  859. static bool
  860. mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
  861. const SCEV *BECount, const SCEV *StoreSizeSCEV,
  862. AliasAnalysis &AA,
  863. SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
  864. // Get the location that may be stored across the loop. Since the access is
  865. // strided positively through memory, we say that the modified location starts
  866. // at the pointer and has infinite size.
  867. LocationSize AccessSize = LocationSize::afterPointer();
  868. // If the loop iterates a fixed number of times, we can refine the access size
  869. // to be exactly the size of the memset, which is (BECount+1)*StoreSize
  870. const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
  871. const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
  872. if (BECst && ConstSize)
  873. AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
  874. ConstSize->getValue()->getZExtValue());
  875. // TODO: For this to be really effective, we have to dive into the pointer
  876. // operand in the store. Store to &A[i] of 100 will always return may alias
  877. // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
  878. // which will then no-alias a store to &A[100].
  879. MemoryLocation StoreLoc(Ptr, AccessSize);
  880. for (BasicBlock *B : L->blocks())
  881. for (Instruction &I : *B)
  882. if (!IgnoredInsts.contains(&I) &&
  883. isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
  884. return true;
  885. return false;
  886. }
  887. // If we have a negative stride, Start refers to the end of the memory location
  888. // we're trying to memset. Therefore, we need to recompute the base pointer,
  889. // which is just Start - BECount*Size.
  890. static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
  891. Type *IntPtr, const SCEV *StoreSizeSCEV,
  892. ScalarEvolution *SE) {
  893. const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
  894. if (!StoreSizeSCEV->isOne()) {
  895. // index = back edge count * store size
  896. Index = SE->getMulExpr(Index,
  897. SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
  898. SCEV::FlagNUW);
  899. }
  900. // base pointer = start - index * store size
  901. return SE->getMinusSCEV(Start, Index);
  902. }
  903. /// Compute trip count from the backedge taken count.
  904. static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
  905. Loop *CurLoop, const DataLayout *DL,
  906. ScalarEvolution *SE) {
  907. const SCEV *TripCountS = nullptr;
  908. // The # stored bytes is (BECount+1). Expand the trip count out to
  909. // pointer size if it isn't already.
  910. //
  911. // If we're going to need to zero extend the BE count, check if we can add
  912. // one to it prior to zero extending without overflow. Provided this is safe,
  913. // it allows better simplification of the +1.
  914. if (DL->getTypeSizeInBits(BECount->getType()) <
  915. DL->getTypeSizeInBits(IntPtr) &&
  916. SE->isLoopEntryGuardedByCond(
  917. CurLoop, ICmpInst::ICMP_NE, BECount,
  918. SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
  919. TripCountS = SE->getZeroExtendExpr(
  920. SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
  921. IntPtr);
  922. } else {
  923. TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
  924. SE->getOne(IntPtr), SCEV::FlagNUW);
  925. }
  926. return TripCountS;
  927. }
  928. /// Compute the number of bytes as a SCEV from the backedge taken count.
  929. ///
  930. /// This also maps the SCEV into the provided type and tries to handle the
  931. /// computation in a way that will fold cleanly.
  932. static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
  933. const SCEV *StoreSizeSCEV, Loop *CurLoop,
  934. const DataLayout *DL, ScalarEvolution *SE) {
  935. const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
  936. return SE->getMulExpr(TripCountSCEV,
  937. SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
  938. SCEV::FlagNUW);
  939. }
  940. /// processLoopStridedStore - We see a strided store of some value. If we can
  941. /// transform this into a memset or memset_pattern in the loop preheader, do so.
  942. bool LoopIdiomRecognize::processLoopStridedStore(
  943. Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
  944. Value *StoredVal, Instruction *TheStore,
  945. SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
  946. const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
  947. Module *M = TheStore->getModule();
  948. Value *SplatValue = isBytewiseValue(StoredVal, *DL);
  949. Constant *PatternValue = nullptr;
  950. if (!SplatValue)
  951. PatternValue = getMemSetPatternValue(StoredVal, DL);
  952. assert((SplatValue || PatternValue) &&
  953. "Expected either splat value or pattern value.");
  954. // The trip count of the loop and the base pointer of the addrec SCEV is
  955. // guaranteed to be loop invariant, which means that it should dominate the
  956. // header. This allows us to insert code for it in the preheader.
  957. unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
  958. BasicBlock *Preheader = CurLoop->getLoopPreheader();
  959. IRBuilder<> Builder(Preheader->getTerminator());
  960. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  961. SCEVExpanderCleaner ExpCleaner(Expander);
  962. Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
  963. Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
  964. bool Changed = false;
  965. const SCEV *Start = Ev->getStart();
  966. // Handle negative strided loops.
  967. if (IsNegStride)
  968. Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
  969. // TODO: ideally we should still be able to generate memset if SCEV expander
  970. // is taught to generate the dependencies at the latest point.
  971. if (!Expander.isSafeToExpand(Start))
  972. return Changed;
  973. // Okay, we have a strided store "p[i]" of a splattable value. We can turn
  974. // this into a memset in the loop preheader now if we want. However, this
  975. // would be unsafe to do if there is anything else in the loop that may read
  976. // or write to the aliased location. Check for any overlap by generating the
  977. // base pointer and checking the region.
  978. Value *BasePtr =
  979. Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
  980. // From here on out, conservatively report to the pass manager that we've
  981. // changed the IR, even if we later clean up these added instructions. There
  982. // may be structural differences e.g. in the order of use lists not accounted
  983. // for in just a textual dump of the IR. This is written as a variable, even
  984. // though statically all the places this dominates could be replaced with
  985. // 'true', with the hope that anyone trying to be clever / "more precise" with
  986. // the return value will read this comment, and leave them alone.
  987. Changed = true;
  988. if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
  989. StoreSizeSCEV, *AA, Stores))
  990. return Changed;
  991. if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
  992. return Changed;
  993. // Okay, everything looks good, insert the memset.
  994. const SCEV *NumBytesS =
  995. getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
  996. // TODO: ideally we should still be able to generate memset if SCEV expander
  997. // is taught to generate the dependencies at the latest point.
  998. if (!Expander.isSafeToExpand(NumBytesS))
  999. return Changed;
  1000. Value *NumBytes =
  1001. Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  1002. CallInst *NewCall;
  1003. if (SplatValue) {
  1004. AAMDNodes AATags = TheStore->getAAMetadata();
  1005. for (Instruction *Store : Stores)
  1006. AATags = AATags.merge(Store->getAAMetadata());
  1007. if (auto CI = dyn_cast<ConstantInt>(NumBytes))
  1008. AATags = AATags.extendTo(CI->getZExtValue());
  1009. else
  1010. AATags = AATags.extendTo(-1);
  1011. NewCall = Builder.CreateMemSet(
  1012. BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
  1013. /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
  1014. } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
  1015. // Everything is emitted in default address space
  1016. Type *Int8PtrTy = DestInt8PtrTy;
  1017. StringRef FuncName = "memset_pattern16";
  1018. FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
  1019. Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
  1020. inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
  1021. // Otherwise we should form a memset_pattern16. PatternValue is known to be
  1022. // an constant array of 16-bytes. Plop the value into a mergable global.
  1023. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
  1024. GlobalValue::PrivateLinkage,
  1025. PatternValue, ".memset_pattern");
  1026. GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
  1027. GV->setAlignment(Align(16));
  1028. Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
  1029. NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
  1030. } else
  1031. return Changed;
  1032. NewCall->setDebugLoc(TheStore->getDebugLoc());
  1033. if (MSSAU) {
  1034. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
  1035. NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
  1036. MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  1037. }
  1038. LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
  1039. << " from store to: " << *Ev << " at: " << *TheStore
  1040. << "\n");
  1041. ORE.emit([&]() {
  1042. OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
  1043. NewCall->getDebugLoc(), Preheader);
  1044. R << "Transformed loop-strided store in "
  1045. << ore::NV("Function", TheStore->getFunction())
  1046. << " function into a call to "
  1047. << ore::NV("NewFunction", NewCall->getCalledFunction())
  1048. << "() intrinsic";
  1049. if (!Stores.empty())
  1050. R << ore::setExtraArgs();
  1051. for (auto *I : Stores) {
  1052. R << ore::NV("FromBlock", I->getParent()->getName())
  1053. << ore::NV("ToBlock", Preheader->getName());
  1054. }
  1055. return R;
  1056. });
  1057. // Okay, the memset has been formed. Zap the original store and anything that
  1058. // feeds into it.
  1059. for (auto *I : Stores) {
  1060. if (MSSAU)
  1061. MSSAU->removeMemoryAccess(I, true);
  1062. deleteDeadInstruction(I);
  1063. }
  1064. if (MSSAU && VerifyMemorySSA)
  1065. MSSAU->getMemorySSA()->verifyMemorySSA();
  1066. ++NumMemSet;
  1067. ExpCleaner.markResultUsed();
  1068. return true;
  1069. }
  1070. /// If the stored value is a strided load in the same loop with the same stride
  1071. /// this may be transformable into a memcpy. This kicks in for stuff like
  1072. /// for (i) A[i] = B[i];
  1073. bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
  1074. const SCEV *BECount) {
  1075. assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
  1076. Value *StorePtr = SI->getPointerOperand();
  1077. const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
  1078. unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
  1079. // The store must be feeding a non-volatile load.
  1080. LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
  1081. assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
  1082. // See if the pointer expression is an AddRec like {base,+,1} on the current
  1083. // loop, which indicates a strided load. If we have something else, it's a
  1084. // random load we can't handle.
  1085. Value *LoadPtr = LI->getPointerOperand();
  1086. const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
  1087. const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
  1088. return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
  1089. SI->getAlign(), LI->getAlign(), SI, LI,
  1090. StoreEv, LoadEv, BECount);
  1091. }
  1092. namespace {
  1093. class MemmoveVerifier {
  1094. public:
  1095. explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
  1096. const DataLayout &DL)
  1097. : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
  1098. LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
  1099. BP2(llvm::GetPointerBaseWithConstantOffset(
  1100. StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
  1101. IsSameObject(BP1 == BP2) {}
  1102. bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
  1103. const Instruction &TheLoad,
  1104. bool IsMemCpy) const {
  1105. if (IsMemCpy) {
  1106. // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
  1107. // for negative stride.
  1108. if ((!IsNegStride && LoadOff <= StoreOff) ||
  1109. (IsNegStride && LoadOff >= StoreOff))
  1110. return false;
  1111. } else {
  1112. // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
  1113. // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
  1114. int64_t LoadSize =
  1115. DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
  1116. if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
  1117. return false;
  1118. if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
  1119. (IsNegStride && LoadOff + LoadSize > StoreOff))
  1120. return false;
  1121. }
  1122. return true;
  1123. }
  1124. private:
  1125. const DataLayout &DL;
  1126. int64_t LoadOff = 0;
  1127. int64_t StoreOff = 0;
  1128. const Value *BP1;
  1129. const Value *BP2;
  1130. public:
  1131. const bool IsSameObject;
  1132. };
  1133. } // namespace
  1134. bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
  1135. Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
  1136. MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
  1137. Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
  1138. const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
  1139. // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
  1140. // conservatively bail here, since otherwise we may have to transform
  1141. // llvm.memcpy.inline into llvm.memcpy which is illegal.
  1142. if (isa<MemCpyInlineInst>(TheStore))
  1143. return false;
  1144. // The trip count of the loop and the base pointer of the addrec SCEV is
  1145. // guaranteed to be loop invariant, which means that it should dominate the
  1146. // header. This allows us to insert code for it in the preheader.
  1147. BasicBlock *Preheader = CurLoop->getLoopPreheader();
  1148. IRBuilder<> Builder(Preheader->getTerminator());
  1149. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  1150. SCEVExpanderCleaner ExpCleaner(Expander);
  1151. bool Changed = false;
  1152. const SCEV *StrStart = StoreEv->getStart();
  1153. unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
  1154. Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
  1155. APInt Stride = getStoreStride(StoreEv);
  1156. const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
  1157. // TODO: Deal with non-constant size; Currently expect constant store size
  1158. assert(ConstStoreSize && "store size is expected to be a constant");
  1159. int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
  1160. bool IsNegStride = StoreSize == -Stride;
  1161. // Handle negative strided loops.
  1162. if (IsNegStride)
  1163. StrStart =
  1164. getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
  1165. // Okay, we have a strided store "p[i]" of a loaded value. We can turn
  1166. // this into a memcpy in the loop preheader now if we want. However, this
  1167. // would be unsafe to do if there is anything else in the loop that may read
  1168. // or write the memory region we're storing to. This includes the load that
  1169. // feeds the stores. Check for an alias by generating the base address and
  1170. // checking everything.
  1171. Value *StoreBasePtr = Expander.expandCodeFor(
  1172. StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
  1173. // From here on out, conservatively report to the pass manager that we've
  1174. // changed the IR, even if we later clean up these added instructions. There
  1175. // may be structural differences e.g. in the order of use lists not accounted
  1176. // for in just a textual dump of the IR. This is written as a variable, even
  1177. // though statically all the places this dominates could be replaced with
  1178. // 'true', with the hope that anyone trying to be clever / "more precise" with
  1179. // the return value will read this comment, and leave them alone.
  1180. Changed = true;
  1181. SmallPtrSet<Instruction *, 2> IgnoredInsts;
  1182. IgnoredInsts.insert(TheStore);
  1183. bool IsMemCpy = isa<MemCpyInst>(TheStore);
  1184. const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
  1185. bool LoopAccessStore =
  1186. mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
  1187. StoreSizeSCEV, *AA, IgnoredInsts);
  1188. if (LoopAccessStore) {
  1189. // For memmove case it's not enough to guarantee that loop doesn't access
  1190. // TheStore and TheLoad. Additionally we need to make sure that TheStore is
  1191. // the only user of TheLoad.
  1192. if (!TheLoad->hasOneUse())
  1193. return Changed;
  1194. IgnoredInsts.insert(TheLoad);
  1195. if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
  1196. BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
  1197. ORE.emit([&]() {
  1198. return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
  1199. TheStore)
  1200. << ore::NV("Inst", InstRemark) << " in "
  1201. << ore::NV("Function", TheStore->getFunction())
  1202. << " function will not be hoisted: "
  1203. << ore::NV("Reason", "The loop may access store location");
  1204. });
  1205. return Changed;
  1206. }
  1207. IgnoredInsts.erase(TheLoad);
  1208. }
  1209. const SCEV *LdStart = LoadEv->getStart();
  1210. unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
  1211. // Handle negative strided loops.
  1212. if (IsNegStride)
  1213. LdStart =
  1214. getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
  1215. // For a memcpy, we have to make sure that the input array is not being
  1216. // mutated by the loop.
  1217. Value *LoadBasePtr = Expander.expandCodeFor(
  1218. LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
  1219. // If the store is a memcpy instruction, we must check if it will write to
  1220. // the load memory locations. So remove it from the ignored stores.
  1221. MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
  1222. if (IsMemCpy && !Verifier.IsSameObject)
  1223. IgnoredInsts.erase(TheStore);
  1224. if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
  1225. StoreSizeSCEV, *AA, IgnoredInsts)) {
  1226. ORE.emit([&]() {
  1227. return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
  1228. << ore::NV("Inst", InstRemark) << " in "
  1229. << ore::NV("Function", TheStore->getFunction())
  1230. << " function will not be hoisted: "
  1231. << ore::NV("Reason", "The loop may access load location");
  1232. });
  1233. return Changed;
  1234. }
  1235. bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
  1236. if (UseMemMove)
  1237. if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
  1238. IsMemCpy))
  1239. return Changed;
  1240. if (avoidLIRForMultiBlockLoop())
  1241. return Changed;
  1242. // Okay, everything is safe, we can transform this!
  1243. const SCEV *NumBytesS =
  1244. getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
  1245. Value *NumBytes =
  1246. Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
  1247. AAMDNodes AATags = TheLoad->getAAMetadata();
  1248. AAMDNodes StoreAATags = TheStore->getAAMetadata();
  1249. AATags = AATags.merge(StoreAATags);
  1250. if (auto CI = dyn_cast<ConstantInt>(NumBytes))
  1251. AATags = AATags.extendTo(CI->getZExtValue());
  1252. else
  1253. AATags = AATags.extendTo(-1);
  1254. CallInst *NewCall = nullptr;
  1255. // Check whether to generate an unordered atomic memcpy:
  1256. // If the load or store are atomic, then they must necessarily be unordered
  1257. // by previous checks.
  1258. if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
  1259. if (UseMemMove)
  1260. NewCall = Builder.CreateMemMove(
  1261. StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
  1262. /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
  1263. else
  1264. NewCall =
  1265. Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
  1266. NumBytes, /*isVolatile=*/false, AATags.TBAA,
  1267. AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
  1268. } else {
  1269. // For now don't support unordered atomic memmove.
  1270. if (UseMemMove)
  1271. return Changed;
  1272. // We cannot allow unaligned ops for unordered load/store, so reject
  1273. // anything where the alignment isn't at least the element size.
  1274. assert((StoreAlign && LoadAlign) &&
  1275. "Expect unordered load/store to have align.");
  1276. if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
  1277. return Changed;
  1278. // If the element.atomic memcpy is not lowered into explicit
  1279. // loads/stores later, then it will be lowered into an element-size
  1280. // specific lib call. If the lib call doesn't exist for our store size, then
  1281. // we shouldn't generate the memcpy.
  1282. if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
  1283. return Changed;
  1284. // Create the call.
  1285. // Note that unordered atomic loads/stores are *required* by the spec to
  1286. // have an alignment but non-atomic loads/stores may not.
  1287. NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
  1288. StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
  1289. AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
  1290. }
  1291. NewCall->setDebugLoc(TheStore->getDebugLoc());
  1292. if (MSSAU) {
  1293. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
  1294. NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
  1295. MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
  1296. }
  1297. LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
  1298. << " from load ptr=" << *LoadEv << " at: " << *TheLoad
  1299. << "\n"
  1300. << " from store ptr=" << *StoreEv << " at: " << *TheStore
  1301. << "\n");
  1302. ORE.emit([&]() {
  1303. return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
  1304. NewCall->getDebugLoc(), Preheader)
  1305. << "Formed a call to "
  1306. << ore::NV("NewFunction", NewCall->getCalledFunction())
  1307. << "() intrinsic from " << ore::NV("Inst", InstRemark)
  1308. << " instruction in " << ore::NV("Function", TheStore->getFunction())
  1309. << " function"
  1310. << ore::setExtraArgs()
  1311. << ore::NV("FromBlock", TheStore->getParent()->getName())
  1312. << ore::NV("ToBlock", Preheader->getName());
  1313. });
  1314. // Okay, a new call to memcpy/memmove has been formed. Zap the original store
  1315. // and anything that feeds into it.
  1316. if (MSSAU)
  1317. MSSAU->removeMemoryAccess(TheStore, true);
  1318. deleteDeadInstruction(TheStore);
  1319. if (MSSAU && VerifyMemorySSA)
  1320. MSSAU->getMemorySSA()->verifyMemorySSA();
  1321. if (UseMemMove)
  1322. ++NumMemMove;
  1323. else
  1324. ++NumMemCpy;
  1325. ExpCleaner.markResultUsed();
  1326. return true;
  1327. }
  1328. // When compiling for codesize we avoid idiom recognition for a multi-block loop
  1329. // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
  1330. //
  1331. bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
  1332. bool IsLoopMemset) {
  1333. if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
  1334. if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
  1335. LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
  1336. << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
  1337. << " avoided: multi-block top-level loop\n");
  1338. return true;
  1339. }
  1340. }
  1341. return false;
  1342. }
  1343. bool LoopIdiomRecognize::runOnNoncountableLoop() {
  1344. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
  1345. << CurLoop->getHeader()->getParent()->getName()
  1346. << "] Noncountable Loop %"
  1347. << CurLoop->getHeader()->getName() << "\n");
  1348. return recognizePopcount() || recognizeAndInsertFFS() ||
  1349. recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
  1350. }
  1351. /// Check if the given conditional branch is based on the comparison between
  1352. /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
  1353. /// true), the control yields to the loop entry. If the branch matches the
  1354. /// behavior, the variable involved in the comparison is returned. This function
  1355. /// will be called to see if the precondition and postcondition of the loop are
  1356. /// in desirable form.
  1357. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
  1358. bool JmpOnZero = false) {
  1359. if (!BI || !BI->isConditional())
  1360. return nullptr;
  1361. ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
  1362. if (!Cond)
  1363. return nullptr;
  1364. ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
  1365. if (!CmpZero || !CmpZero->isZero())
  1366. return nullptr;
  1367. BasicBlock *TrueSucc = BI->getSuccessor(0);
  1368. BasicBlock *FalseSucc = BI->getSuccessor(1);
  1369. if (JmpOnZero)
  1370. std::swap(TrueSucc, FalseSucc);
  1371. ICmpInst::Predicate Pred = Cond->getPredicate();
  1372. if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
  1373. (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
  1374. return Cond->getOperand(0);
  1375. return nullptr;
  1376. }
  1377. // Check if the recurrence variable `VarX` is in the right form to create
  1378. // the idiom. Returns the value coerced to a PHINode if so.
  1379. static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
  1380. BasicBlock *LoopEntry) {
  1381. auto *PhiX = dyn_cast<PHINode>(VarX);
  1382. if (PhiX && PhiX->getParent() == LoopEntry &&
  1383. (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
  1384. return PhiX;
  1385. return nullptr;
  1386. }
  1387. /// Return true iff the idiom is detected in the loop.
  1388. ///
  1389. /// Additionally:
  1390. /// 1) \p CntInst is set to the instruction counting the population bit.
  1391. /// 2) \p CntPhi is set to the corresponding phi node.
  1392. /// 3) \p Var is set to the value whose population bits are being counted.
  1393. ///
  1394. /// The core idiom we are trying to detect is:
  1395. /// \code
  1396. /// if (x0 != 0)
  1397. /// goto loop-exit // the precondition of the loop
  1398. /// cnt0 = init-val;
  1399. /// do {
  1400. /// x1 = phi (x0, x2);
  1401. /// cnt1 = phi(cnt0, cnt2);
  1402. ///
  1403. /// cnt2 = cnt1 + 1;
  1404. /// ...
  1405. /// x2 = x1 & (x1 - 1);
  1406. /// ...
  1407. /// } while(x != 0);
  1408. ///
  1409. /// loop-exit:
  1410. /// \endcode
  1411. static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
  1412. Instruction *&CntInst, PHINode *&CntPhi,
  1413. Value *&Var) {
  1414. // step 1: Check to see if the look-back branch match this pattern:
  1415. // "if (a!=0) goto loop-entry".
  1416. BasicBlock *LoopEntry;
  1417. Instruction *DefX2, *CountInst;
  1418. Value *VarX1, *VarX0;
  1419. PHINode *PhiX, *CountPhi;
  1420. DefX2 = CountInst = nullptr;
  1421. VarX1 = VarX0 = nullptr;
  1422. PhiX = CountPhi = nullptr;
  1423. LoopEntry = *(CurLoop->block_begin());
  1424. // step 1: Check if the loop-back branch is in desirable form.
  1425. {
  1426. if (Value *T = matchCondition(
  1427. dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
  1428. DefX2 = dyn_cast<Instruction>(T);
  1429. else
  1430. return false;
  1431. }
  1432. // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
  1433. {
  1434. if (!DefX2 || DefX2->getOpcode() != Instruction::And)
  1435. return false;
  1436. BinaryOperator *SubOneOp;
  1437. if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
  1438. VarX1 = DefX2->getOperand(1);
  1439. else {
  1440. VarX1 = DefX2->getOperand(0);
  1441. SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
  1442. }
  1443. if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
  1444. return false;
  1445. ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
  1446. if (!Dec ||
  1447. !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
  1448. (SubOneOp->getOpcode() == Instruction::Add &&
  1449. Dec->isMinusOne()))) {
  1450. return false;
  1451. }
  1452. }
  1453. // step 3: Check the recurrence of variable X
  1454. PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
  1455. if (!PhiX)
  1456. return false;
  1457. // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
  1458. {
  1459. CountInst = nullptr;
  1460. for (Instruction &Inst : llvm::make_range(
  1461. LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
  1462. if (Inst.getOpcode() != Instruction::Add)
  1463. continue;
  1464. ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
  1465. if (!Inc || !Inc->isOne())
  1466. continue;
  1467. PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
  1468. if (!Phi)
  1469. continue;
  1470. // Check if the result of the instruction is live of the loop.
  1471. bool LiveOutLoop = false;
  1472. for (User *U : Inst.users()) {
  1473. if ((cast<Instruction>(U))->getParent() != LoopEntry) {
  1474. LiveOutLoop = true;
  1475. break;
  1476. }
  1477. }
  1478. if (LiveOutLoop) {
  1479. CountInst = &Inst;
  1480. CountPhi = Phi;
  1481. break;
  1482. }
  1483. }
  1484. if (!CountInst)
  1485. return false;
  1486. }
  1487. // step 5: check if the precondition is in this form:
  1488. // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
  1489. {
  1490. auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1491. Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
  1492. if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
  1493. return false;
  1494. CntInst = CountInst;
  1495. CntPhi = CountPhi;
  1496. Var = T;
  1497. }
  1498. return true;
  1499. }
  1500. /// Return true if the idiom is detected in the loop.
  1501. ///
  1502. /// Additionally:
  1503. /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
  1504. /// or nullptr if there is no such.
  1505. /// 2) \p CntPhi is set to the corresponding phi node
  1506. /// or nullptr if there is no such.
  1507. /// 3) \p Var is set to the value whose CTLZ could be used.
  1508. /// 4) \p DefX is set to the instruction calculating Loop exit condition.
  1509. ///
  1510. /// The core idiom we are trying to detect is:
  1511. /// \code
  1512. /// if (x0 == 0)
  1513. /// goto loop-exit // the precondition of the loop
  1514. /// cnt0 = init-val;
  1515. /// do {
  1516. /// x = phi (x0, x.next); //PhiX
  1517. /// cnt = phi(cnt0, cnt.next);
  1518. ///
  1519. /// cnt.next = cnt + 1;
  1520. /// ...
  1521. /// x.next = x >> 1; // DefX
  1522. /// ...
  1523. /// } while(x.next != 0);
  1524. ///
  1525. /// loop-exit:
  1526. /// \endcode
  1527. static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
  1528. Intrinsic::ID &IntrinID, Value *&InitX,
  1529. Instruction *&CntInst, PHINode *&CntPhi,
  1530. Instruction *&DefX) {
  1531. BasicBlock *LoopEntry;
  1532. Value *VarX = nullptr;
  1533. DefX = nullptr;
  1534. CntInst = nullptr;
  1535. CntPhi = nullptr;
  1536. LoopEntry = *(CurLoop->block_begin());
  1537. // step 1: Check if the loop-back branch is in desirable form.
  1538. if (Value *T = matchCondition(
  1539. dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
  1540. DefX = dyn_cast<Instruction>(T);
  1541. else
  1542. return false;
  1543. // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
  1544. if (!DefX || !DefX->isShift())
  1545. return false;
  1546. IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
  1547. Intrinsic::ctlz;
  1548. ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
  1549. if (!Shft || !Shft->isOne())
  1550. return false;
  1551. VarX = DefX->getOperand(0);
  1552. // step 3: Check the recurrence of variable X
  1553. PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
  1554. if (!PhiX)
  1555. return false;
  1556. InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
  1557. // Make sure the initial value can't be negative otherwise the ashr in the
  1558. // loop might never reach zero which would make the loop infinite.
  1559. if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
  1560. return false;
  1561. // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
  1562. // or cnt.next = cnt + -1.
  1563. // TODO: We can skip the step. If loop trip count is known (CTLZ),
  1564. // then all uses of "cnt.next" could be optimized to the trip count
  1565. // plus "cnt0". Currently it is not optimized.
  1566. // This step could be used to detect POPCNT instruction:
  1567. // cnt.next = cnt + (x.next & 1)
  1568. for (Instruction &Inst : llvm::make_range(
  1569. LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
  1570. if (Inst.getOpcode() != Instruction::Add)
  1571. continue;
  1572. ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
  1573. if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
  1574. continue;
  1575. PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
  1576. if (!Phi)
  1577. continue;
  1578. CntInst = &Inst;
  1579. CntPhi = Phi;
  1580. break;
  1581. }
  1582. if (!CntInst)
  1583. return false;
  1584. return true;
  1585. }
  1586. /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
  1587. /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
  1588. /// trip count returns true; otherwise, returns false.
  1589. bool LoopIdiomRecognize::recognizeAndInsertFFS() {
  1590. // Give up if the loop has multiple blocks or multiple backedges.
  1591. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
  1592. return false;
  1593. Intrinsic::ID IntrinID;
  1594. Value *InitX;
  1595. Instruction *DefX = nullptr;
  1596. PHINode *CntPhi = nullptr;
  1597. Instruction *CntInst = nullptr;
  1598. // Help decide if transformation is profitable. For ShiftUntilZero idiom,
  1599. // this is always 6.
  1600. size_t IdiomCanonicalSize = 6;
  1601. if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
  1602. CntInst, CntPhi, DefX))
  1603. return false;
  1604. bool IsCntPhiUsedOutsideLoop = false;
  1605. for (User *U : CntPhi->users())
  1606. if (!CurLoop->contains(cast<Instruction>(U))) {
  1607. IsCntPhiUsedOutsideLoop = true;
  1608. break;
  1609. }
  1610. bool IsCntInstUsedOutsideLoop = false;
  1611. for (User *U : CntInst->users())
  1612. if (!CurLoop->contains(cast<Instruction>(U))) {
  1613. IsCntInstUsedOutsideLoop = true;
  1614. break;
  1615. }
  1616. // If both CntInst and CntPhi are used outside the loop the profitability
  1617. // is questionable.
  1618. if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
  1619. return false;
  1620. // For some CPUs result of CTLZ(X) intrinsic is undefined
  1621. // when X is 0. If we can not guarantee X != 0, we need to check this
  1622. // when expand.
  1623. bool ZeroCheck = false;
  1624. // It is safe to assume Preheader exist as it was checked in
  1625. // parent function RunOnLoop.
  1626. BasicBlock *PH = CurLoop->getLoopPreheader();
  1627. // If we are using the count instruction outside the loop, make sure we
  1628. // have a zero check as a precondition. Without the check the loop would run
  1629. // one iteration for before any check of the input value. This means 0 and 1
  1630. // would have identical behavior in the original loop and thus
  1631. if (!IsCntPhiUsedOutsideLoop) {
  1632. auto *PreCondBB = PH->getSinglePredecessor();
  1633. if (!PreCondBB)
  1634. return false;
  1635. auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1636. if (!PreCondBI)
  1637. return false;
  1638. if (matchCondition(PreCondBI, PH) != InitX)
  1639. return false;
  1640. ZeroCheck = true;
  1641. }
  1642. // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
  1643. // profitable if we delete the loop.
  1644. // the loop has only 6 instructions:
  1645. // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
  1646. // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
  1647. // %shr = ashr %n.addr.0, 1
  1648. // %tobool = icmp eq %shr, 0
  1649. // %inc = add nsw %i.0, 1
  1650. // br i1 %tobool
  1651. const Value *Args[] = {InitX,
  1652. ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
  1653. // @llvm.dbg doesn't count as they have no semantic effect.
  1654. auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
  1655. uint32_t HeaderSize =
  1656. std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
  1657. IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
  1658. InstructionCost Cost =
  1659. TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
  1660. if (HeaderSize != IdiomCanonicalSize &&
  1661. Cost > TargetTransformInfo::TCC_Basic)
  1662. return false;
  1663. transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
  1664. DefX->getDebugLoc(), ZeroCheck,
  1665. IsCntPhiUsedOutsideLoop);
  1666. return true;
  1667. }
  1668. /// Recognizes a population count idiom in a non-countable loop.
  1669. ///
  1670. /// If detected, transforms the relevant code to issue the popcount intrinsic
  1671. /// function call, and returns true; otherwise, returns false.
  1672. bool LoopIdiomRecognize::recognizePopcount() {
  1673. if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
  1674. return false;
  1675. // Counting population are usually conducted by few arithmetic instructions.
  1676. // Such instructions can be easily "absorbed" by vacant slots in a
  1677. // non-compact loop. Therefore, recognizing popcount idiom only makes sense
  1678. // in a compact loop.
  1679. // Give up if the loop has multiple blocks or multiple backedges.
  1680. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
  1681. return false;
  1682. BasicBlock *LoopBody = *(CurLoop->block_begin());
  1683. if (LoopBody->size() >= 20) {
  1684. // The loop is too big, bail out.
  1685. return false;
  1686. }
  1687. // It should have a preheader containing nothing but an unconditional branch.
  1688. BasicBlock *PH = CurLoop->getLoopPreheader();
  1689. if (!PH || &PH->front() != PH->getTerminator())
  1690. return false;
  1691. auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
  1692. if (!EntryBI || EntryBI->isConditional())
  1693. return false;
  1694. // It should have a precondition block where the generated popcount intrinsic
  1695. // function can be inserted.
  1696. auto *PreCondBB = PH->getSinglePredecessor();
  1697. if (!PreCondBB)
  1698. return false;
  1699. auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
  1700. if (!PreCondBI || PreCondBI->isUnconditional())
  1701. return false;
  1702. Instruction *CntInst;
  1703. PHINode *CntPhi;
  1704. Value *Val;
  1705. if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
  1706. return false;
  1707. transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
  1708. return true;
  1709. }
  1710. static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
  1711. const DebugLoc &DL) {
  1712. Value *Ops[] = {Val};
  1713. Type *Tys[] = {Val->getType()};
  1714. Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  1715. Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
  1716. CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  1717. CI->setDebugLoc(DL);
  1718. return CI;
  1719. }
  1720. static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
  1721. const DebugLoc &DL, bool ZeroCheck,
  1722. Intrinsic::ID IID) {
  1723. Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
  1724. Type *Tys[] = {Val->getType()};
  1725. Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
  1726. Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
  1727. CallInst *CI = IRBuilder.CreateCall(Func, Ops);
  1728. CI->setDebugLoc(DL);
  1729. return CI;
  1730. }
  1731. /// Transform the following loop (Using CTLZ, CTTZ is similar):
  1732. /// loop:
  1733. /// CntPhi = PHI [Cnt0, CntInst]
  1734. /// PhiX = PHI [InitX, DefX]
  1735. /// CntInst = CntPhi + 1
  1736. /// DefX = PhiX >> 1
  1737. /// LOOP_BODY
  1738. /// Br: loop if (DefX != 0)
  1739. /// Use(CntPhi) or Use(CntInst)
  1740. ///
  1741. /// Into:
  1742. /// If CntPhi used outside the loop:
  1743. /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
  1744. /// Count = CountPrev + 1
  1745. /// else
  1746. /// Count = BitWidth(InitX) - CTLZ(InitX)
  1747. /// loop:
  1748. /// CntPhi = PHI [Cnt0, CntInst]
  1749. /// PhiX = PHI [InitX, DefX]
  1750. /// PhiCount = PHI [Count, Dec]
  1751. /// CntInst = CntPhi + 1
  1752. /// DefX = PhiX >> 1
  1753. /// Dec = PhiCount - 1
  1754. /// LOOP_BODY
  1755. /// Br: loop if (Dec != 0)
  1756. /// Use(CountPrev + Cnt0) // Use(CntPhi)
  1757. /// or
  1758. /// Use(Count + Cnt0) // Use(CntInst)
  1759. ///
  1760. /// If LOOP_BODY is empty the loop will be deleted.
  1761. /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
  1762. void LoopIdiomRecognize::transformLoopToCountable(
  1763. Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
  1764. PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
  1765. bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
  1766. BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
  1767. // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
  1768. IRBuilder<> Builder(PreheaderBr);
  1769. Builder.SetCurrentDebugLocation(DL);
  1770. // If there are no uses of CntPhi crate:
  1771. // Count = BitWidth - CTLZ(InitX);
  1772. // NewCount = Count;
  1773. // If there are uses of CntPhi create:
  1774. // NewCount = BitWidth - CTLZ(InitX >> 1);
  1775. // Count = NewCount + 1;
  1776. Value *InitXNext;
  1777. if (IsCntPhiUsedOutsideLoop) {
  1778. if (DefX->getOpcode() == Instruction::AShr)
  1779. InitXNext = Builder.CreateAShr(InitX, 1);
  1780. else if (DefX->getOpcode() == Instruction::LShr)
  1781. InitXNext = Builder.CreateLShr(InitX, 1);
  1782. else if (DefX->getOpcode() == Instruction::Shl) // cttz
  1783. InitXNext = Builder.CreateShl(InitX, 1);
  1784. else
  1785. llvm_unreachable("Unexpected opcode!");
  1786. } else
  1787. InitXNext = InitX;
  1788. Value *Count =
  1789. createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
  1790. Type *CountTy = Count->getType();
  1791. Count = Builder.CreateSub(
  1792. ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
  1793. Value *NewCount = Count;
  1794. if (IsCntPhiUsedOutsideLoop)
  1795. Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
  1796. NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
  1797. Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
  1798. if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
  1799. // If the counter was being incremented in the loop, add NewCount to the
  1800. // counter's initial value, but only if the initial value is not zero.
  1801. ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
  1802. if (!InitConst || !InitConst->isZero())
  1803. NewCount = Builder.CreateAdd(NewCount, CntInitVal);
  1804. } else {
  1805. // If the count was being decremented in the loop, subtract NewCount from
  1806. // the counter's initial value.
  1807. NewCount = Builder.CreateSub(CntInitVal, NewCount);
  1808. }
  1809. // Step 2: Insert new IV and loop condition:
  1810. // loop:
  1811. // ...
  1812. // PhiCount = PHI [Count, Dec]
  1813. // ...
  1814. // Dec = PhiCount - 1
  1815. // ...
  1816. // Br: loop if (Dec != 0)
  1817. BasicBlock *Body = *(CurLoop->block_begin());
  1818. auto *LbBr = cast<BranchInst>(Body->getTerminator());
  1819. ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
  1820. PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
  1821. Builder.SetInsertPoint(LbCond);
  1822. Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
  1823. TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
  1824. TcPhi->addIncoming(Count, Preheader);
  1825. TcPhi->addIncoming(TcDec, Body);
  1826. CmpInst::Predicate Pred =
  1827. (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
  1828. LbCond->setPredicate(Pred);
  1829. LbCond->setOperand(0, TcDec);
  1830. LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
  1831. // Step 3: All the references to the original counter outside
  1832. // the loop are replaced with the NewCount
  1833. if (IsCntPhiUsedOutsideLoop)
  1834. CntPhi->replaceUsesOutsideBlock(NewCount, Body);
  1835. else
  1836. CntInst->replaceUsesOutsideBlock(NewCount, Body);
  1837. // step 4: Forget the "non-computable" trip-count SCEV associated with the
  1838. // loop. The loop would otherwise not be deleted even if it becomes empty.
  1839. SE->forgetLoop(CurLoop);
  1840. }
  1841. void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
  1842. Instruction *CntInst,
  1843. PHINode *CntPhi, Value *Var) {
  1844. BasicBlock *PreHead = CurLoop->getLoopPreheader();
  1845. auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
  1846. const DebugLoc &DL = CntInst->getDebugLoc();
  1847. // Assuming before transformation, the loop is following:
  1848. // if (x) // the precondition
  1849. // do { cnt++; x &= x - 1; } while(x);
  1850. // Step 1: Insert the ctpop instruction at the end of the precondition block
  1851. IRBuilder<> Builder(PreCondBr);
  1852. Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
  1853. {
  1854. PopCnt = createPopcntIntrinsic(Builder, Var, DL);
  1855. NewCount = PopCntZext =
  1856. Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
  1857. if (NewCount != PopCnt)
  1858. (cast<Instruction>(NewCount))->setDebugLoc(DL);
  1859. // TripCnt is exactly the number of iterations the loop has
  1860. TripCnt = NewCount;
  1861. // If the population counter's initial value is not zero, insert Add Inst.
  1862. Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
  1863. ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
  1864. if (!InitConst || !InitConst->isZero()) {
  1865. NewCount = Builder.CreateAdd(NewCount, CntInitVal);
  1866. (cast<Instruction>(NewCount))->setDebugLoc(DL);
  1867. }
  1868. }
  1869. // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
  1870. // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
  1871. // function would be partial dead code, and downstream passes will drag
  1872. // it back from the precondition block to the preheader.
  1873. {
  1874. ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
  1875. Value *Opnd0 = PopCntZext;
  1876. Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
  1877. if (PreCond->getOperand(0) != Var)
  1878. std::swap(Opnd0, Opnd1);
  1879. ICmpInst *NewPreCond = cast<ICmpInst>(
  1880. Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
  1881. PreCondBr->setCondition(NewPreCond);
  1882. RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
  1883. }
  1884. // Step 3: Note that the population count is exactly the trip count of the
  1885. // loop in question, which enable us to convert the loop from noncountable
  1886. // loop into a countable one. The benefit is twofold:
  1887. //
  1888. // - If the loop only counts population, the entire loop becomes dead after
  1889. // the transformation. It is a lot easier to prove a countable loop dead
  1890. // than to prove a noncountable one. (In some C dialects, an infinite loop
  1891. // isn't dead even if it computes nothing useful. In general, DCE needs
  1892. // to prove a noncountable loop finite before safely delete it.)
  1893. //
  1894. // - If the loop also performs something else, it remains alive.
  1895. // Since it is transformed to countable form, it can be aggressively
  1896. // optimized by some optimizations which are in general not applicable
  1897. // to a noncountable loop.
  1898. //
  1899. // After this step, this loop (conceptually) would look like following:
  1900. // newcnt = __builtin_ctpop(x);
  1901. // t = newcnt;
  1902. // if (x)
  1903. // do { cnt++; x &= x-1; t--) } while (t > 0);
  1904. BasicBlock *Body = *(CurLoop->block_begin());
  1905. {
  1906. auto *LbBr = cast<BranchInst>(Body->getTerminator());
  1907. ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
  1908. Type *Ty = TripCnt->getType();
  1909. PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
  1910. Builder.SetInsertPoint(LbCond);
  1911. Instruction *TcDec = cast<Instruction>(
  1912. Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
  1913. "tcdec", false, true));
  1914. TcPhi->addIncoming(TripCnt, PreHead);
  1915. TcPhi->addIncoming(TcDec, Body);
  1916. CmpInst::Predicate Pred =
  1917. (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
  1918. LbCond->setPredicate(Pred);
  1919. LbCond->setOperand(0, TcDec);
  1920. LbCond->setOperand(1, ConstantInt::get(Ty, 0));
  1921. }
  1922. // Step 4: All the references to the original population counter outside
  1923. // the loop are replaced with the NewCount -- the value returned from
  1924. // __builtin_ctpop().
  1925. CntInst->replaceUsesOutsideBlock(NewCount, Body);
  1926. // step 5: Forget the "non-computable" trip-count SCEV associated with the
  1927. // loop. The loop would otherwise not be deleted even if it becomes empty.
  1928. SE->forgetLoop(CurLoop);
  1929. }
  1930. /// Match loop-invariant value.
  1931. template <typename SubPattern_t> struct match_LoopInvariant {
  1932. SubPattern_t SubPattern;
  1933. const Loop *L;
  1934. match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
  1935. : SubPattern(SP), L(L) {}
  1936. template <typename ITy> bool match(ITy *V) {
  1937. return L->isLoopInvariant(V) && SubPattern.match(V);
  1938. }
  1939. };
  1940. /// Matches if the value is loop-invariant.
  1941. template <typename Ty>
  1942. inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
  1943. return match_LoopInvariant<Ty>(M, L);
  1944. }
  1945. /// Return true if the idiom is detected in the loop.
  1946. ///
  1947. /// The core idiom we are trying to detect is:
  1948. /// \code
  1949. /// entry:
  1950. /// <...>
  1951. /// %bitmask = shl i32 1, %bitpos
  1952. /// br label %loop
  1953. ///
  1954. /// loop:
  1955. /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
  1956. /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
  1957. /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
  1958. /// %x.next = shl i32 %x.curr, 1
  1959. /// <...>
  1960. /// br i1 %x.curr.isbitunset, label %loop, label %end
  1961. ///
  1962. /// end:
  1963. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  1964. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  1965. /// <...>
  1966. /// \endcode
  1967. static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
  1968. Value *&BitMask, Value *&BitPos,
  1969. Value *&CurrX, Instruction *&NextX) {
  1970. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  1971. " Performing shift-until-bittest idiom detection.\n");
  1972. // Give up if the loop has multiple blocks or multiple backedges.
  1973. if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
  1974. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
  1975. return false;
  1976. }
  1977. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  1978. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  1979. assert(LoopPreheaderBB && "There is always a loop preheader.");
  1980. using namespace PatternMatch;
  1981. // Step 1: Check if the loop backedge is in desirable form.
  1982. ICmpInst::Predicate Pred;
  1983. Value *CmpLHS, *CmpRHS;
  1984. BasicBlock *TrueBB, *FalseBB;
  1985. if (!match(LoopHeaderBB->getTerminator(),
  1986. m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
  1987. m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
  1988. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
  1989. return false;
  1990. }
  1991. // Step 2: Check if the backedge's condition is in desirable form.
  1992. auto MatchVariableBitMask = [&]() {
  1993. return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
  1994. match(CmpLHS,
  1995. m_c_And(m_Value(CurrX),
  1996. m_CombineAnd(
  1997. m_Value(BitMask),
  1998. m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
  1999. CurLoop))));
  2000. };
  2001. auto MatchConstantBitMask = [&]() {
  2002. return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
  2003. match(CmpLHS, m_And(m_Value(CurrX),
  2004. m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
  2005. (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
  2006. };
  2007. auto MatchDecomposableConstantBitMask = [&]() {
  2008. APInt Mask;
  2009. return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
  2010. ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
  2011. (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
  2012. (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
  2013. };
  2014. if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
  2015. !MatchDecomposableConstantBitMask()) {
  2016. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
  2017. return false;
  2018. }
  2019. // Step 3: Check if the recurrence is in desirable form.
  2020. auto *CurrXPN = dyn_cast<PHINode>(CurrX);
  2021. if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
  2022. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
  2023. return false;
  2024. }
  2025. BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
  2026. NextX =
  2027. dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
  2028. assert(CurLoop->isLoopInvariant(BaseX) &&
  2029. "Expected BaseX to be avaliable in the preheader!");
  2030. if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
  2031. // FIXME: support right-shift?
  2032. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
  2033. return false;
  2034. }
  2035. // Step 4: Check if the backedge's destinations are in desirable form.
  2036. assert(ICmpInst::isEquality(Pred) &&
  2037. "Should only get equality predicates here.");
  2038. // cmp-br is commutative, so canonicalize to a single variant.
  2039. if (Pred != ICmpInst::Predicate::ICMP_EQ) {
  2040. Pred = ICmpInst::getInversePredicate(Pred);
  2041. std::swap(TrueBB, FalseBB);
  2042. }
  2043. // We expect to exit loop when comparison yields false,
  2044. // so when it yields true we should branch back to loop header.
  2045. if (TrueBB != LoopHeaderBB) {
  2046. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
  2047. return false;
  2048. }
  2049. // Okay, idiom checks out.
  2050. return true;
  2051. }
  2052. /// Look for the following loop:
  2053. /// \code
  2054. /// entry:
  2055. /// <...>
  2056. /// %bitmask = shl i32 1, %bitpos
  2057. /// br label %loop
  2058. ///
  2059. /// loop:
  2060. /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
  2061. /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
  2062. /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
  2063. /// %x.next = shl i32 %x.curr, 1
  2064. /// <...>
  2065. /// br i1 %x.curr.isbitunset, label %loop, label %end
  2066. ///
  2067. /// end:
  2068. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  2069. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  2070. /// <...>
  2071. /// \endcode
  2072. ///
  2073. /// And transform it into:
  2074. /// \code
  2075. /// entry:
  2076. /// %bitmask = shl i32 1, %bitpos
  2077. /// %lowbitmask = add i32 %bitmask, -1
  2078. /// %mask = or i32 %lowbitmask, %bitmask
  2079. /// %x.masked = and i32 %x, %mask
  2080. /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
  2081. /// i1 true)
  2082. /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
  2083. /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
  2084. /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
  2085. /// %tripcount = add i32 %backedgetakencount, 1
  2086. /// %x.curr = shl i32 %x, %backedgetakencount
  2087. /// %x.next = shl i32 %x, %tripcount
  2088. /// br label %loop
  2089. ///
  2090. /// loop:
  2091. /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
  2092. /// %loop.iv.next = add nuw i32 %loop.iv, 1
  2093. /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
  2094. /// <...>
  2095. /// br i1 %loop.ivcheck, label %end, label %loop
  2096. ///
  2097. /// end:
  2098. /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
  2099. /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
  2100. /// <...>
  2101. /// \endcode
  2102. bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
  2103. bool MadeChange = false;
  2104. Value *X, *BitMask, *BitPos, *XCurr;
  2105. Instruction *XNext;
  2106. if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
  2107. XNext)) {
  2108. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2109. " shift-until-bittest idiom detection failed.\n");
  2110. return MadeChange;
  2111. }
  2112. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
  2113. // Ok, it is the idiom we were looking for, we *could* transform this loop,
  2114. // but is it profitable to transform?
  2115. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2116. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2117. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2118. BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  2119. assert(SuccessorBB && "There is only a single successor.");
  2120. IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  2121. Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
  2122. Intrinsic::ID IntrID = Intrinsic::ctlz;
  2123. Type *Ty = X->getType();
  2124. unsigned Bitwidth = Ty->getScalarSizeInBits();
  2125. TargetTransformInfo::TargetCostKind CostKind =
  2126. TargetTransformInfo::TCK_SizeAndLatency;
  2127. // The rewrite is considered to be unprofitable iff and only iff the
  2128. // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
  2129. // making the loop countable, even if nothing else changes.
  2130. IntrinsicCostAttributes Attrs(
  2131. IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
  2132. InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  2133. if (Cost > TargetTransformInfo::TCC_Basic) {
  2134. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2135. " Intrinsic is too costly, not beneficial\n");
  2136. return MadeChange;
  2137. }
  2138. if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
  2139. TargetTransformInfo::TCC_Basic) {
  2140. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
  2141. return MadeChange;
  2142. }
  2143. // Ok, transform appears worthwhile.
  2144. MadeChange = true;
  2145. // Step 1: Compute the loop trip count.
  2146. Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
  2147. BitPos->getName() + ".lowbitmask");
  2148. Value *Mask =
  2149. Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
  2150. Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
  2151. CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
  2152. IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
  2153. /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
  2154. Value *XMaskedNumActiveBits = Builder.CreateSub(
  2155. ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
  2156. XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
  2157. /*HasNSW=*/Bitwidth != 2);
  2158. Value *XMaskedLeadingOnePos =
  2159. Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
  2160. XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
  2161. /*HasNSW=*/Bitwidth > 2);
  2162. Value *LoopBackedgeTakenCount = Builder.CreateSub(
  2163. BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
  2164. /*HasNUW=*/true, /*HasNSW=*/true);
  2165. // We know loop's backedge-taken count, but what's loop's trip count?
  2166. // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
  2167. Value *LoopTripCount =
  2168. Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
  2169. CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
  2170. /*HasNSW=*/Bitwidth != 2);
  2171. // Step 2: Compute the recurrence's final value without a loop.
  2172. // NewX is always safe to compute, because `LoopBackedgeTakenCount`
  2173. // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
  2174. Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
  2175. NewX->takeName(XCurr);
  2176. if (auto *I = dyn_cast<Instruction>(NewX))
  2177. I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
  2178. Value *NewXNext;
  2179. // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
  2180. // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
  2181. // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
  2182. // that isn't the case, we'll need to emit an alternative, safe IR.
  2183. if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
  2184. PatternMatch::match(
  2185. BitPos, PatternMatch::m_SpecificInt_ICMP(
  2186. ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
  2187. Ty->getScalarSizeInBits() - 1))))
  2188. NewXNext = Builder.CreateShl(X, LoopTripCount);
  2189. else {
  2190. // Otherwise, just additionally shift by one. It's the smallest solution,
  2191. // alternatively, we could check that NewX is INT_MIN (or BitPos is )
  2192. // and select 0 instead.
  2193. NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
  2194. }
  2195. NewXNext->takeName(XNext);
  2196. if (auto *I = dyn_cast<Instruction>(NewXNext))
  2197. I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
  2198. // Step 3: Adjust the successor basic block to recieve the computed
  2199. // recurrence's final value instead of the recurrence itself.
  2200. XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
  2201. XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
  2202. // Step 4: Rewrite the loop into a countable form, with canonical IV.
  2203. // The new canonical induction variable.
  2204. Builder.SetInsertPoint(&LoopHeaderBB->front());
  2205. auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
  2206. // The induction itself.
  2207. // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
  2208. Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  2209. auto *IVNext =
  2210. Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
  2211. /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
  2212. // The loop trip count check.
  2213. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
  2214. CurLoop->getName() + ".ivcheck");
  2215. Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
  2216. LoopHeaderBB->getTerminator()->eraseFromParent();
  2217. // Populate the IV PHI.
  2218. IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  2219. IV->addIncoming(IVNext, LoopHeaderBB);
  2220. // Step 5: Forget the "non-computable" trip-count SCEV associated with the
  2221. // loop. The loop would otherwise not be deleted even if it becomes empty.
  2222. SE->forgetLoop(CurLoop);
  2223. // Other passes will take care of actually deleting the loop if possible.
  2224. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
  2225. ++NumShiftUntilBitTest;
  2226. return MadeChange;
  2227. }
  2228. /// Return true if the idiom is detected in the loop.
  2229. ///
  2230. /// The core idiom we are trying to detect is:
  2231. /// \code
  2232. /// entry:
  2233. /// <...>
  2234. /// %start = <...>
  2235. /// %extraoffset = <...>
  2236. /// <...>
  2237. /// br label %for.cond
  2238. ///
  2239. /// loop:
  2240. /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
  2241. /// %nbits = add nsw i8 %iv, %extraoffset
  2242. /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
  2243. /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
  2244. /// %iv.next = add i8 %iv, 1
  2245. /// <...>
  2246. /// br i1 %val.shifted.iszero, label %end, label %loop
  2247. ///
  2248. /// end:
  2249. /// %iv.res = phi i8 [ %iv, %loop ] <...>
  2250. /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
  2251. /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
  2252. /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
  2253. /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
  2254. /// <...>
  2255. /// \endcode
  2256. static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
  2257. Instruction *&ValShiftedIsZero,
  2258. Intrinsic::ID &IntrinID, Instruction *&IV,
  2259. Value *&Start, Value *&Val,
  2260. const SCEV *&ExtraOffsetExpr,
  2261. bool &InvertedCond) {
  2262. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2263. " Performing shift-until-zero idiom detection.\n");
  2264. // Give up if the loop has multiple blocks or multiple backedges.
  2265. if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
  2266. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
  2267. return false;
  2268. }
  2269. Instruction *ValShifted, *NBits, *IVNext;
  2270. Value *ExtraOffset;
  2271. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2272. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2273. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2274. using namespace PatternMatch;
  2275. // Step 1: Check if the loop backedge, condition is in desirable form.
  2276. ICmpInst::Predicate Pred;
  2277. BasicBlock *TrueBB, *FalseBB;
  2278. if (!match(LoopHeaderBB->getTerminator(),
  2279. m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
  2280. m_BasicBlock(FalseBB))) ||
  2281. !match(ValShiftedIsZero,
  2282. m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
  2283. !ICmpInst::isEquality(Pred)) {
  2284. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
  2285. return false;
  2286. }
  2287. // Step 2: Check if the comparison's operand is in desirable form.
  2288. // FIXME: Val could be a one-input PHI node, which we should look past.
  2289. if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
  2290. m_Instruction(NBits)))) {
  2291. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
  2292. return false;
  2293. }
  2294. IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
  2295. : Intrinsic::ctlz;
  2296. // Step 3: Check if the shift amount is in desirable form.
  2297. if (match(NBits, m_c_Add(m_Instruction(IV),
  2298. m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
  2299. (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
  2300. ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
  2301. else if (match(NBits,
  2302. m_Sub(m_Instruction(IV),
  2303. m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
  2304. NBits->hasNoSignedWrap())
  2305. ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
  2306. else {
  2307. IV = NBits;
  2308. ExtraOffsetExpr = SE->getZero(NBits->getType());
  2309. }
  2310. // Step 4: Check if the recurrence is in desirable form.
  2311. auto *IVPN = dyn_cast<PHINode>(IV);
  2312. if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
  2313. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
  2314. return false;
  2315. }
  2316. Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
  2317. IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
  2318. if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
  2319. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
  2320. return false;
  2321. }
  2322. // Step 4: Check if the backedge's destinations are in desirable form.
  2323. assert(ICmpInst::isEquality(Pred) &&
  2324. "Should only get equality predicates here.");
  2325. // cmp-br is commutative, so canonicalize to a single variant.
  2326. InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
  2327. if (InvertedCond) {
  2328. Pred = ICmpInst::getInversePredicate(Pred);
  2329. std::swap(TrueBB, FalseBB);
  2330. }
  2331. // We expect to exit loop when comparison yields true,
  2332. // so when it yields false we should branch back to loop header.
  2333. if (FalseBB != LoopHeaderBB) {
  2334. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
  2335. return false;
  2336. }
  2337. // The new, countable, loop will certainly only run a known number of
  2338. // iterations, It won't be infinite. But the old loop might be infinite
  2339. // under certain conditions. For logical shifts, the value will become zero
  2340. // after at most bitwidth(%Val) loop iterations. However, for arithmetic
  2341. // right-shift, iff the sign bit was set, the value will never become zero,
  2342. // and the loop may never finish.
  2343. if (ValShifted->getOpcode() == Instruction::AShr &&
  2344. !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
  2345. LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
  2346. return false;
  2347. }
  2348. // Okay, idiom checks out.
  2349. return true;
  2350. }
  2351. /// Look for the following loop:
  2352. /// \code
  2353. /// entry:
  2354. /// <...>
  2355. /// %start = <...>
  2356. /// %extraoffset = <...>
  2357. /// <...>
  2358. /// br label %for.cond
  2359. ///
  2360. /// loop:
  2361. /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
  2362. /// %nbits = add nsw i8 %iv, %extraoffset
  2363. /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
  2364. /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
  2365. /// %iv.next = add i8 %iv, 1
  2366. /// <...>
  2367. /// br i1 %val.shifted.iszero, label %end, label %loop
  2368. ///
  2369. /// end:
  2370. /// %iv.res = phi i8 [ %iv, %loop ] <...>
  2371. /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
  2372. /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
  2373. /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
  2374. /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
  2375. /// <...>
  2376. /// \endcode
  2377. ///
  2378. /// And transform it into:
  2379. /// \code
  2380. /// entry:
  2381. /// <...>
  2382. /// %start = <...>
  2383. /// %extraoffset = <...>
  2384. /// <...>
  2385. /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
  2386. /// %val.numactivebits = sub i8 8, %val.numleadingzeros
  2387. /// %extraoffset.neg = sub i8 0, %extraoffset
  2388. /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
  2389. /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
  2390. /// %loop.tripcount = sub i8 %iv.final, %start
  2391. /// br label %loop
  2392. ///
  2393. /// loop:
  2394. /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
  2395. /// %loop.iv.next = add i8 %loop.iv, 1
  2396. /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
  2397. /// %iv = add i8 %loop.iv, %start
  2398. /// <...>
  2399. /// br i1 %loop.ivcheck, label %end, label %loop
  2400. ///
  2401. /// end:
  2402. /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
  2403. /// <...>
  2404. /// \endcode
  2405. bool LoopIdiomRecognize::recognizeShiftUntilZero() {
  2406. bool MadeChange = false;
  2407. Instruction *ValShiftedIsZero;
  2408. Intrinsic::ID IntrID;
  2409. Instruction *IV;
  2410. Value *Start, *Val;
  2411. const SCEV *ExtraOffsetExpr;
  2412. bool InvertedCond;
  2413. if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
  2414. Start, Val, ExtraOffsetExpr, InvertedCond)) {
  2415. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2416. " shift-until-zero idiom detection failed.\n");
  2417. return MadeChange;
  2418. }
  2419. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
  2420. // Ok, it is the idiom we were looking for, we *could* transform this loop,
  2421. // but is it profitable to transform?
  2422. BasicBlock *LoopHeaderBB = CurLoop->getHeader();
  2423. BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
  2424. assert(LoopPreheaderBB && "There is always a loop preheader.");
  2425. BasicBlock *SuccessorBB = CurLoop->getExitBlock();
  2426. assert(SuccessorBB && "There is only a single successor.");
  2427. IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
  2428. Builder.SetCurrentDebugLocation(IV->getDebugLoc());
  2429. Type *Ty = Val->getType();
  2430. unsigned Bitwidth = Ty->getScalarSizeInBits();
  2431. TargetTransformInfo::TargetCostKind CostKind =
  2432. TargetTransformInfo::TCK_SizeAndLatency;
  2433. // The rewrite is considered to be unprofitable iff and only iff the
  2434. // intrinsic we'll use are not cheap. Note that we are okay with *just*
  2435. // making the loop countable, even if nothing else changes.
  2436. IntrinsicCostAttributes Attrs(
  2437. IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
  2438. InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
  2439. if (Cost > TargetTransformInfo::TCC_Basic) {
  2440. LLVM_DEBUG(dbgs() << DEBUG_TYPE
  2441. " Intrinsic is too costly, not beneficial\n");
  2442. return MadeChange;
  2443. }
  2444. // Ok, transform appears worthwhile.
  2445. MadeChange = true;
  2446. bool OffsetIsZero = false;
  2447. if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
  2448. OffsetIsZero = ExtraOffsetExprC->isZero();
  2449. // Step 1: Compute the loop's final IV value / trip count.
  2450. CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
  2451. IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
  2452. /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
  2453. Value *ValNumActiveBits = Builder.CreateSub(
  2454. ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
  2455. Val->getName() + ".numactivebits", /*HasNUW=*/true,
  2456. /*HasNSW=*/Bitwidth != 2);
  2457. SCEVExpander Expander(*SE, *DL, "loop-idiom");
  2458. Expander.setInsertPoint(&*Builder.GetInsertPoint());
  2459. Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
  2460. Value *ValNumActiveBitsOffset = Builder.CreateAdd(
  2461. ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
  2462. /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
  2463. Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
  2464. {ValNumActiveBitsOffset, Start},
  2465. /*FMFSource=*/nullptr, "iv.final");
  2466. auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
  2467. IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
  2468. /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
  2469. // FIXME: or when the offset was `add nuw`
  2470. // We know loop's backedge-taken count, but what's loop's trip count?
  2471. Value *LoopTripCount =
  2472. Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
  2473. CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
  2474. /*HasNSW=*/Bitwidth != 2);
  2475. // Step 2: Adjust the successor basic block to recieve the original
  2476. // induction variable's final value instead of the orig. IV itself.
  2477. IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
  2478. // Step 3: Rewrite the loop into a countable form, with canonical IV.
  2479. // The new canonical induction variable.
  2480. Builder.SetInsertPoint(&LoopHeaderBB->front());
  2481. auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
  2482. // The induction itself.
  2483. Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
  2484. auto *CIVNext =
  2485. Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
  2486. /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
  2487. // The loop trip count check.
  2488. auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
  2489. CurLoop->getName() + ".ivcheck");
  2490. auto *NewIVCheck = CIVCheck;
  2491. if (InvertedCond) {
  2492. NewIVCheck = Builder.CreateNot(CIVCheck);
  2493. NewIVCheck->takeName(ValShiftedIsZero);
  2494. }
  2495. // The original IV, but rebased to be an offset to the CIV.
  2496. auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
  2497. /*HasNSW=*/true); // FIXME: what about NUW?
  2498. IVDePHId->takeName(IV);
  2499. // The loop terminator.
  2500. Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
  2501. Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
  2502. LoopHeaderBB->getTerminator()->eraseFromParent();
  2503. // Populate the IV PHI.
  2504. CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
  2505. CIV->addIncoming(CIVNext, LoopHeaderBB);
  2506. // Step 4: Forget the "non-computable" trip-count SCEV associated with the
  2507. // loop. The loop would otherwise not be deleted even if it becomes empty.
  2508. SE->forgetLoop(CurLoop);
  2509. // Step 5: Try to cleanup the loop's body somewhat.
  2510. IV->replaceAllUsesWith(IVDePHId);
  2511. IV->eraseFromParent();
  2512. ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
  2513. ValShiftedIsZero->eraseFromParent();
  2514. // Other passes will take care of actually deleting the loop if possible.
  2515. LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
  2516. ++NumShiftUntilZero;
  2517. return MadeChange;
  2518. }