LoopDistribute.cpp 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067
  1. //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the Loop Distribution Pass. Its main focus is to
  10. // distribute loops that cannot be vectorized due to dependence cycles. It
  11. // tries to isolate the offending dependences into a new loop allowing
  12. // vectorization of the remaining parts.
  13. //
  14. // For dependence analysis, the pass uses the LoopVectorizer's
  15. // LoopAccessAnalysis. Because this analysis presumes no change in the order of
  16. // memory operations, special care is taken to preserve the lexical order of
  17. // these operations.
  18. //
  19. // Similarly to the Vectorizer, the pass also supports loop versioning to
  20. // run-time disambiguate potentially overlapping arrays.
  21. //
  22. //===----------------------------------------------------------------------===//
  23. #include "llvm/Transforms/Scalar/LoopDistribute.h"
  24. #include "llvm/ADT/DenseMap.h"
  25. #include "llvm/ADT/DepthFirstIterator.h"
  26. #include "llvm/ADT/EquivalenceClasses.h"
  27. #include "llvm/ADT/STLExtras.h"
  28. #include "llvm/ADT/SmallPtrSet.h"
  29. #include "llvm/ADT/SmallVector.h"
  30. #include "llvm/ADT/Statistic.h"
  31. #include "llvm/ADT/StringRef.h"
  32. #include "llvm/ADT/Twine.h"
  33. #include "llvm/ADT/iterator_range.h"
  34. #include "llvm/Analysis/AssumptionCache.h"
  35. #include "llvm/Analysis/GlobalsModRef.h"
  36. #include "llvm/Analysis/LoopAccessAnalysis.h"
  37. #include "llvm/Analysis/LoopAnalysisManager.h"
  38. #include "llvm/Analysis/LoopInfo.h"
  39. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  40. #include "llvm/Analysis/ScalarEvolution.h"
  41. #include "llvm/Analysis/TargetLibraryInfo.h"
  42. #include "llvm/Analysis/TargetTransformInfo.h"
  43. #include "llvm/IR/BasicBlock.h"
  44. #include "llvm/IR/Constants.h"
  45. #include "llvm/IR/DiagnosticInfo.h"
  46. #include "llvm/IR/Dominators.h"
  47. #include "llvm/IR/Function.h"
  48. #include "llvm/IR/Instruction.h"
  49. #include "llvm/IR/Instructions.h"
  50. #include "llvm/IR/LLVMContext.h"
  51. #include "llvm/IR/Metadata.h"
  52. #include "llvm/IR/PassManager.h"
  53. #include "llvm/IR/Value.h"
  54. #include "llvm/InitializePasses.h"
  55. #include "llvm/Pass.h"
  56. #include "llvm/Support/Casting.h"
  57. #include "llvm/Support/CommandLine.h"
  58. #include "llvm/Support/Debug.h"
  59. #include "llvm/Support/raw_ostream.h"
  60. #include "llvm/Transforms/Scalar.h"
  61. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  62. #include "llvm/Transforms/Utils/Cloning.h"
  63. #include "llvm/Transforms/Utils/LoopUtils.h"
  64. #include "llvm/Transforms/Utils/LoopVersioning.h"
  65. #include "llvm/Transforms/Utils/ValueMapper.h"
  66. #include <cassert>
  67. #include <functional>
  68. #include <list>
  69. #include <tuple>
  70. #include <utility>
  71. using namespace llvm;
  72. #define LDIST_NAME "loop-distribute"
  73. #define DEBUG_TYPE LDIST_NAME
  74. /// @{
  75. /// Metadata attribute names
  76. static const char *const LLVMLoopDistributeFollowupAll =
  77. "llvm.loop.distribute.followup_all";
  78. static const char *const LLVMLoopDistributeFollowupCoincident =
  79. "llvm.loop.distribute.followup_coincident";
  80. static const char *const LLVMLoopDistributeFollowupSequential =
  81. "llvm.loop.distribute.followup_sequential";
  82. static const char *const LLVMLoopDistributeFollowupFallback =
  83. "llvm.loop.distribute.followup_fallback";
  84. /// @}
  85. static cl::opt<bool>
  86. LDistVerify("loop-distribute-verify", cl::Hidden,
  87. cl::desc("Turn on DominatorTree and LoopInfo verification "
  88. "after Loop Distribution"),
  89. cl::init(false));
  90. static cl::opt<bool> DistributeNonIfConvertible(
  91. "loop-distribute-non-if-convertible", cl::Hidden,
  92. cl::desc("Whether to distribute into a loop that may not be "
  93. "if-convertible by the loop vectorizer"),
  94. cl::init(false));
  95. static cl::opt<unsigned> DistributeSCEVCheckThreshold(
  96. "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
  97. cl::desc("The maximum number of SCEV checks allowed for Loop "
  98. "Distribution"));
  99. static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
  100. "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
  101. cl::Hidden,
  102. cl::desc(
  103. "The maximum number of SCEV checks allowed for Loop "
  104. "Distribution for loop marked with #pragma loop distribute(enable)"));
  105. static cl::opt<bool> EnableLoopDistribute(
  106. "enable-loop-distribute", cl::Hidden,
  107. cl::desc("Enable the new, experimental LoopDistribution Pass"),
  108. cl::init(false));
  109. STATISTIC(NumLoopsDistributed, "Number of loops distributed");
  110. namespace {
  111. /// Maintains the set of instructions of the loop for a partition before
  112. /// cloning. After cloning, it hosts the new loop.
  113. class InstPartition {
  114. using InstructionSet = SmallPtrSet<Instruction *, 8>;
  115. public:
  116. InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
  117. : DepCycle(DepCycle), OrigLoop(L) {
  118. Set.insert(I);
  119. }
  120. /// Returns whether this partition contains a dependence cycle.
  121. bool hasDepCycle() const { return DepCycle; }
  122. /// Adds an instruction to this partition.
  123. void add(Instruction *I) { Set.insert(I); }
  124. /// Collection accessors.
  125. InstructionSet::iterator begin() { return Set.begin(); }
  126. InstructionSet::iterator end() { return Set.end(); }
  127. InstructionSet::const_iterator begin() const { return Set.begin(); }
  128. InstructionSet::const_iterator end() const { return Set.end(); }
  129. bool empty() const { return Set.empty(); }
  130. /// Moves this partition into \p Other. This partition becomes empty
  131. /// after this.
  132. void moveTo(InstPartition &Other) {
  133. Other.Set.insert(Set.begin(), Set.end());
  134. Set.clear();
  135. Other.DepCycle |= DepCycle;
  136. }
  137. /// Populates the partition with a transitive closure of all the
  138. /// instructions that the seeded instructions dependent on.
  139. void populateUsedSet() {
  140. // FIXME: We currently don't use control-dependence but simply include all
  141. // blocks (possibly empty at the end) and let simplifycfg mostly clean this
  142. // up.
  143. for (auto *B : OrigLoop->getBlocks())
  144. Set.insert(B->getTerminator());
  145. // Follow the use-def chains to form a transitive closure of all the
  146. // instructions that the originally seeded instructions depend on.
  147. SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
  148. while (!Worklist.empty()) {
  149. Instruction *I = Worklist.pop_back_val();
  150. // Insert instructions from the loop that we depend on.
  151. for (Value *V : I->operand_values()) {
  152. auto *I = dyn_cast<Instruction>(V);
  153. if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
  154. Worklist.push_back(I);
  155. }
  156. }
  157. }
  158. /// Clones the original loop.
  159. ///
  160. /// Updates LoopInfo and DominatorTree using the information that block \p
  161. /// LoopDomBB dominates the loop.
  162. Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
  163. unsigned Index, LoopInfo *LI,
  164. DominatorTree *DT) {
  165. ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
  166. VMap, Twine(".ldist") + Twine(Index),
  167. LI, DT, ClonedLoopBlocks);
  168. return ClonedLoop;
  169. }
  170. /// The cloned loop. If this partition is mapped to the original loop,
  171. /// this is null.
  172. const Loop *getClonedLoop() const { return ClonedLoop; }
  173. /// Returns the loop where this partition ends up after distribution.
  174. /// If this partition is mapped to the original loop then use the block from
  175. /// the loop.
  176. Loop *getDistributedLoop() const {
  177. return ClonedLoop ? ClonedLoop : OrigLoop;
  178. }
  179. /// The VMap that is populated by cloning and then used in
  180. /// remapinstruction to remap the cloned instructions.
  181. ValueToValueMapTy &getVMap() { return VMap; }
  182. /// Remaps the cloned instructions using VMap.
  183. void remapInstructions() {
  184. remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
  185. }
  186. /// Based on the set of instructions selected for this partition,
  187. /// removes the unnecessary ones.
  188. void removeUnusedInsts() {
  189. SmallVector<Instruction *, 8> Unused;
  190. for (auto *Block : OrigLoop->getBlocks())
  191. for (auto &Inst : *Block)
  192. if (!Set.count(&Inst)) {
  193. Instruction *NewInst = &Inst;
  194. if (!VMap.empty())
  195. NewInst = cast<Instruction>(VMap[NewInst]);
  196. assert(!isa<BranchInst>(NewInst) &&
  197. "Branches are marked used early on");
  198. Unused.push_back(NewInst);
  199. }
  200. // Delete the instructions backwards, as it has a reduced likelihood of
  201. // having to update as many def-use and use-def chains.
  202. for (auto *Inst : reverse(Unused)) {
  203. if (!Inst->use_empty())
  204. Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
  205. Inst->eraseFromParent();
  206. }
  207. }
  208. void print() const {
  209. if (DepCycle)
  210. dbgs() << " (cycle)\n";
  211. for (auto *I : Set)
  212. // Prefix with the block name.
  213. dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
  214. }
  215. void printBlocks() const {
  216. for (auto *BB : getDistributedLoop()->getBlocks())
  217. dbgs() << *BB;
  218. }
  219. private:
  220. /// Instructions from OrigLoop selected for this partition.
  221. InstructionSet Set;
  222. /// Whether this partition contains a dependence cycle.
  223. bool DepCycle;
  224. /// The original loop.
  225. Loop *OrigLoop;
  226. /// The cloned loop. If this partition is mapped to the original loop,
  227. /// this is null.
  228. Loop *ClonedLoop = nullptr;
  229. /// The blocks of ClonedLoop including the preheader. If this
  230. /// partition is mapped to the original loop, this is empty.
  231. SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
  232. /// These gets populated once the set of instructions have been
  233. /// finalized. If this partition is mapped to the original loop, these are not
  234. /// set.
  235. ValueToValueMapTy VMap;
  236. };
  237. /// Holds the set of Partitions. It populates them, merges them and then
  238. /// clones the loops.
  239. class InstPartitionContainer {
  240. using InstToPartitionIdT = DenseMap<Instruction *, int>;
  241. public:
  242. InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
  243. : L(L), LI(LI), DT(DT) {}
  244. /// Returns the number of partitions.
  245. unsigned getSize() const { return PartitionContainer.size(); }
  246. /// Adds \p Inst into the current partition if that is marked to
  247. /// contain cycles. Otherwise start a new partition for it.
  248. void addToCyclicPartition(Instruction *Inst) {
  249. // If the current partition is non-cyclic. Start a new one.
  250. if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
  251. PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
  252. else
  253. PartitionContainer.back().add(Inst);
  254. }
  255. /// Adds \p Inst into a partition that is not marked to contain
  256. /// dependence cycles.
  257. ///
  258. // Initially we isolate memory instructions into as many partitions as
  259. // possible, then later we may merge them back together.
  260. void addToNewNonCyclicPartition(Instruction *Inst) {
  261. PartitionContainer.emplace_back(Inst, L);
  262. }
  263. /// Merges adjacent non-cyclic partitions.
  264. ///
  265. /// The idea is that we currently only want to isolate the non-vectorizable
  266. /// partition. We could later allow more distribution among these partition
  267. /// too.
  268. void mergeAdjacentNonCyclic() {
  269. mergeAdjacentPartitionsIf(
  270. [](const InstPartition *P) { return !P->hasDepCycle(); });
  271. }
  272. /// If a partition contains only conditional stores, we won't vectorize
  273. /// it. Try to merge it with a previous cyclic partition.
  274. void mergeNonIfConvertible() {
  275. mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
  276. if (Partition->hasDepCycle())
  277. return true;
  278. // Now, check if all stores are conditional in this partition.
  279. bool seenStore = false;
  280. for (auto *Inst : *Partition)
  281. if (isa<StoreInst>(Inst)) {
  282. seenStore = true;
  283. if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
  284. return false;
  285. }
  286. return seenStore;
  287. });
  288. }
  289. /// Merges the partitions according to various heuristics.
  290. void mergeBeforePopulating() {
  291. mergeAdjacentNonCyclic();
  292. if (!DistributeNonIfConvertible)
  293. mergeNonIfConvertible();
  294. }
  295. /// Merges partitions in order to ensure that no loads are duplicated.
  296. ///
  297. /// We can't duplicate loads because that could potentially reorder them.
  298. /// LoopAccessAnalysis provides dependency information with the context that
  299. /// the order of memory operation is preserved.
  300. ///
  301. /// Return if any partitions were merged.
  302. bool mergeToAvoidDuplicatedLoads() {
  303. using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
  304. using ToBeMergedT = EquivalenceClasses<InstPartition *>;
  305. LoadToPartitionT LoadToPartition;
  306. ToBeMergedT ToBeMerged;
  307. // Step through the partitions and create equivalence between partitions
  308. // that contain the same load. Also put partitions in between them in the
  309. // same equivalence class to avoid reordering of memory operations.
  310. for (PartitionContainerT::iterator I = PartitionContainer.begin(),
  311. E = PartitionContainer.end();
  312. I != E; ++I) {
  313. auto *PartI = &*I;
  314. // If a load occurs in two partitions PartI and PartJ, merge all
  315. // partitions (PartI, PartJ] into PartI.
  316. for (Instruction *Inst : *PartI)
  317. if (isa<LoadInst>(Inst)) {
  318. bool NewElt;
  319. LoadToPartitionT::iterator LoadToPart;
  320. std::tie(LoadToPart, NewElt) =
  321. LoadToPartition.insert(std::make_pair(Inst, PartI));
  322. if (!NewElt) {
  323. LLVM_DEBUG(dbgs()
  324. << "Merging partitions due to this load in multiple "
  325. << "partitions: " << PartI << ", " << LoadToPart->second
  326. << "\n"
  327. << *Inst << "\n");
  328. auto PartJ = I;
  329. do {
  330. --PartJ;
  331. ToBeMerged.unionSets(PartI, &*PartJ);
  332. } while (&*PartJ != LoadToPart->second);
  333. }
  334. }
  335. }
  336. if (ToBeMerged.empty())
  337. return false;
  338. // Merge the member of an equivalence class into its class leader. This
  339. // makes the members empty.
  340. for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
  341. I != E; ++I) {
  342. if (!I->isLeader())
  343. continue;
  344. auto PartI = I->getData();
  345. for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
  346. ToBeMerged.member_end())) {
  347. PartJ->moveTo(*PartI);
  348. }
  349. }
  350. // Remove the empty partitions.
  351. PartitionContainer.remove_if(
  352. [](const InstPartition &P) { return P.empty(); });
  353. return true;
  354. }
  355. /// Sets up the mapping between instructions to partitions. If the
  356. /// instruction is duplicated across multiple partitions, set the entry to -1.
  357. void setupPartitionIdOnInstructions() {
  358. int PartitionID = 0;
  359. for (const auto &Partition : PartitionContainer) {
  360. for (Instruction *Inst : Partition) {
  361. bool NewElt;
  362. InstToPartitionIdT::iterator Iter;
  363. std::tie(Iter, NewElt) =
  364. InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
  365. if (!NewElt)
  366. Iter->second = -1;
  367. }
  368. ++PartitionID;
  369. }
  370. }
  371. /// Populates the partition with everything that the seeding
  372. /// instructions require.
  373. void populateUsedSet() {
  374. for (auto &P : PartitionContainer)
  375. P.populateUsedSet();
  376. }
  377. /// This performs the main chunk of the work of cloning the loops for
  378. /// the partitions.
  379. void cloneLoops() {
  380. BasicBlock *OrigPH = L->getLoopPreheader();
  381. // At this point the predecessor of the preheader is either the memcheck
  382. // block or the top part of the original preheader.
  383. BasicBlock *Pred = OrigPH->getSinglePredecessor();
  384. assert(Pred && "Preheader does not have a single predecessor");
  385. BasicBlock *ExitBlock = L->getExitBlock();
  386. assert(ExitBlock && "No single exit block");
  387. Loop *NewLoop;
  388. assert(!PartitionContainer.empty() && "at least two partitions expected");
  389. // We're cloning the preheader along with the loop so we already made sure
  390. // it was empty.
  391. assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
  392. "preheader not empty");
  393. // Preserve the original loop ID for use after the transformation.
  394. MDNode *OrigLoopID = L->getLoopID();
  395. // Create a loop for each partition except the last. Clone the original
  396. // loop before PH along with adding a preheader for the cloned loop. Then
  397. // update PH to point to the newly added preheader.
  398. BasicBlock *TopPH = OrigPH;
  399. unsigned Index = getSize() - 1;
  400. for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) {
  401. NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
  402. Part.getVMap()[ExitBlock] = TopPH;
  403. Part.remapInstructions();
  404. setNewLoopID(OrigLoopID, &Part);
  405. --Index;
  406. TopPH = NewLoop->getLoopPreheader();
  407. }
  408. Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
  409. // Also set a new loop ID for the last loop.
  410. setNewLoopID(OrigLoopID, &PartitionContainer.back());
  411. // Now go in forward order and update the immediate dominator for the
  412. // preheaders with the exiting block of the previous loop. Dominance
  413. // within the loop is updated in cloneLoopWithPreheader.
  414. for (auto Curr = PartitionContainer.cbegin(),
  415. Next = std::next(PartitionContainer.cbegin()),
  416. E = PartitionContainer.cend();
  417. Next != E; ++Curr, ++Next)
  418. DT->changeImmediateDominator(
  419. Next->getDistributedLoop()->getLoopPreheader(),
  420. Curr->getDistributedLoop()->getExitingBlock());
  421. }
  422. /// Removes the dead instructions from the cloned loops.
  423. void removeUnusedInsts() {
  424. for (auto &Partition : PartitionContainer)
  425. Partition.removeUnusedInsts();
  426. }
  427. /// For each memory pointer, it computes the partitionId the pointer is
  428. /// used in.
  429. ///
  430. /// This returns an array of int where the I-th entry corresponds to I-th
  431. /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
  432. /// partitions its entry is set to -1.
  433. SmallVector<int, 8>
  434. computePartitionSetForPointers(const LoopAccessInfo &LAI) {
  435. const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
  436. unsigned N = RtPtrCheck->Pointers.size();
  437. SmallVector<int, 8> PtrToPartitions(N);
  438. for (unsigned I = 0; I < N; ++I) {
  439. Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
  440. auto Instructions =
  441. LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
  442. int &Partition = PtrToPartitions[I];
  443. // First set it to uninitialized.
  444. Partition = -2;
  445. for (Instruction *Inst : Instructions) {
  446. // Note that this could be -1 if Inst is duplicated across multiple
  447. // partitions.
  448. int ThisPartition = this->InstToPartitionId[Inst];
  449. if (Partition == -2)
  450. Partition = ThisPartition;
  451. // -1 means belonging to multiple partitions.
  452. else if (Partition == -1)
  453. break;
  454. else if (Partition != (int)ThisPartition)
  455. Partition = -1;
  456. }
  457. assert(Partition != -2 && "Pointer not belonging to any partition");
  458. }
  459. return PtrToPartitions;
  460. }
  461. void print(raw_ostream &OS) const {
  462. unsigned Index = 0;
  463. for (const auto &P : PartitionContainer) {
  464. OS << "Partition " << Index++ << " (" << &P << "):\n";
  465. P.print();
  466. }
  467. }
  468. void dump() const { print(dbgs()); }
  469. #ifndef NDEBUG
  470. friend raw_ostream &operator<<(raw_ostream &OS,
  471. const InstPartitionContainer &Partitions) {
  472. Partitions.print(OS);
  473. return OS;
  474. }
  475. #endif
  476. void printBlocks() const {
  477. unsigned Index = 0;
  478. for (const auto &P : PartitionContainer) {
  479. dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
  480. P.printBlocks();
  481. }
  482. }
  483. private:
  484. using PartitionContainerT = std::list<InstPartition>;
  485. /// List of partitions.
  486. PartitionContainerT PartitionContainer;
  487. /// Mapping from Instruction to partition Id. If the instruction
  488. /// belongs to multiple partitions the entry contains -1.
  489. InstToPartitionIdT InstToPartitionId;
  490. Loop *L;
  491. LoopInfo *LI;
  492. DominatorTree *DT;
  493. /// The control structure to merge adjacent partitions if both satisfy
  494. /// the \p Predicate.
  495. template <class UnaryPredicate>
  496. void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
  497. InstPartition *PrevMatch = nullptr;
  498. for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
  499. auto DoesMatch = Predicate(&*I);
  500. if (PrevMatch == nullptr && DoesMatch) {
  501. PrevMatch = &*I;
  502. ++I;
  503. } else if (PrevMatch != nullptr && DoesMatch) {
  504. I->moveTo(*PrevMatch);
  505. I = PartitionContainer.erase(I);
  506. } else {
  507. PrevMatch = nullptr;
  508. ++I;
  509. }
  510. }
  511. }
  512. /// Assign new LoopIDs for the partition's cloned loop.
  513. void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
  514. std::optional<MDNode *> PartitionID = makeFollowupLoopID(
  515. OrigLoopID,
  516. {LLVMLoopDistributeFollowupAll,
  517. Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
  518. : LLVMLoopDistributeFollowupCoincident});
  519. if (PartitionID) {
  520. Loop *NewLoop = Part->getDistributedLoop();
  521. NewLoop->setLoopID(*PartitionID);
  522. }
  523. }
  524. };
  525. /// For each memory instruction, this class maintains difference of the
  526. /// number of unsafe dependences that start out from this instruction minus
  527. /// those that end here.
  528. ///
  529. /// By traversing the memory instructions in program order and accumulating this
  530. /// number, we know whether any unsafe dependence crosses over a program point.
  531. class MemoryInstructionDependences {
  532. using Dependence = MemoryDepChecker::Dependence;
  533. public:
  534. struct Entry {
  535. Instruction *Inst;
  536. unsigned NumUnsafeDependencesStartOrEnd = 0;
  537. Entry(Instruction *Inst) : Inst(Inst) {}
  538. };
  539. using AccessesType = SmallVector<Entry, 8>;
  540. AccessesType::const_iterator begin() const { return Accesses.begin(); }
  541. AccessesType::const_iterator end() const { return Accesses.end(); }
  542. MemoryInstructionDependences(
  543. const SmallVectorImpl<Instruction *> &Instructions,
  544. const SmallVectorImpl<Dependence> &Dependences) {
  545. Accesses.append(Instructions.begin(), Instructions.end());
  546. LLVM_DEBUG(dbgs() << "Backward dependences:\n");
  547. for (const auto &Dep : Dependences)
  548. if (Dep.isPossiblyBackward()) {
  549. // Note that the designations source and destination follow the program
  550. // order, i.e. source is always first. (The direction is given by the
  551. // DepType.)
  552. ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
  553. --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
  554. LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
  555. }
  556. }
  557. private:
  558. AccessesType Accesses;
  559. };
  560. /// The actual class performing the per-loop work.
  561. class LoopDistributeForLoop {
  562. public:
  563. LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
  564. ScalarEvolution *SE, LoopAccessInfoManager &LAIs,
  565. OptimizationRemarkEmitter *ORE)
  566. : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
  567. setForced();
  568. }
  569. /// Try to distribute an inner-most loop.
  570. bool processLoop() {
  571. assert(L->isInnermost() && "Only process inner loops.");
  572. LLVM_DEBUG(dbgs() << "\nLDist: In \""
  573. << L->getHeader()->getParent()->getName()
  574. << "\" checking " << *L << "\n");
  575. // Having a single exit block implies there's also one exiting block.
  576. if (!L->getExitBlock())
  577. return fail("MultipleExitBlocks", "multiple exit blocks");
  578. if (!L->isLoopSimplifyForm())
  579. return fail("NotLoopSimplifyForm",
  580. "loop is not in loop-simplify form");
  581. if (!L->isRotatedForm())
  582. return fail("NotBottomTested", "loop is not bottom tested");
  583. BasicBlock *PH = L->getLoopPreheader();
  584. LAI = &LAIs.getInfo(*L);
  585. // Currently, we only distribute to isolate the part of the loop with
  586. // dependence cycles to enable partial vectorization.
  587. if (LAI->canVectorizeMemory())
  588. return fail("MemOpsCanBeVectorized",
  589. "memory operations are safe for vectorization");
  590. auto *Dependences = LAI->getDepChecker().getDependences();
  591. if (!Dependences || Dependences->empty())
  592. return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
  593. InstPartitionContainer Partitions(L, LI, DT);
  594. // First, go through each memory operation and assign them to consecutive
  595. // partitions (the order of partitions follows program order). Put those
  596. // with unsafe dependences into "cyclic" partition otherwise put each store
  597. // in its own "non-cyclic" partition (we'll merge these later).
  598. //
  599. // Note that a memory operation (e.g. Load2 below) at a program point that
  600. // has an unsafe dependence (Store3->Load1) spanning over it must be
  601. // included in the same cyclic partition as the dependent operations. This
  602. // is to preserve the original program order after distribution. E.g.:
  603. //
  604. // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
  605. // Load1 -. 1 0->1
  606. // Load2 | /Unsafe/ 0 1
  607. // Store3 -' -1 1->0
  608. // Load4 0 0
  609. //
  610. // NumUnsafeDependencesActive > 0 indicates this situation and in this case
  611. // we just keep assigning to the same cyclic partition until
  612. // NumUnsafeDependencesActive reaches 0.
  613. const MemoryDepChecker &DepChecker = LAI->getDepChecker();
  614. MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
  615. *Dependences);
  616. int NumUnsafeDependencesActive = 0;
  617. for (const auto &InstDep : MID) {
  618. Instruction *I = InstDep.Inst;
  619. // We update NumUnsafeDependencesActive post-instruction, catch the
  620. // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
  621. if (NumUnsafeDependencesActive ||
  622. InstDep.NumUnsafeDependencesStartOrEnd > 0)
  623. Partitions.addToCyclicPartition(I);
  624. else
  625. Partitions.addToNewNonCyclicPartition(I);
  626. NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
  627. assert(NumUnsafeDependencesActive >= 0 &&
  628. "Negative number of dependences active");
  629. }
  630. // Add partitions for values used outside. These partitions can be out of
  631. // order from the original program order. This is OK because if the
  632. // partition uses a load we will merge this partition with the original
  633. // partition of the load that we set up in the previous loop (see
  634. // mergeToAvoidDuplicatedLoads).
  635. auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
  636. for (auto *Inst : DefsUsedOutside)
  637. Partitions.addToNewNonCyclicPartition(Inst);
  638. LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
  639. if (Partitions.getSize() < 2)
  640. return fail("CantIsolateUnsafeDeps",
  641. "cannot isolate unsafe dependencies");
  642. // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
  643. // should be able to vectorize these together.
  644. Partitions.mergeBeforePopulating();
  645. LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
  646. if (Partitions.getSize() < 2)
  647. return fail("CantIsolateUnsafeDeps",
  648. "cannot isolate unsafe dependencies");
  649. // Now, populate the partitions with non-memory operations.
  650. Partitions.populateUsedSet();
  651. LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
  652. // In order to preserve original lexical order for loads, keep them in the
  653. // partition that we set up in the MemoryInstructionDependences loop.
  654. if (Partitions.mergeToAvoidDuplicatedLoads()) {
  655. LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
  656. << Partitions);
  657. if (Partitions.getSize() < 2)
  658. return fail("CantIsolateUnsafeDeps",
  659. "cannot isolate unsafe dependencies");
  660. }
  661. // Don't distribute the loop if we need too many SCEV run-time checks, or
  662. // any if it's illegal.
  663. const SCEVPredicate &Pred = LAI->getPSE().getPredicate();
  664. if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
  665. return fail("RuntimeCheckWithConvergent",
  666. "may not insert runtime check with convergent operation");
  667. }
  668. if (Pred.getComplexity() > (IsForced.value_or(false)
  669. ? PragmaDistributeSCEVCheckThreshold
  670. : DistributeSCEVCheckThreshold))
  671. return fail("TooManySCEVRuntimeChecks",
  672. "too many SCEV run-time checks needed.\n");
  673. if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L))
  674. return fail("HeuristicDisabled", "distribution heuristic disabled");
  675. LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
  676. // We're done forming the partitions set up the reverse mapping from
  677. // instructions to partitions.
  678. Partitions.setupPartitionIdOnInstructions();
  679. // If we need run-time checks, version the loop now.
  680. auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
  681. const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
  682. const auto &AllChecks = RtPtrChecking->getChecks();
  683. auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
  684. RtPtrChecking);
  685. if (LAI->hasConvergentOp() && !Checks.empty()) {
  686. return fail("RuntimeCheckWithConvergent",
  687. "may not insert runtime check with convergent operation");
  688. }
  689. // To keep things simple have an empty preheader before we version or clone
  690. // the loop. (Also split if this has no predecessor, i.e. entry, because we
  691. // rely on PH having a predecessor.)
  692. if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
  693. SplitBlock(PH, PH->getTerminator(), DT, LI);
  694. if (!Pred.isAlwaysTrue() || !Checks.empty()) {
  695. assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
  696. MDNode *OrigLoopID = L->getLoopID();
  697. LLVM_DEBUG(dbgs() << "\nPointers:\n");
  698. LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
  699. LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
  700. LVer.versionLoop(DefsUsedOutside);
  701. LVer.annotateLoopWithNoAlias();
  702. // The unversioned loop will not be changed, so we inherit all attributes
  703. // from the original loop, but remove the loop distribution metadata to
  704. // avoid to distribute it again.
  705. MDNode *UnversionedLoopID = *makeFollowupLoopID(
  706. OrigLoopID,
  707. {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback},
  708. "llvm.loop.distribute.", true);
  709. LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
  710. }
  711. // Create identical copies of the original loop for each partition and hook
  712. // them up sequentially.
  713. Partitions.cloneLoops();
  714. // Now, we remove the instruction from each loop that don't belong to that
  715. // partition.
  716. Partitions.removeUnusedInsts();
  717. LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
  718. LLVM_DEBUG(Partitions.printBlocks());
  719. if (LDistVerify) {
  720. LI->verify(*DT);
  721. assert(DT->verify(DominatorTree::VerificationLevel::Fast));
  722. }
  723. ++NumLoopsDistributed;
  724. // Report the success.
  725. ORE->emit([&]() {
  726. return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
  727. L->getHeader())
  728. << "distributed loop";
  729. });
  730. return true;
  731. }
  732. /// Provide diagnostics then \return with false.
  733. bool fail(StringRef RemarkName, StringRef Message) {
  734. LLVMContext &Ctx = F->getContext();
  735. bool Forced = isForced().value_or(false);
  736. LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
  737. // With Rpass-missed report that distribution failed.
  738. ORE->emit([&]() {
  739. return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
  740. L->getStartLoc(), L->getHeader())
  741. << "loop not distributed: use -Rpass-analysis=loop-distribute for "
  742. "more "
  743. "info";
  744. });
  745. // With Rpass-analysis report why. This is on by default if distribution
  746. // was requested explicitly.
  747. ORE->emit(OptimizationRemarkAnalysis(
  748. Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
  749. RemarkName, L->getStartLoc(), L->getHeader())
  750. << "loop not distributed: " << Message);
  751. // Also issue a warning if distribution was requested explicitly but it
  752. // failed.
  753. if (Forced)
  754. Ctx.diagnose(DiagnosticInfoOptimizationFailure(
  755. *F, L->getStartLoc(), "loop not distributed: failed "
  756. "explicitly specified loop distribution"));
  757. return false;
  758. }
  759. /// Return if distribution forced to be enabled/disabled for the loop.
  760. ///
  761. /// If the optional has a value, it indicates whether distribution was forced
  762. /// to be enabled (true) or disabled (false). If the optional has no value
  763. /// distribution was not forced either way.
  764. const std::optional<bool> &isForced() const { return IsForced; }
  765. private:
  766. /// Filter out checks between pointers from the same partition.
  767. ///
  768. /// \p PtrToPartition contains the partition number for pointers. Partition
  769. /// number -1 means that the pointer is used in multiple partitions. In this
  770. /// case we can't safely omit the check.
  771. SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
  772. const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
  773. const SmallVectorImpl<int> &PtrToPartition,
  774. const RuntimePointerChecking *RtPtrChecking) {
  775. SmallVector<RuntimePointerCheck, 4> Checks;
  776. copy_if(AllChecks, std::back_inserter(Checks),
  777. [&](const RuntimePointerCheck &Check) {
  778. for (unsigned PtrIdx1 : Check.first->Members)
  779. for (unsigned PtrIdx2 : Check.second->Members)
  780. // Only include this check if there is a pair of pointers
  781. // that require checking and the pointers fall into
  782. // separate partitions.
  783. //
  784. // (Note that we already know at this point that the two
  785. // pointer groups need checking but it doesn't follow
  786. // that each pair of pointers within the two groups need
  787. // checking as well.
  788. //
  789. // In other words we don't want to include a check just
  790. // because there is a pair of pointers between the two
  791. // pointer groups that require checks and a different
  792. // pair whose pointers fall into different partitions.)
  793. if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
  794. !RuntimePointerChecking::arePointersInSamePartition(
  795. PtrToPartition, PtrIdx1, PtrIdx2))
  796. return true;
  797. return false;
  798. });
  799. return Checks;
  800. }
  801. /// Check whether the loop metadata is forcing distribution to be
  802. /// enabled/disabled.
  803. void setForced() {
  804. std::optional<const MDOperand *> Value =
  805. findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
  806. if (!Value)
  807. return;
  808. const MDOperand *Op = *Value;
  809. assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
  810. IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
  811. }
  812. Loop *L;
  813. Function *F;
  814. // Analyses used.
  815. LoopInfo *LI;
  816. const LoopAccessInfo *LAI = nullptr;
  817. DominatorTree *DT;
  818. ScalarEvolution *SE;
  819. LoopAccessInfoManager &LAIs;
  820. OptimizationRemarkEmitter *ORE;
  821. /// Indicates whether distribution is forced to be enabled/disabled for
  822. /// the loop.
  823. ///
  824. /// If the optional has a value, it indicates whether distribution was forced
  825. /// to be enabled (true) or disabled (false). If the optional has no value
  826. /// distribution was not forced either way.
  827. std::optional<bool> IsForced;
  828. };
  829. } // end anonymous namespace
  830. /// Shared implementation between new and old PMs.
  831. static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
  832. ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
  833. LoopAccessInfoManager &LAIs) {
  834. // Build up a worklist of inner-loops to vectorize. This is necessary as the
  835. // act of distributing a loop creates new loops and can invalidate iterators
  836. // across the loops.
  837. SmallVector<Loop *, 8> Worklist;
  838. for (Loop *TopLevelLoop : *LI)
  839. for (Loop *L : depth_first(TopLevelLoop))
  840. // We only handle inner-most loops.
  841. if (L->isInnermost())
  842. Worklist.push_back(L);
  843. // Now walk the identified inner loops.
  844. bool Changed = false;
  845. for (Loop *L : Worklist) {
  846. LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE);
  847. // If distribution was forced for the specific loop to be
  848. // enabled/disabled, follow that. Otherwise use the global flag.
  849. if (LDL.isForced().value_or(EnableLoopDistribute))
  850. Changed |= LDL.processLoop();
  851. }
  852. // Process each loop nest in the function.
  853. return Changed;
  854. }
  855. namespace {
  856. /// The pass class.
  857. class LoopDistributeLegacy : public FunctionPass {
  858. public:
  859. static char ID;
  860. LoopDistributeLegacy() : FunctionPass(ID) {
  861. // The default is set by the caller.
  862. initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
  863. }
  864. bool runOnFunction(Function &F) override {
  865. if (skipFunction(F))
  866. return false;
  867. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  868. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  869. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  870. auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  871. auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
  872. return runImpl(F, LI, DT, SE, ORE, LAIs);
  873. }
  874. void getAnalysisUsage(AnalysisUsage &AU) const override {
  875. AU.addRequired<ScalarEvolutionWrapperPass>();
  876. AU.addRequired<LoopInfoWrapperPass>();
  877. AU.addPreserved<LoopInfoWrapperPass>();
  878. AU.addRequired<LoopAccessLegacyAnalysis>();
  879. AU.addRequired<DominatorTreeWrapperPass>();
  880. AU.addPreserved<DominatorTreeWrapperPass>();
  881. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  882. AU.addPreserved<GlobalsAAWrapperPass>();
  883. }
  884. };
  885. } // end anonymous namespace
  886. PreservedAnalyses LoopDistributePass::run(Function &F,
  887. FunctionAnalysisManager &AM) {
  888. auto &LI = AM.getResult<LoopAnalysis>(F);
  889. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  890. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  891. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  892. LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
  893. bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs);
  894. if (!Changed)
  895. return PreservedAnalyses::all();
  896. PreservedAnalyses PA;
  897. PA.preserve<LoopAnalysis>();
  898. PA.preserve<DominatorTreeAnalysis>();
  899. return PA;
  900. }
  901. char LoopDistributeLegacy::ID;
  902. static const char ldist_name[] = "Loop Distribution";
  903. INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
  904. false)
  905. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  906. INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
  907. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  908. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  909. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  910. INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
  911. FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }