LoopAccessAnalysis.cpp 89 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362
  1. //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The implementation for the loop memory dependence that was originally
  10. // developed for the loop vectorizer.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Analysis/LoopAccessAnalysis.h"
  14. #include "llvm/ADT/APInt.h"
  15. #include "llvm/ADT/DenseMap.h"
  16. #include "llvm/ADT/DepthFirstIterator.h"
  17. #include "llvm/ADT/EquivalenceClasses.h"
  18. #include "llvm/ADT/PointerIntPair.h"
  19. #include "llvm/ADT/STLExtras.h"
  20. #include "llvm/ADT/SetVector.h"
  21. #include "llvm/ADT/SmallPtrSet.h"
  22. #include "llvm/ADT/SmallSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/iterator_range.h"
  25. #include "llvm/Analysis/AliasAnalysis.h"
  26. #include "llvm/Analysis/AliasSetTracker.h"
  27. #include "llvm/Analysis/LoopAnalysisManager.h"
  28. #include "llvm/Analysis/LoopInfo.h"
  29. #include "llvm/Analysis/MemoryLocation.h"
  30. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  31. #include "llvm/Analysis/ScalarEvolution.h"
  32. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  33. #include "llvm/Analysis/TargetLibraryInfo.h"
  34. #include "llvm/Analysis/ValueTracking.h"
  35. #include "llvm/Analysis/VectorUtils.h"
  36. #include "llvm/IR/BasicBlock.h"
  37. #include "llvm/IR/Constants.h"
  38. #include "llvm/IR/DataLayout.h"
  39. #include "llvm/IR/DebugLoc.h"
  40. #include "llvm/IR/DerivedTypes.h"
  41. #include "llvm/IR/DiagnosticInfo.h"
  42. #include "llvm/IR/Dominators.h"
  43. #include "llvm/IR/Function.h"
  44. #include "llvm/IR/InstrTypes.h"
  45. #include "llvm/IR/Instruction.h"
  46. #include "llvm/IR/Instructions.h"
  47. #include "llvm/IR/Operator.h"
  48. #include "llvm/IR/PassManager.h"
  49. #include "llvm/IR/Type.h"
  50. #include "llvm/IR/Value.h"
  51. #include "llvm/IR/ValueHandle.h"
  52. #include "llvm/InitializePasses.h"
  53. #include "llvm/Pass.h"
  54. #include "llvm/Support/Casting.h"
  55. #include "llvm/Support/CommandLine.h"
  56. #include "llvm/Support/Debug.h"
  57. #include "llvm/Support/ErrorHandling.h"
  58. #include "llvm/Support/raw_ostream.h"
  59. #include <algorithm>
  60. #include <cassert>
  61. #include <cstdint>
  62. #include <cstdlib>
  63. #include <iterator>
  64. #include <utility>
  65. #include <vector>
  66. using namespace llvm;
  67. #define DEBUG_TYPE "loop-accesses"
  68. static cl::opt<unsigned, true>
  69. VectorizationFactor("force-vector-width", cl::Hidden,
  70. cl::desc("Sets the SIMD width. Zero is autoselect."),
  71. cl::location(VectorizerParams::VectorizationFactor));
  72. unsigned VectorizerParams::VectorizationFactor;
  73. static cl::opt<unsigned, true>
  74. VectorizationInterleave("force-vector-interleave", cl::Hidden,
  75. cl::desc("Sets the vectorization interleave count. "
  76. "Zero is autoselect."),
  77. cl::location(
  78. VectorizerParams::VectorizationInterleave));
  79. unsigned VectorizerParams::VectorizationInterleave;
  80. static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
  81. "runtime-memory-check-threshold", cl::Hidden,
  82. cl::desc("When performing memory disambiguation checks at runtime do not "
  83. "generate more than this number of comparisons (default = 8)."),
  84. cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
  85. unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
  86. /// The maximum iterations used to merge memory checks
  87. static cl::opt<unsigned> MemoryCheckMergeThreshold(
  88. "memory-check-merge-threshold", cl::Hidden,
  89. cl::desc("Maximum number of comparisons done when trying to merge "
  90. "runtime memory checks. (default = 100)"),
  91. cl::init(100));
  92. /// Maximum SIMD width.
  93. const unsigned VectorizerParams::MaxVectorWidth = 64;
  94. /// We collect dependences up to this threshold.
  95. static cl::opt<unsigned>
  96. MaxDependences("max-dependences", cl::Hidden,
  97. cl::desc("Maximum number of dependences collected by "
  98. "loop-access analysis (default = 100)"),
  99. cl::init(100));
  100. /// This enables versioning on the strides of symbolically striding memory
  101. /// accesses in code like the following.
  102. /// for (i = 0; i < N; ++i)
  103. /// A[i * Stride1] += B[i * Stride2] ...
  104. ///
  105. /// Will be roughly translated to
  106. /// if (Stride1 == 1 && Stride2 == 1) {
  107. /// for (i = 0; i < N; i+=4)
  108. /// A[i:i+3] += ...
  109. /// } else
  110. /// ...
  111. static cl::opt<bool> EnableMemAccessVersioning(
  112. "enable-mem-access-versioning", cl::init(true), cl::Hidden,
  113. cl::desc("Enable symbolic stride memory access versioning"));
  114. /// Enable store-to-load forwarding conflict detection. This option can
  115. /// be disabled for correctness testing.
  116. static cl::opt<bool> EnableForwardingConflictDetection(
  117. "store-to-load-forwarding-conflict-detection", cl::Hidden,
  118. cl::desc("Enable conflict detection in loop-access analysis"),
  119. cl::init(true));
  120. bool VectorizerParams::isInterleaveForced() {
  121. return ::VectorizationInterleave.getNumOccurrences() > 0;
  122. }
  123. Value *llvm::stripIntegerCast(Value *V) {
  124. if (auto *CI = dyn_cast<CastInst>(V))
  125. if (CI->getOperand(0)->getType()->isIntegerTy())
  126. return CI->getOperand(0);
  127. return V;
  128. }
  129. const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
  130. const ValueToValueMap &PtrToStride,
  131. Value *Ptr) {
  132. const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
  133. // If there is an entry in the map return the SCEV of the pointer with the
  134. // symbolic stride replaced by one.
  135. ValueToValueMap::const_iterator SI = PtrToStride.find(Ptr);
  136. if (SI == PtrToStride.end())
  137. // For a non-symbolic stride, just return the original expression.
  138. return OrigSCEV;
  139. Value *StrideVal = stripIntegerCast(SI->second);
  140. ScalarEvolution *SE = PSE.getSE();
  141. const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
  142. const auto *CT =
  143. static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
  144. PSE.addPredicate(*SE->getEqualPredicate(U, CT));
  145. auto *Expr = PSE.getSCEV(Ptr);
  146. LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
  147. << " by: " << *Expr << "\n");
  148. return Expr;
  149. }
  150. RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
  151. unsigned Index, RuntimePointerChecking &RtCheck)
  152. : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
  153. AddressSpace(RtCheck.Pointers[Index]
  154. .PointerValue->getType()
  155. ->getPointerAddressSpace()) {
  156. Members.push_back(Index);
  157. }
  158. /// Calculate Start and End points of memory access.
  159. /// Let's assume A is the first access and B is a memory access on N-th loop
  160. /// iteration. Then B is calculated as:
  161. /// B = A + Step*N .
  162. /// Step value may be positive or negative.
  163. /// N is a calculated back-edge taken count:
  164. /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
  165. /// Start and End points are calculated in the following way:
  166. /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
  167. /// where SizeOfElt is the size of single memory access in bytes.
  168. ///
  169. /// There is no conflict when the intervals are disjoint:
  170. /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
  171. void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
  172. unsigned DepSetId, unsigned ASId,
  173. const ValueToValueMap &Strides,
  174. PredicatedScalarEvolution &PSE) {
  175. // Get the stride replaced scev.
  176. const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
  177. ScalarEvolution *SE = PSE.getSE();
  178. const SCEV *ScStart;
  179. const SCEV *ScEnd;
  180. if (SE->isLoopInvariant(Sc, Lp)) {
  181. ScStart = ScEnd = Sc;
  182. } else {
  183. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
  184. assert(AR && "Invalid addrec expression");
  185. const SCEV *Ex = PSE.getBackedgeTakenCount();
  186. ScStart = AR->getStart();
  187. ScEnd = AR->evaluateAtIteration(Ex, *SE);
  188. const SCEV *Step = AR->getStepRecurrence(*SE);
  189. // For expressions with negative step, the upper bound is ScStart and the
  190. // lower bound is ScEnd.
  191. if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
  192. if (CStep->getValue()->isNegative())
  193. std::swap(ScStart, ScEnd);
  194. } else {
  195. // Fallback case: the step is not constant, but we can still
  196. // get the upper and lower bounds of the interval by using min/max
  197. // expressions.
  198. ScStart = SE->getUMinExpr(ScStart, ScEnd);
  199. ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
  200. }
  201. }
  202. // Add the size of the pointed element to ScEnd.
  203. auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  204. Type *IdxTy = DL.getIndexType(Ptr->getType());
  205. const SCEV *EltSizeSCEV =
  206. SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType());
  207. ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
  208. Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
  209. }
  210. SmallVector<RuntimePointerCheck, 4>
  211. RuntimePointerChecking::generateChecks() const {
  212. SmallVector<RuntimePointerCheck, 4> Checks;
  213. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  214. for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
  215. const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
  216. const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
  217. if (needsChecking(CGI, CGJ))
  218. Checks.push_back(std::make_pair(&CGI, &CGJ));
  219. }
  220. }
  221. return Checks;
  222. }
  223. void RuntimePointerChecking::generateChecks(
  224. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  225. assert(Checks.empty() && "Checks is not empty");
  226. groupChecks(DepCands, UseDependencies);
  227. Checks = generateChecks();
  228. }
  229. bool RuntimePointerChecking::needsChecking(
  230. const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
  231. for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
  232. for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
  233. if (needsChecking(M.Members[I], N.Members[J]))
  234. return true;
  235. return false;
  236. }
  237. /// Compare \p I and \p J and return the minimum.
  238. /// Return nullptr in case we couldn't find an answer.
  239. static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
  240. ScalarEvolution *SE) {
  241. const SCEV *Diff = SE->getMinusSCEV(J, I);
  242. const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
  243. if (!C)
  244. return nullptr;
  245. if (C->getValue()->isNegative())
  246. return J;
  247. return I;
  248. }
  249. bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
  250. RuntimePointerChecking &RtCheck) {
  251. return addPointer(
  252. Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
  253. RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
  254. *RtCheck.SE);
  255. }
  256. bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
  257. const SCEV *End, unsigned AS,
  258. ScalarEvolution &SE) {
  259. assert(AddressSpace == AS &&
  260. "all pointers in a checking group must be in the same address space");
  261. // Compare the starts and ends with the known minimum and maximum
  262. // of this set. We need to know how we compare against the min/max
  263. // of the set in order to be able to emit memchecks.
  264. const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
  265. if (!Min0)
  266. return false;
  267. const SCEV *Min1 = getMinFromExprs(End, High, &SE);
  268. if (!Min1)
  269. return false;
  270. // Update the low bound expression if we've found a new min value.
  271. if (Min0 == Start)
  272. Low = Start;
  273. // Update the high bound expression if we've found a new max value.
  274. if (Min1 != End)
  275. High = End;
  276. Members.push_back(Index);
  277. return true;
  278. }
  279. void RuntimePointerChecking::groupChecks(
  280. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  281. // We build the groups from dependency candidates equivalence classes
  282. // because:
  283. // - We know that pointers in the same equivalence class share
  284. // the same underlying object and therefore there is a chance
  285. // that we can compare pointers
  286. // - We wouldn't be able to merge two pointers for which we need
  287. // to emit a memcheck. The classes in DepCands are already
  288. // conveniently built such that no two pointers in the same
  289. // class need checking against each other.
  290. // We use the following (greedy) algorithm to construct the groups
  291. // For every pointer in the equivalence class:
  292. // For each existing group:
  293. // - if the difference between this pointer and the min/max bounds
  294. // of the group is a constant, then make the pointer part of the
  295. // group and update the min/max bounds of that group as required.
  296. CheckingGroups.clear();
  297. // If we need to check two pointers to the same underlying object
  298. // with a non-constant difference, we shouldn't perform any pointer
  299. // grouping with those pointers. This is because we can easily get
  300. // into cases where the resulting check would return false, even when
  301. // the accesses are safe.
  302. //
  303. // The following example shows this:
  304. // for (i = 0; i < 1000; ++i)
  305. // a[5000 + i * m] = a[i] + a[i + 9000]
  306. //
  307. // Here grouping gives a check of (5000, 5000 + 1000 * m) against
  308. // (0, 10000) which is always false. However, if m is 1, there is no
  309. // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
  310. // us to perform an accurate check in this case.
  311. //
  312. // The above case requires that we have an UnknownDependence between
  313. // accesses to the same underlying object. This cannot happen unless
  314. // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
  315. // is also false. In this case we will use the fallback path and create
  316. // separate checking groups for all pointers.
  317. // If we don't have the dependency partitions, construct a new
  318. // checking pointer group for each pointer. This is also required
  319. // for correctness, because in this case we can have checking between
  320. // pointers to the same underlying object.
  321. if (!UseDependencies) {
  322. for (unsigned I = 0; I < Pointers.size(); ++I)
  323. CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
  324. return;
  325. }
  326. unsigned TotalComparisons = 0;
  327. DenseMap<Value *, unsigned> PositionMap;
  328. for (unsigned Index = 0; Index < Pointers.size(); ++Index)
  329. PositionMap[Pointers[Index].PointerValue] = Index;
  330. // We need to keep track of what pointers we've already seen so we
  331. // don't process them twice.
  332. SmallSet<unsigned, 2> Seen;
  333. // Go through all equivalence classes, get the "pointer check groups"
  334. // and add them to the overall solution. We use the order in which accesses
  335. // appear in 'Pointers' to enforce determinism.
  336. for (unsigned I = 0; I < Pointers.size(); ++I) {
  337. // We've seen this pointer before, and therefore already processed
  338. // its equivalence class.
  339. if (Seen.count(I))
  340. continue;
  341. MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
  342. Pointers[I].IsWritePtr);
  343. SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
  344. auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
  345. // Because DepCands is constructed by visiting accesses in the order in
  346. // which they appear in alias sets (which is deterministic) and the
  347. // iteration order within an equivalence class member is only dependent on
  348. // the order in which unions and insertions are performed on the
  349. // equivalence class, the iteration order is deterministic.
  350. for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
  351. MI != ME; ++MI) {
  352. auto PointerI = PositionMap.find(MI->getPointer());
  353. assert(PointerI != PositionMap.end() &&
  354. "pointer in equivalence class not found in PositionMap");
  355. unsigned Pointer = PointerI->second;
  356. bool Merged = false;
  357. // Mark this pointer as seen.
  358. Seen.insert(Pointer);
  359. // Go through all the existing sets and see if we can find one
  360. // which can include this pointer.
  361. for (RuntimeCheckingPtrGroup &Group : Groups) {
  362. // Don't perform more than a certain amount of comparisons.
  363. // This should limit the cost of grouping the pointers to something
  364. // reasonable. If we do end up hitting this threshold, the algorithm
  365. // will create separate groups for all remaining pointers.
  366. if (TotalComparisons > MemoryCheckMergeThreshold)
  367. break;
  368. TotalComparisons++;
  369. if (Group.addPointer(Pointer, *this)) {
  370. Merged = true;
  371. break;
  372. }
  373. }
  374. if (!Merged)
  375. // We couldn't add this pointer to any existing set or the threshold
  376. // for the number of comparisons has been reached. Create a new group
  377. // to hold the current pointer.
  378. Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
  379. }
  380. // We've computed the grouped checks for this partition.
  381. // Save the results and continue with the next one.
  382. llvm::copy(Groups, std::back_inserter(CheckingGroups));
  383. }
  384. }
  385. bool RuntimePointerChecking::arePointersInSamePartition(
  386. const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
  387. unsigned PtrIdx2) {
  388. return (PtrToPartition[PtrIdx1] != -1 &&
  389. PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
  390. }
  391. bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
  392. const PointerInfo &PointerI = Pointers[I];
  393. const PointerInfo &PointerJ = Pointers[J];
  394. // No need to check if two readonly pointers intersect.
  395. if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
  396. return false;
  397. // Only need to check pointers between two different dependency sets.
  398. if (PointerI.DependencySetId == PointerJ.DependencySetId)
  399. return false;
  400. // Only need to check pointers in the same alias set.
  401. if (PointerI.AliasSetId != PointerJ.AliasSetId)
  402. return false;
  403. return true;
  404. }
  405. void RuntimePointerChecking::printChecks(
  406. raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
  407. unsigned Depth) const {
  408. unsigned N = 0;
  409. for (const auto &Check : Checks) {
  410. const auto &First = Check.first->Members, &Second = Check.second->Members;
  411. OS.indent(Depth) << "Check " << N++ << ":\n";
  412. OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
  413. for (unsigned K = 0; K < First.size(); ++K)
  414. OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
  415. OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
  416. for (unsigned K = 0; K < Second.size(); ++K)
  417. OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
  418. }
  419. }
  420. void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
  421. OS.indent(Depth) << "Run-time memory checks:\n";
  422. printChecks(OS, Checks, Depth);
  423. OS.indent(Depth) << "Grouped accesses:\n";
  424. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  425. const auto &CG = CheckingGroups[I];
  426. OS.indent(Depth + 2) << "Group " << &CG << ":\n";
  427. OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
  428. << ")\n";
  429. for (unsigned J = 0; J < CG.Members.size(); ++J) {
  430. OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
  431. << "\n";
  432. }
  433. }
  434. }
  435. namespace {
  436. /// Analyses memory accesses in a loop.
  437. ///
  438. /// Checks whether run time pointer checks are needed and builds sets for data
  439. /// dependence checking.
  440. class AccessAnalysis {
  441. public:
  442. /// Read or write access location.
  443. typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
  444. typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
  445. AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
  446. MemoryDepChecker::DepCandidates &DA,
  447. PredicatedScalarEvolution &PSE)
  448. : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {}
  449. /// Register a load and whether it is only read from.
  450. void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
  451. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  452. AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
  453. Accesses.insert(MemAccessInfo(Ptr, false));
  454. if (IsReadOnly)
  455. ReadOnlyPtr.insert(Ptr);
  456. }
  457. /// Register a store.
  458. void addStore(MemoryLocation &Loc) {
  459. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  460. AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
  461. Accesses.insert(MemAccessInfo(Ptr, true));
  462. }
  463. /// Check if we can emit a run-time no-alias check for \p Access.
  464. ///
  465. /// Returns true if we can emit a run-time no alias check for \p Access.
  466. /// If we can check this access, this also adds it to a dependence set and
  467. /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
  468. /// we will attempt to use additional run-time checks in order to get
  469. /// the bounds of the pointer.
  470. bool createCheckForAccess(RuntimePointerChecking &RtCheck,
  471. MemAccessInfo Access,
  472. const ValueToValueMap &Strides,
  473. DenseMap<Value *, unsigned> &DepSetId,
  474. Loop *TheLoop, unsigned &RunningDepId,
  475. unsigned ASId, bool ShouldCheckStride,
  476. bool Assume);
  477. /// Check whether we can check the pointers at runtime for
  478. /// non-intersection.
  479. ///
  480. /// Returns true if we need no check or if we do and we can generate them
  481. /// (i.e. the pointers have computable bounds).
  482. bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
  483. Loop *TheLoop, const ValueToValueMap &Strides,
  484. bool ShouldCheckWrap = false);
  485. /// Goes over all memory accesses, checks whether a RT check is needed
  486. /// and builds sets of dependent accesses.
  487. void buildDependenceSets() {
  488. processMemAccesses();
  489. }
  490. /// Initial processing of memory accesses determined that we need to
  491. /// perform dependency checking.
  492. ///
  493. /// Note that this can later be cleared if we retry memcheck analysis without
  494. /// dependency checking (i.e. FoundNonConstantDistanceDependence).
  495. bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
  496. /// We decided that no dependence analysis would be used. Reset the state.
  497. void resetDepChecks(MemoryDepChecker &DepChecker) {
  498. CheckDeps.clear();
  499. DepChecker.clearDependences();
  500. }
  501. MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
  502. private:
  503. typedef SetVector<MemAccessInfo> PtrAccessSet;
  504. /// Go over all memory access and check whether runtime pointer checks
  505. /// are needed and build sets of dependency check candidates.
  506. void processMemAccesses();
  507. /// Set of all accesses.
  508. PtrAccessSet Accesses;
  509. /// The loop being checked.
  510. const Loop *TheLoop;
  511. /// List of accesses that need a further dependence check.
  512. MemAccessInfoList CheckDeps;
  513. /// Set of pointers that are read only.
  514. SmallPtrSet<Value*, 16> ReadOnlyPtr;
  515. /// An alias set tracker to partition the access set by underlying object and
  516. //intrinsic property (such as TBAA metadata).
  517. AliasSetTracker AST;
  518. LoopInfo *LI;
  519. /// Sets of potentially dependent accesses - members of one set share an
  520. /// underlying pointer. The set "CheckDeps" identfies which sets really need a
  521. /// dependence check.
  522. MemoryDepChecker::DepCandidates &DepCands;
  523. /// Initial processing of memory accesses determined that we may need
  524. /// to add memchecks. Perform the analysis to determine the necessary checks.
  525. ///
  526. /// Note that, this is different from isDependencyCheckNeeded. When we retry
  527. /// memcheck analysis without dependency checking
  528. /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
  529. /// cleared while this remains set if we have potentially dependent accesses.
  530. bool IsRTCheckAnalysisNeeded = false;
  531. /// The SCEV predicate containing all the SCEV-related assumptions.
  532. PredicatedScalarEvolution &PSE;
  533. };
  534. } // end anonymous namespace
  535. /// Check whether a pointer can participate in a runtime bounds check.
  536. /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
  537. /// by adding run-time checks (overflow checks) if necessary.
  538. static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
  539. const ValueToValueMap &Strides, Value *Ptr,
  540. Loop *L, bool Assume) {
  541. const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
  542. // The bounds for loop-invariant pointer is trivial.
  543. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  544. return true;
  545. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  546. if (!AR && Assume)
  547. AR = PSE.getAsAddRec(Ptr);
  548. if (!AR)
  549. return false;
  550. return AR->isAffine();
  551. }
  552. /// Check whether a pointer address cannot wrap.
  553. static bool isNoWrap(PredicatedScalarEvolution &PSE,
  554. const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
  555. const SCEV *PtrScev = PSE.getSCEV(Ptr);
  556. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  557. return true;
  558. Type *AccessTy = Ptr->getType()->getPointerElementType();
  559. int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides);
  560. if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
  561. return true;
  562. return false;
  563. }
  564. static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
  565. function_ref<void(Value *)> AddPointer) {
  566. SmallPtrSet<Value *, 8> Visited;
  567. SmallVector<Value *> WorkList;
  568. WorkList.push_back(StartPtr);
  569. while (!WorkList.empty()) {
  570. Value *Ptr = WorkList.pop_back_val();
  571. if (!Visited.insert(Ptr).second)
  572. continue;
  573. auto *PN = dyn_cast<PHINode>(Ptr);
  574. // SCEV does not look through non-header PHIs inside the loop. Such phis
  575. // can be analyzed by adding separate accesses for each incoming pointer
  576. // value.
  577. if (PN && InnermostLoop.contains(PN->getParent()) &&
  578. PN->getParent() != InnermostLoop.getHeader()) {
  579. for (const Use &Inc : PN->incoming_values())
  580. WorkList.push_back(Inc);
  581. } else
  582. AddPointer(Ptr);
  583. }
  584. }
  585. bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
  586. MemAccessInfo Access,
  587. const ValueToValueMap &StridesMap,
  588. DenseMap<Value *, unsigned> &DepSetId,
  589. Loop *TheLoop, unsigned &RunningDepId,
  590. unsigned ASId, bool ShouldCheckWrap,
  591. bool Assume) {
  592. Value *Ptr = Access.getPointer();
  593. if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
  594. return false;
  595. // When we run after a failing dependency check we have to make sure
  596. // we don't have wrapping pointers.
  597. if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
  598. auto *Expr = PSE.getSCEV(Ptr);
  599. if (!Assume || !isa<SCEVAddRecExpr>(Expr))
  600. return false;
  601. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  602. }
  603. // The id of the dependence set.
  604. unsigned DepId;
  605. if (isDependencyCheckNeeded()) {
  606. Value *Leader = DepCands.getLeaderValue(Access).getPointer();
  607. unsigned &LeaderId = DepSetId[Leader];
  608. if (!LeaderId)
  609. LeaderId = RunningDepId++;
  610. DepId = LeaderId;
  611. } else
  612. // Each access has its own dependence set.
  613. DepId = RunningDepId++;
  614. bool IsWrite = Access.getInt();
  615. RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
  616. LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
  617. return true;
  618. }
  619. bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
  620. ScalarEvolution *SE, Loop *TheLoop,
  621. const ValueToValueMap &StridesMap,
  622. bool ShouldCheckWrap) {
  623. // Find pointers with computable bounds. We are going to use this information
  624. // to place a runtime bound check.
  625. bool CanDoRT = true;
  626. bool MayNeedRTCheck = false;
  627. if (!IsRTCheckAnalysisNeeded) return true;
  628. bool IsDepCheckNeeded = isDependencyCheckNeeded();
  629. // We assign a consecutive id to access from different alias sets.
  630. // Accesses between different groups doesn't need to be checked.
  631. unsigned ASId = 0;
  632. for (auto &AS : AST) {
  633. int NumReadPtrChecks = 0;
  634. int NumWritePtrChecks = 0;
  635. bool CanDoAliasSetRT = true;
  636. ++ASId;
  637. // We assign consecutive id to access from different dependence sets.
  638. // Accesses within the same set don't need a runtime check.
  639. unsigned RunningDepId = 1;
  640. DenseMap<Value *, unsigned> DepSetId;
  641. SmallVector<MemAccessInfo, 4> Retries;
  642. // First, count how many write and read accesses are in the alias set. Also
  643. // collect MemAccessInfos for later.
  644. SmallVector<MemAccessInfo, 4> AccessInfos;
  645. for (const auto &A : AS) {
  646. Value *Ptr = A.getValue();
  647. bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
  648. if (IsWrite)
  649. ++NumWritePtrChecks;
  650. else
  651. ++NumReadPtrChecks;
  652. AccessInfos.emplace_back(Ptr, IsWrite);
  653. }
  654. // We do not need runtime checks for this alias set, if there are no writes
  655. // or a single write and no reads.
  656. if (NumWritePtrChecks == 0 ||
  657. (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
  658. assert((AS.size() <= 1 ||
  659. all_of(AS,
  660. [this](auto AC) {
  661. MemAccessInfo AccessWrite(AC.getValue(), true);
  662. return DepCands.findValue(AccessWrite) == DepCands.end();
  663. })) &&
  664. "Can only skip updating CanDoRT below, if all entries in AS "
  665. "are reads or there is at most 1 entry");
  666. continue;
  667. }
  668. for (auto &Access : AccessInfos) {
  669. if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
  670. RunningDepId, ASId, ShouldCheckWrap, false)) {
  671. LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
  672. << *Access.getPointer() << '\n');
  673. Retries.push_back(Access);
  674. CanDoAliasSetRT = false;
  675. }
  676. }
  677. // Note that this function computes CanDoRT and MayNeedRTCheck
  678. // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
  679. // we have a pointer for which we couldn't find the bounds but we don't
  680. // actually need to emit any checks so it does not matter.
  681. //
  682. // We need runtime checks for this alias set, if there are at least 2
  683. // dependence sets (in which case RunningDepId > 2) or if we need to re-try
  684. // any bound checks (because in that case the number of dependence sets is
  685. // incomplete).
  686. bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
  687. // We need to perform run-time alias checks, but some pointers had bounds
  688. // that couldn't be checked.
  689. if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
  690. // Reset the CanDoSetRt flag and retry all accesses that have failed.
  691. // We know that we need these checks, so we can now be more aggressive
  692. // and add further checks if required (overflow checks).
  693. CanDoAliasSetRT = true;
  694. for (auto Access : Retries)
  695. if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
  696. TheLoop, RunningDepId, ASId,
  697. ShouldCheckWrap, /*Assume=*/true)) {
  698. CanDoAliasSetRT = false;
  699. break;
  700. }
  701. }
  702. CanDoRT &= CanDoAliasSetRT;
  703. MayNeedRTCheck |= NeedsAliasSetRTCheck;
  704. ++ASId;
  705. }
  706. // If the pointers that we would use for the bounds comparison have different
  707. // address spaces, assume the values aren't directly comparable, so we can't
  708. // use them for the runtime check. We also have to assume they could
  709. // overlap. In the future there should be metadata for whether address spaces
  710. // are disjoint.
  711. unsigned NumPointers = RtCheck.Pointers.size();
  712. for (unsigned i = 0; i < NumPointers; ++i) {
  713. for (unsigned j = i + 1; j < NumPointers; ++j) {
  714. // Only need to check pointers between two different dependency sets.
  715. if (RtCheck.Pointers[i].DependencySetId ==
  716. RtCheck.Pointers[j].DependencySetId)
  717. continue;
  718. // Only need to check pointers in the same alias set.
  719. if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
  720. continue;
  721. Value *PtrI = RtCheck.Pointers[i].PointerValue;
  722. Value *PtrJ = RtCheck.Pointers[j].PointerValue;
  723. unsigned ASi = PtrI->getType()->getPointerAddressSpace();
  724. unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
  725. if (ASi != ASj) {
  726. LLVM_DEBUG(
  727. dbgs() << "LAA: Runtime check would require comparison between"
  728. " different address spaces\n");
  729. return false;
  730. }
  731. }
  732. }
  733. if (MayNeedRTCheck && CanDoRT)
  734. RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
  735. LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
  736. << " pointer comparisons.\n");
  737. // If we can do run-time checks, but there are no checks, no runtime checks
  738. // are needed. This can happen when all pointers point to the same underlying
  739. // object for example.
  740. RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
  741. bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
  742. if (!CanDoRTIfNeeded)
  743. RtCheck.reset();
  744. return CanDoRTIfNeeded;
  745. }
  746. void AccessAnalysis::processMemAccesses() {
  747. // We process the set twice: first we process read-write pointers, last we
  748. // process read-only pointers. This allows us to skip dependence tests for
  749. // read-only pointers.
  750. LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
  751. LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
  752. LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
  753. LLVM_DEBUG({
  754. for (auto A : Accesses)
  755. dbgs() << "\t" << *A.getPointer() << " (" <<
  756. (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
  757. "read-only" : "read")) << ")\n";
  758. });
  759. // The AliasSetTracker has nicely partitioned our pointers by metadata
  760. // compatibility and potential for underlying-object overlap. As a result, we
  761. // only need to check for potential pointer dependencies within each alias
  762. // set.
  763. for (const auto &AS : AST) {
  764. // Note that both the alias-set tracker and the alias sets themselves used
  765. // linked lists internally and so the iteration order here is deterministic
  766. // (matching the original instruction order within each set).
  767. bool SetHasWrite = false;
  768. // Map of pointers to last access encountered.
  769. typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
  770. UnderlyingObjToAccessMap ObjToLastAccess;
  771. // Set of access to check after all writes have been processed.
  772. PtrAccessSet DeferredAccesses;
  773. // Iterate over each alias set twice, once to process read/write pointers,
  774. // and then to process read-only pointers.
  775. for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
  776. bool UseDeferred = SetIteration > 0;
  777. PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
  778. for (const auto &AV : AS) {
  779. Value *Ptr = AV.getValue();
  780. // For a single memory access in AliasSetTracker, Accesses may contain
  781. // both read and write, and they both need to be handled for CheckDeps.
  782. for (const auto &AC : S) {
  783. if (AC.getPointer() != Ptr)
  784. continue;
  785. bool IsWrite = AC.getInt();
  786. // If we're using the deferred access set, then it contains only
  787. // reads.
  788. bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
  789. if (UseDeferred && !IsReadOnlyPtr)
  790. continue;
  791. // Otherwise, the pointer must be in the PtrAccessSet, either as a
  792. // read or a write.
  793. assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
  794. S.count(MemAccessInfo(Ptr, false))) &&
  795. "Alias-set pointer not in the access set?");
  796. MemAccessInfo Access(Ptr, IsWrite);
  797. DepCands.insert(Access);
  798. // Memorize read-only pointers for later processing and skip them in
  799. // the first round (they need to be checked after we have seen all
  800. // write pointers). Note: we also mark pointer that are not
  801. // consecutive as "read-only" pointers (so that we check
  802. // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
  803. if (!UseDeferred && IsReadOnlyPtr) {
  804. DeferredAccesses.insert(Access);
  805. continue;
  806. }
  807. // If this is a write - check other reads and writes for conflicts. If
  808. // this is a read only check other writes for conflicts (but only if
  809. // there is no other write to the ptr - this is an optimization to
  810. // catch "a[i] = a[i] + " without having to do a dependence check).
  811. if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
  812. CheckDeps.push_back(Access);
  813. IsRTCheckAnalysisNeeded = true;
  814. }
  815. if (IsWrite)
  816. SetHasWrite = true;
  817. // Create sets of pointers connected by a shared alias set and
  818. // underlying object.
  819. typedef SmallVector<const Value *, 16> ValueVector;
  820. ValueVector TempObjects;
  821. getUnderlyingObjects(Ptr, TempObjects, LI);
  822. LLVM_DEBUG(dbgs()
  823. << "Underlying objects for pointer " << *Ptr << "\n");
  824. for (const Value *UnderlyingObj : TempObjects) {
  825. // nullptr never alias, don't join sets for pointer that have "null"
  826. // in their UnderlyingObjects list.
  827. if (isa<ConstantPointerNull>(UnderlyingObj) &&
  828. !NullPointerIsDefined(
  829. TheLoop->getHeader()->getParent(),
  830. UnderlyingObj->getType()->getPointerAddressSpace()))
  831. continue;
  832. UnderlyingObjToAccessMap::iterator Prev =
  833. ObjToLastAccess.find(UnderlyingObj);
  834. if (Prev != ObjToLastAccess.end())
  835. DepCands.unionSets(Access, Prev->second);
  836. ObjToLastAccess[UnderlyingObj] = Access;
  837. LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
  838. }
  839. }
  840. }
  841. }
  842. }
  843. }
  844. static bool isInBoundsGep(Value *Ptr) {
  845. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
  846. return GEP->isInBounds();
  847. return false;
  848. }
  849. /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
  850. /// i.e. monotonically increasing/decreasing.
  851. static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
  852. PredicatedScalarEvolution &PSE, const Loop *L) {
  853. // FIXME: This should probably only return true for NUW.
  854. if (AR->getNoWrapFlags(SCEV::NoWrapMask))
  855. return true;
  856. // Scalar evolution does not propagate the non-wrapping flags to values that
  857. // are derived from a non-wrapping induction variable because non-wrapping
  858. // could be flow-sensitive.
  859. //
  860. // Look through the potentially overflowing instruction to try to prove
  861. // non-wrapping for the *specific* value of Ptr.
  862. // The arithmetic implied by an inbounds GEP can't overflow.
  863. auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  864. if (!GEP || !GEP->isInBounds())
  865. return false;
  866. // Make sure there is only one non-const index and analyze that.
  867. Value *NonConstIndex = nullptr;
  868. for (Value *Index : GEP->indices())
  869. if (!isa<ConstantInt>(Index)) {
  870. if (NonConstIndex)
  871. return false;
  872. NonConstIndex = Index;
  873. }
  874. if (!NonConstIndex)
  875. // The recurrence is on the pointer, ignore for now.
  876. return false;
  877. // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
  878. // AddRec using a NSW operation.
  879. if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
  880. if (OBO->hasNoSignedWrap() &&
  881. // Assume constant for other the operand so that the AddRec can be
  882. // easily found.
  883. isa<ConstantInt>(OBO->getOperand(1))) {
  884. auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
  885. if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
  886. return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
  887. }
  888. return false;
  889. }
  890. /// Check whether the access through \p Ptr has a constant stride.
  891. int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
  892. Value *Ptr, const Loop *Lp,
  893. const ValueToValueMap &StridesMap, bool Assume,
  894. bool ShouldCheckWrap) {
  895. Type *Ty = Ptr->getType();
  896. assert(Ty->isPointerTy() && "Unexpected non-ptr");
  897. if (isa<ScalableVectorType>(AccessTy)) {
  898. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
  899. << "\n");
  900. return 0;
  901. }
  902. const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
  903. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  904. if (Assume && !AR)
  905. AR = PSE.getAsAddRec(Ptr);
  906. if (!AR) {
  907. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
  908. << " SCEV: " << *PtrScev << "\n");
  909. return 0;
  910. }
  911. // The access function must stride over the innermost loop.
  912. if (Lp != AR->getLoop()) {
  913. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
  914. << *Ptr << " SCEV: " << *AR << "\n");
  915. return 0;
  916. }
  917. // The address calculation must not wrap. Otherwise, a dependence could be
  918. // inverted.
  919. // An inbounds getelementptr that is a AddRec with a unit stride
  920. // cannot wrap per definition. The unit stride requirement is checked later.
  921. // An getelementptr without an inbounds attribute and unit stride would have
  922. // to access the pointer value "0" which is undefined behavior in address
  923. // space 0, therefore we can also vectorize this case.
  924. unsigned AddrSpace = Ty->getPointerAddressSpace();
  925. bool IsInBoundsGEP = isInBoundsGep(Ptr);
  926. bool IsNoWrapAddRec = !ShouldCheckWrap ||
  927. PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
  928. isNoWrapAddRec(Ptr, AR, PSE, Lp);
  929. if (!IsNoWrapAddRec && !IsInBoundsGEP &&
  930. NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace)) {
  931. if (Assume) {
  932. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  933. IsNoWrapAddRec = true;
  934. LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
  935. << "LAA: Pointer: " << *Ptr << "\n"
  936. << "LAA: SCEV: " << *AR << "\n"
  937. << "LAA: Added an overflow assumption\n");
  938. } else {
  939. LLVM_DEBUG(
  940. dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
  941. << *Ptr << " SCEV: " << *AR << "\n");
  942. return 0;
  943. }
  944. }
  945. // Check the step is constant.
  946. const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
  947. // Calculate the pointer stride and check if it is constant.
  948. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
  949. if (!C) {
  950. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
  951. << " SCEV: " << *AR << "\n");
  952. return 0;
  953. }
  954. auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  955. TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
  956. int64_t Size = AllocSize.getFixedSize();
  957. const APInt &APStepVal = C->getAPInt();
  958. // Huge step value - give up.
  959. if (APStepVal.getBitWidth() > 64)
  960. return 0;
  961. int64_t StepVal = APStepVal.getSExtValue();
  962. // Strided access.
  963. int64_t Stride = StepVal / Size;
  964. int64_t Rem = StepVal % Size;
  965. if (Rem)
  966. return 0;
  967. // If the SCEV could wrap but we have an inbounds gep with a unit stride we
  968. // know we can't "wrap around the address space". In case of address space
  969. // zero we know that this won't happen without triggering undefined behavior.
  970. if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
  971. (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
  972. AddrSpace))) {
  973. if (Assume) {
  974. // We can avoid this case by adding a run-time check.
  975. LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
  976. << "inbounds or in address space 0 may wrap:\n"
  977. << "LAA: Pointer: " << *Ptr << "\n"
  978. << "LAA: SCEV: " << *AR << "\n"
  979. << "LAA: Added an overflow assumption\n");
  980. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  981. } else
  982. return 0;
  983. }
  984. return Stride;
  985. }
  986. Optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB,
  987. Value *PtrB, const DataLayout &DL,
  988. ScalarEvolution &SE, bool StrictCheck,
  989. bool CheckType) {
  990. assert(PtrA && PtrB && "Expected non-nullptr pointers.");
  991. assert(cast<PointerType>(PtrA->getType())
  992. ->isOpaqueOrPointeeTypeMatches(ElemTyA) && "Wrong PtrA type");
  993. assert(cast<PointerType>(PtrB->getType())
  994. ->isOpaqueOrPointeeTypeMatches(ElemTyB) && "Wrong PtrB type");
  995. // Make sure that A and B are different pointers.
  996. if (PtrA == PtrB)
  997. return 0;
  998. // Make sure that the element types are the same if required.
  999. if (CheckType && ElemTyA != ElemTyB)
  1000. return None;
  1001. unsigned ASA = PtrA->getType()->getPointerAddressSpace();
  1002. unsigned ASB = PtrB->getType()->getPointerAddressSpace();
  1003. // Check that the address spaces match.
  1004. if (ASA != ASB)
  1005. return None;
  1006. unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
  1007. APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
  1008. Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
  1009. Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  1010. int Val;
  1011. if (PtrA1 == PtrB1) {
  1012. // Retrieve the address space again as pointer stripping now tracks through
  1013. // `addrspacecast`.
  1014. ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
  1015. ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
  1016. // Check that the address spaces match and that the pointers are valid.
  1017. if (ASA != ASB)
  1018. return None;
  1019. IdxWidth = DL.getIndexSizeInBits(ASA);
  1020. OffsetA = OffsetA.sextOrTrunc(IdxWidth);
  1021. OffsetB = OffsetB.sextOrTrunc(IdxWidth);
  1022. OffsetB -= OffsetA;
  1023. Val = OffsetB.getSExtValue();
  1024. } else {
  1025. // Otherwise compute the distance with SCEV between the base pointers.
  1026. const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
  1027. const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
  1028. const auto *Diff =
  1029. dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
  1030. if (!Diff)
  1031. return None;
  1032. Val = Diff->getAPInt().getSExtValue();
  1033. }
  1034. int Size = DL.getTypeStoreSize(ElemTyA);
  1035. int Dist = Val / Size;
  1036. // Ensure that the calculated distance matches the type-based one after all
  1037. // the bitcasts removal in the provided pointers.
  1038. if (!StrictCheck || Dist * Size == Val)
  1039. return Dist;
  1040. return None;
  1041. }
  1042. bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
  1043. const DataLayout &DL, ScalarEvolution &SE,
  1044. SmallVectorImpl<unsigned> &SortedIndices) {
  1045. assert(llvm::all_of(
  1046. VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
  1047. "Expected list of pointer operands.");
  1048. // Walk over the pointers, and map each of them to an offset relative to
  1049. // first pointer in the array.
  1050. Value *Ptr0 = VL[0];
  1051. using DistOrdPair = std::pair<int64_t, int>;
  1052. auto Compare = [](const DistOrdPair &L, const DistOrdPair &R) {
  1053. return L.first < R.first;
  1054. };
  1055. std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
  1056. Offsets.emplace(0, 0);
  1057. int Cnt = 1;
  1058. bool IsConsecutive = true;
  1059. for (auto *Ptr : VL.drop_front()) {
  1060. Optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
  1061. /*StrictCheck=*/true);
  1062. if (!Diff)
  1063. return false;
  1064. // Check if the pointer with the same offset is found.
  1065. int64_t Offset = *Diff;
  1066. auto Res = Offsets.emplace(Offset, Cnt);
  1067. if (!Res.second)
  1068. return false;
  1069. // Consecutive order if the inserted element is the last one.
  1070. IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
  1071. ++Cnt;
  1072. }
  1073. SortedIndices.clear();
  1074. if (!IsConsecutive) {
  1075. // Fill SortedIndices array only if it is non-consecutive.
  1076. SortedIndices.resize(VL.size());
  1077. Cnt = 0;
  1078. for (const std::pair<int64_t, int> &Pair : Offsets) {
  1079. SortedIndices[Cnt] = Pair.second;
  1080. ++Cnt;
  1081. }
  1082. }
  1083. return true;
  1084. }
  1085. /// Returns true if the memory operations \p A and \p B are consecutive.
  1086. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
  1087. ScalarEvolution &SE, bool CheckType) {
  1088. Value *PtrA = getLoadStorePointerOperand(A);
  1089. Value *PtrB = getLoadStorePointerOperand(B);
  1090. if (!PtrA || !PtrB)
  1091. return false;
  1092. Type *ElemTyA = getLoadStoreType(A);
  1093. Type *ElemTyB = getLoadStoreType(B);
  1094. Optional<int> Diff = getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
  1095. /*StrictCheck=*/true, CheckType);
  1096. return Diff && *Diff == 1;
  1097. }
  1098. void MemoryDepChecker::addAccess(StoreInst *SI) {
  1099. visitPointers(SI->getPointerOperand(), *InnermostLoop,
  1100. [this, SI](Value *Ptr) {
  1101. Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
  1102. InstMap.push_back(SI);
  1103. ++AccessIdx;
  1104. });
  1105. }
  1106. void MemoryDepChecker::addAccess(LoadInst *LI) {
  1107. visitPointers(LI->getPointerOperand(), *InnermostLoop,
  1108. [this, LI](Value *Ptr) {
  1109. Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
  1110. InstMap.push_back(LI);
  1111. ++AccessIdx;
  1112. });
  1113. }
  1114. MemoryDepChecker::VectorizationSafetyStatus
  1115. MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
  1116. switch (Type) {
  1117. case NoDep:
  1118. case Forward:
  1119. case BackwardVectorizable:
  1120. return VectorizationSafetyStatus::Safe;
  1121. case Unknown:
  1122. return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
  1123. case ForwardButPreventsForwarding:
  1124. case Backward:
  1125. case BackwardVectorizableButPreventsForwarding:
  1126. return VectorizationSafetyStatus::Unsafe;
  1127. }
  1128. llvm_unreachable("unexpected DepType!");
  1129. }
  1130. bool MemoryDepChecker::Dependence::isBackward() const {
  1131. switch (Type) {
  1132. case NoDep:
  1133. case Forward:
  1134. case ForwardButPreventsForwarding:
  1135. case Unknown:
  1136. return false;
  1137. case BackwardVectorizable:
  1138. case Backward:
  1139. case BackwardVectorizableButPreventsForwarding:
  1140. return true;
  1141. }
  1142. llvm_unreachable("unexpected DepType!");
  1143. }
  1144. bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
  1145. return isBackward() || Type == Unknown;
  1146. }
  1147. bool MemoryDepChecker::Dependence::isForward() const {
  1148. switch (Type) {
  1149. case Forward:
  1150. case ForwardButPreventsForwarding:
  1151. return true;
  1152. case NoDep:
  1153. case Unknown:
  1154. case BackwardVectorizable:
  1155. case Backward:
  1156. case BackwardVectorizableButPreventsForwarding:
  1157. return false;
  1158. }
  1159. llvm_unreachable("unexpected DepType!");
  1160. }
  1161. bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
  1162. uint64_t TypeByteSize) {
  1163. // If loads occur at a distance that is not a multiple of a feasible vector
  1164. // factor store-load forwarding does not take place.
  1165. // Positive dependences might cause troubles because vectorizing them might
  1166. // prevent store-load forwarding making vectorized code run a lot slower.
  1167. // a[i] = a[i-3] ^ a[i-8];
  1168. // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
  1169. // hence on your typical architecture store-load forwarding does not take
  1170. // place. Vectorizing in such cases does not make sense.
  1171. // Store-load forwarding distance.
  1172. // After this many iterations store-to-load forwarding conflicts should not
  1173. // cause any slowdowns.
  1174. const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
  1175. // Maximum vector factor.
  1176. uint64_t MaxVFWithoutSLForwardIssues = std::min(
  1177. VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
  1178. // Compute the smallest VF at which the store and load would be misaligned.
  1179. for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
  1180. VF *= 2) {
  1181. // If the number of vector iteration between the store and the load are
  1182. // small we could incur conflicts.
  1183. if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
  1184. MaxVFWithoutSLForwardIssues = (VF >> 1);
  1185. break;
  1186. }
  1187. }
  1188. if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
  1189. LLVM_DEBUG(
  1190. dbgs() << "LAA: Distance " << Distance
  1191. << " that could cause a store-load forwarding conflict\n");
  1192. return true;
  1193. }
  1194. if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
  1195. MaxVFWithoutSLForwardIssues !=
  1196. VectorizerParams::MaxVectorWidth * TypeByteSize)
  1197. MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
  1198. return false;
  1199. }
  1200. void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
  1201. if (Status < S)
  1202. Status = S;
  1203. }
  1204. /// Given a non-constant (unknown) dependence-distance \p Dist between two
  1205. /// memory accesses, that have the same stride whose absolute value is given
  1206. /// in \p Stride, and that have the same type size \p TypeByteSize,
  1207. /// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
  1208. /// possible to prove statically that the dependence distance is larger
  1209. /// than the range that the accesses will travel through the execution of
  1210. /// the loop. If so, return true; false otherwise. This is useful for
  1211. /// example in loops such as the following (PR31098):
  1212. /// for (i = 0; i < D; ++i) {
  1213. /// = out[i];
  1214. /// out[i+D] =
  1215. /// }
  1216. static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
  1217. const SCEV &BackedgeTakenCount,
  1218. const SCEV &Dist, uint64_t Stride,
  1219. uint64_t TypeByteSize) {
  1220. // If we can prove that
  1221. // (**) |Dist| > BackedgeTakenCount * Step
  1222. // where Step is the absolute stride of the memory accesses in bytes,
  1223. // then there is no dependence.
  1224. //
  1225. // Rationale:
  1226. // We basically want to check if the absolute distance (|Dist/Step|)
  1227. // is >= the loop iteration count (or > BackedgeTakenCount).
  1228. // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
  1229. // Section 4.2.1); Note, that for vectorization it is sufficient to prove
  1230. // that the dependence distance is >= VF; This is checked elsewhere.
  1231. // But in some cases we can prune unknown dependence distances early, and
  1232. // even before selecting the VF, and without a runtime test, by comparing
  1233. // the distance against the loop iteration count. Since the vectorized code
  1234. // will be executed only if LoopCount >= VF, proving distance >= LoopCount
  1235. // also guarantees that distance >= VF.
  1236. //
  1237. const uint64_t ByteStride = Stride * TypeByteSize;
  1238. const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
  1239. const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
  1240. const SCEV *CastedDist = &Dist;
  1241. const SCEV *CastedProduct = Product;
  1242. uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
  1243. uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
  1244. // The dependence distance can be positive/negative, so we sign extend Dist;
  1245. // The multiplication of the absolute stride in bytes and the
  1246. // backedgeTakenCount is non-negative, so we zero extend Product.
  1247. if (DistTypeSize > ProductTypeSize)
  1248. CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
  1249. else
  1250. CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
  1251. // Is Dist - (BackedgeTakenCount * Step) > 0 ?
  1252. // (If so, then we have proven (**) because |Dist| >= Dist)
  1253. const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
  1254. if (SE.isKnownPositive(Minus))
  1255. return true;
  1256. // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
  1257. // (If so, then we have proven (**) because |Dist| >= -1*Dist)
  1258. const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
  1259. Minus = SE.getMinusSCEV(NegDist, CastedProduct);
  1260. if (SE.isKnownPositive(Minus))
  1261. return true;
  1262. return false;
  1263. }
  1264. /// Check the dependence for two accesses with the same stride \p Stride.
  1265. /// \p Distance is the positive distance and \p TypeByteSize is type size in
  1266. /// bytes.
  1267. ///
  1268. /// \returns true if they are independent.
  1269. static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
  1270. uint64_t TypeByteSize) {
  1271. assert(Stride > 1 && "The stride must be greater than 1");
  1272. assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
  1273. assert(Distance > 0 && "The distance must be non-zero");
  1274. // Skip if the distance is not multiple of type byte size.
  1275. if (Distance % TypeByteSize)
  1276. return false;
  1277. uint64_t ScaledDist = Distance / TypeByteSize;
  1278. // No dependence if the scaled distance is not multiple of the stride.
  1279. // E.g.
  1280. // for (i = 0; i < 1024 ; i += 4)
  1281. // A[i+2] = A[i] + 1;
  1282. //
  1283. // Two accesses in memory (scaled distance is 2, stride is 4):
  1284. // | A[0] | | | | A[4] | | | |
  1285. // | | | A[2] | | | | A[6] | |
  1286. //
  1287. // E.g.
  1288. // for (i = 0; i < 1024 ; i += 3)
  1289. // A[i+4] = A[i] + 1;
  1290. //
  1291. // Two accesses in memory (scaled distance is 4, stride is 3):
  1292. // | A[0] | | | A[3] | | | A[6] | | |
  1293. // | | | | | A[4] | | | A[7] | |
  1294. return ScaledDist % Stride;
  1295. }
  1296. MemoryDepChecker::Dependence::DepType
  1297. MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  1298. const MemAccessInfo &B, unsigned BIdx,
  1299. const ValueToValueMap &Strides) {
  1300. assert (AIdx < BIdx && "Must pass arguments in program order");
  1301. Value *APtr = A.getPointer();
  1302. Value *BPtr = B.getPointer();
  1303. bool AIsWrite = A.getInt();
  1304. bool BIsWrite = B.getInt();
  1305. Type *ATy = APtr->getType()->getPointerElementType();
  1306. Type *BTy = BPtr->getType()->getPointerElementType();
  1307. // Two reads are independent.
  1308. if (!AIsWrite && !BIsWrite)
  1309. return Dependence::NoDep;
  1310. // We cannot check pointers in different address spaces.
  1311. if (APtr->getType()->getPointerAddressSpace() !=
  1312. BPtr->getType()->getPointerAddressSpace())
  1313. return Dependence::Unknown;
  1314. int64_t StrideAPtr =
  1315. getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true);
  1316. int64_t StrideBPtr =
  1317. getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true);
  1318. const SCEV *Src = PSE.getSCEV(APtr);
  1319. const SCEV *Sink = PSE.getSCEV(BPtr);
  1320. // If the induction step is negative we have to invert source and sink of the
  1321. // dependence.
  1322. if (StrideAPtr < 0) {
  1323. std::swap(APtr, BPtr);
  1324. std::swap(ATy, BTy);
  1325. std::swap(Src, Sink);
  1326. std::swap(AIsWrite, BIsWrite);
  1327. std::swap(AIdx, BIdx);
  1328. std::swap(StrideAPtr, StrideBPtr);
  1329. }
  1330. const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
  1331. LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
  1332. << "(Induction step: " << StrideAPtr << ")\n");
  1333. LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
  1334. << *InstMap[BIdx] << ": " << *Dist << "\n");
  1335. // Need accesses with constant stride. We don't want to vectorize
  1336. // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
  1337. // the address space.
  1338. if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
  1339. LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
  1340. return Dependence::Unknown;
  1341. }
  1342. auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
  1343. uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
  1344. bool HasSameSize =
  1345. DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
  1346. uint64_t Stride = std::abs(StrideAPtr);
  1347. const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
  1348. if (!C) {
  1349. if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
  1350. isSafeDependenceDistance(DL, *(PSE.getSE()),
  1351. *(PSE.getBackedgeTakenCount()), *Dist, Stride,
  1352. TypeByteSize))
  1353. return Dependence::NoDep;
  1354. LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
  1355. FoundNonConstantDistanceDependence = true;
  1356. return Dependence::Unknown;
  1357. }
  1358. const APInt &Val = C->getAPInt();
  1359. int64_t Distance = Val.getSExtValue();
  1360. // Attempt to prove strided accesses independent.
  1361. if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
  1362. areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
  1363. LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
  1364. return Dependence::NoDep;
  1365. }
  1366. // Negative distances are not plausible dependencies.
  1367. if (Val.isNegative()) {
  1368. bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
  1369. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1370. (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
  1371. !HasSameSize)) {
  1372. LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
  1373. return Dependence::ForwardButPreventsForwarding;
  1374. }
  1375. LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
  1376. return Dependence::Forward;
  1377. }
  1378. // Write to the same location with the same size.
  1379. if (Val == 0) {
  1380. if (HasSameSize)
  1381. return Dependence::Forward;
  1382. LLVM_DEBUG(
  1383. dbgs() << "LAA: Zero dependence difference but different type sizes\n");
  1384. return Dependence::Unknown;
  1385. }
  1386. assert(Val.isStrictlyPositive() && "Expect a positive value");
  1387. if (!HasSameSize) {
  1388. LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
  1389. "different type sizes\n");
  1390. return Dependence::Unknown;
  1391. }
  1392. // Bail out early if passed-in parameters make vectorization not feasible.
  1393. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
  1394. VectorizerParams::VectorizationFactor : 1);
  1395. unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
  1396. VectorizerParams::VectorizationInterleave : 1);
  1397. // The minimum number of iterations for a vectorized/unrolled version.
  1398. unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
  1399. // It's not vectorizable if the distance is smaller than the minimum distance
  1400. // needed for a vectroized/unrolled version. Vectorizing one iteration in
  1401. // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
  1402. // TypeByteSize (No need to plus the last gap distance).
  1403. //
  1404. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1405. // foo(int *A) {
  1406. // int *B = (int *)((char *)A + 14);
  1407. // for (i = 0 ; i < 1024 ; i += 2)
  1408. // B[i] = A[i] + 1;
  1409. // }
  1410. //
  1411. // Two accesses in memory (stride is 2):
  1412. // | A[0] | | A[2] | | A[4] | | A[6] | |
  1413. // | B[0] | | B[2] | | B[4] |
  1414. //
  1415. // Distance needs for vectorizing iterations except the last iteration:
  1416. // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
  1417. // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
  1418. //
  1419. // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
  1420. // 12, which is less than distance.
  1421. //
  1422. // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
  1423. // the minimum distance needed is 28, which is greater than distance. It is
  1424. // not safe to do vectorization.
  1425. uint64_t MinDistanceNeeded =
  1426. TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
  1427. if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
  1428. LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
  1429. << Distance << '\n');
  1430. return Dependence::Backward;
  1431. }
  1432. // Unsafe if the minimum distance needed is greater than max safe distance.
  1433. if (MinDistanceNeeded > MaxSafeDepDistBytes) {
  1434. LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
  1435. << MinDistanceNeeded << " size in bytes");
  1436. return Dependence::Backward;
  1437. }
  1438. // Positive distance bigger than max vectorization factor.
  1439. // FIXME: Should use max factor instead of max distance in bytes, which could
  1440. // not handle different types.
  1441. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1442. // void foo (int *A, char *B) {
  1443. // for (unsigned i = 0; i < 1024; i++) {
  1444. // A[i+2] = A[i] + 1;
  1445. // B[i+2] = B[i] + 1;
  1446. // }
  1447. // }
  1448. //
  1449. // This case is currently unsafe according to the max safe distance. If we
  1450. // analyze the two accesses on array B, the max safe dependence distance
  1451. // is 2. Then we analyze the accesses on array A, the minimum distance needed
  1452. // is 8, which is less than 2 and forbidden vectorization, But actually
  1453. // both A and B could be vectorized by 2 iterations.
  1454. MaxSafeDepDistBytes =
  1455. std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
  1456. bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
  1457. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1458. couldPreventStoreLoadForward(Distance, TypeByteSize))
  1459. return Dependence::BackwardVectorizableButPreventsForwarding;
  1460. uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
  1461. LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
  1462. << " with max VF = " << MaxVF << '\n');
  1463. uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
  1464. MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
  1465. return Dependence::BackwardVectorizable;
  1466. }
  1467. bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
  1468. MemAccessInfoList &CheckDeps,
  1469. const ValueToValueMap &Strides) {
  1470. MaxSafeDepDistBytes = -1;
  1471. SmallPtrSet<MemAccessInfo, 8> Visited;
  1472. for (MemAccessInfo CurAccess : CheckDeps) {
  1473. if (Visited.count(CurAccess))
  1474. continue;
  1475. // Get the relevant memory access set.
  1476. EquivalenceClasses<MemAccessInfo>::iterator I =
  1477. AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
  1478. // Check accesses within this set.
  1479. EquivalenceClasses<MemAccessInfo>::member_iterator AI =
  1480. AccessSets.member_begin(I);
  1481. EquivalenceClasses<MemAccessInfo>::member_iterator AE =
  1482. AccessSets.member_end();
  1483. // Check every access pair.
  1484. while (AI != AE) {
  1485. Visited.insert(*AI);
  1486. bool AIIsWrite = AI->getInt();
  1487. // Check loads only against next equivalent class, but stores also against
  1488. // other stores in the same equivalence class - to the same address.
  1489. EquivalenceClasses<MemAccessInfo>::member_iterator OI =
  1490. (AIIsWrite ? AI : std::next(AI));
  1491. while (OI != AE) {
  1492. // Check every accessing instruction pair in program order.
  1493. for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
  1494. I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
  1495. // Scan all accesses of another equivalence class, but only the next
  1496. // accesses of the same equivalent class.
  1497. for (std::vector<unsigned>::iterator
  1498. I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
  1499. I2E = (OI == AI ? I1E : Accesses[*OI].end());
  1500. I2 != I2E; ++I2) {
  1501. auto A = std::make_pair(&*AI, *I1);
  1502. auto B = std::make_pair(&*OI, *I2);
  1503. assert(*I1 != *I2);
  1504. if (*I1 > *I2)
  1505. std::swap(A, B);
  1506. Dependence::DepType Type =
  1507. isDependent(*A.first, A.second, *B.first, B.second, Strides);
  1508. mergeInStatus(Dependence::isSafeForVectorization(Type));
  1509. // Gather dependences unless we accumulated MaxDependences
  1510. // dependences. In that case return as soon as we find the first
  1511. // unsafe dependence. This puts a limit on this quadratic
  1512. // algorithm.
  1513. if (RecordDependences) {
  1514. if (Type != Dependence::NoDep)
  1515. Dependences.push_back(Dependence(A.second, B.second, Type));
  1516. if (Dependences.size() >= MaxDependences) {
  1517. RecordDependences = false;
  1518. Dependences.clear();
  1519. LLVM_DEBUG(dbgs()
  1520. << "Too many dependences, stopped recording\n");
  1521. }
  1522. }
  1523. if (!RecordDependences && !isSafeForVectorization())
  1524. return false;
  1525. }
  1526. ++OI;
  1527. }
  1528. AI++;
  1529. }
  1530. }
  1531. LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
  1532. return isSafeForVectorization();
  1533. }
  1534. SmallVector<Instruction *, 4>
  1535. MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
  1536. MemAccessInfo Access(Ptr, isWrite);
  1537. auto &IndexVector = Accesses.find(Access)->second;
  1538. SmallVector<Instruction *, 4> Insts;
  1539. transform(IndexVector,
  1540. std::back_inserter(Insts),
  1541. [&](unsigned Idx) { return this->InstMap[Idx]; });
  1542. return Insts;
  1543. }
  1544. const char *MemoryDepChecker::Dependence::DepName[] = {
  1545. "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
  1546. "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
  1547. void MemoryDepChecker::Dependence::print(
  1548. raw_ostream &OS, unsigned Depth,
  1549. const SmallVectorImpl<Instruction *> &Instrs) const {
  1550. OS.indent(Depth) << DepName[Type] << ":\n";
  1551. OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
  1552. OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
  1553. }
  1554. bool LoopAccessInfo::canAnalyzeLoop() {
  1555. // We need to have a loop header.
  1556. LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
  1557. << TheLoop->getHeader()->getParent()->getName() << ": "
  1558. << TheLoop->getHeader()->getName() << '\n');
  1559. // We can only analyze innermost loops.
  1560. if (!TheLoop->isInnermost()) {
  1561. LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
  1562. recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
  1563. return false;
  1564. }
  1565. // We must have a single backedge.
  1566. if (TheLoop->getNumBackEdges() != 1) {
  1567. LLVM_DEBUG(
  1568. dbgs() << "LAA: loop control flow is not understood by analyzer\n");
  1569. recordAnalysis("CFGNotUnderstood")
  1570. << "loop control flow is not understood by analyzer";
  1571. return false;
  1572. }
  1573. // ScalarEvolution needs to be able to find the exit count.
  1574. const SCEV *ExitCount = PSE->getBackedgeTakenCount();
  1575. if (isa<SCEVCouldNotCompute>(ExitCount)) {
  1576. recordAnalysis("CantComputeNumberOfIterations")
  1577. << "could not determine number of loop iterations";
  1578. LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
  1579. return false;
  1580. }
  1581. return true;
  1582. }
  1583. void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
  1584. const TargetLibraryInfo *TLI,
  1585. DominatorTree *DT) {
  1586. typedef SmallPtrSet<Value*, 16> ValueSet;
  1587. // Holds the Load and Store instructions.
  1588. SmallVector<LoadInst *, 16> Loads;
  1589. SmallVector<StoreInst *, 16> Stores;
  1590. // Holds all the different accesses in the loop.
  1591. unsigned NumReads = 0;
  1592. unsigned NumReadWrites = 0;
  1593. bool HasComplexMemInst = false;
  1594. // A runtime check is only legal to insert if there are no convergent calls.
  1595. HasConvergentOp = false;
  1596. PtrRtChecking->Pointers.clear();
  1597. PtrRtChecking->Need = false;
  1598. const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
  1599. const bool EnableMemAccessVersioningOfLoop =
  1600. EnableMemAccessVersioning &&
  1601. !TheLoop->getHeader()->getParent()->hasOptSize();
  1602. // For each block.
  1603. for (BasicBlock *BB : TheLoop->blocks()) {
  1604. // Scan the BB and collect legal loads and stores. Also detect any
  1605. // convergent instructions.
  1606. for (Instruction &I : *BB) {
  1607. if (auto *Call = dyn_cast<CallBase>(&I)) {
  1608. if (Call->isConvergent())
  1609. HasConvergentOp = true;
  1610. }
  1611. // With both a non-vectorizable memory instruction and a convergent
  1612. // operation, found in this loop, no reason to continue the search.
  1613. if (HasComplexMemInst && HasConvergentOp) {
  1614. CanVecMem = false;
  1615. return;
  1616. }
  1617. // Avoid hitting recordAnalysis multiple times.
  1618. if (HasComplexMemInst)
  1619. continue;
  1620. // If this is a load, save it. If this instruction can read from memory
  1621. // but is not a load, then we quit. Notice that we don't handle function
  1622. // calls that read or write.
  1623. if (I.mayReadFromMemory()) {
  1624. // Many math library functions read the rounding mode. We will only
  1625. // vectorize a loop if it contains known function calls that don't set
  1626. // the flag. Therefore, it is safe to ignore this read from memory.
  1627. auto *Call = dyn_cast<CallInst>(&I);
  1628. if (Call && getVectorIntrinsicIDForCall(Call, TLI))
  1629. continue;
  1630. // If the function has an explicit vectorized counterpart, we can safely
  1631. // assume that it can be vectorized.
  1632. if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
  1633. !VFDatabase::getMappings(*Call).empty())
  1634. continue;
  1635. auto *Ld = dyn_cast<LoadInst>(&I);
  1636. if (!Ld) {
  1637. recordAnalysis("CantVectorizeInstruction", Ld)
  1638. << "instruction cannot be vectorized";
  1639. HasComplexMemInst = true;
  1640. continue;
  1641. }
  1642. if (!Ld->isSimple() && !IsAnnotatedParallel) {
  1643. recordAnalysis("NonSimpleLoad", Ld)
  1644. << "read with atomic ordering or volatile read";
  1645. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
  1646. HasComplexMemInst = true;
  1647. continue;
  1648. }
  1649. NumLoads++;
  1650. Loads.push_back(Ld);
  1651. DepChecker->addAccess(Ld);
  1652. if (EnableMemAccessVersioningOfLoop)
  1653. collectStridedAccess(Ld);
  1654. continue;
  1655. }
  1656. // Save 'store' instructions. Abort if other instructions write to memory.
  1657. if (I.mayWriteToMemory()) {
  1658. auto *St = dyn_cast<StoreInst>(&I);
  1659. if (!St) {
  1660. recordAnalysis("CantVectorizeInstruction", St)
  1661. << "instruction cannot be vectorized";
  1662. HasComplexMemInst = true;
  1663. continue;
  1664. }
  1665. if (!St->isSimple() && !IsAnnotatedParallel) {
  1666. recordAnalysis("NonSimpleStore", St)
  1667. << "write with atomic ordering or volatile write";
  1668. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
  1669. HasComplexMemInst = true;
  1670. continue;
  1671. }
  1672. NumStores++;
  1673. Stores.push_back(St);
  1674. DepChecker->addAccess(St);
  1675. if (EnableMemAccessVersioningOfLoop)
  1676. collectStridedAccess(St);
  1677. }
  1678. } // Next instr.
  1679. } // Next block.
  1680. if (HasComplexMemInst) {
  1681. CanVecMem = false;
  1682. return;
  1683. }
  1684. // Now we have two lists that hold the loads and the stores.
  1685. // Next, we find the pointers that they use.
  1686. // Check if we see any stores. If there are no stores, then we don't
  1687. // care if the pointers are *restrict*.
  1688. if (!Stores.size()) {
  1689. LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
  1690. CanVecMem = true;
  1691. return;
  1692. }
  1693. MemoryDepChecker::DepCandidates DependentAccesses;
  1694. AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
  1695. // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
  1696. // multiple times on the same object. If the ptr is accessed twice, once
  1697. // for read and once for write, it will only appear once (on the write
  1698. // list). This is okay, since we are going to check for conflicts between
  1699. // writes and between reads and writes, but not between reads and reads.
  1700. ValueSet Seen;
  1701. // Record uniform store addresses to identify if we have multiple stores
  1702. // to the same address.
  1703. ValueSet UniformStores;
  1704. for (StoreInst *ST : Stores) {
  1705. Value *Ptr = ST->getPointerOperand();
  1706. if (isUniform(Ptr))
  1707. HasDependenceInvolvingLoopInvariantAddress |=
  1708. !UniformStores.insert(Ptr).second;
  1709. // If we did *not* see this pointer before, insert it to the read-write
  1710. // list. At this phase it is only a 'write' list.
  1711. if (Seen.insert(Ptr).second) {
  1712. ++NumReadWrites;
  1713. MemoryLocation Loc = MemoryLocation::get(ST);
  1714. // The TBAA metadata could have a control dependency on the predication
  1715. // condition, so we cannot rely on it when determining whether or not we
  1716. // need runtime pointer checks.
  1717. if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
  1718. Loc.AATags.TBAA = nullptr;
  1719. visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
  1720. [&Accesses, Loc](Value *Ptr) {
  1721. MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
  1722. Accesses.addStore(NewLoc);
  1723. });
  1724. }
  1725. }
  1726. if (IsAnnotatedParallel) {
  1727. LLVM_DEBUG(
  1728. dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
  1729. << "checks.\n");
  1730. CanVecMem = true;
  1731. return;
  1732. }
  1733. for (LoadInst *LD : Loads) {
  1734. Value *Ptr = LD->getPointerOperand();
  1735. // If we did *not* see this pointer before, insert it to the
  1736. // read list. If we *did* see it before, then it is already in
  1737. // the read-write list. This allows us to vectorize expressions
  1738. // such as A[i] += x; Because the address of A[i] is a read-write
  1739. // pointer. This only works if the index of A[i] is consecutive.
  1740. // If the address of i is unknown (for example A[B[i]]) then we may
  1741. // read a few words, modify, and write a few words, and some of the
  1742. // words may be written to the same address.
  1743. bool IsReadOnlyPtr = false;
  1744. if (Seen.insert(Ptr).second ||
  1745. !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides)) {
  1746. ++NumReads;
  1747. IsReadOnlyPtr = true;
  1748. }
  1749. // See if there is an unsafe dependency between a load to a uniform address and
  1750. // store to the same uniform address.
  1751. if (UniformStores.count(Ptr)) {
  1752. LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
  1753. "load and uniform store to the same address!\n");
  1754. HasDependenceInvolvingLoopInvariantAddress = true;
  1755. }
  1756. MemoryLocation Loc = MemoryLocation::get(LD);
  1757. // The TBAA metadata could have a control dependency on the predication
  1758. // condition, so we cannot rely on it when determining whether or not we
  1759. // need runtime pointer checks.
  1760. if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
  1761. Loc.AATags.TBAA = nullptr;
  1762. visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
  1763. [&Accesses, Loc, IsReadOnlyPtr](Value *Ptr) {
  1764. MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
  1765. Accesses.addLoad(NewLoc, IsReadOnlyPtr);
  1766. });
  1767. }
  1768. // If we write (or read-write) to a single destination and there are no
  1769. // other reads in this loop then is it safe to vectorize.
  1770. if (NumReadWrites == 1 && NumReads == 0) {
  1771. LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
  1772. CanVecMem = true;
  1773. return;
  1774. }
  1775. // Build dependence sets and check whether we need a runtime pointer bounds
  1776. // check.
  1777. Accesses.buildDependenceSets();
  1778. // Find pointers with computable bounds. We are going to use this information
  1779. // to place a runtime bound check.
  1780. bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
  1781. TheLoop, SymbolicStrides);
  1782. if (!CanDoRTIfNeeded) {
  1783. recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
  1784. LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
  1785. << "the array bounds.\n");
  1786. CanVecMem = false;
  1787. return;
  1788. }
  1789. LLVM_DEBUG(
  1790. dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
  1791. CanVecMem = true;
  1792. if (Accesses.isDependencyCheckNeeded()) {
  1793. LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
  1794. CanVecMem = DepChecker->areDepsSafe(
  1795. DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
  1796. MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
  1797. if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
  1798. LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
  1799. // Clear the dependency checks. We assume they are not needed.
  1800. Accesses.resetDepChecks(*DepChecker);
  1801. PtrRtChecking->reset();
  1802. PtrRtChecking->Need = true;
  1803. auto *SE = PSE->getSE();
  1804. CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
  1805. SymbolicStrides, true);
  1806. // Check that we found the bounds for the pointer.
  1807. if (!CanDoRTIfNeeded) {
  1808. recordAnalysis("CantCheckMemDepsAtRunTime")
  1809. << "cannot check memory dependencies at runtime";
  1810. LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
  1811. CanVecMem = false;
  1812. return;
  1813. }
  1814. CanVecMem = true;
  1815. }
  1816. }
  1817. if (HasConvergentOp) {
  1818. recordAnalysis("CantInsertRuntimeCheckWithConvergent")
  1819. << "cannot add control dependency to convergent operation";
  1820. LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
  1821. "would be needed with a convergent operation\n");
  1822. CanVecMem = false;
  1823. return;
  1824. }
  1825. if (CanVecMem)
  1826. LLVM_DEBUG(
  1827. dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
  1828. << (PtrRtChecking->Need ? "" : " don't")
  1829. << " need runtime memory checks.\n");
  1830. else {
  1831. recordAnalysis("UnsafeMemDep")
  1832. << "unsafe dependent memory operations in loop. Use "
  1833. "#pragma loop distribute(enable) to allow loop distribution "
  1834. "to attempt to isolate the offending operations into a separate "
  1835. "loop";
  1836. LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
  1837. }
  1838. }
  1839. bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
  1840. DominatorTree *DT) {
  1841. assert(TheLoop->contains(BB) && "Unknown block used");
  1842. // Blocks that do not dominate the latch need predication.
  1843. BasicBlock* Latch = TheLoop->getLoopLatch();
  1844. return !DT->dominates(BB, Latch);
  1845. }
  1846. OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
  1847. Instruction *I) {
  1848. assert(!Report && "Multiple reports generated");
  1849. Value *CodeRegion = TheLoop->getHeader();
  1850. DebugLoc DL = TheLoop->getStartLoc();
  1851. if (I) {
  1852. CodeRegion = I->getParent();
  1853. // If there is no debug location attached to the instruction, revert back to
  1854. // using the loop's.
  1855. if (I->getDebugLoc())
  1856. DL = I->getDebugLoc();
  1857. }
  1858. Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
  1859. CodeRegion);
  1860. return *Report;
  1861. }
  1862. bool LoopAccessInfo::isUniform(Value *V) const {
  1863. auto *SE = PSE->getSE();
  1864. // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
  1865. // never considered uniform.
  1866. // TODO: Is this really what we want? Even without FP SCEV, we may want some
  1867. // trivially loop-invariant FP values to be considered uniform.
  1868. if (!SE->isSCEVable(V->getType()))
  1869. return false;
  1870. return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
  1871. }
  1872. void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
  1873. Value *Ptr = getLoadStorePointerOperand(MemAccess);
  1874. if (!Ptr)
  1875. return;
  1876. Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
  1877. if (!Stride)
  1878. return;
  1879. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
  1880. "versioning:");
  1881. LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
  1882. // Avoid adding the "Stride == 1" predicate when we know that
  1883. // Stride >= Trip-Count. Such a predicate will effectively optimize a single
  1884. // or zero iteration loop, as Trip-Count <= Stride == 1.
  1885. //
  1886. // TODO: We are currently not making a very informed decision on when it is
  1887. // beneficial to apply stride versioning. It might make more sense that the
  1888. // users of this analysis (such as the vectorizer) will trigger it, based on
  1889. // their specific cost considerations; For example, in cases where stride
  1890. // versioning does not help resolving memory accesses/dependences, the
  1891. // vectorizer should evaluate the cost of the runtime test, and the benefit
  1892. // of various possible stride specializations, considering the alternatives
  1893. // of using gather/scatters (if available).
  1894. const SCEV *StrideExpr = PSE->getSCEV(Stride);
  1895. const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
  1896. // Match the types so we can compare the stride and the BETakenCount.
  1897. // The Stride can be positive/negative, so we sign extend Stride;
  1898. // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
  1899. const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
  1900. uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
  1901. uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
  1902. const SCEV *CastedStride = StrideExpr;
  1903. const SCEV *CastedBECount = BETakenCount;
  1904. ScalarEvolution *SE = PSE->getSE();
  1905. if (BETypeSize >= StrideTypeSize)
  1906. CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
  1907. else
  1908. CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
  1909. const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
  1910. // Since TripCount == BackEdgeTakenCount + 1, checking:
  1911. // "Stride >= TripCount" is equivalent to checking:
  1912. // Stride - BETakenCount > 0
  1913. if (SE->isKnownPositive(StrideMinusBETaken)) {
  1914. LLVM_DEBUG(
  1915. dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
  1916. "Stride==1 predicate will imply that the loop executes "
  1917. "at most once.\n");
  1918. return;
  1919. }
  1920. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
  1921. SymbolicStrides[Ptr] = Stride;
  1922. StrideSet.insert(Stride);
  1923. }
  1924. LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
  1925. const TargetLibraryInfo *TLI, AAResults *AA,
  1926. DominatorTree *DT, LoopInfo *LI)
  1927. : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
  1928. PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
  1929. DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
  1930. if (canAnalyzeLoop())
  1931. analyzeLoop(AA, LI, TLI, DT);
  1932. }
  1933. void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
  1934. if (CanVecMem) {
  1935. OS.indent(Depth) << "Memory dependences are safe";
  1936. if (MaxSafeDepDistBytes != -1ULL)
  1937. OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
  1938. << " bytes";
  1939. if (PtrRtChecking->Need)
  1940. OS << " with run-time checks";
  1941. OS << "\n";
  1942. }
  1943. if (HasConvergentOp)
  1944. OS.indent(Depth) << "Has convergent operation in loop\n";
  1945. if (Report)
  1946. OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
  1947. if (auto *Dependences = DepChecker->getDependences()) {
  1948. OS.indent(Depth) << "Dependences:\n";
  1949. for (auto &Dep : *Dependences) {
  1950. Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
  1951. OS << "\n";
  1952. }
  1953. } else
  1954. OS.indent(Depth) << "Too many dependences, not recorded\n";
  1955. // List the pair of accesses need run-time checks to prove independence.
  1956. PtrRtChecking->print(OS, Depth);
  1957. OS << "\n";
  1958. OS.indent(Depth) << "Non vectorizable stores to invariant address were "
  1959. << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
  1960. << "found in loop.\n";
  1961. OS.indent(Depth) << "SCEV assumptions:\n";
  1962. PSE->getUnionPredicate().print(OS, Depth);
  1963. OS << "\n";
  1964. OS.indent(Depth) << "Expressions re-written:\n";
  1965. PSE->print(OS, Depth);
  1966. }
  1967. LoopAccessLegacyAnalysis::LoopAccessLegacyAnalysis() : FunctionPass(ID) {
  1968. initializeLoopAccessLegacyAnalysisPass(*PassRegistry::getPassRegistry());
  1969. }
  1970. const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
  1971. auto &LAI = LoopAccessInfoMap[L];
  1972. if (!LAI)
  1973. LAI = std::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
  1974. return *LAI.get();
  1975. }
  1976. void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
  1977. LoopAccessLegacyAnalysis &LAA = *const_cast<LoopAccessLegacyAnalysis *>(this);
  1978. for (Loop *TopLevelLoop : *LI)
  1979. for (Loop *L : depth_first(TopLevelLoop)) {
  1980. OS.indent(2) << L->getHeader()->getName() << ":\n";
  1981. auto &LAI = LAA.getInfo(L);
  1982. LAI.print(OS, 4);
  1983. }
  1984. }
  1985. bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
  1986. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1987. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  1988. TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
  1989. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  1990. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1991. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1992. return false;
  1993. }
  1994. void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
  1995. AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
  1996. AU.addRequiredTransitive<AAResultsWrapperPass>();
  1997. AU.addRequiredTransitive<DominatorTreeWrapperPass>();
  1998. AU.addRequiredTransitive<LoopInfoWrapperPass>();
  1999. AU.setPreservesAll();
  2000. }
  2001. char LoopAccessLegacyAnalysis::ID = 0;
  2002. static const char laa_name[] = "Loop Access Analysis";
  2003. #define LAA_NAME "loop-accesses"
  2004. INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2005. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  2006. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  2007. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  2008. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  2009. INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2010. AnalysisKey LoopAccessAnalysis::Key;
  2011. LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
  2012. LoopStandardAnalysisResults &AR) {
  2013. return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
  2014. }
  2015. namespace llvm {
  2016. Pass *createLAAPass() {
  2017. return new LoopAccessLegacyAnalysis();
  2018. }
  2019. } // end namespace llvm