LoopAccessAnalysis.cpp 104 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734
  1. //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The implementation for the loop memory dependence that was originally
  10. // developed for the loop vectorizer.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Analysis/LoopAccessAnalysis.h"
  14. #include "llvm/ADT/APInt.h"
  15. #include "llvm/ADT/DenseMap.h"
  16. #include "llvm/ADT/DepthFirstIterator.h"
  17. #include "llvm/ADT/EquivalenceClasses.h"
  18. #include "llvm/ADT/PointerIntPair.h"
  19. #include "llvm/ADT/STLExtras.h"
  20. #include "llvm/ADT/SetVector.h"
  21. #include "llvm/ADT/SmallPtrSet.h"
  22. #include "llvm/ADT/SmallSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/iterator_range.h"
  25. #include "llvm/Analysis/AliasAnalysis.h"
  26. #include "llvm/Analysis/AliasSetTracker.h"
  27. #include "llvm/Analysis/LoopAnalysisManager.h"
  28. #include "llvm/Analysis/LoopInfo.h"
  29. #include "llvm/Analysis/LoopIterator.h"
  30. #include "llvm/Analysis/MemoryLocation.h"
  31. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  32. #include "llvm/Analysis/ScalarEvolution.h"
  33. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  34. #include "llvm/Analysis/TargetLibraryInfo.h"
  35. #include "llvm/Analysis/ValueTracking.h"
  36. #include "llvm/Analysis/VectorUtils.h"
  37. #include "llvm/IR/BasicBlock.h"
  38. #include "llvm/IR/Constants.h"
  39. #include "llvm/IR/DataLayout.h"
  40. #include "llvm/IR/DebugLoc.h"
  41. #include "llvm/IR/DerivedTypes.h"
  42. #include "llvm/IR/DiagnosticInfo.h"
  43. #include "llvm/IR/Dominators.h"
  44. #include "llvm/IR/Function.h"
  45. #include "llvm/IR/InstrTypes.h"
  46. #include "llvm/IR/Instruction.h"
  47. #include "llvm/IR/Instructions.h"
  48. #include "llvm/IR/Operator.h"
  49. #include "llvm/IR/PassManager.h"
  50. #include "llvm/IR/PatternMatch.h"
  51. #include "llvm/IR/Type.h"
  52. #include "llvm/IR/Value.h"
  53. #include "llvm/IR/ValueHandle.h"
  54. #include "llvm/InitializePasses.h"
  55. #include "llvm/Pass.h"
  56. #include "llvm/Support/Casting.h"
  57. #include "llvm/Support/CommandLine.h"
  58. #include "llvm/Support/Debug.h"
  59. #include "llvm/Support/ErrorHandling.h"
  60. #include "llvm/Support/raw_ostream.h"
  61. #include <algorithm>
  62. #include <cassert>
  63. #include <cstdint>
  64. #include <iterator>
  65. #include <utility>
  66. #include <vector>
  67. using namespace llvm;
  68. using namespace llvm::PatternMatch;
  69. #define DEBUG_TYPE "loop-accesses"
  70. static cl::opt<unsigned, true>
  71. VectorizationFactor("force-vector-width", cl::Hidden,
  72. cl::desc("Sets the SIMD width. Zero is autoselect."),
  73. cl::location(VectorizerParams::VectorizationFactor));
  74. unsigned VectorizerParams::VectorizationFactor;
  75. static cl::opt<unsigned, true>
  76. VectorizationInterleave("force-vector-interleave", cl::Hidden,
  77. cl::desc("Sets the vectorization interleave count. "
  78. "Zero is autoselect."),
  79. cl::location(
  80. VectorizerParams::VectorizationInterleave));
  81. unsigned VectorizerParams::VectorizationInterleave;
  82. static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
  83. "runtime-memory-check-threshold", cl::Hidden,
  84. cl::desc("When performing memory disambiguation checks at runtime do not "
  85. "generate more than this number of comparisons (default = 8)."),
  86. cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
  87. unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
  88. /// The maximum iterations used to merge memory checks
  89. static cl::opt<unsigned> MemoryCheckMergeThreshold(
  90. "memory-check-merge-threshold", cl::Hidden,
  91. cl::desc("Maximum number of comparisons done when trying to merge "
  92. "runtime memory checks. (default = 100)"),
  93. cl::init(100));
  94. /// Maximum SIMD width.
  95. const unsigned VectorizerParams::MaxVectorWidth = 64;
  96. /// We collect dependences up to this threshold.
  97. static cl::opt<unsigned>
  98. MaxDependences("max-dependences", cl::Hidden,
  99. cl::desc("Maximum number of dependences collected by "
  100. "loop-access analysis (default = 100)"),
  101. cl::init(100));
  102. /// This enables versioning on the strides of symbolically striding memory
  103. /// accesses in code like the following.
  104. /// for (i = 0; i < N; ++i)
  105. /// A[i * Stride1] += B[i * Stride2] ...
  106. ///
  107. /// Will be roughly translated to
  108. /// if (Stride1 == 1 && Stride2 == 1) {
  109. /// for (i = 0; i < N; i+=4)
  110. /// A[i:i+3] += ...
  111. /// } else
  112. /// ...
  113. static cl::opt<bool> EnableMemAccessVersioning(
  114. "enable-mem-access-versioning", cl::init(true), cl::Hidden,
  115. cl::desc("Enable symbolic stride memory access versioning"));
  116. /// Enable store-to-load forwarding conflict detection. This option can
  117. /// be disabled for correctness testing.
  118. static cl::opt<bool> EnableForwardingConflictDetection(
  119. "store-to-load-forwarding-conflict-detection", cl::Hidden,
  120. cl::desc("Enable conflict detection in loop-access analysis"),
  121. cl::init(true));
  122. static cl::opt<unsigned> MaxForkedSCEVDepth(
  123. "max-forked-scev-depth", cl::Hidden,
  124. cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
  125. cl::init(5));
  126. bool VectorizerParams::isInterleaveForced() {
  127. return ::VectorizationInterleave.getNumOccurrences() > 0;
  128. }
  129. Value *llvm::stripIntegerCast(Value *V) {
  130. if (auto *CI = dyn_cast<CastInst>(V))
  131. if (CI->getOperand(0)->getType()->isIntegerTy())
  132. return CI->getOperand(0);
  133. return V;
  134. }
  135. const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
  136. const ValueToValueMap &PtrToStride,
  137. Value *Ptr) {
  138. const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
  139. // If there is an entry in the map return the SCEV of the pointer with the
  140. // symbolic stride replaced by one.
  141. ValueToValueMap::const_iterator SI = PtrToStride.find(Ptr);
  142. if (SI == PtrToStride.end())
  143. // For a non-symbolic stride, just return the original expression.
  144. return OrigSCEV;
  145. Value *StrideVal = stripIntegerCast(SI->second);
  146. ScalarEvolution *SE = PSE.getSE();
  147. const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
  148. const auto *CT =
  149. static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
  150. PSE.addPredicate(*SE->getEqualPredicate(U, CT));
  151. auto *Expr = PSE.getSCEV(Ptr);
  152. LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
  153. << " by: " << *Expr << "\n");
  154. return Expr;
  155. }
  156. RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
  157. unsigned Index, RuntimePointerChecking &RtCheck)
  158. : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
  159. AddressSpace(RtCheck.Pointers[Index]
  160. .PointerValue->getType()
  161. ->getPointerAddressSpace()),
  162. NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
  163. Members.push_back(Index);
  164. }
  165. /// Calculate Start and End points of memory access.
  166. /// Let's assume A is the first access and B is a memory access on N-th loop
  167. /// iteration. Then B is calculated as:
  168. /// B = A + Step*N .
  169. /// Step value may be positive or negative.
  170. /// N is a calculated back-edge taken count:
  171. /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
  172. /// Start and End points are calculated in the following way:
  173. /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
  174. /// where SizeOfElt is the size of single memory access in bytes.
  175. ///
  176. /// There is no conflict when the intervals are disjoint:
  177. /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
  178. void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
  179. Type *AccessTy, bool WritePtr,
  180. unsigned DepSetId, unsigned ASId,
  181. PredicatedScalarEvolution &PSE,
  182. bool NeedsFreeze) {
  183. ScalarEvolution *SE = PSE.getSE();
  184. const SCEV *ScStart;
  185. const SCEV *ScEnd;
  186. if (SE->isLoopInvariant(PtrExpr, Lp)) {
  187. ScStart = ScEnd = PtrExpr;
  188. } else {
  189. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
  190. assert(AR && "Invalid addrec expression");
  191. const SCEV *Ex = PSE.getBackedgeTakenCount();
  192. ScStart = AR->getStart();
  193. ScEnd = AR->evaluateAtIteration(Ex, *SE);
  194. const SCEV *Step = AR->getStepRecurrence(*SE);
  195. // For expressions with negative step, the upper bound is ScStart and the
  196. // lower bound is ScEnd.
  197. if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
  198. if (CStep->getValue()->isNegative())
  199. std::swap(ScStart, ScEnd);
  200. } else {
  201. // Fallback case: the step is not constant, but we can still
  202. // get the upper and lower bounds of the interval by using min/max
  203. // expressions.
  204. ScStart = SE->getUMinExpr(ScStart, ScEnd);
  205. ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
  206. }
  207. }
  208. // Add the size of the pointed element to ScEnd.
  209. auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  210. Type *IdxTy = DL.getIndexType(Ptr->getType());
  211. const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
  212. ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
  213. Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
  214. NeedsFreeze);
  215. }
  216. void RuntimePointerChecking::tryToCreateDiffCheck(
  217. const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
  218. if (!CanUseDiffCheck)
  219. return;
  220. // If either group contains multiple different pointers, bail out.
  221. // TODO: Support multiple pointers by using the minimum or maximum pointer,
  222. // depending on src & sink.
  223. if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
  224. CanUseDiffCheck = false;
  225. return;
  226. }
  227. PointerInfo *Src = &Pointers[CGI.Members[0]];
  228. PointerInfo *Sink = &Pointers[CGJ.Members[0]];
  229. // If either pointer is read and written, multiple checks may be needed. Bail
  230. // out.
  231. if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
  232. !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
  233. CanUseDiffCheck = false;
  234. return;
  235. }
  236. ArrayRef<unsigned> AccSrc =
  237. DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
  238. ArrayRef<unsigned> AccSink =
  239. DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
  240. // If either pointer is accessed multiple times, there may not be a clear
  241. // src/sink relation. Bail out for now.
  242. if (AccSrc.size() != 1 || AccSink.size() != 1) {
  243. CanUseDiffCheck = false;
  244. return;
  245. }
  246. // If the sink is accessed before src, swap src/sink.
  247. if (AccSink[0] < AccSrc[0])
  248. std::swap(Src, Sink);
  249. auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
  250. auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
  251. if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
  252. SinkAR->getLoop() != DC.getInnermostLoop()) {
  253. CanUseDiffCheck = false;
  254. return;
  255. }
  256. SmallVector<Instruction *, 4> SrcInsts =
  257. DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
  258. SmallVector<Instruction *, 4> SinkInsts =
  259. DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
  260. Type *SrcTy = getLoadStoreType(SrcInsts[0]);
  261. Type *DstTy = getLoadStoreType(SinkInsts[0]);
  262. if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
  263. CanUseDiffCheck = false;
  264. return;
  265. }
  266. const DataLayout &DL =
  267. SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
  268. unsigned AllocSize =
  269. std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
  270. // Only matching constant steps matching the AllocSize are supported at the
  271. // moment. This simplifies the difference computation. Can be extended in the
  272. // future.
  273. auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
  274. if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
  275. Step->getAPInt().abs() != AllocSize) {
  276. CanUseDiffCheck = false;
  277. return;
  278. }
  279. IntegerType *IntTy =
  280. IntegerType::get(Src->PointerValue->getContext(),
  281. DL.getPointerSizeInBits(CGI.AddressSpace));
  282. // When counting down, the dependence distance needs to be swapped.
  283. if (Step->getValue()->isNegative())
  284. std::swap(SinkAR, SrcAR);
  285. const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
  286. const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
  287. if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
  288. isa<SCEVCouldNotCompute>(SrcStartInt)) {
  289. CanUseDiffCheck = false;
  290. return;
  291. }
  292. DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
  293. Src->NeedsFreeze || Sink->NeedsFreeze);
  294. }
  295. SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
  296. SmallVector<RuntimePointerCheck, 4> Checks;
  297. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  298. for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
  299. const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
  300. const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
  301. if (needsChecking(CGI, CGJ)) {
  302. tryToCreateDiffCheck(CGI, CGJ);
  303. Checks.push_back(std::make_pair(&CGI, &CGJ));
  304. }
  305. }
  306. }
  307. return Checks;
  308. }
  309. void RuntimePointerChecking::generateChecks(
  310. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  311. assert(Checks.empty() && "Checks is not empty");
  312. groupChecks(DepCands, UseDependencies);
  313. Checks = generateChecks();
  314. }
  315. bool RuntimePointerChecking::needsChecking(
  316. const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
  317. for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
  318. for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
  319. if (needsChecking(M.Members[I], N.Members[J]))
  320. return true;
  321. return false;
  322. }
  323. /// Compare \p I and \p J and return the minimum.
  324. /// Return nullptr in case we couldn't find an answer.
  325. static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
  326. ScalarEvolution *SE) {
  327. const SCEV *Diff = SE->getMinusSCEV(J, I);
  328. const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
  329. if (!C)
  330. return nullptr;
  331. if (C->getValue()->isNegative())
  332. return J;
  333. return I;
  334. }
  335. bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
  336. RuntimePointerChecking &RtCheck) {
  337. return addPointer(
  338. Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
  339. RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
  340. RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
  341. }
  342. bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
  343. const SCEV *End, unsigned AS,
  344. bool NeedsFreeze,
  345. ScalarEvolution &SE) {
  346. assert(AddressSpace == AS &&
  347. "all pointers in a checking group must be in the same address space");
  348. // Compare the starts and ends with the known minimum and maximum
  349. // of this set. We need to know how we compare against the min/max
  350. // of the set in order to be able to emit memchecks.
  351. const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
  352. if (!Min0)
  353. return false;
  354. const SCEV *Min1 = getMinFromExprs(End, High, &SE);
  355. if (!Min1)
  356. return false;
  357. // Update the low bound expression if we've found a new min value.
  358. if (Min0 == Start)
  359. Low = Start;
  360. // Update the high bound expression if we've found a new max value.
  361. if (Min1 != End)
  362. High = End;
  363. Members.push_back(Index);
  364. this->NeedsFreeze |= NeedsFreeze;
  365. return true;
  366. }
  367. void RuntimePointerChecking::groupChecks(
  368. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  369. // We build the groups from dependency candidates equivalence classes
  370. // because:
  371. // - We know that pointers in the same equivalence class share
  372. // the same underlying object and therefore there is a chance
  373. // that we can compare pointers
  374. // - We wouldn't be able to merge two pointers for which we need
  375. // to emit a memcheck. The classes in DepCands are already
  376. // conveniently built such that no two pointers in the same
  377. // class need checking against each other.
  378. // We use the following (greedy) algorithm to construct the groups
  379. // For every pointer in the equivalence class:
  380. // For each existing group:
  381. // - if the difference between this pointer and the min/max bounds
  382. // of the group is a constant, then make the pointer part of the
  383. // group and update the min/max bounds of that group as required.
  384. CheckingGroups.clear();
  385. // If we need to check two pointers to the same underlying object
  386. // with a non-constant difference, we shouldn't perform any pointer
  387. // grouping with those pointers. This is because we can easily get
  388. // into cases where the resulting check would return false, even when
  389. // the accesses are safe.
  390. //
  391. // The following example shows this:
  392. // for (i = 0; i < 1000; ++i)
  393. // a[5000 + i * m] = a[i] + a[i + 9000]
  394. //
  395. // Here grouping gives a check of (5000, 5000 + 1000 * m) against
  396. // (0, 10000) which is always false. However, if m is 1, there is no
  397. // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
  398. // us to perform an accurate check in this case.
  399. //
  400. // The above case requires that we have an UnknownDependence between
  401. // accesses to the same underlying object. This cannot happen unless
  402. // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
  403. // is also false. In this case we will use the fallback path and create
  404. // separate checking groups for all pointers.
  405. // If we don't have the dependency partitions, construct a new
  406. // checking pointer group for each pointer. This is also required
  407. // for correctness, because in this case we can have checking between
  408. // pointers to the same underlying object.
  409. if (!UseDependencies) {
  410. for (unsigned I = 0; I < Pointers.size(); ++I)
  411. CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
  412. return;
  413. }
  414. unsigned TotalComparisons = 0;
  415. DenseMap<Value *, SmallVector<unsigned>> PositionMap;
  416. for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
  417. auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
  418. Iter.first->second.push_back(Index);
  419. }
  420. // We need to keep track of what pointers we've already seen so we
  421. // don't process them twice.
  422. SmallSet<unsigned, 2> Seen;
  423. // Go through all equivalence classes, get the "pointer check groups"
  424. // and add them to the overall solution. We use the order in which accesses
  425. // appear in 'Pointers' to enforce determinism.
  426. for (unsigned I = 0; I < Pointers.size(); ++I) {
  427. // We've seen this pointer before, and therefore already processed
  428. // its equivalence class.
  429. if (Seen.count(I))
  430. continue;
  431. MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
  432. Pointers[I].IsWritePtr);
  433. SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
  434. auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
  435. // Because DepCands is constructed by visiting accesses in the order in
  436. // which they appear in alias sets (which is deterministic) and the
  437. // iteration order within an equivalence class member is only dependent on
  438. // the order in which unions and insertions are performed on the
  439. // equivalence class, the iteration order is deterministic.
  440. for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
  441. MI != ME; ++MI) {
  442. auto PointerI = PositionMap.find(MI->getPointer());
  443. assert(PointerI != PositionMap.end() &&
  444. "pointer in equivalence class not found in PositionMap");
  445. for (unsigned Pointer : PointerI->second) {
  446. bool Merged = false;
  447. // Mark this pointer as seen.
  448. Seen.insert(Pointer);
  449. // Go through all the existing sets and see if we can find one
  450. // which can include this pointer.
  451. for (RuntimeCheckingPtrGroup &Group : Groups) {
  452. // Don't perform more than a certain amount of comparisons.
  453. // This should limit the cost of grouping the pointers to something
  454. // reasonable. If we do end up hitting this threshold, the algorithm
  455. // will create separate groups for all remaining pointers.
  456. if (TotalComparisons > MemoryCheckMergeThreshold)
  457. break;
  458. TotalComparisons++;
  459. if (Group.addPointer(Pointer, *this)) {
  460. Merged = true;
  461. break;
  462. }
  463. }
  464. if (!Merged)
  465. // We couldn't add this pointer to any existing set or the threshold
  466. // for the number of comparisons has been reached. Create a new group
  467. // to hold the current pointer.
  468. Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
  469. }
  470. }
  471. // We've computed the grouped checks for this partition.
  472. // Save the results and continue with the next one.
  473. llvm::copy(Groups, std::back_inserter(CheckingGroups));
  474. }
  475. }
  476. bool RuntimePointerChecking::arePointersInSamePartition(
  477. const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
  478. unsigned PtrIdx2) {
  479. return (PtrToPartition[PtrIdx1] != -1 &&
  480. PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
  481. }
  482. bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
  483. const PointerInfo &PointerI = Pointers[I];
  484. const PointerInfo &PointerJ = Pointers[J];
  485. // No need to check if two readonly pointers intersect.
  486. if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
  487. return false;
  488. // Only need to check pointers between two different dependency sets.
  489. if (PointerI.DependencySetId == PointerJ.DependencySetId)
  490. return false;
  491. // Only need to check pointers in the same alias set.
  492. if (PointerI.AliasSetId != PointerJ.AliasSetId)
  493. return false;
  494. return true;
  495. }
  496. void RuntimePointerChecking::printChecks(
  497. raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
  498. unsigned Depth) const {
  499. unsigned N = 0;
  500. for (const auto &Check : Checks) {
  501. const auto &First = Check.first->Members, &Second = Check.second->Members;
  502. OS.indent(Depth) << "Check " << N++ << ":\n";
  503. OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
  504. for (unsigned K = 0; K < First.size(); ++K)
  505. OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
  506. OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
  507. for (unsigned K = 0; K < Second.size(); ++K)
  508. OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
  509. }
  510. }
  511. void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
  512. OS.indent(Depth) << "Run-time memory checks:\n";
  513. printChecks(OS, Checks, Depth);
  514. OS.indent(Depth) << "Grouped accesses:\n";
  515. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  516. const auto &CG = CheckingGroups[I];
  517. OS.indent(Depth + 2) << "Group " << &CG << ":\n";
  518. OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
  519. << ")\n";
  520. for (unsigned J = 0; J < CG.Members.size(); ++J) {
  521. OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
  522. << "\n";
  523. }
  524. }
  525. }
  526. namespace {
  527. /// Analyses memory accesses in a loop.
  528. ///
  529. /// Checks whether run time pointer checks are needed and builds sets for data
  530. /// dependence checking.
  531. class AccessAnalysis {
  532. public:
  533. /// Read or write access location.
  534. typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
  535. typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
  536. AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
  537. MemoryDepChecker::DepCandidates &DA,
  538. PredicatedScalarEvolution &PSE)
  539. : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE) {
  540. // We're analyzing dependences across loop iterations.
  541. BAA.enableCrossIterationMode();
  542. }
  543. /// Register a load and whether it is only read from.
  544. void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
  545. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  546. AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
  547. Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
  548. if (IsReadOnly)
  549. ReadOnlyPtr.insert(Ptr);
  550. }
  551. /// Register a store.
  552. void addStore(MemoryLocation &Loc, Type *AccessTy) {
  553. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  554. AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags);
  555. Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
  556. }
  557. /// Check if we can emit a run-time no-alias check for \p Access.
  558. ///
  559. /// Returns true if we can emit a run-time no alias check for \p Access.
  560. /// If we can check this access, this also adds it to a dependence set and
  561. /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
  562. /// we will attempt to use additional run-time checks in order to get
  563. /// the bounds of the pointer.
  564. bool createCheckForAccess(RuntimePointerChecking &RtCheck,
  565. MemAccessInfo Access, Type *AccessTy,
  566. const ValueToValueMap &Strides,
  567. DenseMap<Value *, unsigned> &DepSetId,
  568. Loop *TheLoop, unsigned &RunningDepId,
  569. unsigned ASId, bool ShouldCheckStride, bool Assume);
  570. /// Check whether we can check the pointers at runtime for
  571. /// non-intersection.
  572. ///
  573. /// Returns true if we need no check or if we do and we can generate them
  574. /// (i.e. the pointers have computable bounds).
  575. bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
  576. Loop *TheLoop, const ValueToValueMap &Strides,
  577. Value *&UncomputablePtr, bool ShouldCheckWrap = false);
  578. /// Goes over all memory accesses, checks whether a RT check is needed
  579. /// and builds sets of dependent accesses.
  580. void buildDependenceSets() {
  581. processMemAccesses();
  582. }
  583. /// Initial processing of memory accesses determined that we need to
  584. /// perform dependency checking.
  585. ///
  586. /// Note that this can later be cleared if we retry memcheck analysis without
  587. /// dependency checking (i.e. FoundNonConstantDistanceDependence).
  588. bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
  589. /// We decided that no dependence analysis would be used. Reset the state.
  590. void resetDepChecks(MemoryDepChecker &DepChecker) {
  591. CheckDeps.clear();
  592. DepChecker.clearDependences();
  593. }
  594. MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
  595. private:
  596. typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
  597. /// Go over all memory access and check whether runtime pointer checks
  598. /// are needed and build sets of dependency check candidates.
  599. void processMemAccesses();
  600. /// Map of all accesses. Values are the types used to access memory pointed to
  601. /// by the pointer.
  602. PtrAccessMap Accesses;
  603. /// The loop being checked.
  604. const Loop *TheLoop;
  605. /// List of accesses that need a further dependence check.
  606. MemAccessInfoList CheckDeps;
  607. /// Set of pointers that are read only.
  608. SmallPtrSet<Value*, 16> ReadOnlyPtr;
  609. /// Batched alias analysis results.
  610. BatchAAResults BAA;
  611. /// An alias set tracker to partition the access set by underlying object and
  612. //intrinsic property (such as TBAA metadata).
  613. AliasSetTracker AST;
  614. LoopInfo *LI;
  615. /// Sets of potentially dependent accesses - members of one set share an
  616. /// underlying pointer. The set "CheckDeps" identfies which sets really need a
  617. /// dependence check.
  618. MemoryDepChecker::DepCandidates &DepCands;
  619. /// Initial processing of memory accesses determined that we may need
  620. /// to add memchecks. Perform the analysis to determine the necessary checks.
  621. ///
  622. /// Note that, this is different from isDependencyCheckNeeded. When we retry
  623. /// memcheck analysis without dependency checking
  624. /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
  625. /// cleared while this remains set if we have potentially dependent accesses.
  626. bool IsRTCheckAnalysisNeeded = false;
  627. /// The SCEV predicate containing all the SCEV-related assumptions.
  628. PredicatedScalarEvolution &PSE;
  629. };
  630. } // end anonymous namespace
  631. /// Check whether a pointer can participate in a runtime bounds check.
  632. /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
  633. /// by adding run-time checks (overflow checks) if necessary.
  634. static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
  635. const SCEV *PtrScev, Loop *L, bool Assume) {
  636. // The bounds for loop-invariant pointer is trivial.
  637. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  638. return true;
  639. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  640. if (!AR && Assume)
  641. AR = PSE.getAsAddRec(Ptr);
  642. if (!AR)
  643. return false;
  644. return AR->isAffine();
  645. }
  646. /// Check whether a pointer address cannot wrap.
  647. static bool isNoWrap(PredicatedScalarEvolution &PSE,
  648. const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy,
  649. Loop *L) {
  650. const SCEV *PtrScev = PSE.getSCEV(Ptr);
  651. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  652. return true;
  653. int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
  654. if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
  655. return true;
  656. return false;
  657. }
  658. static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
  659. function_ref<void(Value *)> AddPointer) {
  660. SmallPtrSet<Value *, 8> Visited;
  661. SmallVector<Value *> WorkList;
  662. WorkList.push_back(StartPtr);
  663. while (!WorkList.empty()) {
  664. Value *Ptr = WorkList.pop_back_val();
  665. if (!Visited.insert(Ptr).second)
  666. continue;
  667. auto *PN = dyn_cast<PHINode>(Ptr);
  668. // SCEV does not look through non-header PHIs inside the loop. Such phis
  669. // can be analyzed by adding separate accesses for each incoming pointer
  670. // value.
  671. if (PN && InnermostLoop.contains(PN->getParent()) &&
  672. PN->getParent() != InnermostLoop.getHeader()) {
  673. for (const Use &Inc : PN->incoming_values())
  674. WorkList.push_back(Inc);
  675. } else
  676. AddPointer(Ptr);
  677. }
  678. }
  679. // Walk back through the IR for a pointer, looking for a select like the
  680. // following:
  681. //
  682. // %offset = select i1 %cmp, i64 %a, i64 %b
  683. // %addr = getelementptr double, double* %base, i64 %offset
  684. // %ld = load double, double* %addr, align 8
  685. //
  686. // We won't be able to form a single SCEVAddRecExpr from this since the
  687. // address for each loop iteration depends on %cmp. We could potentially
  688. // produce multiple valid SCEVAddRecExprs, though, and check all of them for
  689. // memory safety/aliasing if needed.
  690. //
  691. // If we encounter some IR we don't yet handle, or something obviously fine
  692. // like a constant, then we just add the SCEV for that term to the list passed
  693. // in by the caller. If we have a node that may potentially yield a valid
  694. // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
  695. // ourselves before adding to the list.
  696. static void findForkedSCEVs(
  697. ScalarEvolution *SE, const Loop *L, Value *Ptr,
  698. SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList,
  699. unsigned Depth) {
  700. // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
  701. // we've exceeded our limit on recursion, just return whatever we have
  702. // regardless of whether it can be used for a forked pointer or not, along
  703. // with an indication of whether it might be a poison or undef value.
  704. const SCEV *Scev = SE->getSCEV(Ptr);
  705. if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
  706. !isa<Instruction>(Ptr) || Depth == 0) {
  707. ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
  708. return;
  709. }
  710. Depth--;
  711. auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
  712. return get<1>(S);
  713. };
  714. auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
  715. switch (Opcode) {
  716. case Instruction::Add:
  717. return SE->getAddExpr(L, R);
  718. case Instruction::Sub:
  719. return SE->getMinusSCEV(L, R);
  720. default:
  721. llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
  722. }
  723. };
  724. Instruction *I = cast<Instruction>(Ptr);
  725. unsigned Opcode = I->getOpcode();
  726. switch (Opcode) {
  727. case Instruction::GetElementPtr: {
  728. GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
  729. Type *SourceTy = GEP->getSourceElementType();
  730. // We only handle base + single offset GEPs here for now.
  731. // Not dealing with preexisting gathers yet, so no vectors.
  732. if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
  733. ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
  734. break;
  735. }
  736. SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs;
  737. SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs;
  738. findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
  739. findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
  740. // See if we need to freeze our fork...
  741. bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
  742. any_of(OffsetScevs, UndefPoisonCheck);
  743. // Check that we only have a single fork, on either the base or the offset.
  744. // Copy the SCEV across for the one without a fork in order to generate
  745. // the full SCEV for both sides of the GEP.
  746. if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
  747. BaseScevs.push_back(BaseScevs[0]);
  748. else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
  749. OffsetScevs.push_back(OffsetScevs[0]);
  750. else {
  751. ScevList.emplace_back(Scev, NeedsFreeze);
  752. break;
  753. }
  754. // Find the pointer type we need to extend to.
  755. Type *IntPtrTy = SE->getEffectiveSCEVType(
  756. SE->getSCEV(GEP->getPointerOperand())->getType());
  757. // Find the size of the type being pointed to. We only have a single
  758. // index term (guarded above) so we don't need to index into arrays or
  759. // structures, just get the size of the scalar value.
  760. const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
  761. // Scale up the offsets by the size of the type, then add to the bases.
  762. const SCEV *Scaled1 = SE->getMulExpr(
  763. Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
  764. const SCEV *Scaled2 = SE->getMulExpr(
  765. Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
  766. ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
  767. NeedsFreeze);
  768. ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
  769. NeedsFreeze);
  770. break;
  771. }
  772. case Instruction::Select: {
  773. SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs;
  774. // A select means we've found a forked pointer, but we currently only
  775. // support a single select per pointer so if there's another behind this
  776. // then we just bail out and return the generic SCEV.
  777. findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
  778. findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
  779. if (ChildScevs.size() == 2) {
  780. ScevList.push_back(ChildScevs[0]);
  781. ScevList.push_back(ChildScevs[1]);
  782. } else
  783. ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
  784. break;
  785. }
  786. case Instruction::Add:
  787. case Instruction::Sub: {
  788. SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs;
  789. SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs;
  790. findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
  791. findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
  792. // See if we need to freeze our fork...
  793. bool NeedsFreeze =
  794. any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
  795. // Check that we only have a single fork, on either the left or right side.
  796. // Copy the SCEV across for the one without a fork in order to generate
  797. // the full SCEV for both sides of the BinOp.
  798. if (LScevs.size() == 2 && RScevs.size() == 1)
  799. RScevs.push_back(RScevs[0]);
  800. else if (RScevs.size() == 2 && LScevs.size() == 1)
  801. LScevs.push_back(LScevs[0]);
  802. else {
  803. ScevList.emplace_back(Scev, NeedsFreeze);
  804. break;
  805. }
  806. ScevList.emplace_back(
  807. GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
  808. NeedsFreeze);
  809. ScevList.emplace_back(
  810. GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
  811. NeedsFreeze);
  812. break;
  813. }
  814. default:
  815. // Just return the current SCEV if we haven't handled the instruction yet.
  816. LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
  817. ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
  818. break;
  819. }
  820. }
  821. static SmallVector<PointerIntPair<const SCEV *, 1, bool>>
  822. findForkedPointer(PredicatedScalarEvolution &PSE,
  823. const ValueToValueMap &StridesMap, Value *Ptr,
  824. const Loop *L) {
  825. ScalarEvolution *SE = PSE.getSE();
  826. assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
  827. SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs;
  828. findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
  829. // For now, we will only accept a forked pointer with two possible SCEVs
  830. // that are either SCEVAddRecExprs or loop invariant.
  831. if (Scevs.size() == 2 &&
  832. (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
  833. SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
  834. (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
  835. SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
  836. LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
  837. LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
  838. LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
  839. return Scevs;
  840. }
  841. return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
  842. }
  843. bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
  844. MemAccessInfo Access, Type *AccessTy,
  845. const ValueToValueMap &StridesMap,
  846. DenseMap<Value *, unsigned> &DepSetId,
  847. Loop *TheLoop, unsigned &RunningDepId,
  848. unsigned ASId, bool ShouldCheckWrap,
  849. bool Assume) {
  850. Value *Ptr = Access.getPointer();
  851. SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
  852. findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
  853. for (auto &P : TranslatedPtrs) {
  854. const SCEV *PtrExpr = get<0>(P);
  855. if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
  856. return false;
  857. // When we run after a failing dependency check we have to make sure
  858. // we don't have wrapping pointers.
  859. if (ShouldCheckWrap) {
  860. // Skip wrap checking when translating pointers.
  861. if (TranslatedPtrs.size() > 1)
  862. return false;
  863. if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
  864. auto *Expr = PSE.getSCEV(Ptr);
  865. if (!Assume || !isa<SCEVAddRecExpr>(Expr))
  866. return false;
  867. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  868. }
  869. }
  870. // If there's only one option for Ptr, look it up after bounds and wrap
  871. // checking, because assumptions might have been added to PSE.
  872. if (TranslatedPtrs.size() == 1)
  873. TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
  874. false};
  875. }
  876. for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
  877. // The id of the dependence set.
  878. unsigned DepId;
  879. if (isDependencyCheckNeeded()) {
  880. Value *Leader = DepCands.getLeaderValue(Access).getPointer();
  881. unsigned &LeaderId = DepSetId[Leader];
  882. if (!LeaderId)
  883. LeaderId = RunningDepId++;
  884. DepId = LeaderId;
  885. } else
  886. // Each access has its own dependence set.
  887. DepId = RunningDepId++;
  888. bool IsWrite = Access.getInt();
  889. RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
  890. NeedsFreeze);
  891. LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
  892. }
  893. return true;
  894. }
  895. bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
  896. ScalarEvolution *SE, Loop *TheLoop,
  897. const ValueToValueMap &StridesMap,
  898. Value *&UncomputablePtr, bool ShouldCheckWrap) {
  899. // Find pointers with computable bounds. We are going to use this information
  900. // to place a runtime bound check.
  901. bool CanDoRT = true;
  902. bool MayNeedRTCheck = false;
  903. if (!IsRTCheckAnalysisNeeded) return true;
  904. bool IsDepCheckNeeded = isDependencyCheckNeeded();
  905. // We assign a consecutive id to access from different alias sets.
  906. // Accesses between different groups doesn't need to be checked.
  907. unsigned ASId = 0;
  908. for (auto &AS : AST) {
  909. int NumReadPtrChecks = 0;
  910. int NumWritePtrChecks = 0;
  911. bool CanDoAliasSetRT = true;
  912. ++ASId;
  913. // We assign consecutive id to access from different dependence sets.
  914. // Accesses within the same set don't need a runtime check.
  915. unsigned RunningDepId = 1;
  916. DenseMap<Value *, unsigned> DepSetId;
  917. SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
  918. // First, count how many write and read accesses are in the alias set. Also
  919. // collect MemAccessInfos for later.
  920. SmallVector<MemAccessInfo, 4> AccessInfos;
  921. for (const auto &A : AS) {
  922. Value *Ptr = A.getValue();
  923. bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
  924. if (IsWrite)
  925. ++NumWritePtrChecks;
  926. else
  927. ++NumReadPtrChecks;
  928. AccessInfos.emplace_back(Ptr, IsWrite);
  929. }
  930. // We do not need runtime checks for this alias set, if there are no writes
  931. // or a single write and no reads.
  932. if (NumWritePtrChecks == 0 ||
  933. (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
  934. assert((AS.size() <= 1 ||
  935. all_of(AS,
  936. [this](auto AC) {
  937. MemAccessInfo AccessWrite(AC.getValue(), true);
  938. return DepCands.findValue(AccessWrite) == DepCands.end();
  939. })) &&
  940. "Can only skip updating CanDoRT below, if all entries in AS "
  941. "are reads or there is at most 1 entry");
  942. continue;
  943. }
  944. for (auto &Access : AccessInfos) {
  945. for (const auto &AccessTy : Accesses[Access]) {
  946. if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
  947. DepSetId, TheLoop, RunningDepId, ASId,
  948. ShouldCheckWrap, false)) {
  949. LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
  950. << *Access.getPointer() << '\n');
  951. Retries.push_back({Access, AccessTy});
  952. CanDoAliasSetRT = false;
  953. }
  954. }
  955. }
  956. // Note that this function computes CanDoRT and MayNeedRTCheck
  957. // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
  958. // we have a pointer for which we couldn't find the bounds but we don't
  959. // actually need to emit any checks so it does not matter.
  960. //
  961. // We need runtime checks for this alias set, if there are at least 2
  962. // dependence sets (in which case RunningDepId > 2) or if we need to re-try
  963. // any bound checks (because in that case the number of dependence sets is
  964. // incomplete).
  965. bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
  966. // We need to perform run-time alias checks, but some pointers had bounds
  967. // that couldn't be checked.
  968. if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
  969. // Reset the CanDoSetRt flag and retry all accesses that have failed.
  970. // We know that we need these checks, so we can now be more aggressive
  971. // and add further checks if required (overflow checks).
  972. CanDoAliasSetRT = true;
  973. for (auto Retry : Retries) {
  974. MemAccessInfo Access = Retry.first;
  975. Type *AccessTy = Retry.second;
  976. if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
  977. DepSetId, TheLoop, RunningDepId, ASId,
  978. ShouldCheckWrap, /*Assume=*/true)) {
  979. CanDoAliasSetRT = false;
  980. UncomputablePtr = Access.getPointer();
  981. break;
  982. }
  983. }
  984. }
  985. CanDoRT &= CanDoAliasSetRT;
  986. MayNeedRTCheck |= NeedsAliasSetRTCheck;
  987. ++ASId;
  988. }
  989. // If the pointers that we would use for the bounds comparison have different
  990. // address spaces, assume the values aren't directly comparable, so we can't
  991. // use them for the runtime check. We also have to assume they could
  992. // overlap. In the future there should be metadata for whether address spaces
  993. // are disjoint.
  994. unsigned NumPointers = RtCheck.Pointers.size();
  995. for (unsigned i = 0; i < NumPointers; ++i) {
  996. for (unsigned j = i + 1; j < NumPointers; ++j) {
  997. // Only need to check pointers between two different dependency sets.
  998. if (RtCheck.Pointers[i].DependencySetId ==
  999. RtCheck.Pointers[j].DependencySetId)
  1000. continue;
  1001. // Only need to check pointers in the same alias set.
  1002. if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
  1003. continue;
  1004. Value *PtrI = RtCheck.Pointers[i].PointerValue;
  1005. Value *PtrJ = RtCheck.Pointers[j].PointerValue;
  1006. unsigned ASi = PtrI->getType()->getPointerAddressSpace();
  1007. unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
  1008. if (ASi != ASj) {
  1009. LLVM_DEBUG(
  1010. dbgs() << "LAA: Runtime check would require comparison between"
  1011. " different address spaces\n");
  1012. return false;
  1013. }
  1014. }
  1015. }
  1016. if (MayNeedRTCheck && CanDoRT)
  1017. RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
  1018. LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
  1019. << " pointer comparisons.\n");
  1020. // If we can do run-time checks, but there are no checks, no runtime checks
  1021. // are needed. This can happen when all pointers point to the same underlying
  1022. // object for example.
  1023. RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
  1024. bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
  1025. if (!CanDoRTIfNeeded)
  1026. RtCheck.reset();
  1027. return CanDoRTIfNeeded;
  1028. }
  1029. void AccessAnalysis::processMemAccesses() {
  1030. // We process the set twice: first we process read-write pointers, last we
  1031. // process read-only pointers. This allows us to skip dependence tests for
  1032. // read-only pointers.
  1033. LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
  1034. LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
  1035. LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
  1036. LLVM_DEBUG({
  1037. for (auto A : Accesses)
  1038. dbgs() << "\t" << *A.first.getPointer() << " ("
  1039. << (A.first.getInt()
  1040. ? "write"
  1041. : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
  1042. : "read"))
  1043. << ")\n";
  1044. });
  1045. // The AliasSetTracker has nicely partitioned our pointers by metadata
  1046. // compatibility and potential for underlying-object overlap. As a result, we
  1047. // only need to check for potential pointer dependencies within each alias
  1048. // set.
  1049. for (const auto &AS : AST) {
  1050. // Note that both the alias-set tracker and the alias sets themselves used
  1051. // linked lists internally and so the iteration order here is deterministic
  1052. // (matching the original instruction order within each set).
  1053. bool SetHasWrite = false;
  1054. // Map of pointers to last access encountered.
  1055. typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
  1056. UnderlyingObjToAccessMap ObjToLastAccess;
  1057. // Set of access to check after all writes have been processed.
  1058. PtrAccessMap DeferredAccesses;
  1059. // Iterate over each alias set twice, once to process read/write pointers,
  1060. // and then to process read-only pointers.
  1061. for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
  1062. bool UseDeferred = SetIteration > 0;
  1063. PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
  1064. for (const auto &AV : AS) {
  1065. Value *Ptr = AV.getValue();
  1066. // For a single memory access in AliasSetTracker, Accesses may contain
  1067. // both read and write, and they both need to be handled for CheckDeps.
  1068. for (const auto &AC : S) {
  1069. if (AC.first.getPointer() != Ptr)
  1070. continue;
  1071. bool IsWrite = AC.first.getInt();
  1072. // If we're using the deferred access set, then it contains only
  1073. // reads.
  1074. bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
  1075. if (UseDeferred && !IsReadOnlyPtr)
  1076. continue;
  1077. // Otherwise, the pointer must be in the PtrAccessSet, either as a
  1078. // read or a write.
  1079. assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
  1080. S.count(MemAccessInfo(Ptr, false))) &&
  1081. "Alias-set pointer not in the access set?");
  1082. MemAccessInfo Access(Ptr, IsWrite);
  1083. DepCands.insert(Access);
  1084. // Memorize read-only pointers for later processing and skip them in
  1085. // the first round (they need to be checked after we have seen all
  1086. // write pointers). Note: we also mark pointer that are not
  1087. // consecutive as "read-only" pointers (so that we check
  1088. // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
  1089. if (!UseDeferred && IsReadOnlyPtr) {
  1090. // We only use the pointer keys, the types vector values don't
  1091. // matter.
  1092. DeferredAccesses.insert({Access, {}});
  1093. continue;
  1094. }
  1095. // If this is a write - check other reads and writes for conflicts. If
  1096. // this is a read only check other writes for conflicts (but only if
  1097. // there is no other write to the ptr - this is an optimization to
  1098. // catch "a[i] = a[i] + " without having to do a dependence check).
  1099. if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
  1100. CheckDeps.push_back(Access);
  1101. IsRTCheckAnalysisNeeded = true;
  1102. }
  1103. if (IsWrite)
  1104. SetHasWrite = true;
  1105. // Create sets of pointers connected by a shared alias set and
  1106. // underlying object.
  1107. typedef SmallVector<const Value *, 16> ValueVector;
  1108. ValueVector TempObjects;
  1109. getUnderlyingObjects(Ptr, TempObjects, LI);
  1110. LLVM_DEBUG(dbgs()
  1111. << "Underlying objects for pointer " << *Ptr << "\n");
  1112. for (const Value *UnderlyingObj : TempObjects) {
  1113. // nullptr never alias, don't join sets for pointer that have "null"
  1114. // in their UnderlyingObjects list.
  1115. if (isa<ConstantPointerNull>(UnderlyingObj) &&
  1116. !NullPointerIsDefined(
  1117. TheLoop->getHeader()->getParent(),
  1118. UnderlyingObj->getType()->getPointerAddressSpace()))
  1119. continue;
  1120. UnderlyingObjToAccessMap::iterator Prev =
  1121. ObjToLastAccess.find(UnderlyingObj);
  1122. if (Prev != ObjToLastAccess.end())
  1123. DepCands.unionSets(Access, Prev->second);
  1124. ObjToLastAccess[UnderlyingObj] = Access;
  1125. LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
  1126. }
  1127. }
  1128. }
  1129. }
  1130. }
  1131. }
  1132. static bool isInBoundsGep(Value *Ptr) {
  1133. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
  1134. return GEP->isInBounds();
  1135. return false;
  1136. }
  1137. /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
  1138. /// i.e. monotonically increasing/decreasing.
  1139. static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
  1140. PredicatedScalarEvolution &PSE, const Loop *L) {
  1141. // FIXME: This should probably only return true for NUW.
  1142. if (AR->getNoWrapFlags(SCEV::NoWrapMask))
  1143. return true;
  1144. // Scalar evolution does not propagate the non-wrapping flags to values that
  1145. // are derived from a non-wrapping induction variable because non-wrapping
  1146. // could be flow-sensitive.
  1147. //
  1148. // Look through the potentially overflowing instruction to try to prove
  1149. // non-wrapping for the *specific* value of Ptr.
  1150. // The arithmetic implied by an inbounds GEP can't overflow.
  1151. auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  1152. if (!GEP || !GEP->isInBounds())
  1153. return false;
  1154. // Make sure there is only one non-const index and analyze that.
  1155. Value *NonConstIndex = nullptr;
  1156. for (Value *Index : GEP->indices())
  1157. if (!isa<ConstantInt>(Index)) {
  1158. if (NonConstIndex)
  1159. return false;
  1160. NonConstIndex = Index;
  1161. }
  1162. if (!NonConstIndex)
  1163. // The recurrence is on the pointer, ignore for now.
  1164. return false;
  1165. // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
  1166. // AddRec using a NSW operation.
  1167. if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
  1168. if (OBO->hasNoSignedWrap() &&
  1169. // Assume constant for other the operand so that the AddRec can be
  1170. // easily found.
  1171. isa<ConstantInt>(OBO->getOperand(1))) {
  1172. auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
  1173. if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
  1174. return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
  1175. }
  1176. return false;
  1177. }
  1178. /// Check whether the access through \p Ptr has a constant stride.
  1179. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
  1180. Type *AccessTy, Value *Ptr,
  1181. const Loop *Lp,
  1182. const ValueToValueMap &StridesMap,
  1183. bool Assume, bool ShouldCheckWrap) {
  1184. Type *Ty = Ptr->getType();
  1185. assert(Ty->isPointerTy() && "Unexpected non-ptr");
  1186. if (isa<ScalableVectorType>(AccessTy)) {
  1187. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
  1188. << "\n");
  1189. return std::nullopt;
  1190. }
  1191. const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
  1192. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  1193. if (Assume && !AR)
  1194. AR = PSE.getAsAddRec(Ptr);
  1195. if (!AR) {
  1196. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
  1197. << " SCEV: " << *PtrScev << "\n");
  1198. return std::nullopt;
  1199. }
  1200. // The access function must stride over the innermost loop.
  1201. if (Lp != AR->getLoop()) {
  1202. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
  1203. << *Ptr << " SCEV: " << *AR << "\n");
  1204. return std::nullopt;
  1205. }
  1206. // The address calculation must not wrap. Otherwise, a dependence could be
  1207. // inverted.
  1208. // An inbounds getelementptr that is a AddRec with a unit stride
  1209. // cannot wrap per definition. The unit stride requirement is checked later.
  1210. // An getelementptr without an inbounds attribute and unit stride would have
  1211. // to access the pointer value "0" which is undefined behavior in address
  1212. // space 0, therefore we can also vectorize this case.
  1213. unsigned AddrSpace = Ty->getPointerAddressSpace();
  1214. bool IsInBoundsGEP = isInBoundsGep(Ptr);
  1215. bool IsNoWrapAddRec = !ShouldCheckWrap ||
  1216. PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
  1217. isNoWrapAddRec(Ptr, AR, PSE, Lp);
  1218. if (!IsNoWrapAddRec && !IsInBoundsGEP &&
  1219. NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace)) {
  1220. if (Assume) {
  1221. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  1222. IsNoWrapAddRec = true;
  1223. LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
  1224. << "LAA: Pointer: " << *Ptr << "\n"
  1225. << "LAA: SCEV: " << *AR << "\n"
  1226. << "LAA: Added an overflow assumption\n");
  1227. } else {
  1228. LLVM_DEBUG(
  1229. dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
  1230. << *Ptr << " SCEV: " << *AR << "\n");
  1231. return std::nullopt;
  1232. }
  1233. }
  1234. // Check the step is constant.
  1235. const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
  1236. // Calculate the pointer stride and check if it is constant.
  1237. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
  1238. if (!C) {
  1239. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
  1240. << " SCEV: " << *AR << "\n");
  1241. return std::nullopt;
  1242. }
  1243. auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  1244. TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
  1245. int64_t Size = AllocSize.getFixedValue();
  1246. const APInt &APStepVal = C->getAPInt();
  1247. // Huge step value - give up.
  1248. if (APStepVal.getBitWidth() > 64)
  1249. return std::nullopt;
  1250. int64_t StepVal = APStepVal.getSExtValue();
  1251. // Strided access.
  1252. int64_t Stride = StepVal / Size;
  1253. int64_t Rem = StepVal % Size;
  1254. if (Rem)
  1255. return std::nullopt;
  1256. // If the SCEV could wrap but we have an inbounds gep with a unit stride we
  1257. // know we can't "wrap around the address space". In case of address space
  1258. // zero we know that this won't happen without triggering undefined behavior.
  1259. if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
  1260. (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
  1261. AddrSpace))) {
  1262. if (Assume) {
  1263. // We can avoid this case by adding a run-time check.
  1264. LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
  1265. << "inbounds or in address space 0 may wrap:\n"
  1266. << "LAA: Pointer: " << *Ptr << "\n"
  1267. << "LAA: SCEV: " << *AR << "\n"
  1268. << "LAA: Added an overflow assumption\n");
  1269. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  1270. } else
  1271. return std::nullopt;
  1272. }
  1273. return Stride;
  1274. }
  1275. std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
  1276. Type *ElemTyB, Value *PtrB,
  1277. const DataLayout &DL,
  1278. ScalarEvolution &SE, bool StrictCheck,
  1279. bool CheckType) {
  1280. assert(PtrA && PtrB && "Expected non-nullptr pointers.");
  1281. assert(cast<PointerType>(PtrA->getType())
  1282. ->isOpaqueOrPointeeTypeMatches(ElemTyA) && "Wrong PtrA type");
  1283. assert(cast<PointerType>(PtrB->getType())
  1284. ->isOpaqueOrPointeeTypeMatches(ElemTyB) && "Wrong PtrB type");
  1285. // Make sure that A and B are different pointers.
  1286. if (PtrA == PtrB)
  1287. return 0;
  1288. // Make sure that the element types are the same if required.
  1289. if (CheckType && ElemTyA != ElemTyB)
  1290. return std::nullopt;
  1291. unsigned ASA = PtrA->getType()->getPointerAddressSpace();
  1292. unsigned ASB = PtrB->getType()->getPointerAddressSpace();
  1293. // Check that the address spaces match.
  1294. if (ASA != ASB)
  1295. return std::nullopt;
  1296. unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
  1297. APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
  1298. Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
  1299. Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  1300. int Val;
  1301. if (PtrA1 == PtrB1) {
  1302. // Retrieve the address space again as pointer stripping now tracks through
  1303. // `addrspacecast`.
  1304. ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
  1305. ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
  1306. // Check that the address spaces match and that the pointers are valid.
  1307. if (ASA != ASB)
  1308. return std::nullopt;
  1309. IdxWidth = DL.getIndexSizeInBits(ASA);
  1310. OffsetA = OffsetA.sextOrTrunc(IdxWidth);
  1311. OffsetB = OffsetB.sextOrTrunc(IdxWidth);
  1312. OffsetB -= OffsetA;
  1313. Val = OffsetB.getSExtValue();
  1314. } else {
  1315. // Otherwise compute the distance with SCEV between the base pointers.
  1316. const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
  1317. const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
  1318. const auto *Diff =
  1319. dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
  1320. if (!Diff)
  1321. return std::nullopt;
  1322. Val = Diff->getAPInt().getSExtValue();
  1323. }
  1324. int Size = DL.getTypeStoreSize(ElemTyA);
  1325. int Dist = Val / Size;
  1326. // Ensure that the calculated distance matches the type-based one after all
  1327. // the bitcasts removal in the provided pointers.
  1328. if (!StrictCheck || Dist * Size == Val)
  1329. return Dist;
  1330. return std::nullopt;
  1331. }
  1332. bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
  1333. const DataLayout &DL, ScalarEvolution &SE,
  1334. SmallVectorImpl<unsigned> &SortedIndices) {
  1335. assert(llvm::all_of(
  1336. VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
  1337. "Expected list of pointer operands.");
  1338. // Walk over the pointers, and map each of them to an offset relative to
  1339. // first pointer in the array.
  1340. Value *Ptr0 = VL[0];
  1341. using DistOrdPair = std::pair<int64_t, int>;
  1342. auto Compare = llvm::less_first();
  1343. std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
  1344. Offsets.emplace(0, 0);
  1345. int Cnt = 1;
  1346. bool IsConsecutive = true;
  1347. for (auto *Ptr : VL.drop_front()) {
  1348. std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
  1349. /*StrictCheck=*/true);
  1350. if (!Diff)
  1351. return false;
  1352. // Check if the pointer with the same offset is found.
  1353. int64_t Offset = *Diff;
  1354. auto Res = Offsets.emplace(Offset, Cnt);
  1355. if (!Res.second)
  1356. return false;
  1357. // Consecutive order if the inserted element is the last one.
  1358. IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
  1359. ++Cnt;
  1360. }
  1361. SortedIndices.clear();
  1362. if (!IsConsecutive) {
  1363. // Fill SortedIndices array only if it is non-consecutive.
  1364. SortedIndices.resize(VL.size());
  1365. Cnt = 0;
  1366. for (const std::pair<int64_t, int> &Pair : Offsets) {
  1367. SortedIndices[Cnt] = Pair.second;
  1368. ++Cnt;
  1369. }
  1370. }
  1371. return true;
  1372. }
  1373. /// Returns true if the memory operations \p A and \p B are consecutive.
  1374. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
  1375. ScalarEvolution &SE, bool CheckType) {
  1376. Value *PtrA = getLoadStorePointerOperand(A);
  1377. Value *PtrB = getLoadStorePointerOperand(B);
  1378. if (!PtrA || !PtrB)
  1379. return false;
  1380. Type *ElemTyA = getLoadStoreType(A);
  1381. Type *ElemTyB = getLoadStoreType(B);
  1382. std::optional<int> Diff =
  1383. getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
  1384. /*StrictCheck=*/true, CheckType);
  1385. return Diff && *Diff == 1;
  1386. }
  1387. void MemoryDepChecker::addAccess(StoreInst *SI) {
  1388. visitPointers(SI->getPointerOperand(), *InnermostLoop,
  1389. [this, SI](Value *Ptr) {
  1390. Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
  1391. InstMap.push_back(SI);
  1392. ++AccessIdx;
  1393. });
  1394. }
  1395. void MemoryDepChecker::addAccess(LoadInst *LI) {
  1396. visitPointers(LI->getPointerOperand(), *InnermostLoop,
  1397. [this, LI](Value *Ptr) {
  1398. Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
  1399. InstMap.push_back(LI);
  1400. ++AccessIdx;
  1401. });
  1402. }
  1403. MemoryDepChecker::VectorizationSafetyStatus
  1404. MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
  1405. switch (Type) {
  1406. case NoDep:
  1407. case Forward:
  1408. case BackwardVectorizable:
  1409. return VectorizationSafetyStatus::Safe;
  1410. case Unknown:
  1411. return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
  1412. case ForwardButPreventsForwarding:
  1413. case Backward:
  1414. case BackwardVectorizableButPreventsForwarding:
  1415. return VectorizationSafetyStatus::Unsafe;
  1416. }
  1417. llvm_unreachable("unexpected DepType!");
  1418. }
  1419. bool MemoryDepChecker::Dependence::isBackward() const {
  1420. switch (Type) {
  1421. case NoDep:
  1422. case Forward:
  1423. case ForwardButPreventsForwarding:
  1424. case Unknown:
  1425. return false;
  1426. case BackwardVectorizable:
  1427. case Backward:
  1428. case BackwardVectorizableButPreventsForwarding:
  1429. return true;
  1430. }
  1431. llvm_unreachable("unexpected DepType!");
  1432. }
  1433. bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
  1434. return isBackward() || Type == Unknown;
  1435. }
  1436. bool MemoryDepChecker::Dependence::isForward() const {
  1437. switch (Type) {
  1438. case Forward:
  1439. case ForwardButPreventsForwarding:
  1440. return true;
  1441. case NoDep:
  1442. case Unknown:
  1443. case BackwardVectorizable:
  1444. case Backward:
  1445. case BackwardVectorizableButPreventsForwarding:
  1446. return false;
  1447. }
  1448. llvm_unreachable("unexpected DepType!");
  1449. }
  1450. bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
  1451. uint64_t TypeByteSize) {
  1452. // If loads occur at a distance that is not a multiple of a feasible vector
  1453. // factor store-load forwarding does not take place.
  1454. // Positive dependences might cause troubles because vectorizing them might
  1455. // prevent store-load forwarding making vectorized code run a lot slower.
  1456. // a[i] = a[i-3] ^ a[i-8];
  1457. // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
  1458. // hence on your typical architecture store-load forwarding does not take
  1459. // place. Vectorizing in such cases does not make sense.
  1460. // Store-load forwarding distance.
  1461. // After this many iterations store-to-load forwarding conflicts should not
  1462. // cause any slowdowns.
  1463. const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
  1464. // Maximum vector factor.
  1465. uint64_t MaxVFWithoutSLForwardIssues = std::min(
  1466. VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
  1467. // Compute the smallest VF at which the store and load would be misaligned.
  1468. for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
  1469. VF *= 2) {
  1470. // If the number of vector iteration between the store and the load are
  1471. // small we could incur conflicts.
  1472. if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
  1473. MaxVFWithoutSLForwardIssues = (VF >> 1);
  1474. break;
  1475. }
  1476. }
  1477. if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
  1478. LLVM_DEBUG(
  1479. dbgs() << "LAA: Distance " << Distance
  1480. << " that could cause a store-load forwarding conflict\n");
  1481. return true;
  1482. }
  1483. if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
  1484. MaxVFWithoutSLForwardIssues !=
  1485. VectorizerParams::MaxVectorWidth * TypeByteSize)
  1486. MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
  1487. return false;
  1488. }
  1489. void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
  1490. if (Status < S)
  1491. Status = S;
  1492. }
  1493. /// Given a dependence-distance \p Dist between two
  1494. /// memory accesses, that have the same stride whose absolute value is given
  1495. /// in \p Stride, and that have the same type size \p TypeByteSize,
  1496. /// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
  1497. /// possible to prove statically that the dependence distance is larger
  1498. /// than the range that the accesses will travel through the execution of
  1499. /// the loop. If so, return true; false otherwise. This is useful for
  1500. /// example in loops such as the following (PR31098):
  1501. /// for (i = 0; i < D; ++i) {
  1502. /// = out[i];
  1503. /// out[i+D] =
  1504. /// }
  1505. static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
  1506. const SCEV &BackedgeTakenCount,
  1507. const SCEV &Dist, uint64_t Stride,
  1508. uint64_t TypeByteSize) {
  1509. // If we can prove that
  1510. // (**) |Dist| > BackedgeTakenCount * Step
  1511. // where Step is the absolute stride of the memory accesses in bytes,
  1512. // then there is no dependence.
  1513. //
  1514. // Rationale:
  1515. // We basically want to check if the absolute distance (|Dist/Step|)
  1516. // is >= the loop iteration count (or > BackedgeTakenCount).
  1517. // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
  1518. // Section 4.2.1); Note, that for vectorization it is sufficient to prove
  1519. // that the dependence distance is >= VF; This is checked elsewhere.
  1520. // But in some cases we can prune dependence distances early, and
  1521. // even before selecting the VF, and without a runtime test, by comparing
  1522. // the distance against the loop iteration count. Since the vectorized code
  1523. // will be executed only if LoopCount >= VF, proving distance >= LoopCount
  1524. // also guarantees that distance >= VF.
  1525. //
  1526. const uint64_t ByteStride = Stride * TypeByteSize;
  1527. const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
  1528. const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
  1529. const SCEV *CastedDist = &Dist;
  1530. const SCEV *CastedProduct = Product;
  1531. uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
  1532. uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
  1533. // The dependence distance can be positive/negative, so we sign extend Dist;
  1534. // The multiplication of the absolute stride in bytes and the
  1535. // backedgeTakenCount is non-negative, so we zero extend Product.
  1536. if (DistTypeSizeBits > ProductTypeSizeBits)
  1537. CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
  1538. else
  1539. CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
  1540. // Is Dist - (BackedgeTakenCount * Step) > 0 ?
  1541. // (If so, then we have proven (**) because |Dist| >= Dist)
  1542. const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
  1543. if (SE.isKnownPositive(Minus))
  1544. return true;
  1545. // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
  1546. // (If so, then we have proven (**) because |Dist| >= -1*Dist)
  1547. const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
  1548. Minus = SE.getMinusSCEV(NegDist, CastedProduct);
  1549. if (SE.isKnownPositive(Minus))
  1550. return true;
  1551. return false;
  1552. }
  1553. /// Check the dependence for two accesses with the same stride \p Stride.
  1554. /// \p Distance is the positive distance and \p TypeByteSize is type size in
  1555. /// bytes.
  1556. ///
  1557. /// \returns true if they are independent.
  1558. static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
  1559. uint64_t TypeByteSize) {
  1560. assert(Stride > 1 && "The stride must be greater than 1");
  1561. assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
  1562. assert(Distance > 0 && "The distance must be non-zero");
  1563. // Skip if the distance is not multiple of type byte size.
  1564. if (Distance % TypeByteSize)
  1565. return false;
  1566. uint64_t ScaledDist = Distance / TypeByteSize;
  1567. // No dependence if the scaled distance is not multiple of the stride.
  1568. // E.g.
  1569. // for (i = 0; i < 1024 ; i += 4)
  1570. // A[i+2] = A[i] + 1;
  1571. //
  1572. // Two accesses in memory (scaled distance is 2, stride is 4):
  1573. // | A[0] | | | | A[4] | | | |
  1574. // | | | A[2] | | | | A[6] | |
  1575. //
  1576. // E.g.
  1577. // for (i = 0; i < 1024 ; i += 3)
  1578. // A[i+4] = A[i] + 1;
  1579. //
  1580. // Two accesses in memory (scaled distance is 4, stride is 3):
  1581. // | A[0] | | | A[3] | | | A[6] | | |
  1582. // | | | | | A[4] | | | A[7] | |
  1583. return ScaledDist % Stride;
  1584. }
  1585. MemoryDepChecker::Dependence::DepType
  1586. MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  1587. const MemAccessInfo &B, unsigned BIdx,
  1588. const ValueToValueMap &Strides) {
  1589. assert (AIdx < BIdx && "Must pass arguments in program order");
  1590. auto [APtr, AIsWrite] = A;
  1591. auto [BPtr, BIsWrite] = B;
  1592. Type *ATy = getLoadStoreType(InstMap[AIdx]);
  1593. Type *BTy = getLoadStoreType(InstMap[BIdx]);
  1594. // Two reads are independent.
  1595. if (!AIsWrite && !BIsWrite)
  1596. return Dependence::NoDep;
  1597. // We cannot check pointers in different address spaces.
  1598. if (APtr->getType()->getPointerAddressSpace() !=
  1599. BPtr->getType()->getPointerAddressSpace())
  1600. return Dependence::Unknown;
  1601. int64_t StrideAPtr =
  1602. getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
  1603. int64_t StrideBPtr =
  1604. getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
  1605. const SCEV *Src = PSE.getSCEV(APtr);
  1606. const SCEV *Sink = PSE.getSCEV(BPtr);
  1607. // If the induction step is negative we have to invert source and sink of the
  1608. // dependence.
  1609. if (StrideAPtr < 0) {
  1610. std::swap(APtr, BPtr);
  1611. std::swap(ATy, BTy);
  1612. std::swap(Src, Sink);
  1613. std::swap(AIsWrite, BIsWrite);
  1614. std::swap(AIdx, BIdx);
  1615. std::swap(StrideAPtr, StrideBPtr);
  1616. }
  1617. ScalarEvolution &SE = *PSE.getSE();
  1618. const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
  1619. LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
  1620. << "(Induction step: " << StrideAPtr << ")\n");
  1621. LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
  1622. << *InstMap[BIdx] << ": " << *Dist << "\n");
  1623. // Need accesses with constant stride. We don't want to vectorize
  1624. // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
  1625. // the address space.
  1626. if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
  1627. LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
  1628. return Dependence::Unknown;
  1629. }
  1630. auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
  1631. uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
  1632. bool HasSameSize =
  1633. DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
  1634. uint64_t Stride = std::abs(StrideAPtr);
  1635. if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
  1636. isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist,
  1637. Stride, TypeByteSize))
  1638. return Dependence::NoDep;
  1639. const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
  1640. if (!C) {
  1641. LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
  1642. FoundNonConstantDistanceDependence = true;
  1643. return Dependence::Unknown;
  1644. }
  1645. const APInt &Val = C->getAPInt();
  1646. int64_t Distance = Val.getSExtValue();
  1647. // Attempt to prove strided accesses independent.
  1648. if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
  1649. areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
  1650. LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
  1651. return Dependence::NoDep;
  1652. }
  1653. // Negative distances are not plausible dependencies.
  1654. if (Val.isNegative()) {
  1655. bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
  1656. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1657. (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
  1658. !HasSameSize)) {
  1659. LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
  1660. return Dependence::ForwardButPreventsForwarding;
  1661. }
  1662. LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
  1663. return Dependence::Forward;
  1664. }
  1665. // Write to the same location with the same size.
  1666. if (Val == 0) {
  1667. if (HasSameSize)
  1668. return Dependence::Forward;
  1669. LLVM_DEBUG(
  1670. dbgs() << "LAA: Zero dependence difference but different type sizes\n");
  1671. return Dependence::Unknown;
  1672. }
  1673. assert(Val.isStrictlyPositive() && "Expect a positive value");
  1674. if (!HasSameSize) {
  1675. LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
  1676. "different type sizes\n");
  1677. return Dependence::Unknown;
  1678. }
  1679. // Bail out early if passed-in parameters make vectorization not feasible.
  1680. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
  1681. VectorizerParams::VectorizationFactor : 1);
  1682. unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
  1683. VectorizerParams::VectorizationInterleave : 1);
  1684. // The minimum number of iterations for a vectorized/unrolled version.
  1685. unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
  1686. // It's not vectorizable if the distance is smaller than the minimum distance
  1687. // needed for a vectroized/unrolled version. Vectorizing one iteration in
  1688. // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
  1689. // TypeByteSize (No need to plus the last gap distance).
  1690. //
  1691. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1692. // foo(int *A) {
  1693. // int *B = (int *)((char *)A + 14);
  1694. // for (i = 0 ; i < 1024 ; i += 2)
  1695. // B[i] = A[i] + 1;
  1696. // }
  1697. //
  1698. // Two accesses in memory (stride is 2):
  1699. // | A[0] | | A[2] | | A[4] | | A[6] | |
  1700. // | B[0] | | B[2] | | B[4] |
  1701. //
  1702. // Distance needs for vectorizing iterations except the last iteration:
  1703. // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
  1704. // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
  1705. //
  1706. // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
  1707. // 12, which is less than distance.
  1708. //
  1709. // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
  1710. // the minimum distance needed is 28, which is greater than distance. It is
  1711. // not safe to do vectorization.
  1712. uint64_t MinDistanceNeeded =
  1713. TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
  1714. if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
  1715. LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
  1716. << Distance << '\n');
  1717. return Dependence::Backward;
  1718. }
  1719. // Unsafe if the minimum distance needed is greater than max safe distance.
  1720. if (MinDistanceNeeded > MaxSafeDepDistBytes) {
  1721. LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
  1722. << MinDistanceNeeded << " size in bytes\n");
  1723. return Dependence::Backward;
  1724. }
  1725. // Positive distance bigger than max vectorization factor.
  1726. // FIXME: Should use max factor instead of max distance in bytes, which could
  1727. // not handle different types.
  1728. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1729. // void foo (int *A, char *B) {
  1730. // for (unsigned i = 0; i < 1024; i++) {
  1731. // A[i+2] = A[i] + 1;
  1732. // B[i+2] = B[i] + 1;
  1733. // }
  1734. // }
  1735. //
  1736. // This case is currently unsafe according to the max safe distance. If we
  1737. // analyze the two accesses on array B, the max safe dependence distance
  1738. // is 2. Then we analyze the accesses on array A, the minimum distance needed
  1739. // is 8, which is less than 2 and forbidden vectorization, But actually
  1740. // both A and B could be vectorized by 2 iterations.
  1741. MaxSafeDepDistBytes =
  1742. std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
  1743. bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
  1744. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1745. couldPreventStoreLoadForward(Distance, TypeByteSize))
  1746. return Dependence::BackwardVectorizableButPreventsForwarding;
  1747. uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
  1748. LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
  1749. << " with max VF = " << MaxVF << '\n');
  1750. uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
  1751. MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
  1752. return Dependence::BackwardVectorizable;
  1753. }
  1754. bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
  1755. MemAccessInfoList &CheckDeps,
  1756. const ValueToValueMap &Strides) {
  1757. MaxSafeDepDistBytes = -1;
  1758. SmallPtrSet<MemAccessInfo, 8> Visited;
  1759. for (MemAccessInfo CurAccess : CheckDeps) {
  1760. if (Visited.count(CurAccess))
  1761. continue;
  1762. // Get the relevant memory access set.
  1763. EquivalenceClasses<MemAccessInfo>::iterator I =
  1764. AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
  1765. // Check accesses within this set.
  1766. EquivalenceClasses<MemAccessInfo>::member_iterator AI =
  1767. AccessSets.member_begin(I);
  1768. EquivalenceClasses<MemAccessInfo>::member_iterator AE =
  1769. AccessSets.member_end();
  1770. // Check every access pair.
  1771. while (AI != AE) {
  1772. Visited.insert(*AI);
  1773. bool AIIsWrite = AI->getInt();
  1774. // Check loads only against next equivalent class, but stores also against
  1775. // other stores in the same equivalence class - to the same address.
  1776. EquivalenceClasses<MemAccessInfo>::member_iterator OI =
  1777. (AIIsWrite ? AI : std::next(AI));
  1778. while (OI != AE) {
  1779. // Check every accessing instruction pair in program order.
  1780. for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
  1781. I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
  1782. // Scan all accesses of another equivalence class, but only the next
  1783. // accesses of the same equivalent class.
  1784. for (std::vector<unsigned>::iterator
  1785. I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
  1786. I2E = (OI == AI ? I1E : Accesses[*OI].end());
  1787. I2 != I2E; ++I2) {
  1788. auto A = std::make_pair(&*AI, *I1);
  1789. auto B = std::make_pair(&*OI, *I2);
  1790. assert(*I1 != *I2);
  1791. if (*I1 > *I2)
  1792. std::swap(A, B);
  1793. Dependence::DepType Type =
  1794. isDependent(*A.first, A.second, *B.first, B.second, Strides);
  1795. mergeInStatus(Dependence::isSafeForVectorization(Type));
  1796. // Gather dependences unless we accumulated MaxDependences
  1797. // dependences. In that case return as soon as we find the first
  1798. // unsafe dependence. This puts a limit on this quadratic
  1799. // algorithm.
  1800. if (RecordDependences) {
  1801. if (Type != Dependence::NoDep)
  1802. Dependences.push_back(Dependence(A.second, B.second, Type));
  1803. if (Dependences.size() >= MaxDependences) {
  1804. RecordDependences = false;
  1805. Dependences.clear();
  1806. LLVM_DEBUG(dbgs()
  1807. << "Too many dependences, stopped recording\n");
  1808. }
  1809. }
  1810. if (!RecordDependences && !isSafeForVectorization())
  1811. return false;
  1812. }
  1813. ++OI;
  1814. }
  1815. AI++;
  1816. }
  1817. }
  1818. LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
  1819. return isSafeForVectorization();
  1820. }
  1821. SmallVector<Instruction *, 4>
  1822. MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
  1823. MemAccessInfo Access(Ptr, isWrite);
  1824. auto &IndexVector = Accesses.find(Access)->second;
  1825. SmallVector<Instruction *, 4> Insts;
  1826. transform(IndexVector,
  1827. std::back_inserter(Insts),
  1828. [&](unsigned Idx) { return this->InstMap[Idx]; });
  1829. return Insts;
  1830. }
  1831. const char *MemoryDepChecker::Dependence::DepName[] = {
  1832. "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
  1833. "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
  1834. void MemoryDepChecker::Dependence::print(
  1835. raw_ostream &OS, unsigned Depth,
  1836. const SmallVectorImpl<Instruction *> &Instrs) const {
  1837. OS.indent(Depth) << DepName[Type] << ":\n";
  1838. OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
  1839. OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
  1840. }
  1841. bool LoopAccessInfo::canAnalyzeLoop() {
  1842. // We need to have a loop header.
  1843. LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
  1844. << TheLoop->getHeader()->getParent()->getName() << ": "
  1845. << TheLoop->getHeader()->getName() << '\n');
  1846. // We can only analyze innermost loops.
  1847. if (!TheLoop->isInnermost()) {
  1848. LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
  1849. recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
  1850. return false;
  1851. }
  1852. // We must have a single backedge.
  1853. if (TheLoop->getNumBackEdges() != 1) {
  1854. LLVM_DEBUG(
  1855. dbgs() << "LAA: loop control flow is not understood by analyzer\n");
  1856. recordAnalysis("CFGNotUnderstood")
  1857. << "loop control flow is not understood by analyzer";
  1858. return false;
  1859. }
  1860. // ScalarEvolution needs to be able to find the exit count.
  1861. const SCEV *ExitCount = PSE->getBackedgeTakenCount();
  1862. if (isa<SCEVCouldNotCompute>(ExitCount)) {
  1863. recordAnalysis("CantComputeNumberOfIterations")
  1864. << "could not determine number of loop iterations";
  1865. LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
  1866. return false;
  1867. }
  1868. return true;
  1869. }
  1870. void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
  1871. const TargetLibraryInfo *TLI,
  1872. DominatorTree *DT) {
  1873. // Holds the Load and Store instructions.
  1874. SmallVector<LoadInst *, 16> Loads;
  1875. SmallVector<StoreInst *, 16> Stores;
  1876. // Holds all the different accesses in the loop.
  1877. unsigned NumReads = 0;
  1878. unsigned NumReadWrites = 0;
  1879. bool HasComplexMemInst = false;
  1880. // A runtime check is only legal to insert if there are no convergent calls.
  1881. HasConvergentOp = false;
  1882. PtrRtChecking->Pointers.clear();
  1883. PtrRtChecking->Need = false;
  1884. const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
  1885. const bool EnableMemAccessVersioningOfLoop =
  1886. EnableMemAccessVersioning &&
  1887. !TheLoop->getHeader()->getParent()->hasOptSize();
  1888. // Traverse blocks in fixed RPOT order, regardless of their storage in the
  1889. // loop info, as it may be arbitrary.
  1890. LoopBlocksRPO RPOT(TheLoop);
  1891. RPOT.perform(LI);
  1892. for (BasicBlock *BB : RPOT) {
  1893. // Scan the BB and collect legal loads and stores. Also detect any
  1894. // convergent instructions.
  1895. for (Instruction &I : *BB) {
  1896. if (auto *Call = dyn_cast<CallBase>(&I)) {
  1897. if (Call->isConvergent())
  1898. HasConvergentOp = true;
  1899. }
  1900. // With both a non-vectorizable memory instruction and a convergent
  1901. // operation, found in this loop, no reason to continue the search.
  1902. if (HasComplexMemInst && HasConvergentOp) {
  1903. CanVecMem = false;
  1904. return;
  1905. }
  1906. // Avoid hitting recordAnalysis multiple times.
  1907. if (HasComplexMemInst)
  1908. continue;
  1909. // If this is a load, save it. If this instruction can read from memory
  1910. // but is not a load, then we quit. Notice that we don't handle function
  1911. // calls that read or write.
  1912. if (I.mayReadFromMemory()) {
  1913. // Many math library functions read the rounding mode. We will only
  1914. // vectorize a loop if it contains known function calls that don't set
  1915. // the flag. Therefore, it is safe to ignore this read from memory.
  1916. auto *Call = dyn_cast<CallInst>(&I);
  1917. if (Call && getVectorIntrinsicIDForCall(Call, TLI))
  1918. continue;
  1919. // If the function has an explicit vectorized counterpart, we can safely
  1920. // assume that it can be vectorized.
  1921. if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
  1922. !VFDatabase::getMappings(*Call).empty())
  1923. continue;
  1924. auto *Ld = dyn_cast<LoadInst>(&I);
  1925. if (!Ld) {
  1926. recordAnalysis("CantVectorizeInstruction", Ld)
  1927. << "instruction cannot be vectorized";
  1928. HasComplexMemInst = true;
  1929. continue;
  1930. }
  1931. if (!Ld->isSimple() && !IsAnnotatedParallel) {
  1932. recordAnalysis("NonSimpleLoad", Ld)
  1933. << "read with atomic ordering or volatile read";
  1934. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
  1935. HasComplexMemInst = true;
  1936. continue;
  1937. }
  1938. NumLoads++;
  1939. Loads.push_back(Ld);
  1940. DepChecker->addAccess(Ld);
  1941. if (EnableMemAccessVersioningOfLoop)
  1942. collectStridedAccess(Ld);
  1943. continue;
  1944. }
  1945. // Save 'store' instructions. Abort if other instructions write to memory.
  1946. if (I.mayWriteToMemory()) {
  1947. auto *St = dyn_cast<StoreInst>(&I);
  1948. if (!St) {
  1949. recordAnalysis("CantVectorizeInstruction", St)
  1950. << "instruction cannot be vectorized";
  1951. HasComplexMemInst = true;
  1952. continue;
  1953. }
  1954. if (!St->isSimple() && !IsAnnotatedParallel) {
  1955. recordAnalysis("NonSimpleStore", St)
  1956. << "write with atomic ordering or volatile write";
  1957. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
  1958. HasComplexMemInst = true;
  1959. continue;
  1960. }
  1961. NumStores++;
  1962. Stores.push_back(St);
  1963. DepChecker->addAccess(St);
  1964. if (EnableMemAccessVersioningOfLoop)
  1965. collectStridedAccess(St);
  1966. }
  1967. } // Next instr.
  1968. } // Next block.
  1969. if (HasComplexMemInst) {
  1970. CanVecMem = false;
  1971. return;
  1972. }
  1973. // Now we have two lists that hold the loads and the stores.
  1974. // Next, we find the pointers that they use.
  1975. // Check if we see any stores. If there are no stores, then we don't
  1976. // care if the pointers are *restrict*.
  1977. if (!Stores.size()) {
  1978. LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
  1979. CanVecMem = true;
  1980. return;
  1981. }
  1982. MemoryDepChecker::DepCandidates DependentAccesses;
  1983. AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
  1984. // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
  1985. // multiple times on the same object. If the ptr is accessed twice, once
  1986. // for read and once for write, it will only appear once (on the write
  1987. // list). This is okay, since we are going to check for conflicts between
  1988. // writes and between reads and writes, but not between reads and reads.
  1989. SmallSet<std::pair<Value *, Type *>, 16> Seen;
  1990. // Record uniform store addresses to identify if we have multiple stores
  1991. // to the same address.
  1992. SmallPtrSet<Value *, 16> UniformStores;
  1993. for (StoreInst *ST : Stores) {
  1994. Value *Ptr = ST->getPointerOperand();
  1995. if (isUniform(Ptr)) {
  1996. // Record store instructions to loop invariant addresses
  1997. StoresToInvariantAddresses.push_back(ST);
  1998. HasDependenceInvolvingLoopInvariantAddress |=
  1999. !UniformStores.insert(Ptr).second;
  2000. }
  2001. // If we did *not* see this pointer before, insert it to the read-write
  2002. // list. At this phase it is only a 'write' list.
  2003. Type *AccessTy = getLoadStoreType(ST);
  2004. if (Seen.insert({Ptr, AccessTy}).second) {
  2005. ++NumReadWrites;
  2006. MemoryLocation Loc = MemoryLocation::get(ST);
  2007. // The TBAA metadata could have a control dependency on the predication
  2008. // condition, so we cannot rely on it when determining whether or not we
  2009. // need runtime pointer checks.
  2010. if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
  2011. Loc.AATags.TBAA = nullptr;
  2012. visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
  2013. [&Accesses, AccessTy, Loc](Value *Ptr) {
  2014. MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
  2015. Accesses.addStore(NewLoc, AccessTy);
  2016. });
  2017. }
  2018. }
  2019. if (IsAnnotatedParallel) {
  2020. LLVM_DEBUG(
  2021. dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
  2022. << "checks.\n");
  2023. CanVecMem = true;
  2024. return;
  2025. }
  2026. for (LoadInst *LD : Loads) {
  2027. Value *Ptr = LD->getPointerOperand();
  2028. // If we did *not* see this pointer before, insert it to the
  2029. // read list. If we *did* see it before, then it is already in
  2030. // the read-write list. This allows us to vectorize expressions
  2031. // such as A[i] += x; Because the address of A[i] is a read-write
  2032. // pointer. This only works if the index of A[i] is consecutive.
  2033. // If the address of i is unknown (for example A[B[i]]) then we may
  2034. // read a few words, modify, and write a few words, and some of the
  2035. // words may be written to the same address.
  2036. bool IsReadOnlyPtr = false;
  2037. Type *AccessTy = getLoadStoreType(LD);
  2038. if (Seen.insert({Ptr, AccessTy}).second ||
  2039. !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
  2040. ++NumReads;
  2041. IsReadOnlyPtr = true;
  2042. }
  2043. // See if there is an unsafe dependency between a load to a uniform address and
  2044. // store to the same uniform address.
  2045. if (UniformStores.count(Ptr)) {
  2046. LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
  2047. "load and uniform store to the same address!\n");
  2048. HasDependenceInvolvingLoopInvariantAddress = true;
  2049. }
  2050. MemoryLocation Loc = MemoryLocation::get(LD);
  2051. // The TBAA metadata could have a control dependency on the predication
  2052. // condition, so we cannot rely on it when determining whether or not we
  2053. // need runtime pointer checks.
  2054. if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
  2055. Loc.AATags.TBAA = nullptr;
  2056. visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
  2057. [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
  2058. MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
  2059. Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
  2060. });
  2061. }
  2062. // If we write (or read-write) to a single destination and there are no
  2063. // other reads in this loop then is it safe to vectorize.
  2064. if (NumReadWrites == 1 && NumReads == 0) {
  2065. LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
  2066. CanVecMem = true;
  2067. return;
  2068. }
  2069. // Build dependence sets and check whether we need a runtime pointer bounds
  2070. // check.
  2071. Accesses.buildDependenceSets();
  2072. // Find pointers with computable bounds. We are going to use this information
  2073. // to place a runtime bound check.
  2074. Value *UncomputablePtr = nullptr;
  2075. bool CanDoRTIfNeeded =
  2076. Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
  2077. SymbolicStrides, UncomputablePtr, false);
  2078. if (!CanDoRTIfNeeded) {
  2079. auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
  2080. recordAnalysis("CantIdentifyArrayBounds", I)
  2081. << "cannot identify array bounds";
  2082. LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
  2083. << "the array bounds.\n");
  2084. CanVecMem = false;
  2085. return;
  2086. }
  2087. LLVM_DEBUG(
  2088. dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
  2089. CanVecMem = true;
  2090. if (Accesses.isDependencyCheckNeeded()) {
  2091. LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
  2092. CanVecMem = DepChecker->areDepsSafe(
  2093. DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
  2094. MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
  2095. if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
  2096. LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
  2097. // Clear the dependency checks. We assume they are not needed.
  2098. Accesses.resetDepChecks(*DepChecker);
  2099. PtrRtChecking->reset();
  2100. PtrRtChecking->Need = true;
  2101. auto *SE = PSE->getSE();
  2102. UncomputablePtr = nullptr;
  2103. CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
  2104. *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
  2105. // Check that we found the bounds for the pointer.
  2106. if (!CanDoRTIfNeeded) {
  2107. auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
  2108. recordAnalysis("CantCheckMemDepsAtRunTime", I)
  2109. << "cannot check memory dependencies at runtime";
  2110. LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
  2111. CanVecMem = false;
  2112. return;
  2113. }
  2114. CanVecMem = true;
  2115. }
  2116. }
  2117. if (HasConvergentOp) {
  2118. recordAnalysis("CantInsertRuntimeCheckWithConvergent")
  2119. << "cannot add control dependency to convergent operation";
  2120. LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
  2121. "would be needed with a convergent operation\n");
  2122. CanVecMem = false;
  2123. return;
  2124. }
  2125. if (CanVecMem)
  2126. LLVM_DEBUG(
  2127. dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
  2128. << (PtrRtChecking->Need ? "" : " don't")
  2129. << " need runtime memory checks.\n");
  2130. else
  2131. emitUnsafeDependenceRemark();
  2132. }
  2133. void LoopAccessInfo::emitUnsafeDependenceRemark() {
  2134. auto Deps = getDepChecker().getDependences();
  2135. if (!Deps)
  2136. return;
  2137. auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
  2138. return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
  2139. MemoryDepChecker::VectorizationSafetyStatus::Safe;
  2140. });
  2141. if (Found == Deps->end())
  2142. return;
  2143. MemoryDepChecker::Dependence Dep = *Found;
  2144. LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
  2145. // Emit remark for first unsafe dependence
  2146. OptimizationRemarkAnalysis &R =
  2147. recordAnalysis("UnsafeDep", Dep.getDestination(*this))
  2148. << "unsafe dependent memory operations in loop. Use "
  2149. "#pragma loop distribute(enable) to allow loop distribution "
  2150. "to attempt to isolate the offending operations into a separate "
  2151. "loop";
  2152. switch (Dep.Type) {
  2153. case MemoryDepChecker::Dependence::NoDep:
  2154. case MemoryDepChecker::Dependence::Forward:
  2155. case MemoryDepChecker::Dependence::BackwardVectorizable:
  2156. llvm_unreachable("Unexpected dependence");
  2157. case MemoryDepChecker::Dependence::Backward:
  2158. R << "\nBackward loop carried data dependence.";
  2159. break;
  2160. case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
  2161. R << "\nForward loop carried data dependence that prevents "
  2162. "store-to-load forwarding.";
  2163. break;
  2164. case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
  2165. R << "\nBackward loop carried data dependence that prevents "
  2166. "store-to-load forwarding.";
  2167. break;
  2168. case MemoryDepChecker::Dependence::Unknown:
  2169. R << "\nUnknown data dependence.";
  2170. break;
  2171. }
  2172. if (Instruction *I = Dep.getSource(*this)) {
  2173. DebugLoc SourceLoc = I->getDebugLoc();
  2174. if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
  2175. SourceLoc = DD->getDebugLoc();
  2176. if (SourceLoc)
  2177. R << " Memory location is the same as accessed at "
  2178. << ore::NV("Location", SourceLoc);
  2179. }
  2180. }
  2181. bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
  2182. DominatorTree *DT) {
  2183. assert(TheLoop->contains(BB) && "Unknown block used");
  2184. // Blocks that do not dominate the latch need predication.
  2185. BasicBlock* Latch = TheLoop->getLoopLatch();
  2186. return !DT->dominates(BB, Latch);
  2187. }
  2188. OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
  2189. Instruction *I) {
  2190. assert(!Report && "Multiple reports generated");
  2191. Value *CodeRegion = TheLoop->getHeader();
  2192. DebugLoc DL = TheLoop->getStartLoc();
  2193. if (I) {
  2194. CodeRegion = I->getParent();
  2195. // If there is no debug location attached to the instruction, revert back to
  2196. // using the loop's.
  2197. if (I->getDebugLoc())
  2198. DL = I->getDebugLoc();
  2199. }
  2200. Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
  2201. CodeRegion);
  2202. return *Report;
  2203. }
  2204. bool LoopAccessInfo::isUniform(Value *V) const {
  2205. auto *SE = PSE->getSE();
  2206. // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
  2207. // never considered uniform.
  2208. // TODO: Is this really what we want? Even without FP SCEV, we may want some
  2209. // trivially loop-invariant FP values to be considered uniform.
  2210. if (!SE->isSCEVable(V->getType()))
  2211. return false;
  2212. return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
  2213. }
  2214. void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
  2215. Value *Ptr = getLoadStorePointerOperand(MemAccess);
  2216. if (!Ptr)
  2217. return;
  2218. Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
  2219. if (!Stride)
  2220. return;
  2221. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
  2222. "versioning:");
  2223. LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
  2224. // Avoid adding the "Stride == 1" predicate when we know that
  2225. // Stride >= Trip-Count. Such a predicate will effectively optimize a single
  2226. // or zero iteration loop, as Trip-Count <= Stride == 1.
  2227. //
  2228. // TODO: We are currently not making a very informed decision on when it is
  2229. // beneficial to apply stride versioning. It might make more sense that the
  2230. // users of this analysis (such as the vectorizer) will trigger it, based on
  2231. // their specific cost considerations; For example, in cases where stride
  2232. // versioning does not help resolving memory accesses/dependences, the
  2233. // vectorizer should evaluate the cost of the runtime test, and the benefit
  2234. // of various possible stride specializations, considering the alternatives
  2235. // of using gather/scatters (if available).
  2236. const SCEV *StrideExpr = PSE->getSCEV(Stride);
  2237. const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
  2238. // Match the types so we can compare the stride and the BETakenCount.
  2239. // The Stride can be positive/negative, so we sign extend Stride;
  2240. // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
  2241. const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
  2242. uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
  2243. uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
  2244. const SCEV *CastedStride = StrideExpr;
  2245. const SCEV *CastedBECount = BETakenCount;
  2246. ScalarEvolution *SE = PSE->getSE();
  2247. if (BETypeSizeBits >= StrideTypeSizeBits)
  2248. CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
  2249. else
  2250. CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
  2251. const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
  2252. // Since TripCount == BackEdgeTakenCount + 1, checking:
  2253. // "Stride >= TripCount" is equivalent to checking:
  2254. // Stride - BETakenCount > 0
  2255. if (SE->isKnownPositive(StrideMinusBETaken)) {
  2256. LLVM_DEBUG(
  2257. dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
  2258. "Stride==1 predicate will imply that the loop executes "
  2259. "at most once.\n");
  2260. return;
  2261. }
  2262. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
  2263. SymbolicStrides[Ptr] = Stride;
  2264. StrideSet.insert(Stride);
  2265. }
  2266. LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
  2267. const TargetLibraryInfo *TLI, AAResults *AA,
  2268. DominatorTree *DT, LoopInfo *LI)
  2269. : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
  2270. PtrRtChecking(nullptr),
  2271. DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
  2272. PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
  2273. if (canAnalyzeLoop()) {
  2274. analyzeLoop(AA, LI, TLI, DT);
  2275. }
  2276. }
  2277. void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
  2278. if (CanVecMem) {
  2279. OS.indent(Depth) << "Memory dependences are safe";
  2280. if (MaxSafeDepDistBytes != -1ULL)
  2281. OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
  2282. << " bytes";
  2283. if (PtrRtChecking->Need)
  2284. OS << " with run-time checks";
  2285. OS << "\n";
  2286. }
  2287. if (HasConvergentOp)
  2288. OS.indent(Depth) << "Has convergent operation in loop\n";
  2289. if (Report)
  2290. OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
  2291. if (auto *Dependences = DepChecker->getDependences()) {
  2292. OS.indent(Depth) << "Dependences:\n";
  2293. for (const auto &Dep : *Dependences) {
  2294. Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
  2295. OS << "\n";
  2296. }
  2297. } else
  2298. OS.indent(Depth) << "Too many dependences, not recorded\n";
  2299. // List the pair of accesses need run-time checks to prove independence.
  2300. PtrRtChecking->print(OS, Depth);
  2301. OS << "\n";
  2302. OS.indent(Depth) << "Non vectorizable stores to invariant address were "
  2303. << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
  2304. << "found in loop.\n";
  2305. OS.indent(Depth) << "SCEV assumptions:\n";
  2306. PSE->getPredicate().print(OS, Depth);
  2307. OS << "\n";
  2308. OS.indent(Depth) << "Expressions re-written:\n";
  2309. PSE->print(OS, Depth);
  2310. }
  2311. const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
  2312. auto I = LoopAccessInfoMap.insert({&L, nullptr});
  2313. if (I.second)
  2314. I.first->second =
  2315. std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
  2316. return *I.first->second;
  2317. }
  2318. LoopAccessLegacyAnalysis::LoopAccessLegacyAnalysis() : FunctionPass(ID) {
  2319. initializeLoopAccessLegacyAnalysisPass(*PassRegistry::getPassRegistry());
  2320. }
  2321. bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
  2322. auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  2323. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  2324. auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
  2325. auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
  2326. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  2327. auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  2328. LAIs = std::make_unique<LoopAccessInfoManager>(SE, AA, DT, LI, TLI);
  2329. return false;
  2330. }
  2331. void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
  2332. AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
  2333. AU.addRequiredTransitive<AAResultsWrapperPass>();
  2334. AU.addRequiredTransitive<DominatorTreeWrapperPass>();
  2335. AU.addRequiredTransitive<LoopInfoWrapperPass>();
  2336. AU.setPreservesAll();
  2337. }
  2338. LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
  2339. FunctionAnalysisManager &AM) {
  2340. return LoopAccessInfoManager(
  2341. AM.getResult<ScalarEvolutionAnalysis>(F), AM.getResult<AAManager>(F),
  2342. AM.getResult<DominatorTreeAnalysis>(F), AM.getResult<LoopAnalysis>(F),
  2343. &AM.getResult<TargetLibraryAnalysis>(F));
  2344. }
  2345. char LoopAccessLegacyAnalysis::ID = 0;
  2346. static const char laa_name[] = "Loop Access Analysis";
  2347. #define LAA_NAME "loop-accesses"
  2348. INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2349. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  2350. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  2351. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  2352. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  2353. INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2354. AnalysisKey LoopAccessAnalysis::Key;
  2355. namespace llvm {
  2356. Pass *createLAAPass() {
  2357. return new LoopAccessLegacyAnalysis();
  2358. }
  2359. } // end namespace llvm