LoopVectorizationLegality.cpp 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435
  1. //===- LoopVectorizationLegality.cpp --------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file provides loop vectorization legality analysis. Original code
  10. // resided in LoopVectorize.cpp for a long time.
  11. //
  12. // At this point, it is implemented as a utility class, not as an analysis
  13. // pass. It should be easy to create an analysis pass around it if there
  14. // is a need (but D45420 needs to happen first).
  15. //
  16. #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
  17. #include "llvm/Analysis/Loads.h"
  18. #include "llvm/Analysis/LoopInfo.h"
  19. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  20. #include "llvm/Analysis/TargetLibraryInfo.h"
  21. #include "llvm/Analysis/TargetTransformInfo.h"
  22. #include "llvm/Analysis/ValueTracking.h"
  23. #include "llvm/Analysis/VectorUtils.h"
  24. #include "llvm/IR/IntrinsicInst.h"
  25. #include "llvm/IR/PatternMatch.h"
  26. #include "llvm/Transforms/Utils/SizeOpts.h"
  27. #include "llvm/Transforms/Vectorize/LoopVectorize.h"
  28. using namespace llvm;
  29. using namespace PatternMatch;
  30. #define LV_NAME "loop-vectorize"
  31. #define DEBUG_TYPE LV_NAME
  32. static cl::opt<bool>
  33. EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
  34. cl::desc("Enable if-conversion during vectorization."));
  35. namespace llvm {
  36. cl::opt<bool>
  37. HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
  38. cl::desc("Allow enabling loop hints to reorder "
  39. "FP operations during vectorization."));
  40. }
  41. // TODO: Move size-based thresholds out of legality checking, make cost based
  42. // decisions instead of hard thresholds.
  43. static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
  44. "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
  45. cl::desc("The maximum number of SCEV checks allowed."));
  46. static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
  47. "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
  48. cl::desc("The maximum number of SCEV checks allowed with a "
  49. "vectorize(enable) pragma"));
  50. static cl::opt<LoopVectorizeHints::ScalableForceKind>
  51. ForceScalableVectorization(
  52. "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
  53. cl::Hidden,
  54. cl::desc("Control whether the compiler can use scalable vectors to "
  55. "vectorize a loop"),
  56. cl::values(
  57. clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
  58. "Scalable vectorization is disabled."),
  59. clEnumValN(
  60. LoopVectorizeHints::SK_PreferScalable, "preferred",
  61. "Scalable vectorization is available and favored when the "
  62. "cost is inconclusive."),
  63. clEnumValN(
  64. LoopVectorizeHints::SK_PreferScalable, "on",
  65. "Scalable vectorization is available and favored when the "
  66. "cost is inconclusive.")));
  67. /// Maximum vectorization interleave count.
  68. static const unsigned MaxInterleaveFactor = 16;
  69. namespace llvm {
  70. bool LoopVectorizeHints::Hint::validate(unsigned Val) {
  71. switch (Kind) {
  72. case HK_WIDTH:
  73. return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
  74. case HK_INTERLEAVE:
  75. return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
  76. case HK_FORCE:
  77. return (Val <= 1);
  78. case HK_ISVECTORIZED:
  79. case HK_PREDICATE:
  80. case HK_SCALABLE:
  81. return (Val == 0 || Val == 1);
  82. }
  83. return false;
  84. }
  85. LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
  86. bool InterleaveOnlyWhenForced,
  87. OptimizationRemarkEmitter &ORE,
  88. const TargetTransformInfo *TTI)
  89. : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
  90. Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
  91. Force("vectorize.enable", FK_Undefined, HK_FORCE),
  92. IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
  93. Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
  94. Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
  95. TheLoop(L), ORE(ORE) {
  96. // Populate values with existing loop metadata.
  97. getHintsFromMetadata();
  98. // force-vector-interleave overrides DisableInterleaving.
  99. if (VectorizerParams::isInterleaveForced())
  100. Interleave.Value = VectorizerParams::VectorizationInterleave;
  101. // If the metadata doesn't explicitly specify whether to enable scalable
  102. // vectorization, then decide based on the following criteria (increasing
  103. // level of priority):
  104. // - Target default
  105. // - Metadata width
  106. // - Force option (always overrides)
  107. if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
  108. if (TTI)
  109. Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
  110. : SK_FixedWidthOnly;
  111. if (Width.Value)
  112. // If the width is set, but the metadata says nothing about the scalable
  113. // property, then assume it concerns only a fixed-width UserVF.
  114. // If width is not set, the flag takes precedence.
  115. Scalable.Value = SK_FixedWidthOnly;
  116. }
  117. // If the flag is set to force any use of scalable vectors, override the loop
  118. // hints.
  119. if (ForceScalableVectorization.getValue() !=
  120. LoopVectorizeHints::SK_Unspecified)
  121. Scalable.Value = ForceScalableVectorization.getValue();
  122. // Scalable vectorization is disabled if no preference is specified.
  123. if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
  124. Scalable.Value = SK_FixedWidthOnly;
  125. if (IsVectorized.Value != 1)
  126. // If the vectorization width and interleaving count are both 1 then
  127. // consider the loop to have been already vectorized because there's
  128. // nothing more that we can do.
  129. IsVectorized.Value =
  130. getWidth() == ElementCount::getFixed(1) && getInterleave() == 1;
  131. LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
  132. << "LV: Interleaving disabled by the pass manager\n");
  133. }
  134. void LoopVectorizeHints::setAlreadyVectorized() {
  135. LLVMContext &Context = TheLoop->getHeader()->getContext();
  136. MDNode *IsVectorizedMD = MDNode::get(
  137. Context,
  138. {MDString::get(Context, "llvm.loop.isvectorized"),
  139. ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
  140. MDNode *LoopID = TheLoop->getLoopID();
  141. MDNode *NewLoopID =
  142. makePostTransformationMetadata(Context, LoopID,
  143. {Twine(Prefix(), "vectorize.").str(),
  144. Twine(Prefix(), "interleave.").str()},
  145. {IsVectorizedMD});
  146. TheLoop->setLoopID(NewLoopID);
  147. // Update internal cache.
  148. IsVectorized.Value = 1;
  149. }
  150. bool LoopVectorizeHints::allowVectorization(
  151. Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
  152. if (getForce() == LoopVectorizeHints::FK_Disabled) {
  153. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
  154. emitRemarkWithHints();
  155. return false;
  156. }
  157. if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
  158. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
  159. emitRemarkWithHints();
  160. return false;
  161. }
  162. if (getIsVectorized() == 1) {
  163. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
  164. // FIXME: Add interleave.disable metadata. This will allow
  165. // vectorize.disable to be used without disabling the pass and errors
  166. // to differentiate between disabled vectorization and a width of 1.
  167. ORE.emit([&]() {
  168. return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
  169. "AllDisabled", L->getStartLoc(),
  170. L->getHeader())
  171. << "loop not vectorized: vectorization and interleaving are "
  172. "explicitly disabled, or the loop has already been "
  173. "vectorized";
  174. });
  175. return false;
  176. }
  177. return true;
  178. }
  179. void LoopVectorizeHints::emitRemarkWithHints() const {
  180. using namespace ore;
  181. ORE.emit([&]() {
  182. if (Force.Value == LoopVectorizeHints::FK_Disabled)
  183. return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
  184. TheLoop->getStartLoc(),
  185. TheLoop->getHeader())
  186. << "loop not vectorized: vectorization is explicitly disabled";
  187. else {
  188. OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
  189. TheLoop->getStartLoc(), TheLoop->getHeader());
  190. R << "loop not vectorized";
  191. if (Force.Value == LoopVectorizeHints::FK_Enabled) {
  192. R << " (Force=" << NV("Force", true);
  193. if (Width.Value != 0)
  194. R << ", Vector Width=" << NV("VectorWidth", getWidth());
  195. if (getInterleave() != 0)
  196. R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
  197. R << ")";
  198. }
  199. return R;
  200. }
  201. });
  202. }
  203. const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
  204. if (getWidth() == ElementCount::getFixed(1))
  205. return LV_NAME;
  206. if (getForce() == LoopVectorizeHints::FK_Disabled)
  207. return LV_NAME;
  208. if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
  209. return LV_NAME;
  210. return OptimizationRemarkAnalysis::AlwaysPrint;
  211. }
  212. bool LoopVectorizeHints::allowReordering() const {
  213. // Allow the vectorizer to change the order of operations if enabling
  214. // loop hints are provided
  215. ElementCount EC = getWidth();
  216. return HintsAllowReordering &&
  217. (getForce() == LoopVectorizeHints::FK_Enabled ||
  218. EC.getKnownMinValue() > 1);
  219. }
  220. void LoopVectorizeHints::getHintsFromMetadata() {
  221. MDNode *LoopID = TheLoop->getLoopID();
  222. if (!LoopID)
  223. return;
  224. // First operand should refer to the loop id itself.
  225. assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
  226. assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  227. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
  228. const MDString *S = nullptr;
  229. SmallVector<Metadata *, 4> Args;
  230. // The expected hint is either a MDString or a MDNode with the first
  231. // operand a MDString.
  232. if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
  233. if (!MD || MD->getNumOperands() == 0)
  234. continue;
  235. S = dyn_cast<MDString>(MD->getOperand(0));
  236. for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
  237. Args.push_back(MD->getOperand(i));
  238. } else {
  239. S = dyn_cast<MDString>(LoopID->getOperand(i));
  240. assert(Args.size() == 0 && "too many arguments for MDString");
  241. }
  242. if (!S)
  243. continue;
  244. // Check if the hint starts with the loop metadata prefix.
  245. StringRef Name = S->getString();
  246. if (Args.size() == 1)
  247. setHint(Name, Args[0]);
  248. }
  249. }
  250. void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
  251. if (!Name.startswith(Prefix()))
  252. return;
  253. Name = Name.substr(Prefix().size(), StringRef::npos);
  254. const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
  255. if (!C)
  256. return;
  257. unsigned Val = C->getZExtValue();
  258. Hint *Hints[] = {&Width, &Interleave, &Force,
  259. &IsVectorized, &Predicate, &Scalable};
  260. for (auto *H : Hints) {
  261. if (Name == H->Name) {
  262. if (H->validate(Val))
  263. H->Value = Val;
  264. else
  265. LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
  266. break;
  267. }
  268. }
  269. }
  270. // Return true if the inner loop \p Lp is uniform with regard to the outer loop
  271. // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
  272. // executing the inner loop will execute the same iterations). This check is
  273. // very constrained for now but it will be relaxed in the future. \p Lp is
  274. // considered uniform if it meets all the following conditions:
  275. // 1) it has a canonical IV (starting from 0 and with stride 1),
  276. // 2) its latch terminator is a conditional branch and,
  277. // 3) its latch condition is a compare instruction whose operands are the
  278. // canonical IV and an OuterLp invariant.
  279. // This check doesn't take into account the uniformity of other conditions not
  280. // related to the loop latch because they don't affect the loop uniformity.
  281. //
  282. // NOTE: We decided to keep all these checks and its associated documentation
  283. // together so that we can easily have a picture of the current supported loop
  284. // nests. However, some of the current checks don't depend on \p OuterLp and
  285. // would be redundantly executed for each \p Lp if we invoked this function for
  286. // different candidate outer loops. This is not the case for now because we
  287. // don't currently have the infrastructure to evaluate multiple candidate outer
  288. // loops and \p OuterLp will be a fixed parameter while we only support explicit
  289. // outer loop vectorization. It's also very likely that these checks go away
  290. // before introducing the aforementioned infrastructure. However, if this is not
  291. // the case, we should move the \p OuterLp independent checks to a separate
  292. // function that is only executed once for each \p Lp.
  293. static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
  294. assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
  295. // If Lp is the outer loop, it's uniform by definition.
  296. if (Lp == OuterLp)
  297. return true;
  298. assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
  299. // 1.
  300. PHINode *IV = Lp->getCanonicalInductionVariable();
  301. if (!IV) {
  302. LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
  303. return false;
  304. }
  305. // 2.
  306. BasicBlock *Latch = Lp->getLoopLatch();
  307. auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
  308. if (!LatchBr || LatchBr->isUnconditional()) {
  309. LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
  310. return false;
  311. }
  312. // 3.
  313. auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
  314. if (!LatchCmp) {
  315. LLVM_DEBUG(
  316. dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
  317. return false;
  318. }
  319. Value *CondOp0 = LatchCmp->getOperand(0);
  320. Value *CondOp1 = LatchCmp->getOperand(1);
  321. Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
  322. if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
  323. !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
  324. LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
  325. return false;
  326. }
  327. return true;
  328. }
  329. // Return true if \p Lp and all its nested loops are uniform with regard to \p
  330. // OuterLp.
  331. static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
  332. if (!isUniformLoop(Lp, OuterLp))
  333. return false;
  334. // Check if nested loops are uniform.
  335. for (Loop *SubLp : *Lp)
  336. if (!isUniformLoopNest(SubLp, OuterLp))
  337. return false;
  338. return true;
  339. }
  340. static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
  341. if (Ty->isPointerTy())
  342. return DL.getIntPtrType(Ty);
  343. // It is possible that char's or short's overflow when we ask for the loop's
  344. // trip count, work around this by changing the type size.
  345. if (Ty->getScalarSizeInBits() < 32)
  346. return Type::getInt32Ty(Ty->getContext());
  347. return Ty;
  348. }
  349. static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
  350. Ty0 = convertPointerToIntegerType(DL, Ty0);
  351. Ty1 = convertPointerToIntegerType(DL, Ty1);
  352. if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
  353. return Ty0;
  354. return Ty1;
  355. }
  356. /// Check that the instruction has outside loop users and is not an
  357. /// identified reduction variable.
  358. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
  359. SmallPtrSetImpl<Value *> &AllowedExit) {
  360. // Reductions, Inductions and non-header phis are allowed to have exit users. All
  361. // other instructions must not have external users.
  362. if (!AllowedExit.count(Inst))
  363. // Check that all of the users of the loop are inside the BB.
  364. for (User *U : Inst->users()) {
  365. Instruction *UI = cast<Instruction>(U);
  366. // This user may be a reduction exit value.
  367. if (!TheLoop->contains(UI)) {
  368. LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
  369. return true;
  370. }
  371. }
  372. return false;
  373. }
  374. /// Returns true if A and B have same pointer operands or same SCEVs addresses
  375. static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
  376. StoreInst *B) {
  377. // Compare store
  378. if (A == B)
  379. return true;
  380. // Otherwise Compare pointers
  381. Value *APtr = A->getPointerOperand();
  382. Value *BPtr = B->getPointerOperand();
  383. if (APtr == BPtr)
  384. return true;
  385. // Otherwise compare address SCEVs
  386. if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
  387. return true;
  388. return false;
  389. }
  390. int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
  391. Value *Ptr) const {
  392. const ValueToValueMap &Strides =
  393. getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
  394. Function *F = TheLoop->getHeader()->getParent();
  395. bool OptForSize = F->hasOptSize() ||
  396. llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
  397. PGSOQueryType::IRPass);
  398. bool CanAddPredicate = !OptForSize;
  399. int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
  400. CanAddPredicate, false).value_or(0);
  401. if (Stride == 1 || Stride == -1)
  402. return Stride;
  403. return 0;
  404. }
  405. bool LoopVectorizationLegality::isUniform(Value *V) const {
  406. return LAI->isUniform(V);
  407. }
  408. bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
  409. Value *Ptr = getLoadStorePointerOperand(&I);
  410. if (!Ptr)
  411. return false;
  412. // Note: There's nothing inherent which prevents predicated loads and
  413. // stores from being uniform. The current lowering simply doesn't handle
  414. // it; in particular, the cost model distinguishes scatter/gather from
  415. // scalar w/predication, and we currently rely on the scalar path.
  416. return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
  417. }
  418. bool LoopVectorizationLegality::canVectorizeOuterLoop() {
  419. assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
  420. // Store the result and return it at the end instead of exiting early, in case
  421. // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
  422. bool Result = true;
  423. bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
  424. for (BasicBlock *BB : TheLoop->blocks()) {
  425. // Check whether the BB terminator is a BranchInst. Any other terminator is
  426. // not supported yet.
  427. auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
  428. if (!Br) {
  429. reportVectorizationFailure("Unsupported basic block terminator",
  430. "loop control flow is not understood by vectorizer",
  431. "CFGNotUnderstood", ORE, TheLoop);
  432. if (DoExtraAnalysis)
  433. Result = false;
  434. else
  435. return false;
  436. }
  437. // Check whether the BranchInst is a supported one. Only unconditional
  438. // branches, conditional branches with an outer loop invariant condition or
  439. // backedges are supported.
  440. // FIXME: We skip these checks when VPlan predication is enabled as we
  441. // want to allow divergent branches. This whole check will be removed
  442. // once VPlan predication is on by default.
  443. if (Br && Br->isConditional() &&
  444. !TheLoop->isLoopInvariant(Br->getCondition()) &&
  445. !LI->isLoopHeader(Br->getSuccessor(0)) &&
  446. !LI->isLoopHeader(Br->getSuccessor(1))) {
  447. reportVectorizationFailure("Unsupported conditional branch",
  448. "loop control flow is not understood by vectorizer",
  449. "CFGNotUnderstood", ORE, TheLoop);
  450. if (DoExtraAnalysis)
  451. Result = false;
  452. else
  453. return false;
  454. }
  455. }
  456. // Check whether inner loops are uniform. At this point, we only support
  457. // simple outer loops scenarios with uniform nested loops.
  458. if (!isUniformLoopNest(TheLoop /*loop nest*/,
  459. TheLoop /*context outer loop*/)) {
  460. reportVectorizationFailure("Outer loop contains divergent loops",
  461. "loop control flow is not understood by vectorizer",
  462. "CFGNotUnderstood", ORE, TheLoop);
  463. if (DoExtraAnalysis)
  464. Result = false;
  465. else
  466. return false;
  467. }
  468. // Check whether we are able to set up outer loop induction.
  469. if (!setupOuterLoopInductions()) {
  470. reportVectorizationFailure("Unsupported outer loop Phi(s)",
  471. "Unsupported outer loop Phi(s)",
  472. "UnsupportedPhi", ORE, TheLoop);
  473. if (DoExtraAnalysis)
  474. Result = false;
  475. else
  476. return false;
  477. }
  478. return Result;
  479. }
  480. void LoopVectorizationLegality::addInductionPhi(
  481. PHINode *Phi, const InductionDescriptor &ID,
  482. SmallPtrSetImpl<Value *> &AllowedExit) {
  483. Inductions[Phi] = ID;
  484. // In case this induction also comes with casts that we know we can ignore
  485. // in the vectorized loop body, record them here. All casts could be recorded
  486. // here for ignoring, but suffices to record only the first (as it is the
  487. // only one that may bw used outside the cast sequence).
  488. const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
  489. if (!Casts.empty())
  490. InductionCastsToIgnore.insert(*Casts.begin());
  491. Type *PhiTy = Phi->getType();
  492. const DataLayout &DL = Phi->getModule()->getDataLayout();
  493. // Get the widest type.
  494. if (!PhiTy->isFloatingPointTy()) {
  495. if (!WidestIndTy)
  496. WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
  497. else
  498. WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
  499. }
  500. // Int inductions are special because we only allow one IV.
  501. if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
  502. ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
  503. isa<Constant>(ID.getStartValue()) &&
  504. cast<Constant>(ID.getStartValue())->isNullValue()) {
  505. // Use the phi node with the widest type as induction. Use the last
  506. // one if there are multiple (no good reason for doing this other
  507. // than it is expedient). We've checked that it begins at zero and
  508. // steps by one, so this is a canonical induction variable.
  509. if (!PrimaryInduction || PhiTy == WidestIndTy)
  510. PrimaryInduction = Phi;
  511. }
  512. // Both the PHI node itself, and the "post-increment" value feeding
  513. // back into the PHI node may have external users.
  514. // We can allow those uses, except if the SCEVs we have for them rely
  515. // on predicates that only hold within the loop, since allowing the exit
  516. // currently means re-using this SCEV outside the loop (see PR33706 for more
  517. // details).
  518. if (PSE.getPredicate().isAlwaysTrue()) {
  519. AllowedExit.insert(Phi);
  520. AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
  521. }
  522. LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
  523. }
  524. bool LoopVectorizationLegality::setupOuterLoopInductions() {
  525. BasicBlock *Header = TheLoop->getHeader();
  526. // Returns true if a given Phi is a supported induction.
  527. auto isSupportedPhi = [&](PHINode &Phi) -> bool {
  528. InductionDescriptor ID;
  529. if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
  530. ID.getKind() == InductionDescriptor::IK_IntInduction) {
  531. addInductionPhi(&Phi, ID, AllowedExit);
  532. return true;
  533. } else {
  534. // Bail out for any Phi in the outer loop header that is not a supported
  535. // induction.
  536. LLVM_DEBUG(
  537. dbgs()
  538. << "LV: Found unsupported PHI for outer loop vectorization.\n");
  539. return false;
  540. }
  541. };
  542. if (llvm::all_of(Header->phis(), isSupportedPhi))
  543. return true;
  544. else
  545. return false;
  546. }
  547. /// Checks if a function is scalarizable according to the TLI, in
  548. /// the sense that it should be vectorized and then expanded in
  549. /// multiple scalar calls. This is represented in the
  550. /// TLI via mappings that do not specify a vector name, as in the
  551. /// following example:
  552. ///
  553. /// const VecDesc VecIntrinsics[] = {
  554. /// {"llvm.phx.abs.i32", "", 4}
  555. /// };
  556. static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
  557. const StringRef ScalarName = CI.getCalledFunction()->getName();
  558. bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
  559. // Check that all known VFs are not associated to a vector
  560. // function, i.e. the vector name is emty.
  561. if (Scalarize) {
  562. ElementCount WidestFixedVF, WidestScalableVF;
  563. TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF);
  564. for (ElementCount VF = ElementCount::getFixed(2);
  565. ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2)
  566. Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
  567. for (ElementCount VF = ElementCount::getScalable(1);
  568. ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2)
  569. Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
  570. assert((WidestScalableVF.isZero() || !Scalarize) &&
  571. "Caller may decide to scalarize a variant using a scalable VF");
  572. }
  573. return Scalarize;
  574. }
  575. bool LoopVectorizationLegality::canVectorizeInstrs() {
  576. BasicBlock *Header = TheLoop->getHeader();
  577. // For each block in the loop.
  578. for (BasicBlock *BB : TheLoop->blocks()) {
  579. // Scan the instructions in the block and look for hazards.
  580. for (Instruction &I : *BB) {
  581. if (auto *Phi = dyn_cast<PHINode>(&I)) {
  582. Type *PhiTy = Phi->getType();
  583. // Check that this PHI type is allowed.
  584. if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
  585. !PhiTy->isPointerTy()) {
  586. reportVectorizationFailure("Found a non-int non-pointer PHI",
  587. "loop control flow is not understood by vectorizer",
  588. "CFGNotUnderstood", ORE, TheLoop);
  589. return false;
  590. }
  591. // If this PHINode is not in the header block, then we know that we
  592. // can convert it to select during if-conversion. No need to check if
  593. // the PHIs in this block are induction or reduction variables.
  594. if (BB != Header) {
  595. // Non-header phi nodes that have outside uses can be vectorized. Add
  596. // them to the list of allowed exits.
  597. // Unsafe cyclic dependencies with header phis are identified during
  598. // legalization for reduction, induction and fixed order
  599. // recurrences.
  600. AllowedExit.insert(&I);
  601. continue;
  602. }
  603. // We only allow if-converted PHIs with exactly two incoming values.
  604. if (Phi->getNumIncomingValues() != 2) {
  605. reportVectorizationFailure("Found an invalid PHI",
  606. "loop control flow is not understood by vectorizer",
  607. "CFGNotUnderstood", ORE, TheLoop, Phi);
  608. return false;
  609. }
  610. RecurrenceDescriptor RedDes;
  611. if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
  612. DT, PSE.getSE())) {
  613. Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
  614. AllowedExit.insert(RedDes.getLoopExitInstr());
  615. Reductions[Phi] = RedDes;
  616. continue;
  617. }
  618. // TODO: Instead of recording the AllowedExit, it would be good to
  619. // record the complementary set: NotAllowedExit. These include (but may
  620. // not be limited to):
  621. // 1. Reduction phis as they represent the one-before-last value, which
  622. // is not available when vectorized
  623. // 2. Induction phis and increment when SCEV predicates cannot be used
  624. // outside the loop - see addInductionPhi
  625. // 3. Non-Phis with outside uses when SCEV predicates cannot be used
  626. // outside the loop - see call to hasOutsideLoopUser in the non-phi
  627. // handling below
  628. // 4. FixedOrderRecurrence phis that can possibly be handled by
  629. // extraction.
  630. // By recording these, we can then reason about ways to vectorize each
  631. // of these NotAllowedExit.
  632. InductionDescriptor ID;
  633. if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
  634. addInductionPhi(Phi, ID, AllowedExit);
  635. Requirements->addExactFPMathInst(ID.getExactFPMathInst());
  636. continue;
  637. }
  638. if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop,
  639. SinkAfter, DT)) {
  640. AllowedExit.insert(Phi);
  641. FixedOrderRecurrences.insert(Phi);
  642. continue;
  643. }
  644. // As a last resort, coerce the PHI to a AddRec expression
  645. // and re-try classifying it a an induction PHI.
  646. if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
  647. addInductionPhi(Phi, ID, AllowedExit);
  648. continue;
  649. }
  650. reportVectorizationFailure("Found an unidentified PHI",
  651. "value that could not be identified as "
  652. "reduction is used outside the loop",
  653. "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
  654. return false;
  655. } // end of PHI handling
  656. // We handle calls that:
  657. // * Are debug info intrinsics.
  658. // * Have a mapping to an IR intrinsic.
  659. // * Have a vector version available.
  660. auto *CI = dyn_cast<CallInst>(&I);
  661. if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
  662. !isa<DbgInfoIntrinsic>(CI) &&
  663. !(CI->getCalledFunction() && TLI &&
  664. (!VFDatabase::getMappings(*CI).empty() ||
  665. isTLIScalarize(*TLI, *CI)))) {
  666. // If the call is a recognized math libary call, it is likely that
  667. // we can vectorize it given loosened floating-point constraints.
  668. LibFunc Func;
  669. bool IsMathLibCall =
  670. TLI && CI->getCalledFunction() &&
  671. CI->getType()->isFloatingPointTy() &&
  672. TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
  673. TLI->hasOptimizedCodeGen(Func);
  674. if (IsMathLibCall) {
  675. // TODO: Ideally, we should not use clang-specific language here,
  676. // but it's hard to provide meaningful yet generic advice.
  677. // Also, should this be guarded by allowExtraAnalysis() and/or be part
  678. // of the returned info from isFunctionVectorizable()?
  679. reportVectorizationFailure(
  680. "Found a non-intrinsic callsite",
  681. "library call cannot be vectorized. "
  682. "Try compiling with -fno-math-errno, -ffast-math, "
  683. "or similar flags",
  684. "CantVectorizeLibcall", ORE, TheLoop, CI);
  685. } else {
  686. reportVectorizationFailure("Found a non-intrinsic callsite",
  687. "call instruction cannot be vectorized",
  688. "CantVectorizeLibcall", ORE, TheLoop, CI);
  689. }
  690. return false;
  691. }
  692. // Some intrinsics have scalar arguments and should be same in order for
  693. // them to be vectorized (i.e. loop invariant).
  694. if (CI) {
  695. auto *SE = PSE.getSE();
  696. Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
  697. for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
  698. if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
  699. if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
  700. reportVectorizationFailure("Found unvectorizable intrinsic",
  701. "intrinsic instruction cannot be vectorized",
  702. "CantVectorizeIntrinsic", ORE, TheLoop, CI);
  703. return false;
  704. }
  705. }
  706. }
  707. // Check that the instruction return type is vectorizable.
  708. // Also, we can't vectorize extractelement instructions.
  709. if ((!VectorType::isValidElementType(I.getType()) &&
  710. !I.getType()->isVoidTy()) ||
  711. isa<ExtractElementInst>(I)) {
  712. reportVectorizationFailure("Found unvectorizable type",
  713. "instruction return type cannot be vectorized",
  714. "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
  715. return false;
  716. }
  717. // Check that the stored type is vectorizable.
  718. if (auto *ST = dyn_cast<StoreInst>(&I)) {
  719. Type *T = ST->getValueOperand()->getType();
  720. if (!VectorType::isValidElementType(T)) {
  721. reportVectorizationFailure("Store instruction cannot be vectorized",
  722. "store instruction cannot be vectorized",
  723. "CantVectorizeStore", ORE, TheLoop, ST);
  724. return false;
  725. }
  726. // For nontemporal stores, check that a nontemporal vector version is
  727. // supported on the target.
  728. if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
  729. // Arbitrarily try a vector of 2 elements.
  730. auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
  731. assert(VecTy && "did not find vectorized version of stored type");
  732. if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
  733. reportVectorizationFailure(
  734. "nontemporal store instruction cannot be vectorized",
  735. "nontemporal store instruction cannot be vectorized",
  736. "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
  737. return false;
  738. }
  739. }
  740. } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
  741. if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
  742. // For nontemporal loads, check that a nontemporal vector version is
  743. // supported on the target (arbitrarily try a vector of 2 elements).
  744. auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
  745. assert(VecTy && "did not find vectorized version of load type");
  746. if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
  747. reportVectorizationFailure(
  748. "nontemporal load instruction cannot be vectorized",
  749. "nontemporal load instruction cannot be vectorized",
  750. "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
  751. return false;
  752. }
  753. }
  754. // FP instructions can allow unsafe algebra, thus vectorizable by
  755. // non-IEEE-754 compliant SIMD units.
  756. // This applies to floating-point math operations and calls, not memory
  757. // operations, shuffles, or casts, as they don't change precision or
  758. // semantics.
  759. } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
  760. !I.isFast()) {
  761. LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
  762. Hints->setPotentiallyUnsafe();
  763. }
  764. // Reduction instructions are allowed to have exit users.
  765. // All other instructions must not have external users.
  766. if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
  767. // We can safely vectorize loops where instructions within the loop are
  768. // used outside the loop only if the SCEV predicates within the loop is
  769. // same as outside the loop. Allowing the exit means reusing the SCEV
  770. // outside the loop.
  771. if (PSE.getPredicate().isAlwaysTrue()) {
  772. AllowedExit.insert(&I);
  773. continue;
  774. }
  775. reportVectorizationFailure("Value cannot be used outside the loop",
  776. "value cannot be used outside the loop",
  777. "ValueUsedOutsideLoop", ORE, TheLoop, &I);
  778. return false;
  779. }
  780. } // next instr.
  781. }
  782. if (!PrimaryInduction) {
  783. if (Inductions.empty()) {
  784. reportVectorizationFailure("Did not find one integer induction var",
  785. "loop induction variable could not be identified",
  786. "NoInductionVariable", ORE, TheLoop);
  787. return false;
  788. } else if (!WidestIndTy) {
  789. reportVectorizationFailure("Did not find one integer induction var",
  790. "integer loop induction variable could not be identified",
  791. "NoIntegerInductionVariable", ORE, TheLoop);
  792. return false;
  793. } else {
  794. LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
  795. }
  796. }
  797. // For fixed order recurrences, we use the previous value (incoming value from
  798. // the latch) to check if it dominates all users of the recurrence. Bail out
  799. // if we have to sink such an instruction for another recurrence, as the
  800. // dominance requirement may not hold after sinking.
  801. BasicBlock *LoopLatch = TheLoop->getLoopLatch();
  802. if (any_of(FixedOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
  803. Instruction *V =
  804. cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
  805. return SinkAfter.find(V) != SinkAfter.end();
  806. }))
  807. return false;
  808. // Now we know the widest induction type, check if our found induction
  809. // is the same size. If it's not, unset it here and InnerLoopVectorizer
  810. // will create another.
  811. if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
  812. PrimaryInduction = nullptr;
  813. return true;
  814. }
  815. bool LoopVectorizationLegality::canVectorizeMemory() {
  816. LAI = &LAIs.getInfo(*TheLoop);
  817. const OptimizationRemarkAnalysis *LAR = LAI->getReport();
  818. if (LAR) {
  819. ORE->emit([&]() {
  820. return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
  821. "loop not vectorized: ", *LAR);
  822. });
  823. }
  824. if (!LAI->canVectorizeMemory())
  825. return false;
  826. // We can vectorize stores to invariant address when final reduction value is
  827. // guaranteed to be stored at the end of the loop. Also, if decision to
  828. // vectorize loop is made, runtime checks are added so as to make sure that
  829. // invariant address won't alias with any other objects.
  830. if (!LAI->getStoresToInvariantAddresses().empty()) {
  831. // For each invariant address, check if last stored value is unconditional
  832. // and the address is not calculated inside the loop.
  833. for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
  834. if (!isInvariantStoreOfReduction(SI))
  835. continue;
  836. if (blockNeedsPredication(SI->getParent())) {
  837. reportVectorizationFailure(
  838. "We don't allow storing to uniform addresses",
  839. "write of conditional recurring variant value to a loop "
  840. "invariant address could not be vectorized",
  841. "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
  842. return false;
  843. }
  844. // Invariant address should be defined outside of loop. LICM pass usually
  845. // makes sure it happens, but in rare cases it does not, we do not want
  846. // to overcomplicate vectorization to support this case.
  847. if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) {
  848. if (TheLoop->contains(Ptr)) {
  849. reportVectorizationFailure(
  850. "Invariant address is calculated inside the loop",
  851. "write to a loop invariant address could not "
  852. "be vectorized",
  853. "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
  854. return false;
  855. }
  856. }
  857. }
  858. if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
  859. // For each invariant address, check its last stored value is the result
  860. // of one of our reductions.
  861. //
  862. // We do not check if dependence with loads exists because they are
  863. // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
  864. // behaviour changes we have to modify this code.
  865. ScalarEvolution *SE = PSE.getSE();
  866. SmallVector<StoreInst *, 4> UnhandledStores;
  867. for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
  868. if (isInvariantStoreOfReduction(SI)) {
  869. // Earlier stores to this address are effectively deadcode.
  870. // With opaque pointers it is possible for one pointer to be used with
  871. // different sizes of stored values:
  872. // store i32 0, ptr %x
  873. // store i8 0, ptr %x
  874. // The latest store doesn't complitely overwrite the first one in the
  875. // example. That is why we have to make sure that types of stored
  876. // values are same.
  877. // TODO: Check that bitwidth of unhandled store is smaller then the
  878. // one that overwrites it and add a test.
  879. erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
  880. return storeToSameAddress(SE, SI, I) &&
  881. I->getValueOperand()->getType() ==
  882. SI->getValueOperand()->getType();
  883. });
  884. continue;
  885. }
  886. UnhandledStores.push_back(SI);
  887. }
  888. bool IsOK = UnhandledStores.empty();
  889. // TODO: we should also validate against InvariantMemSets.
  890. if (!IsOK) {
  891. reportVectorizationFailure(
  892. "We don't allow storing to uniform addresses",
  893. "write to a loop invariant address could not "
  894. "be vectorized",
  895. "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
  896. return false;
  897. }
  898. }
  899. }
  900. PSE.addPredicate(LAI->getPSE().getPredicate());
  901. return true;
  902. }
  903. bool LoopVectorizationLegality::canVectorizeFPMath(
  904. bool EnableStrictReductions) {
  905. // First check if there is any ExactFP math or if we allow reassociations
  906. if (!Requirements->getExactFPInst() || Hints->allowReordering())
  907. return true;
  908. // If the above is false, we have ExactFPMath & do not allow reordering.
  909. // If the EnableStrictReductions flag is set, first check if we have any
  910. // Exact FP induction vars, which we cannot vectorize.
  911. if (!EnableStrictReductions ||
  912. any_of(getInductionVars(), [&](auto &Induction) -> bool {
  913. InductionDescriptor IndDesc = Induction.second;
  914. return IndDesc.getExactFPMathInst();
  915. }))
  916. return false;
  917. // We can now only vectorize if all reductions with Exact FP math also
  918. // have the isOrdered flag set, which indicates that we can move the
  919. // reduction operations in-loop.
  920. return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
  921. const RecurrenceDescriptor &RdxDesc = Reduction.second;
  922. return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
  923. }));
  924. }
  925. bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
  926. return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
  927. const RecurrenceDescriptor &RdxDesc = Reduction.second;
  928. return RdxDesc.IntermediateStore == SI;
  929. });
  930. }
  931. bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
  932. return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
  933. const RecurrenceDescriptor &RdxDesc = Reduction.second;
  934. if (!RdxDesc.IntermediateStore)
  935. return false;
  936. ScalarEvolution *SE = PSE.getSE();
  937. Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
  938. return V == InvariantAddress ||
  939. SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
  940. });
  941. }
  942. bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
  943. Value *In0 = const_cast<Value *>(V);
  944. PHINode *PN = dyn_cast_or_null<PHINode>(In0);
  945. if (!PN)
  946. return false;
  947. return Inductions.count(PN);
  948. }
  949. const InductionDescriptor *
  950. LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
  951. if (!isInductionPhi(Phi))
  952. return nullptr;
  953. auto &ID = getInductionVars().find(Phi)->second;
  954. if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
  955. ID.getKind() == InductionDescriptor::IK_FpInduction)
  956. return &ID;
  957. return nullptr;
  958. }
  959. const InductionDescriptor *
  960. LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
  961. if (!isInductionPhi(Phi))
  962. return nullptr;
  963. auto &ID = getInductionVars().find(Phi)->second;
  964. if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
  965. return &ID;
  966. return nullptr;
  967. }
  968. bool LoopVectorizationLegality::isCastedInductionVariable(
  969. const Value *V) const {
  970. auto *Inst = dyn_cast<Instruction>(V);
  971. return (Inst && InductionCastsToIgnore.count(Inst));
  972. }
  973. bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
  974. return isInductionPhi(V) || isCastedInductionVariable(V);
  975. }
  976. bool LoopVectorizationLegality::isFixedOrderRecurrence(
  977. const PHINode *Phi) const {
  978. return FixedOrderRecurrences.count(Phi);
  979. }
  980. bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
  981. return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
  982. }
  983. bool LoopVectorizationLegality::blockCanBePredicated(
  984. BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
  985. SmallPtrSetImpl<const Instruction *> &MaskedOp,
  986. SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
  987. for (Instruction &I : *BB) {
  988. // We can predicate blocks with calls to assume, as long as we drop them in
  989. // case we flatten the CFG via predication.
  990. if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
  991. ConditionalAssumes.insert(&I);
  992. continue;
  993. }
  994. // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
  995. // TODO: there might be cases that it should block the vectorization. Let's
  996. // ignore those for now.
  997. if (isa<NoAliasScopeDeclInst>(&I))
  998. continue;
  999. // Loads are handled via masking (or speculated if safe to do so.)
  1000. if (auto *LI = dyn_cast<LoadInst>(&I)) {
  1001. if (!SafePtrs.count(LI->getPointerOperand()))
  1002. MaskedOp.insert(LI);
  1003. continue;
  1004. }
  1005. // Predicated store requires some form of masking:
  1006. // 1) masked store HW instruction,
  1007. // 2) emulation via load-blend-store (only if safe and legal to do so,
  1008. // be aware on the race conditions), or
  1009. // 3) element-by-element predicate check and scalar store.
  1010. if (auto *SI = dyn_cast<StoreInst>(&I)) {
  1011. MaskedOp.insert(SI);
  1012. continue;
  1013. }
  1014. if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
  1015. return false;
  1016. }
  1017. return true;
  1018. }
  1019. bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
  1020. if (!EnableIfConversion) {
  1021. reportVectorizationFailure("If-conversion is disabled",
  1022. "if-conversion is disabled",
  1023. "IfConversionDisabled",
  1024. ORE, TheLoop);
  1025. return false;
  1026. }
  1027. assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
  1028. // A list of pointers which are known to be dereferenceable within scope of
  1029. // the loop body for each iteration of the loop which executes. That is,
  1030. // the memory pointed to can be dereferenced (with the access size implied by
  1031. // the value's type) unconditionally within the loop header without
  1032. // introducing a new fault.
  1033. SmallPtrSet<Value *, 8> SafePointers;
  1034. // Collect safe addresses.
  1035. for (BasicBlock *BB : TheLoop->blocks()) {
  1036. if (!blockNeedsPredication(BB)) {
  1037. for (Instruction &I : *BB)
  1038. if (auto *Ptr = getLoadStorePointerOperand(&I))
  1039. SafePointers.insert(Ptr);
  1040. continue;
  1041. }
  1042. // For a block which requires predication, a address may be safe to access
  1043. // in the loop w/o predication if we can prove dereferenceability facts
  1044. // sufficient to ensure it'll never fault within the loop. For the moment,
  1045. // we restrict this to loads; stores are more complicated due to
  1046. // concurrency restrictions.
  1047. ScalarEvolution &SE = *PSE.getSE();
  1048. for (Instruction &I : *BB) {
  1049. LoadInst *LI = dyn_cast<LoadInst>(&I);
  1050. if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
  1051. isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC))
  1052. SafePointers.insert(LI->getPointerOperand());
  1053. }
  1054. }
  1055. // Collect the blocks that need predication.
  1056. for (BasicBlock *BB : TheLoop->blocks()) {
  1057. // We don't support switch statements inside loops.
  1058. if (!isa<BranchInst>(BB->getTerminator())) {
  1059. reportVectorizationFailure("Loop contains a switch statement",
  1060. "loop contains a switch statement",
  1061. "LoopContainsSwitch", ORE, TheLoop,
  1062. BB->getTerminator());
  1063. return false;
  1064. }
  1065. // We must be able to predicate all blocks that need to be predicated.
  1066. if (blockNeedsPredication(BB)) {
  1067. if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
  1068. ConditionalAssumes)) {
  1069. reportVectorizationFailure(
  1070. "Control flow cannot be substituted for a select",
  1071. "control flow cannot be substituted for a select",
  1072. "NoCFGForSelect", ORE, TheLoop,
  1073. BB->getTerminator());
  1074. return false;
  1075. }
  1076. }
  1077. }
  1078. // We can if-convert this loop.
  1079. return true;
  1080. }
  1081. // Helper function to canVectorizeLoopNestCFG.
  1082. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
  1083. bool UseVPlanNativePath) {
  1084. assert((UseVPlanNativePath || Lp->isInnermost()) &&
  1085. "VPlan-native path is not enabled.");
  1086. // TODO: ORE should be improved to show more accurate information when an
  1087. // outer loop can't be vectorized because a nested loop is not understood or
  1088. // legal. Something like: "outer_loop_location: loop not vectorized:
  1089. // (inner_loop_location) loop control flow is not understood by vectorizer".
  1090. // Store the result and return it at the end instead of exiting early, in case
  1091. // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
  1092. bool Result = true;
  1093. bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
  1094. // We must have a loop in canonical form. Loops with indirectbr in them cannot
  1095. // be canonicalized.
  1096. if (!Lp->getLoopPreheader()) {
  1097. reportVectorizationFailure("Loop doesn't have a legal pre-header",
  1098. "loop control flow is not understood by vectorizer",
  1099. "CFGNotUnderstood", ORE, TheLoop);
  1100. if (DoExtraAnalysis)
  1101. Result = false;
  1102. else
  1103. return false;
  1104. }
  1105. // We must have a single backedge.
  1106. if (Lp->getNumBackEdges() != 1) {
  1107. reportVectorizationFailure("The loop must have a single backedge",
  1108. "loop control flow is not understood by vectorizer",
  1109. "CFGNotUnderstood", ORE, TheLoop);
  1110. if (DoExtraAnalysis)
  1111. Result = false;
  1112. else
  1113. return false;
  1114. }
  1115. return Result;
  1116. }
  1117. bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
  1118. Loop *Lp, bool UseVPlanNativePath) {
  1119. // Store the result and return it at the end instead of exiting early, in case
  1120. // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
  1121. bool Result = true;
  1122. bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
  1123. if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
  1124. if (DoExtraAnalysis)
  1125. Result = false;
  1126. else
  1127. return false;
  1128. }
  1129. // Recursively check whether the loop control flow of nested loops is
  1130. // understood.
  1131. for (Loop *SubLp : *Lp)
  1132. if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
  1133. if (DoExtraAnalysis)
  1134. Result = false;
  1135. else
  1136. return false;
  1137. }
  1138. return Result;
  1139. }
  1140. bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
  1141. // Store the result and return it at the end instead of exiting early, in case
  1142. // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
  1143. bool Result = true;
  1144. bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
  1145. // Check whether the loop-related control flow in the loop nest is expected by
  1146. // vectorizer.
  1147. if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
  1148. if (DoExtraAnalysis)
  1149. Result = false;
  1150. else
  1151. return false;
  1152. }
  1153. // We need to have a loop header.
  1154. LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
  1155. << '\n');
  1156. // Specific checks for outer loops. We skip the remaining legal checks at this
  1157. // point because they don't support outer loops.
  1158. if (!TheLoop->isInnermost()) {
  1159. assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
  1160. if (!canVectorizeOuterLoop()) {
  1161. reportVectorizationFailure("Unsupported outer loop",
  1162. "unsupported outer loop",
  1163. "UnsupportedOuterLoop",
  1164. ORE, TheLoop);
  1165. // TODO: Implement DoExtraAnalysis when subsequent legal checks support
  1166. // outer loops.
  1167. return false;
  1168. }
  1169. LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
  1170. return Result;
  1171. }
  1172. assert(TheLoop->isInnermost() && "Inner loop expected.");
  1173. // Check if we can if-convert non-single-bb loops.
  1174. unsigned NumBlocks = TheLoop->getNumBlocks();
  1175. if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
  1176. LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
  1177. if (DoExtraAnalysis)
  1178. Result = false;
  1179. else
  1180. return false;
  1181. }
  1182. // Check if we can vectorize the instructions and CFG in this loop.
  1183. if (!canVectorizeInstrs()) {
  1184. LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
  1185. if (DoExtraAnalysis)
  1186. Result = false;
  1187. else
  1188. return false;
  1189. }
  1190. // Go over each instruction and look at memory deps.
  1191. if (!canVectorizeMemory()) {
  1192. LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
  1193. if (DoExtraAnalysis)
  1194. Result = false;
  1195. else
  1196. return false;
  1197. }
  1198. LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
  1199. << (LAI->getRuntimePointerChecking()->Need
  1200. ? " (with a runtime bound check)"
  1201. : "")
  1202. << "!\n");
  1203. unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
  1204. if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
  1205. SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
  1206. if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
  1207. reportVectorizationFailure("Too many SCEV checks needed",
  1208. "Too many SCEV assumptions need to be made and checked at runtime",
  1209. "TooManySCEVRunTimeChecks", ORE, TheLoop);
  1210. if (DoExtraAnalysis)
  1211. Result = false;
  1212. else
  1213. return false;
  1214. }
  1215. // Okay! We've done all the tests. If any have failed, return false. Otherwise
  1216. // we can vectorize, and at this point we don't have any other mem analysis
  1217. // which may limit our maximum vectorization factor, so just return true with
  1218. // no restrictions.
  1219. return Result;
  1220. }
  1221. bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
  1222. LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
  1223. SmallPtrSet<const Value *, 8> ReductionLiveOuts;
  1224. for (const auto &Reduction : getReductionVars())
  1225. ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
  1226. // TODO: handle non-reduction outside users when tail is folded by masking.
  1227. for (auto *AE : AllowedExit) {
  1228. // Check that all users of allowed exit values are inside the loop or
  1229. // are the live-out of a reduction.
  1230. if (ReductionLiveOuts.count(AE))
  1231. continue;
  1232. for (User *U : AE->users()) {
  1233. Instruction *UI = cast<Instruction>(U);
  1234. if (TheLoop->contains(UI))
  1235. continue;
  1236. LLVM_DEBUG(
  1237. dbgs()
  1238. << "LV: Cannot fold tail by masking, loop has an outside user for "
  1239. << *UI << "\n");
  1240. return false;
  1241. }
  1242. }
  1243. // The list of pointers that we can safely read and write to remains empty.
  1244. SmallPtrSet<Value *, 8> SafePointers;
  1245. SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
  1246. SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
  1247. // Check and mark all blocks for predication, including those that ordinarily
  1248. // do not need predication such as the header block.
  1249. for (BasicBlock *BB : TheLoop->blocks()) {
  1250. if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
  1251. TmpConditionalAssumes)) {
  1252. LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
  1253. return false;
  1254. }
  1255. }
  1256. LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
  1257. MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
  1258. ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
  1259. TmpConditionalAssumes.end());
  1260. return true;
  1261. }
  1262. } // namespace llvm