SVEIntrinsicOpts.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Performs general IR level optimizations on SVE intrinsics.
  10. //
  11. // This pass performs the following optimizations:
  12. //
  13. // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
  14. // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  15. // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  16. // ; (%1 can be replaced with a reinterpret of %2)
  17. //
  18. // - optimizes ptest intrinsics where the operands are being needlessly
  19. // converted to and from svbool_t.
  20. //
  21. //===----------------------------------------------------------------------===//
  22. #include "AArch64.h"
  23. #include "Utils/AArch64BaseInfo.h"
  24. #include "llvm/ADT/PostOrderIterator.h"
  25. #include "llvm/ADT/SetVector.h"
  26. #include "llvm/IR/Constants.h"
  27. #include "llvm/IR/Dominators.h"
  28. #include "llvm/IR/IRBuilder.h"
  29. #include "llvm/IR/Instructions.h"
  30. #include "llvm/IR/IntrinsicInst.h"
  31. #include "llvm/IR/IntrinsicsAArch64.h"
  32. #include "llvm/IR/LLVMContext.h"
  33. #include "llvm/IR/PatternMatch.h"
  34. #include "llvm/InitializePasses.h"
  35. #include "llvm/Support/Debug.h"
  36. using namespace llvm;
  37. using namespace llvm::PatternMatch;
  38. #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
  39. namespace {
  40. struct SVEIntrinsicOpts : public ModulePass {
  41. static char ID; // Pass identification, replacement for typeid
  42. SVEIntrinsicOpts() : ModulePass(ID) {
  43. initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
  44. }
  45. bool runOnModule(Module &M) override;
  46. void getAnalysisUsage(AnalysisUsage &AU) const override;
  47. private:
  48. bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
  49. SmallSetVector<IntrinsicInst *, 4> &PTrues);
  50. bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
  51. bool optimizePredicateStore(Instruction *I);
  52. bool optimizePredicateLoad(Instruction *I);
  53. bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
  54. /// Operates at the function-scope. I.e., optimizations are applied local to
  55. /// the functions themselves.
  56. bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
  57. };
  58. } // end anonymous namespace
  59. void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
  60. AU.addRequired<DominatorTreeWrapperPass>();
  61. AU.setPreservesCFG();
  62. }
  63. char SVEIntrinsicOpts::ID = 0;
  64. static const char *name = "SVE intrinsics optimizations";
  65. INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
  66. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
  67. INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
  68. ModulePass *llvm::createSVEIntrinsicOptsPass() {
  69. return new SVEIntrinsicOpts();
  70. }
  71. /// Checks if a ptrue intrinsic call is promoted. The act of promoting a
  72. /// ptrue will introduce zeroing. For example:
  73. ///
  74. /// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  75. /// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
  76. /// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
  77. ///
  78. /// %1 is promoted, because it is converted:
  79. ///
  80. /// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
  81. ///
  82. /// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
  83. static bool isPTruePromoted(IntrinsicInst *PTrue) {
  84. // Find all users of this intrinsic that are calls to convert-to-svbool
  85. // reinterpret intrinsics.
  86. SmallVector<IntrinsicInst *, 4> ConvertToUses;
  87. for (User *User : PTrue->users()) {
  88. if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
  89. ConvertToUses.push_back(cast<IntrinsicInst>(User));
  90. }
  91. }
  92. // If no such calls were found, this is ptrue is not promoted.
  93. if (ConvertToUses.empty())
  94. return false;
  95. // Otherwise, try to find users of the convert-to-svbool intrinsics that are
  96. // calls to the convert-from-svbool intrinsic, and would result in some lanes
  97. // being zeroed.
  98. const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
  99. for (IntrinsicInst *ConvertToUse : ConvertToUses) {
  100. for (User *User : ConvertToUse->users()) {
  101. auto *IntrUser = dyn_cast<IntrinsicInst>(User);
  102. if (IntrUser && IntrUser->getIntrinsicID() ==
  103. Intrinsic::aarch64_sve_convert_from_svbool) {
  104. const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
  105. // Would some lanes become zeroed by the conversion?
  106. if (IntrUserVTy->getElementCount().getKnownMinValue() >
  107. PTrueVTy->getElementCount().getKnownMinValue())
  108. // This is a promoted ptrue.
  109. return true;
  110. }
  111. }
  112. }
  113. // If no matching calls were found, this is not a promoted ptrue.
  114. return false;
  115. }
  116. /// Attempts to coalesce ptrues in a basic block.
  117. bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
  118. BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
  119. if (PTrues.size() <= 1)
  120. return false;
  121. // Find the ptrue with the most lanes.
  122. auto *MostEncompassingPTrue = *std::max_element(
  123. PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
  124. auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
  125. auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
  126. return PTrue1VTy->getElementCount().getKnownMinValue() <
  127. PTrue2VTy->getElementCount().getKnownMinValue();
  128. });
  129. // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
  130. // behind only the ptrues to be coalesced.
  131. PTrues.remove(MostEncompassingPTrue);
  132. PTrues.remove_if(isPTruePromoted);
  133. // Hoist MostEncompassingPTrue to the start of the basic block. It is always
  134. // safe to do this, since ptrue intrinsic calls are guaranteed to have no
  135. // predecessors.
  136. MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
  137. LLVMContext &Ctx = BB.getContext();
  138. IRBuilder<> Builder(Ctx);
  139. Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
  140. auto *MostEncompassingPTrueVTy =
  141. cast<VectorType>(MostEncompassingPTrue->getType());
  142. auto *ConvertToSVBool = Builder.CreateIntrinsic(
  143. Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
  144. {MostEncompassingPTrue});
  145. bool ConvertFromCreated = false;
  146. for (auto *PTrue : PTrues) {
  147. auto *PTrueVTy = cast<VectorType>(PTrue->getType());
  148. // Only create the converts if the types are not already the same, otherwise
  149. // just use the most encompassing ptrue.
  150. if (MostEncompassingPTrueVTy != PTrueVTy) {
  151. ConvertFromCreated = true;
  152. Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
  153. auto *ConvertFromSVBool =
  154. Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
  155. {PTrueVTy}, {ConvertToSVBool});
  156. PTrue->replaceAllUsesWith(ConvertFromSVBool);
  157. } else
  158. PTrue->replaceAllUsesWith(MostEncompassingPTrue);
  159. PTrue->eraseFromParent();
  160. }
  161. // We never used the ConvertTo so remove it
  162. if (!ConvertFromCreated)
  163. ConvertToSVBool->eraseFromParent();
  164. return true;
  165. }
  166. /// The goal of this function is to remove redundant calls to the SVE ptrue
  167. /// intrinsic in each basic block within the given functions.
  168. ///
  169. /// SVE ptrues have two representations in LLVM IR:
  170. /// - a logical representation -- an arbitrary-width scalable vector of i1s,
  171. /// i.e. <vscale x N x i1>.
  172. /// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
  173. /// scalable vector of i1s, i.e. <vscale x 16 x i1>.
  174. ///
  175. /// The SVE ptrue intrinsic is used to create a logical representation of an SVE
  176. /// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
  177. /// P1 creates a logical SVE predicate that is at least as wide as the logical
  178. /// SVE predicate created by P2, then all of the bits that are true in the
  179. /// physical representation of P2 are necessarily also true in the physical
  180. /// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
  181. /// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
  182. /// convert.{to,from}.svbool.
  183. ///
  184. /// Currently, this pass only coalesces calls to SVE ptrue intrinsics
  185. /// if they match the following conditions:
  186. ///
  187. /// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
  188. /// SV_ALL indicates that all bits of the predicate vector are to be set to
  189. /// true. SV_POW2 indicates that all bits of the predicate vector up to the
  190. /// largest power-of-two are to be set to true.
  191. /// - the result of the call to the intrinsic is not promoted to a wider
  192. /// predicate. In this case, keeping the extra ptrue leads to better codegen
  193. /// -- coalescing here would create an irreducible chain of SVE reinterprets
  194. /// via convert.{to,from}.svbool.
  195. ///
  196. /// EXAMPLE:
  197. ///
  198. /// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
  199. /// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
  200. /// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
  201. /// ...
  202. ///
  203. /// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
  204. /// ; Logical: <1, 1, 1, 1>
  205. /// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
  206. /// ...
  207. ///
  208. /// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
  209. ///
  210. /// %1 = <vscale x 8 x i1> ptrue(i32 i31)
  211. /// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
  212. /// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
  213. ///
  214. bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
  215. SmallSetVector<Function *, 4> &Functions) {
  216. bool Changed = false;
  217. for (auto *F : Functions) {
  218. for (auto &BB : *F) {
  219. SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
  220. SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
  221. // For each basic block, collect the used ptrues and try to coalesce them.
  222. for (Instruction &I : BB) {
  223. if (I.use_empty())
  224. continue;
  225. auto *IntrI = dyn_cast<IntrinsicInst>(&I);
  226. if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
  227. continue;
  228. const auto PTruePattern =
  229. cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
  230. if (PTruePattern == AArch64SVEPredPattern::all)
  231. SVAllPTrues.insert(IntrI);
  232. if (PTruePattern == AArch64SVEPredPattern::pow2)
  233. SVPow2PTrues.insert(IntrI);
  234. }
  235. Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
  236. Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
  237. }
  238. }
  239. return Changed;
  240. }
  241. // This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
  242. // scalable stores as late as possible
  243. bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
  244. auto *F = I->getFunction();
  245. auto Attr = F->getFnAttribute(Attribute::VScaleRange);
  246. if (!Attr.isValid())
  247. return false;
  248. unsigned MinVScale = Attr.getVScaleRangeMin();
  249. Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
  250. // The transform needs to know the exact runtime length of scalable vectors
  251. if (!MaxVScale || MinVScale != MaxVScale)
  252. return false;
  253. auto *PredType =
  254. ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
  255. auto *FixedPredType =
  256. FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
  257. // If we have a store..
  258. auto *Store = dyn_cast<StoreInst>(I);
  259. if (!Store || !Store->isSimple())
  260. return false;
  261. // ..that is storing a predicate vector sized worth of bits..
  262. if (Store->getOperand(0)->getType() != FixedPredType)
  263. return false;
  264. // ..where the value stored comes from a vector extract..
  265. auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
  266. if (!IntrI ||
  267. IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
  268. return false;
  269. // ..that is extracting from index 0..
  270. if (!cast<ConstantInt>(IntrI->getOperand(1))->isZero())
  271. return false;
  272. // ..where the value being extract from comes from a bitcast
  273. auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
  274. if (!BitCast)
  275. return false;
  276. // ..and the bitcast is casting from predicate type
  277. if (BitCast->getOperand(0)->getType() != PredType)
  278. return false;
  279. IRBuilder<> Builder(I->getContext());
  280. Builder.SetInsertPoint(I);
  281. auto *PtrBitCast = Builder.CreateBitCast(
  282. Store->getPointerOperand(),
  283. PredType->getPointerTo(Store->getPointerAddressSpace()));
  284. Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);
  285. Store->eraseFromParent();
  286. if (IntrI->getNumUses() == 0)
  287. IntrI->eraseFromParent();
  288. if (BitCast->getNumUses() == 0)
  289. BitCast->eraseFromParent();
  290. return true;
  291. }
  292. // This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
  293. // scalable loads as late as possible
  294. bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
  295. auto *F = I->getFunction();
  296. auto Attr = F->getFnAttribute(Attribute::VScaleRange);
  297. if (!Attr.isValid())
  298. return false;
  299. unsigned MinVScale = Attr.getVScaleRangeMin();
  300. Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
  301. // The transform needs to know the exact runtime length of scalable vectors
  302. if (!MaxVScale || MinVScale != MaxVScale)
  303. return false;
  304. auto *PredType =
  305. ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
  306. auto *FixedPredType =
  307. FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
  308. // If we have a bitcast..
  309. auto *BitCast = dyn_cast<BitCastInst>(I);
  310. if (!BitCast || BitCast->getType() != PredType)
  311. return false;
  312. // ..whose operand is a vector_insert..
  313. auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
  314. if (!IntrI ||
  315. IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
  316. return false;
  317. // ..that is inserting into index zero of an undef vector..
  318. if (!isa<UndefValue>(IntrI->getOperand(0)) ||
  319. !cast<ConstantInt>(IntrI->getOperand(2))->isZero())
  320. return false;
  321. // ..where the value inserted comes from a load..
  322. auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
  323. if (!Load || !Load->isSimple())
  324. return false;
  325. // ..that is loading a predicate vector sized worth of bits..
  326. if (Load->getType() != FixedPredType)
  327. return false;
  328. IRBuilder<> Builder(I->getContext());
  329. Builder.SetInsertPoint(Load);
  330. auto *PtrBitCast = Builder.CreateBitCast(
  331. Load->getPointerOperand(),
  332. PredType->getPointerTo(Load->getPointerAddressSpace()));
  333. auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
  334. BitCast->replaceAllUsesWith(LoadPred);
  335. BitCast->eraseFromParent();
  336. if (IntrI->getNumUses() == 0)
  337. IntrI->eraseFromParent();
  338. if (Load->getNumUses() == 0)
  339. Load->eraseFromParent();
  340. return true;
  341. }
  342. bool SVEIntrinsicOpts::optimizeInstructions(
  343. SmallSetVector<Function *, 4> &Functions) {
  344. bool Changed = false;
  345. for (auto *F : Functions) {
  346. DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
  347. // Traverse the DT with an rpo walk so we see defs before uses, allowing
  348. // simplification to be done incrementally.
  349. BasicBlock *Root = DT->getRoot();
  350. ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
  351. for (auto *BB : RPOT) {
  352. for (Instruction &I : make_early_inc_range(*BB)) {
  353. switch (I.getOpcode()) {
  354. case Instruction::Store:
  355. Changed |= optimizePredicateStore(&I);
  356. break;
  357. case Instruction::BitCast:
  358. Changed |= optimizePredicateLoad(&I);
  359. break;
  360. }
  361. }
  362. }
  363. }
  364. return Changed;
  365. }
  366. bool SVEIntrinsicOpts::optimizeFunctions(
  367. SmallSetVector<Function *, 4> &Functions) {
  368. bool Changed = false;
  369. Changed |= optimizePTrueIntrinsicCalls(Functions);
  370. Changed |= optimizeInstructions(Functions);
  371. return Changed;
  372. }
  373. bool SVEIntrinsicOpts::runOnModule(Module &M) {
  374. bool Changed = false;
  375. SmallSetVector<Function *, 4> Functions;
  376. // Check for SVE intrinsic declarations first so that we only iterate over
  377. // relevant functions. Where an appropriate declaration is found, store the
  378. // function(s) where it is used so we can target these only.
  379. for (auto &F : M.getFunctionList()) {
  380. if (!F.isDeclaration())
  381. continue;
  382. switch (F.getIntrinsicID()) {
  383. case Intrinsic::experimental_vector_extract:
  384. case Intrinsic::experimental_vector_insert:
  385. case Intrinsic::aarch64_sve_ptrue:
  386. for (User *U : F.users())
  387. Functions.insert(cast<Instruction>(U)->getFunction());
  388. break;
  389. default:
  390. break;
  391. }
  392. }
  393. if (!Functions.empty())
  394. Changed |= optimizeFunctions(Functions);
  395. return Changed;
  396. }