VPlanSLP.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// This file implements SLP analysis based on VPlan. The analysis is based on
  9. /// the ideas described in
  10. ///
  11. /// Look-ahead SLP: auto-vectorization in the presence of commutative
  12. /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
  13. /// Luís F. W. Góes
  14. ///
  15. //===----------------------------------------------------------------------===//
  16. #include "VPlan.h"
  17. #include "llvm/ADT/DepthFirstIterator.h"
  18. #include "llvm/ADT/PostOrderIterator.h"
  19. #include "llvm/ADT/SmallVector.h"
  20. #include "llvm/ADT/Twine.h"
  21. #include "llvm/Analysis/LoopInfo.h"
  22. #include "llvm/Analysis/VectorUtils.h"
  23. #include "llvm/IR/BasicBlock.h"
  24. #include "llvm/IR/CFG.h"
  25. #include "llvm/IR/Dominators.h"
  26. #include "llvm/IR/InstrTypes.h"
  27. #include "llvm/IR/Instruction.h"
  28. #include "llvm/IR/Instructions.h"
  29. #include "llvm/IR/Type.h"
  30. #include "llvm/IR/Value.h"
  31. #include "llvm/Support/Casting.h"
  32. #include "llvm/Support/Debug.h"
  33. #include "llvm/Support/ErrorHandling.h"
  34. #include "llvm/Support/GraphWriter.h"
  35. #include "llvm/Support/raw_ostream.h"
  36. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  37. #include <algorithm>
  38. #include <cassert>
  39. #include <iterator>
  40. #include <utility>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "vplan-slp"
  43. // Number of levels to look ahead when re-ordering multi node operands.
  44. static unsigned LookaheadMaxDepth = 5;
  45. VPInstruction *VPlanSlp::markFailed() {
  46. // FIXME: Currently this is used to signal we hit instructions we cannot
  47. // trivially SLP'ize.
  48. CompletelySLP = false;
  49. return nullptr;
  50. }
  51. void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
  52. if (all_of(Operands, [](VPValue *V) {
  53. return cast<VPInstruction>(V)->getUnderlyingInstr();
  54. })) {
  55. unsigned BundleSize = 0;
  56. for (VPValue *V : Operands) {
  57. Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
  58. assert(!T->isVectorTy() && "Only scalar types supported for now");
  59. BundleSize += T->getScalarSizeInBits();
  60. }
  61. WidestBundleBits = std::max(WidestBundleBits, BundleSize);
  62. }
  63. auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
  64. assert(Res.second &&
  65. "Already created a combined instruction for the operand bundle");
  66. (void)Res;
  67. }
  68. bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
  69. // Currently we only support VPInstructions.
  70. if (!all_of(Operands, [](VPValue *Op) {
  71. return Op && isa<VPInstruction>(Op) &&
  72. cast<VPInstruction>(Op)->getUnderlyingInstr();
  73. })) {
  74. LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
  75. return false;
  76. }
  77. // Check if opcodes and type width agree for all instructions in the bundle.
  78. // FIXME: Differing widths/opcodes can be handled by inserting additional
  79. // instructions.
  80. // FIXME: Deal with non-primitive types.
  81. const Instruction *OriginalInstr =
  82. cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
  83. unsigned Opcode = OriginalInstr->getOpcode();
  84. unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
  85. if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
  86. const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
  87. return I->getOpcode() == Opcode &&
  88. I->getType()->getPrimitiveSizeInBits() == Width;
  89. })) {
  90. LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
  91. return false;
  92. }
  93. // For now, all operands must be defined in the same BB.
  94. if (any_of(Operands, [this](VPValue *Op) {
  95. return cast<VPInstruction>(Op)->getParent() != &this->BB;
  96. })) {
  97. LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
  98. return false;
  99. }
  100. if (any_of(Operands,
  101. [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
  102. LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
  103. return false;
  104. }
  105. // For loads, check that there are no instructions writing to memory in
  106. // between them.
  107. // TODO: we only have to forbid instructions writing to memory that could
  108. // interfere with any of the loads in the bundle
  109. if (Opcode == Instruction::Load) {
  110. unsigned LoadsSeen = 0;
  111. VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
  112. for (auto &I : *Parent) {
  113. auto *VPI = dyn_cast<VPInstruction>(&I);
  114. if (!VPI)
  115. break;
  116. if (VPI->getOpcode() == Instruction::Load &&
  117. llvm::is_contained(Operands, VPI))
  118. LoadsSeen++;
  119. if (LoadsSeen == Operands.size())
  120. break;
  121. if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
  122. LLVM_DEBUG(
  123. dbgs() << "VPSLP: instruction modifying memory between loads\n");
  124. return false;
  125. }
  126. }
  127. if (!all_of(Operands, [](VPValue *Op) {
  128. return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
  129. ->isSimple();
  130. })) {
  131. LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
  132. return false;
  133. }
  134. }
  135. if (Opcode == Instruction::Store)
  136. if (!all_of(Operands, [](VPValue *Op) {
  137. return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
  138. ->isSimple();
  139. })) {
  140. LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
  141. return false;
  142. }
  143. return true;
  144. }
  145. static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
  146. unsigned OperandIndex) {
  147. SmallVector<VPValue *, 4> Operands;
  148. for (VPValue *V : Values) {
  149. // Currently we only support VPInstructions.
  150. auto *U = cast<VPInstruction>(V);
  151. Operands.push_back(U->getOperand(OperandIndex));
  152. }
  153. return Operands;
  154. }
  155. static bool areCommutative(ArrayRef<VPValue *> Values) {
  156. return Instruction::isCommutative(
  157. cast<VPInstruction>(Values[0])->getOpcode());
  158. }
  159. static SmallVector<SmallVector<VPValue *, 4>, 4>
  160. getOperands(ArrayRef<VPValue *> Values) {
  161. SmallVector<SmallVector<VPValue *, 4>, 4> Result;
  162. auto *VPI = cast<VPInstruction>(Values[0]);
  163. switch (VPI->getOpcode()) {
  164. case Instruction::Load:
  165. llvm_unreachable("Loads terminate a tree, no need to get operands");
  166. case Instruction::Store:
  167. Result.push_back(getOperands(Values, 0));
  168. break;
  169. default:
  170. for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
  171. Result.push_back(getOperands(Values, I));
  172. break;
  173. }
  174. return Result;
  175. }
  176. /// Returns the opcode of Values or ~0 if they do not all agree.
  177. static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
  178. unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
  179. if (any_of(Values, [Opcode](VPValue *V) {
  180. return cast<VPInstruction>(V)->getOpcode() != Opcode;
  181. }))
  182. return None;
  183. return {Opcode};
  184. }
  185. /// Returns true if A and B access sequential memory if they are loads or
  186. /// stores or if they have identical opcodes otherwise.
  187. static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
  188. VPInterleavedAccessInfo &IAI) {
  189. if (A->getOpcode() != B->getOpcode())
  190. return false;
  191. if (A->getOpcode() != Instruction::Load &&
  192. A->getOpcode() != Instruction::Store)
  193. return true;
  194. auto *GA = IAI.getInterleaveGroup(A);
  195. auto *GB = IAI.getInterleaveGroup(B);
  196. return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
  197. }
  198. /// Implements getLAScore from Listing 7 in the paper.
  199. /// Traverses and compares operands of V1 and V2 to MaxLevel.
  200. static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
  201. VPInterleavedAccessInfo &IAI) {
  202. auto *I1 = dyn_cast<VPInstruction>(V1);
  203. auto *I2 = dyn_cast<VPInstruction>(V2);
  204. // Currently we only support VPInstructions.
  205. if (!I1 || !I2)
  206. return 0;
  207. if (MaxLevel == 0)
  208. return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
  209. unsigned Score = 0;
  210. for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
  211. for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
  212. Score +=
  213. getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
  214. return Score;
  215. }
  216. std::pair<VPlanSlp::OpMode, VPValue *>
  217. VPlanSlp::getBest(OpMode Mode, VPValue *Last,
  218. SmallPtrSetImpl<VPValue *> &Candidates,
  219. VPInterleavedAccessInfo &IAI) {
  220. assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
  221. "Currently we only handle load and commutative opcodes");
  222. LLVM_DEBUG(dbgs() << " getBest\n");
  223. SmallVector<VPValue *, 4> BestCandidates;
  224. LLVM_DEBUG(dbgs() << " Candidates for "
  225. << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
  226. for (auto *Candidate : Candidates) {
  227. auto *LastI = cast<VPInstruction>(Last);
  228. auto *CandidateI = cast<VPInstruction>(Candidate);
  229. if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
  230. LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
  231. << " ");
  232. BestCandidates.push_back(Candidate);
  233. }
  234. }
  235. LLVM_DEBUG(dbgs() << "\n");
  236. if (BestCandidates.empty())
  237. return {OpMode::Failed, nullptr};
  238. if (BestCandidates.size() == 1)
  239. return {Mode, BestCandidates[0]};
  240. VPValue *Best = nullptr;
  241. unsigned BestScore = 0;
  242. for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
  243. unsigned PrevScore = ~0u;
  244. bool AllSame = true;
  245. // FIXME: Avoid visiting the same operands multiple times.
  246. for (auto *Candidate : BestCandidates) {
  247. unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
  248. if (PrevScore == ~0u)
  249. PrevScore = Score;
  250. if (PrevScore != Score)
  251. AllSame = false;
  252. PrevScore = Score;
  253. if (Score > BestScore) {
  254. BestScore = Score;
  255. Best = Candidate;
  256. }
  257. }
  258. if (!AllSame)
  259. break;
  260. }
  261. LLVM_DEBUG(dbgs() << "Found best "
  262. << *cast<VPInstruction>(Best)->getUnderlyingInstr()
  263. << "\n");
  264. Candidates.erase(Best);
  265. return {Mode, Best};
  266. }
  267. SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
  268. SmallVector<MultiNodeOpTy, 4> FinalOrder;
  269. SmallVector<OpMode, 4> Mode;
  270. FinalOrder.reserve(MultiNodeOps.size());
  271. Mode.reserve(MultiNodeOps.size());
  272. LLVM_DEBUG(dbgs() << "Reordering multinode\n");
  273. for (auto &Operands : MultiNodeOps) {
  274. FinalOrder.push_back({Operands.first, {Operands.second[0]}});
  275. if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
  276. Instruction::Load)
  277. Mode.push_back(OpMode::Load);
  278. else
  279. Mode.push_back(OpMode::Opcode);
  280. }
  281. for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
  282. LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
  283. SmallPtrSet<VPValue *, 4> Candidates;
  284. LLVM_DEBUG(dbgs() << " Candidates ");
  285. for (auto Ops : MultiNodeOps) {
  286. LLVM_DEBUG(
  287. dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
  288. << " ");
  289. Candidates.insert(Ops.second[Lane]);
  290. }
  291. LLVM_DEBUG(dbgs() << "\n");
  292. for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
  293. LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
  294. if (Mode[Op] == OpMode::Failed)
  295. continue;
  296. VPValue *Last = FinalOrder[Op].second[Lane - 1];
  297. std::pair<OpMode, VPValue *> Res =
  298. getBest(Mode[Op], Last, Candidates, IAI);
  299. if (Res.second)
  300. FinalOrder[Op].second.push_back(Res.second);
  301. else
  302. // TODO: handle this case
  303. FinalOrder[Op].second.push_back(markFailed());
  304. }
  305. }
  306. return FinalOrder;
  307. }
  308. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  309. void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
  310. dbgs() << " Ops: ";
  311. for (auto Op : Values) {
  312. if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
  313. if (auto *Instr = VPInstr->getUnderlyingInstr()) {
  314. dbgs() << *Instr << " | ";
  315. continue;
  316. }
  317. dbgs() << " nullptr | ";
  318. }
  319. dbgs() << "\n";
  320. }
  321. #endif
  322. VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
  323. assert(!Values.empty() && "Need some operands!");
  324. // If we already visited this instruction bundle, re-use the existing node
  325. auto I = BundleToCombined.find(to_vector<4>(Values));
  326. if (I != BundleToCombined.end()) {
  327. #ifndef NDEBUG
  328. // Check that the resulting graph is a tree. If we re-use a node, this means
  329. // its values have multiple users. We only allow this, if all users of each
  330. // value are the same instruction.
  331. for (auto *V : Values) {
  332. auto UI = V->user_begin();
  333. auto *FirstUser = *UI++;
  334. while (UI != V->user_end()) {
  335. assert(*UI == FirstUser && "Currently we only support SLP trees.");
  336. UI++;
  337. }
  338. }
  339. #endif
  340. return I->second;
  341. }
  342. // Dump inputs
  343. LLVM_DEBUG({
  344. dbgs() << "buildGraph: ";
  345. dumpBundle(Values);
  346. });
  347. if (!areVectorizable(Values))
  348. return markFailed();
  349. assert(getOpcode(Values) && "Opcodes for all values must match");
  350. unsigned ValuesOpcode = getOpcode(Values).getValue();
  351. SmallVector<VPValue *, 4> CombinedOperands;
  352. if (areCommutative(Values)) {
  353. bool MultiNodeRoot = !MultiNodeActive;
  354. MultiNodeActive = true;
  355. for (auto &Operands : getOperands(Values)) {
  356. LLVM_DEBUG({
  357. dbgs() << " Visiting Commutative";
  358. dumpBundle(Operands);
  359. });
  360. auto OperandsOpcode = getOpcode(Operands);
  361. if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
  362. LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
  363. CombinedOperands.push_back(buildGraph(Operands));
  364. } else {
  365. LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
  366. // Create dummy VPInstruction, which will we replace later by the
  367. // re-ordered operand.
  368. VPInstruction *Op = new VPInstruction(0, {});
  369. CombinedOperands.push_back(Op);
  370. MultiNodeOps.emplace_back(Op, Operands);
  371. }
  372. }
  373. if (MultiNodeRoot) {
  374. LLVM_DEBUG(dbgs() << "Reorder \n");
  375. MultiNodeActive = false;
  376. auto FinalOrder = reorderMultiNodeOps();
  377. MultiNodeOps.clear();
  378. for (auto &Ops : FinalOrder) {
  379. VPInstruction *NewOp = buildGraph(Ops.second);
  380. Ops.first->replaceAllUsesWith(NewOp);
  381. for (unsigned i = 0; i < CombinedOperands.size(); i++)
  382. if (CombinedOperands[i] == Ops.first)
  383. CombinedOperands[i] = NewOp;
  384. delete Ops.first;
  385. Ops.first = NewOp;
  386. }
  387. LLVM_DEBUG(dbgs() << "Found final order\n");
  388. }
  389. } else {
  390. LLVM_DEBUG(dbgs() << " NonCommuntative\n");
  391. if (ValuesOpcode == Instruction::Load)
  392. for (VPValue *V : Values)
  393. CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
  394. else
  395. for (auto &Operands : getOperands(Values))
  396. CombinedOperands.push_back(buildGraph(Operands));
  397. }
  398. unsigned Opcode;
  399. switch (ValuesOpcode) {
  400. case Instruction::Load:
  401. Opcode = VPInstruction::SLPLoad;
  402. break;
  403. case Instruction::Store:
  404. Opcode = VPInstruction::SLPStore;
  405. break;
  406. default:
  407. Opcode = ValuesOpcode;
  408. break;
  409. }
  410. if (!CompletelySLP)
  411. return markFailed();
  412. assert(CombinedOperands.size() > 0 && "Need more some operands");
  413. auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
  414. auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
  415. VPI->setUnderlyingInstr(Inst);
  416. LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
  417. << *cast<VPInstruction>(Values[0]) << "\n");
  418. addCombined(Values, VPI);
  419. return VPI;
  420. }