SeparateConstOffsetFromGEP.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394
  1. //===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Loop unrolling may create many similar GEPs for array accesses.
  10. // e.g., a 2-level loop
  11. //
  12. // float a[32][32]; // global variable
  13. //
  14. // for (int i = 0; i < 2; ++i) {
  15. // for (int j = 0; j < 2; ++j) {
  16. // ...
  17. // ... = a[x + i][y + j];
  18. // ...
  19. // }
  20. // }
  21. //
  22. // will probably be unrolled to:
  23. //
  24. // gep %a, 0, %x, %y; load
  25. // gep %a, 0, %x, %y + 1; load
  26. // gep %a, 0, %x + 1, %y; load
  27. // gep %a, 0, %x + 1, %y + 1; load
  28. //
  29. // LLVM's GVN does not use partial redundancy elimination yet, and is thus
  30. // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
  31. // significant slowdown in targets with limited addressing modes. For instance,
  32. // because the PTX target does not support the reg+reg addressing mode, the
  33. // NVPTX backend emits PTX code that literally computes the pointer address of
  34. // each GEP, wasting tons of registers. It emits the following PTX for the
  35. // first load and similar PTX for other loads.
  36. //
  37. // mov.u32 %r1, %x;
  38. // mov.u32 %r2, %y;
  39. // mul.wide.u32 %rl2, %r1, 128;
  40. // mov.u64 %rl3, a;
  41. // add.s64 %rl4, %rl3, %rl2;
  42. // mul.wide.u32 %rl5, %r2, 4;
  43. // add.s64 %rl6, %rl4, %rl5;
  44. // ld.global.f32 %f1, [%rl6];
  45. //
  46. // To reduce the register pressure, the optimization implemented in this file
  47. // merges the common part of a group of GEPs, so we can compute each pointer
  48. // address by adding a simple offset to the common part, saving many registers.
  49. //
  50. // It works by splitting each GEP into a variadic base and a constant offset.
  51. // The variadic base can be computed once and reused by multiple GEPs, and the
  52. // constant offsets can be nicely folded into the reg+immediate addressing mode
  53. // (supported by most targets) without using any extra register.
  54. //
  55. // For instance, we transform the four GEPs and four loads in the above example
  56. // into:
  57. //
  58. // base = gep a, 0, x, y
  59. // load base
  60. // laod base + 1 * sizeof(float)
  61. // load base + 32 * sizeof(float)
  62. // load base + 33 * sizeof(float)
  63. //
  64. // Given the transformed IR, a backend that supports the reg+immediate
  65. // addressing mode can easily fold the pointer arithmetics into the loads. For
  66. // example, the NVPTX backend can easily fold the pointer arithmetics into the
  67. // ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
  68. //
  69. // mov.u32 %r1, %tid.x;
  70. // mov.u32 %r2, %tid.y;
  71. // mul.wide.u32 %rl2, %r1, 128;
  72. // mov.u64 %rl3, a;
  73. // add.s64 %rl4, %rl3, %rl2;
  74. // mul.wide.u32 %rl5, %r2, 4;
  75. // add.s64 %rl6, %rl4, %rl5;
  76. // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
  77. // ld.global.f32 %f2, [%rl6+4]; // much better
  78. // ld.global.f32 %f3, [%rl6+128]; // much better
  79. // ld.global.f32 %f4, [%rl6+132]; // much better
  80. //
  81. // Another improvement enabled by the LowerGEP flag is to lower a GEP with
  82. // multiple indices to either multiple GEPs with a single index or arithmetic
  83. // operations (depending on whether the target uses alias analysis in codegen).
  84. // Such transformation can have following benefits:
  85. // (1) It can always extract constants in the indices of structure type.
  86. // (2) After such Lowering, there are more optimization opportunities such as
  87. // CSE, LICM and CGP.
  88. //
  89. // E.g. The following GEPs have multiple indices:
  90. // BB1:
  91. // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
  92. // load %p
  93. // ...
  94. // BB2:
  95. // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
  96. // load %p2
  97. // ...
  98. //
  99. // We can not do CSE to the common part related to index "i64 %i". Lowering
  100. // GEPs can achieve such goals.
  101. // If the target does not use alias analysis in codegen, this pass will
  102. // lower a GEP with multiple indices into arithmetic operations:
  103. // BB1:
  104. // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
  105. // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  106. // %3 = add i64 %1, %2 ; CSE opportunity
  107. // %4 = mul i64 %j1, length_of_struct
  108. // %5 = add i64 %3, %4
  109. // %6 = add i64 %3, struct_field_3 ; Constant offset
  110. // %p = inttoptr i64 %6 to i32*
  111. // load %p
  112. // ...
  113. // BB2:
  114. // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
  115. // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  116. // %9 = add i64 %7, %8 ; CSE opportunity
  117. // %10 = mul i64 %j2, length_of_struct
  118. // %11 = add i64 %9, %10
  119. // %12 = add i64 %11, struct_field_2 ; Constant offset
  120. // %p = inttoptr i64 %12 to i32*
  121. // load %p2
  122. // ...
  123. //
  124. // If the target uses alias analysis in codegen, this pass will lower a GEP
  125. // with multiple indices into multiple GEPs with a single index:
  126. // BB1:
  127. // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
  128. // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  129. // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
  130. // %4 = mul i64 %j1, length_of_struct
  131. // %5 = getelementptr i8* %3, i64 %4
  132. // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
  133. // %p = bitcast i8* %6 to i32*
  134. // load %p
  135. // ...
  136. // BB2:
  137. // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
  138. // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  139. // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
  140. // %10 = mul i64 %j2, length_of_struct
  141. // %11 = getelementptr i8* %9, i64 %10
  142. // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
  143. // %p2 = bitcast i8* %12 to i32*
  144. // load %p2
  145. // ...
  146. //
  147. // Lowering GEPs can also benefit other passes such as LICM and CGP.
  148. // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
  149. // indices if one of the index is variant. If we lower such GEP into invariant
  150. // parts and variant parts, LICM can hoist/sink those invariant parts.
  151. // CGP (CodeGen Prepare) tries to sink address calculations that match the
  152. // target's addressing modes. A GEP with multiple indices may not match and will
  153. // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
  154. // them. So we end up with a better addressing mode.
  155. //
  156. //===----------------------------------------------------------------------===//
  157. #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
  158. #include "llvm/ADT/APInt.h"
  159. #include "llvm/ADT/DenseMap.h"
  160. #include "llvm/ADT/DepthFirstIterator.h"
  161. #include "llvm/ADT/SmallVector.h"
  162. #include "llvm/Analysis/LoopInfo.h"
  163. #include "llvm/Analysis/MemoryBuiltins.h"
  164. #include "llvm/Analysis/ScalarEvolution.h"
  165. #include "llvm/Analysis/TargetLibraryInfo.h"
  166. #include "llvm/Analysis/TargetTransformInfo.h"
  167. #include "llvm/Analysis/ValueTracking.h"
  168. #include "llvm/IR/BasicBlock.h"
  169. #include "llvm/IR/Constant.h"
  170. #include "llvm/IR/Constants.h"
  171. #include "llvm/IR/DataLayout.h"
  172. #include "llvm/IR/DerivedTypes.h"
  173. #include "llvm/IR/Dominators.h"
  174. #include "llvm/IR/Function.h"
  175. #include "llvm/IR/GetElementPtrTypeIterator.h"
  176. #include "llvm/IR/IRBuilder.h"
  177. #include "llvm/IR/Instruction.h"
  178. #include "llvm/IR/Instructions.h"
  179. #include "llvm/IR/Module.h"
  180. #include "llvm/IR/PassManager.h"
  181. #include "llvm/IR/PatternMatch.h"
  182. #include "llvm/IR/Type.h"
  183. #include "llvm/IR/User.h"
  184. #include "llvm/IR/Value.h"
  185. #include "llvm/InitializePasses.h"
  186. #include "llvm/Pass.h"
  187. #include "llvm/Support/Casting.h"
  188. #include "llvm/Support/CommandLine.h"
  189. #include "llvm/Support/ErrorHandling.h"
  190. #include "llvm/Support/raw_ostream.h"
  191. #include "llvm/Transforms/Scalar.h"
  192. #include "llvm/Transforms/Utils/Local.h"
  193. #include <cassert>
  194. #include <cstdint>
  195. #include <string>
  196. using namespace llvm;
  197. using namespace llvm::PatternMatch;
  198. static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
  199. "disable-separate-const-offset-from-gep", cl::init(false),
  200. cl::desc("Do not separate the constant offset from a GEP instruction"),
  201. cl::Hidden);
  202. // Setting this flag may emit false positives when the input module already
  203. // contains dead instructions. Therefore, we set it only in unit tests that are
  204. // free of dead code.
  205. static cl::opt<bool>
  206. VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
  207. cl::desc("Verify this pass produces no dead code"),
  208. cl::Hidden);
  209. namespace {
  210. /// A helper class for separating a constant offset from a GEP index.
  211. ///
  212. /// In real programs, a GEP index may be more complicated than a simple addition
  213. /// of something and a constant integer which can be trivially splitted. For
  214. /// example, to split ((a << 3) | 5) + b, we need to search deeper for the
  215. /// constant offset, so that we can separate the index to (a << 3) + b and 5.
  216. ///
  217. /// Therefore, this class looks into the expression that computes a given GEP
  218. /// index, and tries to find a constant integer that can be hoisted to the
  219. /// outermost level of the expression as an addition. Not every constant in an
  220. /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
  221. /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
  222. /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
  223. class ConstantOffsetExtractor {
  224. public:
  225. /// Extracts a constant offset from the given GEP index. It returns the
  226. /// new index representing the remainder (equal to the original index minus
  227. /// the constant offset), or nullptr if we cannot extract a constant offset.
  228. /// \p Idx The given GEP index
  229. /// \p GEP The given GEP
  230. /// \p UserChainTail Outputs the tail of UserChain so that we can
  231. /// garbage-collect unused instructions in UserChain.
  232. static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
  233. User *&UserChainTail, const DominatorTree *DT);
  234. /// Looks for a constant offset from the given GEP index without extracting
  235. /// it. It returns the numeric value of the extracted constant offset (0 if
  236. /// failed). The meaning of the arguments are the same as Extract.
  237. static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
  238. const DominatorTree *DT);
  239. private:
  240. ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
  241. : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
  242. }
  243. /// Searches the expression that computes V for a non-zero constant C s.t.
  244. /// V can be reassociated into the form V' + C. If the searching is
  245. /// successful, returns C and update UserChain as a def-use chain from C to V;
  246. /// otherwise, UserChain is empty.
  247. ///
  248. /// \p V The given expression
  249. /// \p SignExtended Whether V will be sign-extended in the computation of the
  250. /// GEP index
  251. /// \p ZeroExtended Whether V will be zero-extended in the computation of the
  252. /// GEP index
  253. /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
  254. /// an index of an inbounds GEP is guaranteed to be
  255. /// non-negative. Levaraging this, we can better split
  256. /// inbounds GEPs.
  257. APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
  258. /// A helper function to look into both operands of a binary operator.
  259. APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
  260. bool ZeroExtended);
  261. /// After finding the constant offset C from the GEP index I, we build a new
  262. /// index I' s.t. I' + C = I. This function builds and returns the new
  263. /// index I' according to UserChain produced by function "find".
  264. ///
  265. /// The building conceptually takes two steps:
  266. /// 1) iteratively distribute s/zext towards the leaves of the expression tree
  267. /// that computes I
  268. /// 2) reassociate the expression tree to the form I' + C.
  269. ///
  270. /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
  271. /// sext to a, b and 5 so that we have
  272. /// sext(a) + (sext(b) + 5).
  273. /// Then, we reassociate it to
  274. /// (sext(a) + sext(b)) + 5.
  275. /// Given this form, we know I' is sext(a) + sext(b).
  276. Value *rebuildWithoutConstOffset();
  277. /// After the first step of rebuilding the GEP index without the constant
  278. /// offset, distribute s/zext to the operands of all operators in UserChain.
  279. /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
  280. /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
  281. ///
  282. /// The function also updates UserChain to point to new subexpressions after
  283. /// distributing s/zext. e.g., the old UserChain of the above example is
  284. /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
  285. /// and the new UserChain is
  286. /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
  287. /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
  288. ///
  289. /// \p ChainIndex The index to UserChain. ChainIndex is initially
  290. /// UserChain.size() - 1, and is decremented during
  291. /// the recursion.
  292. Value *distributeExtsAndCloneChain(unsigned ChainIndex);
  293. /// Reassociates the GEP index to the form I' + C and returns I'.
  294. Value *removeConstOffset(unsigned ChainIndex);
  295. /// A helper function to apply ExtInsts, a list of s/zext, to value V.
  296. /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
  297. /// returns "sext i32 (zext i16 V to i32) to i64".
  298. Value *applyExts(Value *V);
  299. /// A helper function that returns whether we can trace into the operands
  300. /// of binary operator BO for a constant offset.
  301. ///
  302. /// \p SignExtended Whether BO is surrounded by sext
  303. /// \p ZeroExtended Whether BO is surrounded by zext
  304. /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
  305. /// array index.
  306. bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
  307. bool NonNegative);
  308. /// The path from the constant offset to the old GEP index. e.g., if the GEP
  309. /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
  310. /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
  311. /// UserChain[2] will be the entire expression "a * b + (c + 5)".
  312. ///
  313. /// This path helps to rebuild the new GEP index.
  314. SmallVector<User *, 8> UserChain;
  315. /// A data structure used in rebuildWithoutConstOffset. Contains all
  316. /// sext/zext instructions along UserChain.
  317. SmallVector<CastInst *, 16> ExtInsts;
  318. /// Insertion position of cloned instructions.
  319. Instruction *IP;
  320. const DataLayout &DL;
  321. const DominatorTree *DT;
  322. };
  323. /// A pass that tries to split every GEP in the function into a variadic
  324. /// base and a constant offset. It is a FunctionPass because searching for the
  325. /// constant offset may inspect other basic blocks.
  326. class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
  327. public:
  328. static char ID;
  329. SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
  330. : FunctionPass(ID), LowerGEP(LowerGEP) {
  331. initializeSeparateConstOffsetFromGEPLegacyPassPass(
  332. *PassRegistry::getPassRegistry());
  333. }
  334. void getAnalysisUsage(AnalysisUsage &AU) const override {
  335. AU.addRequired<DominatorTreeWrapperPass>();
  336. AU.addRequired<ScalarEvolutionWrapperPass>();
  337. AU.addRequired<TargetTransformInfoWrapperPass>();
  338. AU.addRequired<LoopInfoWrapperPass>();
  339. AU.setPreservesCFG();
  340. AU.addRequired<TargetLibraryInfoWrapperPass>();
  341. }
  342. bool runOnFunction(Function &F) override;
  343. private:
  344. bool LowerGEP;
  345. };
  346. /// A pass that tries to split every GEP in the function into a variadic
  347. /// base and a constant offset. It is a FunctionPass because searching for the
  348. /// constant offset may inspect other basic blocks.
  349. class SeparateConstOffsetFromGEP {
  350. public:
  351. SeparateConstOffsetFromGEP(
  352. DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI,
  353. TargetLibraryInfo *TLI,
  354. function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP)
  355. : DT(DT), SE(SE), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {}
  356. bool run(Function &F);
  357. private:
  358. /// Tries to split the given GEP into a variadic base and a constant offset,
  359. /// and returns true if the splitting succeeds.
  360. bool splitGEP(GetElementPtrInst *GEP);
  361. /// Lower a GEP with multiple indices into multiple GEPs with a single index.
  362. /// Function splitGEP already split the original GEP into a variadic part and
  363. /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
  364. /// variadic part into a set of GEPs with a single index and applies
  365. /// AccumulativeByteOffset to it.
  366. /// \p Variadic The variadic part of the original GEP.
  367. /// \p AccumulativeByteOffset The constant offset.
  368. void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
  369. int64_t AccumulativeByteOffset);
  370. /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
  371. /// Function splitGEP already split the original GEP into a variadic part and
  372. /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
  373. /// variadic part into a set of arithmetic operations and applies
  374. /// AccumulativeByteOffset to it.
  375. /// \p Variadic The variadic part of the original GEP.
  376. /// \p AccumulativeByteOffset The constant offset.
  377. void lowerToArithmetics(GetElementPtrInst *Variadic,
  378. int64_t AccumulativeByteOffset);
  379. /// Finds the constant offset within each index and accumulates them. If
  380. /// LowerGEP is true, it finds in indices of both sequential and structure
  381. /// types, otherwise it only finds in sequential indices. The output
  382. /// NeedsExtraction indicates whether we successfully find a non-zero constant
  383. /// offset.
  384. int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
  385. /// Canonicalize array indices to pointer-size integers. This helps to
  386. /// simplify the logic of splitting a GEP. For example, if a + b is a
  387. /// pointer-size integer, we have
  388. /// gep base, a + b = gep (gep base, a), b
  389. /// However, this equality may not hold if the size of a + b is smaller than
  390. /// the pointer size, because LLVM conceptually sign-extends GEP indices to
  391. /// pointer size before computing the address
  392. /// (http://llvm.org/docs/LangRef.html#id181).
  393. ///
  394. /// This canonicalization is very likely already done in clang and
  395. /// instcombine. Therefore, the program will probably remain the same.
  396. ///
  397. /// Returns true if the module changes.
  398. ///
  399. /// Verified in @i32_add in split-gep.ll
  400. bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
  401. /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
  402. /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
  403. /// the constant offset. After extraction, it becomes desirable to reunion the
  404. /// distributed sexts. For example,
  405. ///
  406. /// &a[sext(i +nsw (j +nsw 5)]
  407. /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
  408. /// => constant extraction &a[sext(i) + sext(j)] + 5
  409. /// => reunion &a[sext(i +nsw j)] + 5
  410. bool reuniteExts(Function &F);
  411. /// A helper that reunites sexts in an instruction.
  412. bool reuniteExts(Instruction *I);
  413. /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
  414. Instruction *findClosestMatchingDominator(
  415. const SCEV *Key, Instruction *Dominatee,
  416. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
  417. /// Verify F is free of dead code.
  418. void verifyNoDeadCode(Function &F);
  419. bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
  420. // Swap the index operand of two GEP.
  421. void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
  422. // Check if it is safe to swap operand of two GEP.
  423. bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
  424. Loop *CurLoop);
  425. const DataLayout *DL = nullptr;
  426. DominatorTree *DT = nullptr;
  427. ScalarEvolution *SE;
  428. LoopInfo *LI;
  429. TargetLibraryInfo *TLI;
  430. // Retrieved lazily since not always used.
  431. function_ref<TargetTransformInfo &(Function &)> GetTTI;
  432. /// Whether to lower a GEP with multiple indices into arithmetic operations or
  433. /// multiple GEPs with a single index.
  434. bool LowerGEP;
  435. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
  436. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
  437. };
  438. } // end anonymous namespace
  439. char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
  440. INITIALIZE_PASS_BEGIN(
  441. SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
  442. "Split GEPs to a variadic base and a constant offset for better CSE", false,
  443. false)
  444. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  445. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  446. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  447. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  448. INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
  449. INITIALIZE_PASS_END(
  450. SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
  451. "Split GEPs to a variadic base and a constant offset for better CSE", false,
  452. false)
  453. FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
  454. return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
  455. }
  456. bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
  457. bool ZeroExtended,
  458. BinaryOperator *BO,
  459. bool NonNegative) {
  460. // We only consider ADD, SUB and OR, because a non-zero constant found in
  461. // expressions composed of these operations can be easily hoisted as a
  462. // constant offset by reassociation.
  463. if (BO->getOpcode() != Instruction::Add &&
  464. BO->getOpcode() != Instruction::Sub &&
  465. BO->getOpcode() != Instruction::Or) {
  466. return false;
  467. }
  468. Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
  469. // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
  470. // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
  471. // FIXME: this does not appear to be covered by any tests
  472. // (with x86/aarch64 backends at least)
  473. if (BO->getOpcode() == Instruction::Or &&
  474. !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
  475. return false;
  476. // In addition, tracing into BO requires that its surrounding s/zext (if
  477. // any) is distributable to both operands.
  478. //
  479. // Suppose BO = A op B.
  480. // SignExtended | ZeroExtended | Distributable?
  481. // --------------+--------------+----------------------------------
  482. // 0 | 0 | true because no s/zext exists
  483. // 0 | 1 | zext(BO) == zext(A) op zext(B)
  484. // 1 | 0 | sext(BO) == sext(A) op sext(B)
  485. // 1 | 1 | zext(sext(BO)) ==
  486. // | | zext(sext(A)) op zext(sext(B))
  487. if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
  488. // If a + b >= 0 and (a >= 0 or b >= 0), then
  489. // sext(a + b) = sext(a) + sext(b)
  490. // even if the addition is not marked nsw.
  491. //
  492. // Leveraging this invariant, we can trace into an sext'ed inbound GEP
  493. // index if the constant offset is non-negative.
  494. //
  495. // Verified in @sext_add in split-gep.ll.
  496. if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
  497. if (!ConstLHS->isNegative())
  498. return true;
  499. }
  500. if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
  501. if (!ConstRHS->isNegative())
  502. return true;
  503. }
  504. }
  505. // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
  506. // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
  507. if (BO->getOpcode() == Instruction::Add ||
  508. BO->getOpcode() == Instruction::Sub) {
  509. if (SignExtended && !BO->hasNoSignedWrap())
  510. return false;
  511. if (ZeroExtended && !BO->hasNoUnsignedWrap())
  512. return false;
  513. }
  514. return true;
  515. }
  516. APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
  517. bool SignExtended,
  518. bool ZeroExtended) {
  519. // Save off the current height of the chain, in case we need to restore it.
  520. size_t ChainLength = UserChain.size();
  521. // BO being non-negative does not shed light on whether its operands are
  522. // non-negative. Clear the NonNegative flag here.
  523. APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
  524. /* NonNegative */ false);
  525. // If we found a constant offset in the left operand, stop and return that.
  526. // This shortcut might cause us to miss opportunities of combining the
  527. // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
  528. // However, such cases are probably already handled by -instcombine,
  529. // given this pass runs after the standard optimizations.
  530. if (ConstantOffset != 0) return ConstantOffset;
  531. // Reset the chain back to where it was when we started exploring this node,
  532. // since visiting the LHS didn't pan out.
  533. UserChain.resize(ChainLength);
  534. ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
  535. /* NonNegative */ false);
  536. // If U is a sub operator, negate the constant offset found in the right
  537. // operand.
  538. if (BO->getOpcode() == Instruction::Sub)
  539. ConstantOffset = -ConstantOffset;
  540. // If RHS wasn't a suitable candidate either, reset the chain again.
  541. if (ConstantOffset == 0)
  542. UserChain.resize(ChainLength);
  543. return ConstantOffset;
  544. }
  545. APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
  546. bool ZeroExtended, bool NonNegative) {
  547. // TODO(jingyue): We could trace into integer/pointer casts, such as
  548. // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
  549. // integers because it gives good enough results for our benchmarks.
  550. unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
  551. // We cannot do much with Values that are not a User, such as an Argument.
  552. User *U = dyn_cast<User>(V);
  553. if (U == nullptr) return APInt(BitWidth, 0);
  554. APInt ConstantOffset(BitWidth, 0);
  555. if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
  556. // Hooray, we found it!
  557. ConstantOffset = CI->getValue();
  558. } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
  559. // Trace into subexpressions for more hoisting opportunities.
  560. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
  561. ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
  562. } else if (isa<TruncInst>(V)) {
  563. ConstantOffset =
  564. find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
  565. .trunc(BitWidth);
  566. } else if (isa<SExtInst>(V)) {
  567. ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
  568. ZeroExtended, NonNegative).sext(BitWidth);
  569. } else if (isa<ZExtInst>(V)) {
  570. // As an optimization, we can clear the SignExtended flag because
  571. // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
  572. //
  573. // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
  574. ConstantOffset =
  575. find(U->getOperand(0), /* SignExtended */ false,
  576. /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
  577. }
  578. // If we found a non-zero constant offset, add it to the path for
  579. // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
  580. // help this optimization.
  581. if (ConstantOffset != 0)
  582. UserChain.push_back(U);
  583. return ConstantOffset;
  584. }
  585. Value *ConstantOffsetExtractor::applyExts(Value *V) {
  586. Value *Current = V;
  587. // ExtInsts is built in the use-def order. Therefore, we apply them to V
  588. // in the reversed order.
  589. for (CastInst *I : llvm::reverse(ExtInsts)) {
  590. if (Constant *C = dyn_cast<Constant>(Current)) {
  591. // If Current is a constant, apply s/zext using ConstantExpr::getCast.
  592. // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
  593. Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType());
  594. } else {
  595. Instruction *Ext = I->clone();
  596. Ext->setOperand(0, Current);
  597. Ext->insertBefore(IP);
  598. Current = Ext;
  599. }
  600. }
  601. return Current;
  602. }
  603. Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
  604. distributeExtsAndCloneChain(UserChain.size() - 1);
  605. // Remove all nullptrs (used to be s/zext) from UserChain.
  606. unsigned NewSize = 0;
  607. for (User *I : UserChain) {
  608. if (I != nullptr) {
  609. UserChain[NewSize] = I;
  610. NewSize++;
  611. }
  612. }
  613. UserChain.resize(NewSize);
  614. return removeConstOffset(UserChain.size() - 1);
  615. }
  616. Value *
  617. ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
  618. User *U = UserChain[ChainIndex];
  619. if (ChainIndex == 0) {
  620. assert(isa<ConstantInt>(U));
  621. // If U is a ConstantInt, applyExts will return a ConstantInt as well.
  622. return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
  623. }
  624. if (CastInst *Cast = dyn_cast<CastInst>(U)) {
  625. assert(
  626. (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
  627. "Only following instructions can be traced: sext, zext & trunc");
  628. ExtInsts.push_back(Cast);
  629. UserChain[ChainIndex] = nullptr;
  630. return distributeExtsAndCloneChain(ChainIndex - 1);
  631. }
  632. // Function find only trace into BinaryOperator and CastInst.
  633. BinaryOperator *BO = cast<BinaryOperator>(U);
  634. // OpNo = which operand of BO is UserChain[ChainIndex - 1]
  635. unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
  636. Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
  637. Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
  638. BinaryOperator *NewBO = nullptr;
  639. if (OpNo == 0) {
  640. NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
  641. BO->getName(), IP);
  642. } else {
  643. NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
  644. BO->getName(), IP);
  645. }
  646. return UserChain[ChainIndex] = NewBO;
  647. }
  648. Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
  649. if (ChainIndex == 0) {
  650. assert(isa<ConstantInt>(UserChain[ChainIndex]));
  651. return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
  652. }
  653. BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
  654. assert((BO->use_empty() || BO->hasOneUse()) &&
  655. "distributeExtsAndCloneChain clones each BinaryOperator in "
  656. "UserChain, so no one should be used more than "
  657. "once");
  658. unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
  659. assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
  660. Value *NextInChain = removeConstOffset(ChainIndex - 1);
  661. Value *TheOther = BO->getOperand(1 - OpNo);
  662. // If NextInChain is 0 and not the LHS of a sub, we can simplify the
  663. // sub-expression to be just TheOther.
  664. if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
  665. if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
  666. return TheOther;
  667. }
  668. BinaryOperator::BinaryOps NewOp = BO->getOpcode();
  669. if (BO->getOpcode() == Instruction::Or) {
  670. // Rebuild "or" as "add", because "or" may be invalid for the new
  671. // expression.
  672. //
  673. // For instance, given
  674. // a | (b + 5) where a and b + 5 have no common bits,
  675. // we can extract 5 as the constant offset.
  676. //
  677. // However, reusing the "or" in the new index would give us
  678. // (a | b) + 5
  679. // which does not equal a | (b + 5).
  680. //
  681. // Replacing the "or" with "add" is fine, because
  682. // a | (b + 5) = a + (b + 5) = (a + b) + 5
  683. NewOp = Instruction::Add;
  684. }
  685. BinaryOperator *NewBO;
  686. if (OpNo == 0) {
  687. NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
  688. } else {
  689. NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
  690. }
  691. NewBO->takeName(BO);
  692. return NewBO;
  693. }
  694. Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
  695. User *&UserChainTail,
  696. const DominatorTree *DT) {
  697. ConstantOffsetExtractor Extractor(GEP, DT);
  698. // Find a non-zero constant offset first.
  699. APInt ConstantOffset =
  700. Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
  701. GEP->isInBounds());
  702. if (ConstantOffset == 0) {
  703. UserChainTail = nullptr;
  704. return nullptr;
  705. }
  706. // Separates the constant offset from the GEP index.
  707. Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
  708. UserChainTail = Extractor.UserChain.back();
  709. return IdxWithoutConstOffset;
  710. }
  711. int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
  712. const DominatorTree *DT) {
  713. // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
  714. return ConstantOffsetExtractor(GEP, DT)
  715. .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
  716. GEP->isInBounds())
  717. .getSExtValue();
  718. }
  719. bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
  720. GetElementPtrInst *GEP) {
  721. bool Changed = false;
  722. Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
  723. gep_type_iterator GTI = gep_type_begin(*GEP);
  724. for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
  725. I != E; ++I, ++GTI) {
  726. // Skip struct member indices which must be i32.
  727. if (GTI.isSequential()) {
  728. if ((*I)->getType() != IntPtrTy) {
  729. *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
  730. Changed = true;
  731. }
  732. }
  733. }
  734. return Changed;
  735. }
  736. int64_t
  737. SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
  738. bool &NeedsExtraction) {
  739. NeedsExtraction = false;
  740. int64_t AccumulativeByteOffset = 0;
  741. gep_type_iterator GTI = gep_type_begin(*GEP);
  742. for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
  743. if (GTI.isSequential()) {
  744. // Constant offsets of scalable types are not really constant.
  745. if (isa<ScalableVectorType>(GTI.getIndexedType()))
  746. continue;
  747. // Tries to extract a constant offset from this GEP index.
  748. int64_t ConstantOffset =
  749. ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
  750. if (ConstantOffset != 0) {
  751. NeedsExtraction = true;
  752. // A GEP may have multiple indices. We accumulate the extracted
  753. // constant offset to a byte offset, and later offset the remainder of
  754. // the original GEP with this byte offset.
  755. AccumulativeByteOffset +=
  756. ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
  757. }
  758. } else if (LowerGEP) {
  759. StructType *StTy = GTI.getStructType();
  760. uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
  761. // Skip field 0 as the offset is always 0.
  762. if (Field != 0) {
  763. NeedsExtraction = true;
  764. AccumulativeByteOffset +=
  765. DL->getStructLayout(StTy)->getElementOffset(Field);
  766. }
  767. }
  768. }
  769. return AccumulativeByteOffset;
  770. }
  771. void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
  772. GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
  773. IRBuilder<> Builder(Variadic);
  774. Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
  775. Type *I8PtrTy =
  776. Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
  777. Value *ResultPtr = Variadic->getOperand(0);
  778. Loop *L = LI->getLoopFor(Variadic->getParent());
  779. // Check if the base is not loop invariant or used more than once.
  780. bool isSwapCandidate =
  781. L && L->isLoopInvariant(ResultPtr) &&
  782. !hasMoreThanOneUseInLoop(ResultPtr, L);
  783. Value *FirstResult = nullptr;
  784. if (ResultPtr->getType() != I8PtrTy)
  785. ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
  786. gep_type_iterator GTI = gep_type_begin(*Variadic);
  787. // Create an ugly GEP for each sequential index. We don't create GEPs for
  788. // structure indices, as they are accumulated in the constant offset index.
  789. for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
  790. if (GTI.isSequential()) {
  791. Value *Idx = Variadic->getOperand(I);
  792. // Skip zero indices.
  793. if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
  794. if (CI->isZero())
  795. continue;
  796. APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
  797. DL->getTypeAllocSize(GTI.getIndexedType()));
  798. // Scale the index by element size.
  799. if (ElementSize != 1) {
  800. if (ElementSize.isPowerOf2()) {
  801. Idx = Builder.CreateShl(
  802. Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
  803. } else {
  804. Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
  805. }
  806. }
  807. // Create an ugly GEP with a single index for each index.
  808. ResultPtr =
  809. Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
  810. if (FirstResult == nullptr)
  811. FirstResult = ResultPtr;
  812. }
  813. }
  814. // Create a GEP with the constant offset index.
  815. if (AccumulativeByteOffset != 0) {
  816. Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
  817. ResultPtr =
  818. Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
  819. } else
  820. isSwapCandidate = false;
  821. // If we created a GEP with constant index, and the base is loop invariant,
  822. // then we swap the first one with it, so LICM can move constant GEP out
  823. // later.
  824. auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
  825. auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
  826. if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
  827. swapGEPOperand(FirstGEP, SecondGEP);
  828. if (ResultPtr->getType() != Variadic->getType())
  829. ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
  830. Variadic->replaceAllUsesWith(ResultPtr);
  831. Variadic->eraseFromParent();
  832. }
  833. void
  834. SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
  835. int64_t AccumulativeByteOffset) {
  836. IRBuilder<> Builder(Variadic);
  837. Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
  838. Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
  839. gep_type_iterator GTI = gep_type_begin(*Variadic);
  840. // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
  841. // don't create arithmetics for structure indices, as they are accumulated
  842. // in the constant offset index.
  843. for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
  844. if (GTI.isSequential()) {
  845. Value *Idx = Variadic->getOperand(I);
  846. // Skip zero indices.
  847. if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
  848. if (CI->isZero())
  849. continue;
  850. APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
  851. DL->getTypeAllocSize(GTI.getIndexedType()));
  852. // Scale the index by element size.
  853. if (ElementSize != 1) {
  854. if (ElementSize.isPowerOf2()) {
  855. Idx = Builder.CreateShl(
  856. Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
  857. } else {
  858. Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
  859. }
  860. }
  861. // Create an ADD for each index.
  862. ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
  863. }
  864. }
  865. // Create an ADD for the constant offset index.
  866. if (AccumulativeByteOffset != 0) {
  867. ResultPtr = Builder.CreateAdd(
  868. ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
  869. }
  870. ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
  871. Variadic->replaceAllUsesWith(ResultPtr);
  872. Variadic->eraseFromParent();
  873. }
  874. bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
  875. // Skip vector GEPs.
  876. if (GEP->getType()->isVectorTy())
  877. return false;
  878. // The backend can already nicely handle the case where all indices are
  879. // constant.
  880. if (GEP->hasAllConstantIndices())
  881. return false;
  882. bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
  883. bool NeedsExtraction;
  884. int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
  885. if (!NeedsExtraction)
  886. return Changed;
  887. TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
  888. // If LowerGEP is disabled, before really splitting the GEP, check whether the
  889. // backend supports the addressing mode we are about to produce. If no, this
  890. // splitting probably won't be beneficial.
  891. // If LowerGEP is enabled, even the extracted constant offset can not match
  892. // the addressing mode, we can still do optimizations to other lowered parts
  893. // of variable indices. Therefore, we don't check for addressing modes in that
  894. // case.
  895. if (!LowerGEP) {
  896. unsigned AddrSpace = GEP->getPointerAddressSpace();
  897. if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
  898. /*BaseGV=*/nullptr, AccumulativeByteOffset,
  899. /*HasBaseReg=*/true, /*Scale=*/0,
  900. AddrSpace)) {
  901. return Changed;
  902. }
  903. }
  904. // Remove the constant offset in each sequential index. The resultant GEP
  905. // computes the variadic base.
  906. // Notice that we don't remove struct field indices here. If LowerGEP is
  907. // disabled, a structure index is not accumulated and we still use the old
  908. // one. If LowerGEP is enabled, a structure index is accumulated in the
  909. // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
  910. // handle the constant offset and won't need a new structure index.
  911. gep_type_iterator GTI = gep_type_begin(*GEP);
  912. for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
  913. if (GTI.isSequential()) {
  914. // Constant offsets of scalable types are not really constant.
  915. if (isa<ScalableVectorType>(GTI.getIndexedType()))
  916. continue;
  917. // Splits this GEP index into a variadic part and a constant offset, and
  918. // uses the variadic part as the new index.
  919. Value *OldIdx = GEP->getOperand(I);
  920. User *UserChainTail;
  921. Value *NewIdx =
  922. ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
  923. if (NewIdx != nullptr) {
  924. // Switches to the index with the constant offset removed.
  925. GEP->setOperand(I, NewIdx);
  926. // After switching to the new index, we can garbage-collect UserChain
  927. // and the old index if they are not used.
  928. RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
  929. RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
  930. }
  931. }
  932. }
  933. // Clear the inbounds attribute because the new index may be off-bound.
  934. // e.g.,
  935. //
  936. // b = add i64 a, 5
  937. // addr = gep inbounds float, float* p, i64 b
  938. //
  939. // is transformed to:
  940. //
  941. // addr2 = gep float, float* p, i64 a ; inbounds removed
  942. // addr = gep inbounds float, float* addr2, i64 5
  943. //
  944. // If a is -4, although the old index b is in bounds, the new index a is
  945. // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
  946. // inbounds keyword is not present, the offsets are added to the base
  947. // address with silently-wrapping two's complement arithmetic".
  948. // Therefore, the final code will be a semantically equivalent.
  949. //
  950. // TODO(jingyue): do some range analysis to keep as many inbounds as
  951. // possible. GEPs with inbounds are more friendly to alias analysis.
  952. bool GEPWasInBounds = GEP->isInBounds();
  953. GEP->setIsInBounds(false);
  954. // Lowers a GEP to either GEPs with a single index or arithmetic operations.
  955. if (LowerGEP) {
  956. // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
  957. // arithmetic operations if the target uses alias analysis in codegen.
  958. if (TTI.useAA())
  959. lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
  960. else
  961. lowerToArithmetics(GEP, AccumulativeByteOffset);
  962. return true;
  963. }
  964. // No need to create another GEP if the accumulative byte offset is 0.
  965. if (AccumulativeByteOffset == 0)
  966. return true;
  967. // Offsets the base with the accumulative byte offset.
  968. //
  969. // %gep ; the base
  970. // ... %gep ...
  971. //
  972. // => add the offset
  973. //
  974. // %gep2 ; clone of %gep
  975. // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
  976. // %gep ; will be removed
  977. // ... %gep ...
  978. //
  979. // => replace all uses of %gep with %new.gep and remove %gep
  980. //
  981. // %gep2 ; clone of %gep
  982. // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
  983. // ... %new.gep ...
  984. //
  985. // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
  986. // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
  987. // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
  988. // type of %gep.
  989. //
  990. // %gep2 ; clone of %gep
  991. // %0 = bitcast %gep2 to i8*
  992. // %uglygep = gep %0, <offset>
  993. // %new.gep = bitcast %uglygep to <type of %gep>
  994. // ... %new.gep ...
  995. Instruction *NewGEP = GEP->clone();
  996. NewGEP->insertBefore(GEP);
  997. // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
  998. // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
  999. // used with unsigned integers later.
  1000. int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
  1001. DL->getTypeAllocSize(GEP->getResultElementType()));
  1002. Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
  1003. if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
  1004. // Very likely. As long as %gep is naturally aligned, the byte offset we
  1005. // extracted should be a multiple of sizeof(*%gep).
  1006. int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
  1007. NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
  1008. ConstantInt::get(IntPtrTy, Index, true),
  1009. GEP->getName(), GEP);
  1010. NewGEP->copyMetadata(*GEP);
  1011. // Inherit the inbounds attribute of the original GEP.
  1012. cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
  1013. } else {
  1014. // Unlikely but possible. For example,
  1015. // #pragma pack(1)
  1016. // struct S {
  1017. // int a[3];
  1018. // int64 b[8];
  1019. // };
  1020. // #pragma pack()
  1021. //
  1022. // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
  1023. // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
  1024. // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
  1025. // sizeof(int64).
  1026. //
  1027. // Emit an uglygep in this case.
  1028. IRBuilder<> Builder(GEP);
  1029. Type *I8PtrTy =
  1030. Builder.getInt8Ty()->getPointerTo(GEP->getPointerAddressSpace());
  1031. NewGEP = cast<Instruction>(Builder.CreateGEP(
  1032. Builder.getInt8Ty(), Builder.CreateBitCast(NewGEP, I8PtrTy),
  1033. {ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true)}, "uglygep",
  1034. GEPWasInBounds));
  1035. NewGEP->copyMetadata(*GEP);
  1036. NewGEP = cast<Instruction>(Builder.CreateBitCast(NewGEP, GEP->getType()));
  1037. }
  1038. GEP->replaceAllUsesWith(NewGEP);
  1039. GEP->eraseFromParent();
  1040. return true;
  1041. }
  1042. bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
  1043. if (skipFunction(F))
  1044. return false;
  1045. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1046. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1047. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1048. auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
  1049. auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
  1050. return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1051. };
  1052. SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
  1053. return Impl.run(F);
  1054. }
  1055. bool SeparateConstOffsetFromGEP::run(Function &F) {
  1056. if (DisableSeparateConstOffsetFromGEP)
  1057. return false;
  1058. DL = &F.getParent()->getDataLayout();
  1059. bool Changed = false;
  1060. for (BasicBlock &B : F) {
  1061. if (!DT->isReachableFromEntry(&B))
  1062. continue;
  1063. for (Instruction &I : llvm::make_early_inc_range(B))
  1064. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
  1065. Changed |= splitGEP(GEP);
  1066. // No need to split GEP ConstantExprs because all its indices are constant
  1067. // already.
  1068. }
  1069. Changed |= reuniteExts(F);
  1070. if (VerifyNoDeadCode)
  1071. verifyNoDeadCode(F);
  1072. return Changed;
  1073. }
  1074. Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
  1075. const SCEV *Key, Instruction *Dominatee,
  1076. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
  1077. auto Pos = DominatingExprs.find(Key);
  1078. if (Pos == DominatingExprs.end())
  1079. return nullptr;
  1080. auto &Candidates = Pos->second;
  1081. // Because we process the basic blocks in pre-order of the dominator tree, a
  1082. // candidate that doesn't dominate the current instruction won't dominate any
  1083. // future instruction either. Therefore, we pop it out of the stack. This
  1084. // optimization makes the algorithm O(n).
  1085. while (!Candidates.empty()) {
  1086. Instruction *Candidate = Candidates.back();
  1087. if (DT->dominates(Candidate, Dominatee))
  1088. return Candidate;
  1089. Candidates.pop_back();
  1090. }
  1091. return nullptr;
  1092. }
  1093. bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
  1094. if (!SE->isSCEVable(I->getType()))
  1095. return false;
  1096. // Dom: LHS+RHS
  1097. // I: sext(LHS)+sext(RHS)
  1098. // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
  1099. // TODO: handle zext
  1100. Value *LHS = nullptr, *RHS = nullptr;
  1101. if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
  1102. if (LHS->getType() == RHS->getType()) {
  1103. const SCEV *Key =
  1104. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1105. if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
  1106. Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
  1107. NewSExt->takeName(I);
  1108. I->replaceAllUsesWith(NewSExt);
  1109. RecursivelyDeleteTriviallyDeadInstructions(I);
  1110. return true;
  1111. }
  1112. }
  1113. } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
  1114. if (LHS->getType() == RHS->getType()) {
  1115. const SCEV *Key =
  1116. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1117. if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
  1118. Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
  1119. NewSExt->takeName(I);
  1120. I->replaceAllUsesWith(NewSExt);
  1121. RecursivelyDeleteTriviallyDeadInstructions(I);
  1122. return true;
  1123. }
  1124. }
  1125. }
  1126. // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
  1127. if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
  1128. if (programUndefinedIfPoison(I)) {
  1129. const SCEV *Key =
  1130. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1131. DominatingAdds[Key].push_back(I);
  1132. }
  1133. } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
  1134. if (programUndefinedIfPoison(I)) {
  1135. const SCEV *Key =
  1136. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1137. DominatingSubs[Key].push_back(I);
  1138. }
  1139. }
  1140. return false;
  1141. }
  1142. bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
  1143. bool Changed = false;
  1144. DominatingAdds.clear();
  1145. DominatingSubs.clear();
  1146. for (const auto Node : depth_first(DT)) {
  1147. BasicBlock *BB = Node->getBlock();
  1148. for (Instruction &I : llvm::make_early_inc_range(*BB))
  1149. Changed |= reuniteExts(&I);
  1150. }
  1151. return Changed;
  1152. }
  1153. void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
  1154. for (BasicBlock &B : F) {
  1155. for (Instruction &I : B) {
  1156. if (isInstructionTriviallyDead(&I)) {
  1157. std::string ErrMessage;
  1158. raw_string_ostream RSO(ErrMessage);
  1159. RSO << "Dead instruction detected!\n" << I << "\n";
  1160. llvm_unreachable(RSO.str().c_str());
  1161. }
  1162. }
  1163. }
  1164. }
  1165. bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
  1166. GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
  1167. if (!FirstGEP || !FirstGEP->hasOneUse())
  1168. return false;
  1169. if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
  1170. return false;
  1171. if (FirstGEP == SecondGEP)
  1172. return false;
  1173. unsigned FirstNum = FirstGEP->getNumOperands();
  1174. unsigned SecondNum = SecondGEP->getNumOperands();
  1175. // Give up if the number of operands are not 2.
  1176. if (FirstNum != SecondNum || FirstNum != 2)
  1177. return false;
  1178. Value *FirstBase = FirstGEP->getOperand(0);
  1179. Value *SecondBase = SecondGEP->getOperand(0);
  1180. Value *FirstOffset = FirstGEP->getOperand(1);
  1181. // Give up if the index of the first GEP is loop invariant.
  1182. if (CurLoop->isLoopInvariant(FirstOffset))
  1183. return false;
  1184. // Give up if base doesn't have same type.
  1185. if (FirstBase->getType() != SecondBase->getType())
  1186. return false;
  1187. Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
  1188. // Check if the second operand of first GEP has constant coefficient.
  1189. // For an example, for the following code, we won't gain anything by
  1190. // hoisting the second GEP out because the second GEP can be folded away.
  1191. // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
  1192. // %67 = shl i64 %scevgep.sum.ur159, 2
  1193. // %uglygep160 = getelementptr i8* %65, i64 %67
  1194. // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
  1195. // Skip constant shift instruction which may be generated by Splitting GEPs.
  1196. if (FirstOffsetDef && FirstOffsetDef->isShift() &&
  1197. isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
  1198. FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
  1199. // Give up if FirstOffsetDef is an Add or Sub with constant.
  1200. // Because it may not profitable at all due to constant folding.
  1201. if (FirstOffsetDef)
  1202. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
  1203. unsigned opc = BO->getOpcode();
  1204. if ((opc == Instruction::Add || opc == Instruction::Sub) &&
  1205. (isa<ConstantInt>(BO->getOperand(0)) ||
  1206. isa<ConstantInt>(BO->getOperand(1))))
  1207. return false;
  1208. }
  1209. return true;
  1210. }
  1211. bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
  1212. int UsesInLoop = 0;
  1213. for (User *U : V->users()) {
  1214. if (Instruction *User = dyn_cast<Instruction>(U))
  1215. if (L->contains(User))
  1216. if (++UsesInLoop > 1)
  1217. return true;
  1218. }
  1219. return false;
  1220. }
  1221. void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
  1222. GetElementPtrInst *Second) {
  1223. Value *Offset1 = First->getOperand(1);
  1224. Value *Offset2 = Second->getOperand(1);
  1225. First->setOperand(1, Offset2);
  1226. Second->setOperand(1, Offset1);
  1227. // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
  1228. const DataLayout &DAL = First->getModule()->getDataLayout();
  1229. APInt Offset(DAL.getIndexSizeInBits(
  1230. cast<PointerType>(First->getType())->getAddressSpace()),
  1231. 0);
  1232. Value *NewBase =
  1233. First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
  1234. uint64_t ObjectSize;
  1235. if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
  1236. Offset.ugt(ObjectSize)) {
  1237. First->setIsInBounds(false);
  1238. Second->setIsInBounds(false);
  1239. } else
  1240. First->setIsInBounds(true);
  1241. }
  1242. PreservedAnalyses
  1243. SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
  1244. auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
  1245. auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
  1246. auto *LI = &AM.getResult<LoopAnalysis>(F);
  1247. auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
  1248. auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & {
  1249. return AM.getResult<TargetIRAnalysis>(F);
  1250. };
  1251. SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
  1252. if (!Impl.run(F))
  1253. return PreservedAnalyses::all();
  1254. PreservedAnalyses PA;
  1255. PA.preserveSet<CFGAnalyses>();
  1256. return PA;
  1257. }