SeparateConstOffsetFromGEP.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388
  1. //===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Loop unrolling may create many similar GEPs for array accesses.
  10. // e.g., a 2-level loop
  11. //
  12. // float a[32][32]; // global variable
  13. //
  14. // for (int i = 0; i < 2; ++i) {
  15. // for (int j = 0; j < 2; ++j) {
  16. // ...
  17. // ... = a[x + i][y + j];
  18. // ...
  19. // }
  20. // }
  21. //
  22. // will probably be unrolled to:
  23. //
  24. // gep %a, 0, %x, %y; load
  25. // gep %a, 0, %x, %y + 1; load
  26. // gep %a, 0, %x + 1, %y; load
  27. // gep %a, 0, %x + 1, %y + 1; load
  28. //
  29. // LLVM's GVN does not use partial redundancy elimination yet, and is thus
  30. // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
  31. // significant slowdown in targets with limited addressing modes. For instance,
  32. // because the PTX target does not support the reg+reg addressing mode, the
  33. // NVPTX backend emits PTX code that literally computes the pointer address of
  34. // each GEP, wasting tons of registers. It emits the following PTX for the
  35. // first load and similar PTX for other loads.
  36. //
  37. // mov.u32 %r1, %x;
  38. // mov.u32 %r2, %y;
  39. // mul.wide.u32 %rl2, %r1, 128;
  40. // mov.u64 %rl3, a;
  41. // add.s64 %rl4, %rl3, %rl2;
  42. // mul.wide.u32 %rl5, %r2, 4;
  43. // add.s64 %rl6, %rl4, %rl5;
  44. // ld.global.f32 %f1, [%rl6];
  45. //
  46. // To reduce the register pressure, the optimization implemented in this file
  47. // merges the common part of a group of GEPs, so we can compute each pointer
  48. // address by adding a simple offset to the common part, saving many registers.
  49. //
  50. // It works by splitting each GEP into a variadic base and a constant offset.
  51. // The variadic base can be computed once and reused by multiple GEPs, and the
  52. // constant offsets can be nicely folded into the reg+immediate addressing mode
  53. // (supported by most targets) without using any extra register.
  54. //
  55. // For instance, we transform the four GEPs and four loads in the above example
  56. // into:
  57. //
  58. // base = gep a, 0, x, y
  59. // load base
  60. // laod base + 1 * sizeof(float)
  61. // load base + 32 * sizeof(float)
  62. // load base + 33 * sizeof(float)
  63. //
  64. // Given the transformed IR, a backend that supports the reg+immediate
  65. // addressing mode can easily fold the pointer arithmetics into the loads. For
  66. // example, the NVPTX backend can easily fold the pointer arithmetics into the
  67. // ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
  68. //
  69. // mov.u32 %r1, %tid.x;
  70. // mov.u32 %r2, %tid.y;
  71. // mul.wide.u32 %rl2, %r1, 128;
  72. // mov.u64 %rl3, a;
  73. // add.s64 %rl4, %rl3, %rl2;
  74. // mul.wide.u32 %rl5, %r2, 4;
  75. // add.s64 %rl6, %rl4, %rl5;
  76. // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
  77. // ld.global.f32 %f2, [%rl6+4]; // much better
  78. // ld.global.f32 %f3, [%rl6+128]; // much better
  79. // ld.global.f32 %f4, [%rl6+132]; // much better
  80. //
  81. // Another improvement enabled by the LowerGEP flag is to lower a GEP with
  82. // multiple indices to either multiple GEPs with a single index or arithmetic
  83. // operations (depending on whether the target uses alias analysis in codegen).
  84. // Such transformation can have following benefits:
  85. // (1) It can always extract constants in the indices of structure type.
  86. // (2) After such Lowering, there are more optimization opportunities such as
  87. // CSE, LICM and CGP.
  88. //
  89. // E.g. The following GEPs have multiple indices:
  90. // BB1:
  91. // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
  92. // load %p
  93. // ...
  94. // BB2:
  95. // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
  96. // load %p2
  97. // ...
  98. //
  99. // We can not do CSE to the common part related to index "i64 %i". Lowering
  100. // GEPs can achieve such goals.
  101. // If the target does not use alias analysis in codegen, this pass will
  102. // lower a GEP with multiple indices into arithmetic operations:
  103. // BB1:
  104. // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
  105. // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  106. // %3 = add i64 %1, %2 ; CSE opportunity
  107. // %4 = mul i64 %j1, length_of_struct
  108. // %5 = add i64 %3, %4
  109. // %6 = add i64 %3, struct_field_3 ; Constant offset
  110. // %p = inttoptr i64 %6 to i32*
  111. // load %p
  112. // ...
  113. // BB2:
  114. // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
  115. // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  116. // %9 = add i64 %7, %8 ; CSE opportunity
  117. // %10 = mul i64 %j2, length_of_struct
  118. // %11 = add i64 %9, %10
  119. // %12 = add i64 %11, struct_field_2 ; Constant offset
  120. // %p = inttoptr i64 %12 to i32*
  121. // load %p2
  122. // ...
  123. //
  124. // If the target uses alias analysis in codegen, this pass will lower a GEP
  125. // with multiple indices into multiple GEPs with a single index:
  126. // BB1:
  127. // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
  128. // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  129. // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
  130. // %4 = mul i64 %j1, length_of_struct
  131. // %5 = getelementptr i8* %3, i64 %4
  132. // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
  133. // %p = bitcast i8* %6 to i32*
  134. // load %p
  135. // ...
  136. // BB2:
  137. // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
  138. // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
  139. // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
  140. // %10 = mul i64 %j2, length_of_struct
  141. // %11 = getelementptr i8* %9, i64 %10
  142. // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
  143. // %p2 = bitcast i8* %12 to i32*
  144. // load %p2
  145. // ...
  146. //
  147. // Lowering GEPs can also benefit other passes such as LICM and CGP.
  148. // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
  149. // indices if one of the index is variant. If we lower such GEP into invariant
  150. // parts and variant parts, LICM can hoist/sink those invariant parts.
  151. // CGP (CodeGen Prepare) tries to sink address calculations that match the
  152. // target's addressing modes. A GEP with multiple indices may not match and will
  153. // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
  154. // them. So we end up with a better addressing mode.
  155. //
  156. //===----------------------------------------------------------------------===//
  157. #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
  158. #include "llvm/ADT/APInt.h"
  159. #include "llvm/ADT/DenseMap.h"
  160. #include "llvm/ADT/DepthFirstIterator.h"
  161. #include "llvm/ADT/SmallVector.h"
  162. #include "llvm/Analysis/LoopInfo.h"
  163. #include "llvm/Analysis/MemoryBuiltins.h"
  164. #include "llvm/Analysis/ScalarEvolution.h"
  165. #include "llvm/Analysis/TargetLibraryInfo.h"
  166. #include "llvm/Analysis/TargetTransformInfo.h"
  167. #include "llvm/Analysis/ValueTracking.h"
  168. #include "llvm/IR/BasicBlock.h"
  169. #include "llvm/IR/Constant.h"
  170. #include "llvm/IR/Constants.h"
  171. #include "llvm/IR/DataLayout.h"
  172. #include "llvm/IR/DerivedTypes.h"
  173. #include "llvm/IR/Dominators.h"
  174. #include "llvm/IR/Function.h"
  175. #include "llvm/IR/GetElementPtrTypeIterator.h"
  176. #include "llvm/IR/IRBuilder.h"
  177. #include "llvm/IR/Instruction.h"
  178. #include "llvm/IR/Instructions.h"
  179. #include "llvm/IR/Module.h"
  180. #include "llvm/IR/PassManager.h"
  181. #include "llvm/IR/PatternMatch.h"
  182. #include "llvm/IR/Type.h"
  183. #include "llvm/IR/User.h"
  184. #include "llvm/IR/Value.h"
  185. #include "llvm/InitializePasses.h"
  186. #include "llvm/Pass.h"
  187. #include "llvm/Support/Casting.h"
  188. #include "llvm/Support/CommandLine.h"
  189. #include "llvm/Support/ErrorHandling.h"
  190. #include "llvm/Support/raw_ostream.h"
  191. #include "llvm/Target/TargetMachine.h"
  192. #include "llvm/Transforms/Scalar.h"
  193. #include "llvm/Transforms/Utils/Local.h"
  194. #include <cassert>
  195. #include <cstdint>
  196. #include <string>
  197. using namespace llvm;
  198. using namespace llvm::PatternMatch;
  199. static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
  200. "disable-separate-const-offset-from-gep", cl::init(false),
  201. cl::desc("Do not separate the constant offset from a GEP instruction"),
  202. cl::Hidden);
  203. // Setting this flag may emit false positives when the input module already
  204. // contains dead instructions. Therefore, we set it only in unit tests that are
  205. // free of dead code.
  206. static cl::opt<bool>
  207. VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
  208. cl::desc("Verify this pass produces no dead code"),
  209. cl::Hidden);
  210. namespace {
  211. /// A helper class for separating a constant offset from a GEP index.
  212. ///
  213. /// In real programs, a GEP index may be more complicated than a simple addition
  214. /// of something and a constant integer which can be trivially splitted. For
  215. /// example, to split ((a << 3) | 5) + b, we need to search deeper for the
  216. /// constant offset, so that we can separate the index to (a << 3) + b and 5.
  217. ///
  218. /// Therefore, this class looks into the expression that computes a given GEP
  219. /// index, and tries to find a constant integer that can be hoisted to the
  220. /// outermost level of the expression as an addition. Not every constant in an
  221. /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
  222. /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
  223. /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
  224. class ConstantOffsetExtractor {
  225. public:
  226. /// Extracts a constant offset from the given GEP index. It returns the
  227. /// new index representing the remainder (equal to the original index minus
  228. /// the constant offset), or nullptr if we cannot extract a constant offset.
  229. /// \p Idx The given GEP index
  230. /// \p GEP The given GEP
  231. /// \p UserChainTail Outputs the tail of UserChain so that we can
  232. /// garbage-collect unused instructions in UserChain.
  233. static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
  234. User *&UserChainTail, const DominatorTree *DT);
  235. /// Looks for a constant offset from the given GEP index without extracting
  236. /// it. It returns the numeric value of the extracted constant offset (0 if
  237. /// failed). The meaning of the arguments are the same as Extract.
  238. static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
  239. const DominatorTree *DT);
  240. private:
  241. ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
  242. : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
  243. }
  244. /// Searches the expression that computes V for a non-zero constant C s.t.
  245. /// V can be reassociated into the form V' + C. If the searching is
  246. /// successful, returns C and update UserChain as a def-use chain from C to V;
  247. /// otherwise, UserChain is empty.
  248. ///
  249. /// \p V The given expression
  250. /// \p SignExtended Whether V will be sign-extended in the computation of the
  251. /// GEP index
  252. /// \p ZeroExtended Whether V will be zero-extended in the computation of the
  253. /// GEP index
  254. /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
  255. /// an index of an inbounds GEP is guaranteed to be
  256. /// non-negative. Levaraging this, we can better split
  257. /// inbounds GEPs.
  258. APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
  259. /// A helper function to look into both operands of a binary operator.
  260. APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
  261. bool ZeroExtended);
  262. /// After finding the constant offset C from the GEP index I, we build a new
  263. /// index I' s.t. I' + C = I. This function builds and returns the new
  264. /// index I' according to UserChain produced by function "find".
  265. ///
  266. /// The building conceptually takes two steps:
  267. /// 1) iteratively distribute s/zext towards the leaves of the expression tree
  268. /// that computes I
  269. /// 2) reassociate the expression tree to the form I' + C.
  270. ///
  271. /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
  272. /// sext to a, b and 5 so that we have
  273. /// sext(a) + (sext(b) + 5).
  274. /// Then, we reassociate it to
  275. /// (sext(a) + sext(b)) + 5.
  276. /// Given this form, we know I' is sext(a) + sext(b).
  277. Value *rebuildWithoutConstOffset();
  278. /// After the first step of rebuilding the GEP index without the constant
  279. /// offset, distribute s/zext to the operands of all operators in UserChain.
  280. /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
  281. /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
  282. ///
  283. /// The function also updates UserChain to point to new subexpressions after
  284. /// distributing s/zext. e.g., the old UserChain of the above example is
  285. /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
  286. /// and the new UserChain is
  287. /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
  288. /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
  289. ///
  290. /// \p ChainIndex The index to UserChain. ChainIndex is initially
  291. /// UserChain.size() - 1, and is decremented during
  292. /// the recursion.
  293. Value *distributeExtsAndCloneChain(unsigned ChainIndex);
  294. /// Reassociates the GEP index to the form I' + C and returns I'.
  295. Value *removeConstOffset(unsigned ChainIndex);
  296. /// A helper function to apply ExtInsts, a list of s/zext, to value V.
  297. /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
  298. /// returns "sext i32 (zext i16 V to i32) to i64".
  299. Value *applyExts(Value *V);
  300. /// A helper function that returns whether we can trace into the operands
  301. /// of binary operator BO for a constant offset.
  302. ///
  303. /// \p SignExtended Whether BO is surrounded by sext
  304. /// \p ZeroExtended Whether BO is surrounded by zext
  305. /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
  306. /// array index.
  307. bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
  308. bool NonNegative);
  309. /// The path from the constant offset to the old GEP index. e.g., if the GEP
  310. /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
  311. /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
  312. /// UserChain[2] will be the entire expression "a * b + (c + 5)".
  313. ///
  314. /// This path helps to rebuild the new GEP index.
  315. SmallVector<User *, 8> UserChain;
  316. /// A data structure used in rebuildWithoutConstOffset. Contains all
  317. /// sext/zext instructions along UserChain.
  318. SmallVector<CastInst *, 16> ExtInsts;
  319. /// Insertion position of cloned instructions.
  320. Instruction *IP;
  321. const DataLayout &DL;
  322. const DominatorTree *DT;
  323. };
  324. /// A pass that tries to split every GEP in the function into a variadic
  325. /// base and a constant offset. It is a FunctionPass because searching for the
  326. /// constant offset may inspect other basic blocks.
  327. class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
  328. public:
  329. static char ID;
  330. SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
  331. : FunctionPass(ID), LowerGEP(LowerGEP) {
  332. initializeSeparateConstOffsetFromGEPLegacyPassPass(
  333. *PassRegistry::getPassRegistry());
  334. }
  335. void getAnalysisUsage(AnalysisUsage &AU) const override {
  336. AU.addRequired<DominatorTreeWrapperPass>();
  337. AU.addRequired<ScalarEvolutionWrapperPass>();
  338. AU.addRequired<TargetTransformInfoWrapperPass>();
  339. AU.addRequired<LoopInfoWrapperPass>();
  340. AU.setPreservesCFG();
  341. AU.addRequired<TargetLibraryInfoWrapperPass>();
  342. }
  343. bool runOnFunction(Function &F) override;
  344. private:
  345. bool LowerGEP;
  346. };
  347. /// A pass that tries to split every GEP in the function into a variadic
  348. /// base and a constant offset. It is a FunctionPass because searching for the
  349. /// constant offset may inspect other basic blocks.
  350. class SeparateConstOffsetFromGEP {
  351. public:
  352. SeparateConstOffsetFromGEP(
  353. DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI,
  354. TargetLibraryInfo *TLI,
  355. function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP)
  356. : DT(DT), SE(SE), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {}
  357. bool run(Function &F);
  358. private:
  359. /// Tries to split the given GEP into a variadic base and a constant offset,
  360. /// and returns true if the splitting succeeds.
  361. bool splitGEP(GetElementPtrInst *GEP);
  362. /// Lower a GEP with multiple indices into multiple GEPs with a single index.
  363. /// Function splitGEP already split the original GEP into a variadic part and
  364. /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
  365. /// variadic part into a set of GEPs with a single index and applies
  366. /// AccumulativeByteOffset to it.
  367. /// \p Variadic The variadic part of the original GEP.
  368. /// \p AccumulativeByteOffset The constant offset.
  369. void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
  370. int64_t AccumulativeByteOffset);
  371. /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
  372. /// Function splitGEP already split the original GEP into a variadic part and
  373. /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
  374. /// variadic part into a set of arithmetic operations and applies
  375. /// AccumulativeByteOffset to it.
  376. /// \p Variadic The variadic part of the original GEP.
  377. /// \p AccumulativeByteOffset The constant offset.
  378. void lowerToArithmetics(GetElementPtrInst *Variadic,
  379. int64_t AccumulativeByteOffset);
  380. /// Finds the constant offset within each index and accumulates them. If
  381. /// LowerGEP is true, it finds in indices of both sequential and structure
  382. /// types, otherwise it only finds in sequential indices. The output
  383. /// NeedsExtraction indicates whether we successfully find a non-zero constant
  384. /// offset.
  385. int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
  386. /// Canonicalize array indices to pointer-size integers. This helps to
  387. /// simplify the logic of splitting a GEP. For example, if a + b is a
  388. /// pointer-size integer, we have
  389. /// gep base, a + b = gep (gep base, a), b
  390. /// However, this equality may not hold if the size of a + b is smaller than
  391. /// the pointer size, because LLVM conceptually sign-extends GEP indices to
  392. /// pointer size before computing the address
  393. /// (http://llvm.org/docs/LangRef.html#id181).
  394. ///
  395. /// This canonicalization is very likely already done in clang and
  396. /// instcombine. Therefore, the program will probably remain the same.
  397. ///
  398. /// Returns true if the module changes.
  399. ///
  400. /// Verified in @i32_add in split-gep.ll
  401. bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
  402. /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
  403. /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
  404. /// the constant offset. After extraction, it becomes desirable to reunion the
  405. /// distributed sexts. For example,
  406. ///
  407. /// &a[sext(i +nsw (j +nsw 5)]
  408. /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
  409. /// => constant extraction &a[sext(i) + sext(j)] + 5
  410. /// => reunion &a[sext(i +nsw j)] + 5
  411. bool reuniteExts(Function &F);
  412. /// A helper that reunites sexts in an instruction.
  413. bool reuniteExts(Instruction *I);
  414. /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
  415. Instruction *findClosestMatchingDominator(
  416. const SCEV *Key, Instruction *Dominatee,
  417. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
  418. /// Verify F is free of dead code.
  419. void verifyNoDeadCode(Function &F);
  420. bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
  421. // Swap the index operand of two GEP.
  422. void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
  423. // Check if it is safe to swap operand of two GEP.
  424. bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
  425. Loop *CurLoop);
  426. const DataLayout *DL = nullptr;
  427. DominatorTree *DT = nullptr;
  428. ScalarEvolution *SE;
  429. LoopInfo *LI;
  430. TargetLibraryInfo *TLI;
  431. // Retrieved lazily since not always used.
  432. function_ref<TargetTransformInfo &(Function &)> GetTTI;
  433. /// Whether to lower a GEP with multiple indices into arithmetic operations or
  434. /// multiple GEPs with a single index.
  435. bool LowerGEP;
  436. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
  437. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
  438. };
  439. } // end anonymous namespace
  440. char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
  441. INITIALIZE_PASS_BEGIN(
  442. SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
  443. "Split GEPs to a variadic base and a constant offset for better CSE", false,
  444. false)
  445. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  446. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  447. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  448. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  449. INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
  450. INITIALIZE_PASS_END(
  451. SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
  452. "Split GEPs to a variadic base and a constant offset for better CSE", false,
  453. false)
  454. FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
  455. return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
  456. }
  457. bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
  458. bool ZeroExtended,
  459. BinaryOperator *BO,
  460. bool NonNegative) {
  461. // We only consider ADD, SUB and OR, because a non-zero constant found in
  462. // expressions composed of these operations can be easily hoisted as a
  463. // constant offset by reassociation.
  464. if (BO->getOpcode() != Instruction::Add &&
  465. BO->getOpcode() != Instruction::Sub &&
  466. BO->getOpcode() != Instruction::Or) {
  467. return false;
  468. }
  469. Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
  470. // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
  471. // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
  472. // FIXME: this does not appear to be covered by any tests
  473. // (with x86/aarch64 backends at least)
  474. if (BO->getOpcode() == Instruction::Or &&
  475. !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
  476. return false;
  477. // In addition, tracing into BO requires that its surrounding s/zext (if
  478. // any) is distributable to both operands.
  479. //
  480. // Suppose BO = A op B.
  481. // SignExtended | ZeroExtended | Distributable?
  482. // --------------+--------------+----------------------------------
  483. // 0 | 0 | true because no s/zext exists
  484. // 0 | 1 | zext(BO) == zext(A) op zext(B)
  485. // 1 | 0 | sext(BO) == sext(A) op sext(B)
  486. // 1 | 1 | zext(sext(BO)) ==
  487. // | | zext(sext(A)) op zext(sext(B))
  488. if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
  489. // If a + b >= 0 and (a >= 0 or b >= 0), then
  490. // sext(a + b) = sext(a) + sext(b)
  491. // even if the addition is not marked nsw.
  492. //
  493. // Leveraging this invariant, we can trace into an sext'ed inbound GEP
  494. // index if the constant offset is non-negative.
  495. //
  496. // Verified in @sext_add in split-gep.ll.
  497. if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
  498. if (!ConstLHS->isNegative())
  499. return true;
  500. }
  501. if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
  502. if (!ConstRHS->isNegative())
  503. return true;
  504. }
  505. }
  506. // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
  507. // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
  508. if (BO->getOpcode() == Instruction::Add ||
  509. BO->getOpcode() == Instruction::Sub) {
  510. if (SignExtended && !BO->hasNoSignedWrap())
  511. return false;
  512. if (ZeroExtended && !BO->hasNoUnsignedWrap())
  513. return false;
  514. }
  515. return true;
  516. }
  517. APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
  518. bool SignExtended,
  519. bool ZeroExtended) {
  520. // Save off the current height of the chain, in case we need to restore it.
  521. size_t ChainLength = UserChain.size();
  522. // BO being non-negative does not shed light on whether its operands are
  523. // non-negative. Clear the NonNegative flag here.
  524. APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
  525. /* NonNegative */ false);
  526. // If we found a constant offset in the left operand, stop and return that.
  527. // This shortcut might cause us to miss opportunities of combining the
  528. // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
  529. // However, such cases are probably already handled by -instcombine,
  530. // given this pass runs after the standard optimizations.
  531. if (ConstantOffset != 0) return ConstantOffset;
  532. // Reset the chain back to where it was when we started exploring this node,
  533. // since visiting the LHS didn't pan out.
  534. UserChain.resize(ChainLength);
  535. ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
  536. /* NonNegative */ false);
  537. // If U is a sub operator, negate the constant offset found in the right
  538. // operand.
  539. if (BO->getOpcode() == Instruction::Sub)
  540. ConstantOffset = -ConstantOffset;
  541. // If RHS wasn't a suitable candidate either, reset the chain again.
  542. if (ConstantOffset == 0)
  543. UserChain.resize(ChainLength);
  544. return ConstantOffset;
  545. }
  546. APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
  547. bool ZeroExtended, bool NonNegative) {
  548. // TODO(jingyue): We could trace into integer/pointer casts, such as
  549. // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
  550. // integers because it gives good enough results for our benchmarks.
  551. unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
  552. // We cannot do much with Values that are not a User, such as an Argument.
  553. User *U = dyn_cast<User>(V);
  554. if (U == nullptr) return APInt(BitWidth, 0);
  555. APInt ConstantOffset(BitWidth, 0);
  556. if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
  557. // Hooray, we found it!
  558. ConstantOffset = CI->getValue();
  559. } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
  560. // Trace into subexpressions for more hoisting opportunities.
  561. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
  562. ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
  563. } else if (isa<TruncInst>(V)) {
  564. ConstantOffset =
  565. find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
  566. .trunc(BitWidth);
  567. } else if (isa<SExtInst>(V)) {
  568. ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
  569. ZeroExtended, NonNegative).sext(BitWidth);
  570. } else if (isa<ZExtInst>(V)) {
  571. // As an optimization, we can clear the SignExtended flag because
  572. // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
  573. //
  574. // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
  575. ConstantOffset =
  576. find(U->getOperand(0), /* SignExtended */ false,
  577. /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
  578. }
  579. // If we found a non-zero constant offset, add it to the path for
  580. // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
  581. // help this optimization.
  582. if (ConstantOffset != 0)
  583. UserChain.push_back(U);
  584. return ConstantOffset;
  585. }
  586. Value *ConstantOffsetExtractor::applyExts(Value *V) {
  587. Value *Current = V;
  588. // ExtInsts is built in the use-def order. Therefore, we apply them to V
  589. // in the reversed order.
  590. for (CastInst *I : llvm::reverse(ExtInsts)) {
  591. if (Constant *C = dyn_cast<Constant>(Current)) {
  592. // If Current is a constant, apply s/zext using ConstantExpr::getCast.
  593. // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
  594. Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType());
  595. } else {
  596. Instruction *Ext = I->clone();
  597. Ext->setOperand(0, Current);
  598. Ext->insertBefore(IP);
  599. Current = Ext;
  600. }
  601. }
  602. return Current;
  603. }
  604. Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
  605. distributeExtsAndCloneChain(UserChain.size() - 1);
  606. // Remove all nullptrs (used to be s/zext) from UserChain.
  607. unsigned NewSize = 0;
  608. for (User *I : UserChain) {
  609. if (I != nullptr) {
  610. UserChain[NewSize] = I;
  611. NewSize++;
  612. }
  613. }
  614. UserChain.resize(NewSize);
  615. return removeConstOffset(UserChain.size() - 1);
  616. }
  617. Value *
  618. ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
  619. User *U = UserChain[ChainIndex];
  620. if (ChainIndex == 0) {
  621. assert(isa<ConstantInt>(U));
  622. // If U is a ConstantInt, applyExts will return a ConstantInt as well.
  623. return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
  624. }
  625. if (CastInst *Cast = dyn_cast<CastInst>(U)) {
  626. assert(
  627. (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
  628. "Only following instructions can be traced: sext, zext & trunc");
  629. ExtInsts.push_back(Cast);
  630. UserChain[ChainIndex] = nullptr;
  631. return distributeExtsAndCloneChain(ChainIndex - 1);
  632. }
  633. // Function find only trace into BinaryOperator and CastInst.
  634. BinaryOperator *BO = cast<BinaryOperator>(U);
  635. // OpNo = which operand of BO is UserChain[ChainIndex - 1]
  636. unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
  637. Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
  638. Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
  639. BinaryOperator *NewBO = nullptr;
  640. if (OpNo == 0) {
  641. NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
  642. BO->getName(), IP);
  643. } else {
  644. NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
  645. BO->getName(), IP);
  646. }
  647. return UserChain[ChainIndex] = NewBO;
  648. }
  649. Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
  650. if (ChainIndex == 0) {
  651. assert(isa<ConstantInt>(UserChain[ChainIndex]));
  652. return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
  653. }
  654. BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
  655. assert((BO->use_empty() || BO->hasOneUse()) &&
  656. "distributeExtsAndCloneChain clones each BinaryOperator in "
  657. "UserChain, so no one should be used more than "
  658. "once");
  659. unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
  660. assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
  661. Value *NextInChain = removeConstOffset(ChainIndex - 1);
  662. Value *TheOther = BO->getOperand(1 - OpNo);
  663. // If NextInChain is 0 and not the LHS of a sub, we can simplify the
  664. // sub-expression to be just TheOther.
  665. if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
  666. if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
  667. return TheOther;
  668. }
  669. BinaryOperator::BinaryOps NewOp = BO->getOpcode();
  670. if (BO->getOpcode() == Instruction::Or) {
  671. // Rebuild "or" as "add", because "or" may be invalid for the new
  672. // expression.
  673. //
  674. // For instance, given
  675. // a | (b + 5) where a and b + 5 have no common bits,
  676. // we can extract 5 as the constant offset.
  677. //
  678. // However, reusing the "or" in the new index would give us
  679. // (a | b) + 5
  680. // which does not equal a | (b + 5).
  681. //
  682. // Replacing the "or" with "add" is fine, because
  683. // a | (b + 5) = a + (b + 5) = (a + b) + 5
  684. NewOp = Instruction::Add;
  685. }
  686. BinaryOperator *NewBO;
  687. if (OpNo == 0) {
  688. NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
  689. } else {
  690. NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
  691. }
  692. NewBO->takeName(BO);
  693. return NewBO;
  694. }
  695. Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
  696. User *&UserChainTail,
  697. const DominatorTree *DT) {
  698. ConstantOffsetExtractor Extractor(GEP, DT);
  699. // Find a non-zero constant offset first.
  700. APInt ConstantOffset =
  701. Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
  702. GEP->isInBounds());
  703. if (ConstantOffset == 0) {
  704. UserChainTail = nullptr;
  705. return nullptr;
  706. }
  707. // Separates the constant offset from the GEP index.
  708. Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
  709. UserChainTail = Extractor.UserChain.back();
  710. return IdxWithoutConstOffset;
  711. }
  712. int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
  713. const DominatorTree *DT) {
  714. // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
  715. return ConstantOffsetExtractor(GEP, DT)
  716. .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
  717. GEP->isInBounds())
  718. .getSExtValue();
  719. }
  720. bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
  721. GetElementPtrInst *GEP) {
  722. bool Changed = false;
  723. Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
  724. gep_type_iterator GTI = gep_type_begin(*GEP);
  725. for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
  726. I != E; ++I, ++GTI) {
  727. // Skip struct member indices which must be i32.
  728. if (GTI.isSequential()) {
  729. if ((*I)->getType() != IntPtrTy) {
  730. *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
  731. Changed = true;
  732. }
  733. }
  734. }
  735. return Changed;
  736. }
  737. int64_t
  738. SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
  739. bool &NeedsExtraction) {
  740. NeedsExtraction = false;
  741. int64_t AccumulativeByteOffset = 0;
  742. gep_type_iterator GTI = gep_type_begin(*GEP);
  743. for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
  744. if (GTI.isSequential()) {
  745. // Tries to extract a constant offset from this GEP index.
  746. int64_t ConstantOffset =
  747. ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
  748. if (ConstantOffset != 0) {
  749. NeedsExtraction = true;
  750. // A GEP may have multiple indices. We accumulate the extracted
  751. // constant offset to a byte offset, and later offset the remainder of
  752. // the original GEP with this byte offset.
  753. AccumulativeByteOffset +=
  754. ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
  755. }
  756. } else if (LowerGEP) {
  757. StructType *StTy = GTI.getStructType();
  758. uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
  759. // Skip field 0 as the offset is always 0.
  760. if (Field != 0) {
  761. NeedsExtraction = true;
  762. AccumulativeByteOffset +=
  763. DL->getStructLayout(StTy)->getElementOffset(Field);
  764. }
  765. }
  766. }
  767. return AccumulativeByteOffset;
  768. }
  769. void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
  770. GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
  771. IRBuilder<> Builder(Variadic);
  772. Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
  773. Type *I8PtrTy =
  774. Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
  775. Value *ResultPtr = Variadic->getOperand(0);
  776. Loop *L = LI->getLoopFor(Variadic->getParent());
  777. // Check if the base is not loop invariant or used more than once.
  778. bool isSwapCandidate =
  779. L && L->isLoopInvariant(ResultPtr) &&
  780. !hasMoreThanOneUseInLoop(ResultPtr, L);
  781. Value *FirstResult = nullptr;
  782. if (ResultPtr->getType() != I8PtrTy)
  783. ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
  784. gep_type_iterator GTI = gep_type_begin(*Variadic);
  785. // Create an ugly GEP for each sequential index. We don't create GEPs for
  786. // structure indices, as they are accumulated in the constant offset index.
  787. for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
  788. if (GTI.isSequential()) {
  789. Value *Idx = Variadic->getOperand(I);
  790. // Skip zero indices.
  791. if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
  792. if (CI->isZero())
  793. continue;
  794. APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
  795. DL->getTypeAllocSize(GTI.getIndexedType()));
  796. // Scale the index by element size.
  797. if (ElementSize != 1) {
  798. if (ElementSize.isPowerOf2()) {
  799. Idx = Builder.CreateShl(
  800. Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
  801. } else {
  802. Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
  803. }
  804. }
  805. // Create an ugly GEP with a single index for each index.
  806. ResultPtr =
  807. Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
  808. if (FirstResult == nullptr)
  809. FirstResult = ResultPtr;
  810. }
  811. }
  812. // Create a GEP with the constant offset index.
  813. if (AccumulativeByteOffset != 0) {
  814. Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
  815. ResultPtr =
  816. Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
  817. } else
  818. isSwapCandidate = false;
  819. // If we created a GEP with constant index, and the base is loop invariant,
  820. // then we swap the first one with it, so LICM can move constant GEP out
  821. // later.
  822. auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
  823. auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
  824. if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
  825. swapGEPOperand(FirstGEP, SecondGEP);
  826. if (ResultPtr->getType() != Variadic->getType())
  827. ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
  828. Variadic->replaceAllUsesWith(ResultPtr);
  829. Variadic->eraseFromParent();
  830. }
  831. void
  832. SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
  833. int64_t AccumulativeByteOffset) {
  834. IRBuilder<> Builder(Variadic);
  835. Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
  836. Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
  837. gep_type_iterator GTI = gep_type_begin(*Variadic);
  838. // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
  839. // don't create arithmetics for structure indices, as they are accumulated
  840. // in the constant offset index.
  841. for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
  842. if (GTI.isSequential()) {
  843. Value *Idx = Variadic->getOperand(I);
  844. // Skip zero indices.
  845. if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
  846. if (CI->isZero())
  847. continue;
  848. APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
  849. DL->getTypeAllocSize(GTI.getIndexedType()));
  850. // Scale the index by element size.
  851. if (ElementSize != 1) {
  852. if (ElementSize.isPowerOf2()) {
  853. Idx = Builder.CreateShl(
  854. Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
  855. } else {
  856. Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
  857. }
  858. }
  859. // Create an ADD for each index.
  860. ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
  861. }
  862. }
  863. // Create an ADD for the constant offset index.
  864. if (AccumulativeByteOffset != 0) {
  865. ResultPtr = Builder.CreateAdd(
  866. ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
  867. }
  868. ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
  869. Variadic->replaceAllUsesWith(ResultPtr);
  870. Variadic->eraseFromParent();
  871. }
  872. bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
  873. // Skip vector GEPs.
  874. if (GEP->getType()->isVectorTy())
  875. return false;
  876. // The backend can already nicely handle the case where all indices are
  877. // constant.
  878. if (GEP->hasAllConstantIndices())
  879. return false;
  880. bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
  881. bool NeedsExtraction;
  882. int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
  883. if (!NeedsExtraction)
  884. return Changed;
  885. TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
  886. // If LowerGEP is disabled, before really splitting the GEP, check whether the
  887. // backend supports the addressing mode we are about to produce. If no, this
  888. // splitting probably won't be beneficial.
  889. // If LowerGEP is enabled, even the extracted constant offset can not match
  890. // the addressing mode, we can still do optimizations to other lowered parts
  891. // of variable indices. Therefore, we don't check for addressing modes in that
  892. // case.
  893. if (!LowerGEP) {
  894. unsigned AddrSpace = GEP->getPointerAddressSpace();
  895. if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
  896. /*BaseGV=*/nullptr, AccumulativeByteOffset,
  897. /*HasBaseReg=*/true, /*Scale=*/0,
  898. AddrSpace)) {
  899. return Changed;
  900. }
  901. }
  902. // Remove the constant offset in each sequential index. The resultant GEP
  903. // computes the variadic base.
  904. // Notice that we don't remove struct field indices here. If LowerGEP is
  905. // disabled, a structure index is not accumulated and we still use the old
  906. // one. If LowerGEP is enabled, a structure index is accumulated in the
  907. // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
  908. // handle the constant offset and won't need a new structure index.
  909. gep_type_iterator GTI = gep_type_begin(*GEP);
  910. for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
  911. if (GTI.isSequential()) {
  912. // Splits this GEP index into a variadic part and a constant offset, and
  913. // uses the variadic part as the new index.
  914. Value *OldIdx = GEP->getOperand(I);
  915. User *UserChainTail;
  916. Value *NewIdx =
  917. ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
  918. if (NewIdx != nullptr) {
  919. // Switches to the index with the constant offset removed.
  920. GEP->setOperand(I, NewIdx);
  921. // After switching to the new index, we can garbage-collect UserChain
  922. // and the old index if they are not used.
  923. RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
  924. RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
  925. }
  926. }
  927. }
  928. // Clear the inbounds attribute because the new index may be off-bound.
  929. // e.g.,
  930. //
  931. // b = add i64 a, 5
  932. // addr = gep inbounds float, float* p, i64 b
  933. //
  934. // is transformed to:
  935. //
  936. // addr2 = gep float, float* p, i64 a ; inbounds removed
  937. // addr = gep inbounds float, float* addr2, i64 5
  938. //
  939. // If a is -4, although the old index b is in bounds, the new index a is
  940. // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
  941. // inbounds keyword is not present, the offsets are added to the base
  942. // address with silently-wrapping two's complement arithmetic".
  943. // Therefore, the final code will be a semantically equivalent.
  944. //
  945. // TODO(jingyue): do some range analysis to keep as many inbounds as
  946. // possible. GEPs with inbounds are more friendly to alias analysis.
  947. bool GEPWasInBounds = GEP->isInBounds();
  948. GEP->setIsInBounds(false);
  949. // Lowers a GEP to either GEPs with a single index or arithmetic operations.
  950. if (LowerGEP) {
  951. // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
  952. // arithmetic operations if the target uses alias analysis in codegen.
  953. if (TTI.useAA())
  954. lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
  955. else
  956. lowerToArithmetics(GEP, AccumulativeByteOffset);
  957. return true;
  958. }
  959. // No need to create another GEP if the accumulative byte offset is 0.
  960. if (AccumulativeByteOffset == 0)
  961. return true;
  962. // Offsets the base with the accumulative byte offset.
  963. //
  964. // %gep ; the base
  965. // ... %gep ...
  966. //
  967. // => add the offset
  968. //
  969. // %gep2 ; clone of %gep
  970. // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
  971. // %gep ; will be removed
  972. // ... %gep ...
  973. //
  974. // => replace all uses of %gep with %new.gep and remove %gep
  975. //
  976. // %gep2 ; clone of %gep
  977. // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
  978. // ... %new.gep ...
  979. //
  980. // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
  981. // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
  982. // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
  983. // type of %gep.
  984. //
  985. // %gep2 ; clone of %gep
  986. // %0 = bitcast %gep2 to i8*
  987. // %uglygep = gep %0, <offset>
  988. // %new.gep = bitcast %uglygep to <type of %gep>
  989. // ... %new.gep ...
  990. Instruction *NewGEP = GEP->clone();
  991. NewGEP->insertBefore(GEP);
  992. // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
  993. // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
  994. // used with unsigned integers later.
  995. int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
  996. DL->getTypeAllocSize(GEP->getResultElementType()));
  997. Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
  998. if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
  999. // Very likely. As long as %gep is naturally aligned, the byte offset we
  1000. // extracted should be a multiple of sizeof(*%gep).
  1001. int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
  1002. NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
  1003. ConstantInt::get(IntPtrTy, Index, true),
  1004. GEP->getName(), GEP);
  1005. NewGEP->copyMetadata(*GEP);
  1006. // Inherit the inbounds attribute of the original GEP.
  1007. cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
  1008. } else {
  1009. // Unlikely but possible. For example,
  1010. // #pragma pack(1)
  1011. // struct S {
  1012. // int a[3];
  1013. // int64 b[8];
  1014. // };
  1015. // #pragma pack()
  1016. //
  1017. // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
  1018. // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
  1019. // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
  1020. // sizeof(int64).
  1021. //
  1022. // Emit an uglygep in this case.
  1023. Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
  1024. GEP->getPointerAddressSpace());
  1025. NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
  1026. NewGEP = GetElementPtrInst::Create(
  1027. Type::getInt8Ty(GEP->getContext()), NewGEP,
  1028. ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
  1029. GEP);
  1030. NewGEP->copyMetadata(*GEP);
  1031. // Inherit the inbounds attribute of the original GEP.
  1032. cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
  1033. if (GEP->getType() != I8PtrTy)
  1034. NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
  1035. }
  1036. GEP->replaceAllUsesWith(NewGEP);
  1037. GEP->eraseFromParent();
  1038. return true;
  1039. }
  1040. bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
  1041. if (skipFunction(F))
  1042. return false;
  1043. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1044. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1045. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1046. auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
  1047. auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
  1048. return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1049. };
  1050. SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
  1051. return Impl.run(F);
  1052. }
  1053. bool SeparateConstOffsetFromGEP::run(Function &F) {
  1054. if (DisableSeparateConstOffsetFromGEP)
  1055. return false;
  1056. DL = &F.getParent()->getDataLayout();
  1057. bool Changed = false;
  1058. for (BasicBlock &B : F) {
  1059. if (!DT->isReachableFromEntry(&B))
  1060. continue;
  1061. for (Instruction &I : llvm::make_early_inc_range(B))
  1062. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
  1063. Changed |= splitGEP(GEP);
  1064. // No need to split GEP ConstantExprs because all its indices are constant
  1065. // already.
  1066. }
  1067. Changed |= reuniteExts(F);
  1068. if (VerifyNoDeadCode)
  1069. verifyNoDeadCode(F);
  1070. return Changed;
  1071. }
  1072. Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
  1073. const SCEV *Key, Instruction *Dominatee,
  1074. DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
  1075. auto Pos = DominatingExprs.find(Key);
  1076. if (Pos == DominatingExprs.end())
  1077. return nullptr;
  1078. auto &Candidates = Pos->second;
  1079. // Because we process the basic blocks in pre-order of the dominator tree, a
  1080. // candidate that doesn't dominate the current instruction won't dominate any
  1081. // future instruction either. Therefore, we pop it out of the stack. This
  1082. // optimization makes the algorithm O(n).
  1083. while (!Candidates.empty()) {
  1084. Instruction *Candidate = Candidates.back();
  1085. if (DT->dominates(Candidate, Dominatee))
  1086. return Candidate;
  1087. Candidates.pop_back();
  1088. }
  1089. return nullptr;
  1090. }
  1091. bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
  1092. if (!SE->isSCEVable(I->getType()))
  1093. return false;
  1094. // Dom: LHS+RHS
  1095. // I: sext(LHS)+sext(RHS)
  1096. // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
  1097. // TODO: handle zext
  1098. Value *LHS = nullptr, *RHS = nullptr;
  1099. if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
  1100. if (LHS->getType() == RHS->getType()) {
  1101. const SCEV *Key =
  1102. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1103. if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
  1104. Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
  1105. NewSExt->takeName(I);
  1106. I->replaceAllUsesWith(NewSExt);
  1107. RecursivelyDeleteTriviallyDeadInstructions(I);
  1108. return true;
  1109. }
  1110. }
  1111. } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
  1112. if (LHS->getType() == RHS->getType()) {
  1113. const SCEV *Key =
  1114. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1115. if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
  1116. Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
  1117. NewSExt->takeName(I);
  1118. I->replaceAllUsesWith(NewSExt);
  1119. RecursivelyDeleteTriviallyDeadInstructions(I);
  1120. return true;
  1121. }
  1122. }
  1123. }
  1124. // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
  1125. if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
  1126. if (programUndefinedIfPoison(I)) {
  1127. const SCEV *Key =
  1128. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1129. DominatingAdds[Key].push_back(I);
  1130. }
  1131. } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
  1132. if (programUndefinedIfPoison(I)) {
  1133. const SCEV *Key =
  1134. SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
  1135. DominatingSubs[Key].push_back(I);
  1136. }
  1137. }
  1138. return false;
  1139. }
  1140. bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
  1141. bool Changed = false;
  1142. DominatingAdds.clear();
  1143. DominatingSubs.clear();
  1144. for (const auto Node : depth_first(DT)) {
  1145. BasicBlock *BB = Node->getBlock();
  1146. for (Instruction &I : llvm::make_early_inc_range(*BB))
  1147. Changed |= reuniteExts(&I);
  1148. }
  1149. return Changed;
  1150. }
  1151. void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
  1152. for (BasicBlock &B : F) {
  1153. for (Instruction &I : B) {
  1154. if (isInstructionTriviallyDead(&I)) {
  1155. std::string ErrMessage;
  1156. raw_string_ostream RSO(ErrMessage);
  1157. RSO << "Dead instruction detected!\n" << I << "\n";
  1158. llvm_unreachable(RSO.str().c_str());
  1159. }
  1160. }
  1161. }
  1162. }
  1163. bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
  1164. GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
  1165. if (!FirstGEP || !FirstGEP->hasOneUse())
  1166. return false;
  1167. if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
  1168. return false;
  1169. if (FirstGEP == SecondGEP)
  1170. return false;
  1171. unsigned FirstNum = FirstGEP->getNumOperands();
  1172. unsigned SecondNum = SecondGEP->getNumOperands();
  1173. // Give up if the number of operands are not 2.
  1174. if (FirstNum != SecondNum || FirstNum != 2)
  1175. return false;
  1176. Value *FirstBase = FirstGEP->getOperand(0);
  1177. Value *SecondBase = SecondGEP->getOperand(0);
  1178. Value *FirstOffset = FirstGEP->getOperand(1);
  1179. // Give up if the index of the first GEP is loop invariant.
  1180. if (CurLoop->isLoopInvariant(FirstOffset))
  1181. return false;
  1182. // Give up if base doesn't have same type.
  1183. if (FirstBase->getType() != SecondBase->getType())
  1184. return false;
  1185. Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
  1186. // Check if the second operand of first GEP has constant coefficient.
  1187. // For an example, for the following code, we won't gain anything by
  1188. // hoisting the second GEP out because the second GEP can be folded away.
  1189. // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
  1190. // %67 = shl i64 %scevgep.sum.ur159, 2
  1191. // %uglygep160 = getelementptr i8* %65, i64 %67
  1192. // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
  1193. // Skip constant shift instruction which may be generated by Splitting GEPs.
  1194. if (FirstOffsetDef && FirstOffsetDef->isShift() &&
  1195. isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
  1196. FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
  1197. // Give up if FirstOffsetDef is an Add or Sub with constant.
  1198. // Because it may not profitable at all due to constant folding.
  1199. if (FirstOffsetDef)
  1200. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
  1201. unsigned opc = BO->getOpcode();
  1202. if ((opc == Instruction::Add || opc == Instruction::Sub) &&
  1203. (isa<ConstantInt>(BO->getOperand(0)) ||
  1204. isa<ConstantInt>(BO->getOperand(1))))
  1205. return false;
  1206. }
  1207. return true;
  1208. }
  1209. bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
  1210. int UsesInLoop = 0;
  1211. for (User *U : V->users()) {
  1212. if (Instruction *User = dyn_cast<Instruction>(U))
  1213. if (L->contains(User))
  1214. if (++UsesInLoop > 1)
  1215. return true;
  1216. }
  1217. return false;
  1218. }
  1219. void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
  1220. GetElementPtrInst *Second) {
  1221. Value *Offset1 = First->getOperand(1);
  1222. Value *Offset2 = Second->getOperand(1);
  1223. First->setOperand(1, Offset2);
  1224. Second->setOperand(1, Offset1);
  1225. // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
  1226. const DataLayout &DAL = First->getModule()->getDataLayout();
  1227. APInt Offset(DAL.getIndexSizeInBits(
  1228. cast<PointerType>(First->getType())->getAddressSpace()),
  1229. 0);
  1230. Value *NewBase =
  1231. First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
  1232. uint64_t ObjectSize;
  1233. if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
  1234. Offset.ugt(ObjectSize)) {
  1235. First->setIsInBounds(false);
  1236. Second->setIsInBounds(false);
  1237. } else
  1238. First->setIsInBounds(true);
  1239. }
  1240. PreservedAnalyses
  1241. SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
  1242. auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
  1243. auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
  1244. auto *LI = &AM.getResult<LoopAnalysis>(F);
  1245. auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
  1246. auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & {
  1247. return AM.getResult<TargetIRAnalysis>(F);
  1248. };
  1249. SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
  1250. if (!Impl.run(F))
  1251. return PreservedAnalyses::all();
  1252. PreservedAnalyses PA;
  1253. PA.preserveSet<CFGAnalyses>();
  1254. return PA;
  1255. }