IRSimilarityIdentifier.h 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // \file
  15. // Interface file for the IRSimilarityIdentifier for identifying similarities in
  16. // IR including the IRInstructionMapper, which maps an Instruction to unsigned
  17. // integers.
  18. //
  19. // Two sequences of instructions are called "similar" if they perform the same
  20. // series of operations for all inputs.
  21. //
  22. // \code
  23. // %1 = add i32 %a, 10
  24. // %2 = add i32 %a, %1
  25. // %3 = icmp slt icmp %1, %2
  26. // \endcode
  27. //
  28. // and
  29. //
  30. // \code
  31. // %1 = add i32 11, %a
  32. // %2 = sub i32 %a, %1
  33. // %3 = icmp sgt icmp %2, %1
  34. // \endcode
  35. //
  36. // ultimately have the same result, even if the inputs, and structure are
  37. // slightly different.
  38. //
  39. // For instructions, we do not worry about operands that do not have fixed
  40. // semantic meaning to the program. We consider the opcode that the instruction
  41. // has, the types, parameters, and extra information such as the function name,
  42. // or comparison predicate. These are used to create a hash to map instructions
  43. // to integers to be used in similarity matching in sequences of instructions
  44. //
  45. // Terminology:
  46. // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
  47. // Instructions), usually used to denote a region of similarity has been found.
  48. //
  49. // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
  50. // similar to one another.
  51. //
  52. //===----------------------------------------------------------------------===//
  53. #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  54. #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  55. #include "llvm/IR/InstVisitor.h"
  56. #include "llvm/IR/Instructions.h"
  57. #include "llvm/IR/Module.h"
  58. #include "llvm/IR/PassManager.h"
  59. #include "llvm/Pass.h"
  60. #include "llvm/Support/Allocator.h"
  61. namespace llvm {
  62. namespace IRSimilarity {
  63. struct IRInstructionDataList;
  64. /// This represents what is and is not supported when finding similarity in
  65. /// Instructions.
  66. ///
  67. /// Legal Instructions are considered when looking at similarity between
  68. /// Instructions.
  69. ///
  70. /// Illegal Instructions cannot be considered when looking for similarity
  71. /// between Instructions. They act as boundaries between similarity regions.
  72. ///
  73. /// Invisible Instructions are skipped over during analysis.
  74. // TODO: Shared with MachineOutliner
  75. enum InstrType { Legal, Illegal, Invisible };
  76. /// This provides the utilities for hashing an Instruction to an unsigned
  77. /// integer. Two IRInstructionDatas produce the same hash value when their
  78. /// underlying Instructions perform the same operation (even if they don't have
  79. /// the same input operands.)
  80. /// As a more concrete example, consider the following:
  81. ///
  82. /// \code
  83. /// %add1 = add i32 %a, %b
  84. /// %add2 = add i32 %c, %d
  85. /// %add3 = add i64 %e, %f
  86. /// \endcode
  87. ///
  88. // Then the IRInstructionData wrappers for these Instructions may be hashed like
  89. /// so:
  90. ///
  91. /// \code
  92. /// ; These two adds have the same types and operand types, so they hash to the
  93. /// ; same number.
  94. /// %add1 = add i32 %a, %b ; Hash: 1
  95. /// %add2 = add i32 %c, %d ; Hash: 1
  96. /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
  97. /// ; it hashes to a different number.
  98. /// %add3 = add i64 %e, %f; Hash: 2
  99. /// \endcode
  100. ///
  101. ///
  102. /// This hashing scheme will be used to represent the program as a very long
  103. /// string. This string can then be placed in a data structure which can be used
  104. /// for similarity queries.
  105. ///
  106. /// TODO: Handle types of Instructions which can be equal even with different
  107. /// operands. (E.g. comparisons with swapped predicates.)
  108. /// TODO: Handle CallInsts, which are only checked for function type
  109. /// by \ref isSameOperationAs.
  110. /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
  111. /// exact same, and some do not.
  112. struct IRInstructionData : ilist_node<IRInstructionData> {
  113. /// The source Instruction that is being wrapped.
  114. Instruction *Inst = nullptr;
  115. /// The values of the operands in the Instruction.
  116. SmallVector<Value *, 4> OperVals;
  117. /// The legality of the wrapped instruction. This is informed by InstrType,
  118. /// and is used when checking when two instructions are considered similar.
  119. /// If either instruction is not legal, the instructions are automatically not
  120. /// considered similar.
  121. bool Legal;
  122. /// This is only relevant if we are wrapping a CmpInst where we needed to
  123. /// change the predicate of a compare instruction from a greater than form
  124. /// to a less than form. It is None otherwise.
  125. Optional<CmpInst::Predicate> RevisedPredicate;
  126. /// Gather the information that is difficult to gather for an Instruction, or
  127. /// is changed. i.e. the operands of an Instruction and the Types of those
  128. /// operands. This extra information allows for similarity matching to make
  129. /// assertions that allow for more flexibility when checking for whether an
  130. /// Instruction performs the same operation.
  131. IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
  132. /// Get the predicate that the compare instruction is using for hashing the
  133. /// instruction. the IRInstructionData must be wrapping a CmpInst.
  134. CmpInst::Predicate getPredicate() const;
  135. /// A function that swaps the predicates to their less than form if they are
  136. /// in a greater than form. Otherwise, the predicate is unchanged.
  137. ///
  138. /// \param CI - The comparison operation to find a consistent preidcate for.
  139. /// \return the consistent comparison predicate.
  140. static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
  141. /// Hashes \p Value based on its opcode, types, and operand types.
  142. /// Two IRInstructionData instances produce the same hash when they perform
  143. /// the same operation.
  144. ///
  145. /// As a simple example, consider the following instructions.
  146. ///
  147. /// \code
  148. /// %add1 = add i32 %x1, %y1
  149. /// %add2 = add i32 %x2, %y2
  150. ///
  151. /// %sub = sub i32 %x1, %y1
  152. ///
  153. /// %add_i64 = add i64 %x2, %y2
  154. /// \endcode
  155. ///
  156. /// Because the first two adds operate the same types, and are performing the
  157. /// same action, they will be hashed to the same value.
  158. ///
  159. /// However, the subtraction instruction is not the same as an addition, and
  160. /// will be hashed to a different value.
  161. ///
  162. /// Finally, the last add has a different type compared to the first two add
  163. /// instructions, so it will also be hashed to a different value that any of
  164. /// the previous instructions.
  165. ///
  166. /// \param [in] ID - The IRInstructionData instance to be hashed.
  167. /// \returns A hash_value of the IRInstructionData.
  168. friend hash_code hash_value(const IRInstructionData &ID) {
  169. SmallVector<Type *, 4> OperTypes;
  170. for (Value *V : ID.OperVals)
  171. OperTypes.push_back(V->getType());
  172. if (isa<CmpInst>(ID.Inst))
  173. return llvm::hash_combine(
  174. llvm::hash_value(ID.Inst->getOpcode()),
  175. llvm::hash_value(ID.Inst->getType()),
  176. llvm::hash_value(ID.getPredicate()),
  177. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  178. else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
  179. return llvm::hash_combine(
  180. llvm::hash_value(ID.Inst->getOpcode()),
  181. llvm::hash_value(ID.Inst->getType()),
  182. llvm::hash_value(CI->getCalledFunction()->getName().str()),
  183. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  184. return llvm::hash_combine(
  185. llvm::hash_value(ID.Inst->getOpcode()),
  186. llvm::hash_value(ID.Inst->getType()),
  187. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  188. }
  189. IRInstructionDataList *IDL = nullptr;
  190. };
  191. struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
  192. /// Compare one IRInstructionData class to another IRInstructionData class for
  193. /// whether they are performing a the same operation, and can mapped to the
  194. /// same value. For regular instructions if the hash value is the same, then
  195. /// they will also be close.
  196. ///
  197. /// \param A - The first IRInstructionData class to compare
  198. /// \param B - The second IRInstructionData class to compare
  199. /// \returns true if \p A and \p B are similar enough to be mapped to the same
  200. /// value.
  201. bool isClose(const IRInstructionData &A, const IRInstructionData &B);
  202. struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
  203. static inline IRInstructionData *getEmptyKey() { return nullptr; }
  204. static inline IRInstructionData *getTombstoneKey() {
  205. return reinterpret_cast<IRInstructionData *>(-1);
  206. }
  207. static unsigned getHashValue(const IRInstructionData *E) {
  208. using llvm::hash_value;
  209. assert(E && "IRInstructionData is a nullptr?");
  210. return hash_value(*E);
  211. }
  212. static bool isEqual(const IRInstructionData *LHS,
  213. const IRInstructionData *RHS) {
  214. if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
  215. LHS == getEmptyKey() || LHS == getTombstoneKey())
  216. return LHS == RHS;
  217. assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
  218. return isClose(*LHS, *RHS);
  219. }
  220. };
  221. /// Helper struct for converting the Instructions in a Module into a vector of
  222. /// unsigned integers. This vector of unsigned integers can be thought of as a
  223. /// "numeric string". This numeric string can then be queried by, for example,
  224. /// data structures that find repeated substrings.
  225. ///
  226. /// This hashing is done per BasicBlock in the module. To hash Instructions
  227. /// based off of their operations, each Instruction is wrapped in an
  228. /// IRInstructionData struct. The unsigned integer for an IRInstructionData
  229. /// depends on:
  230. /// - The hash provided by the IRInstructionData.
  231. /// - Which member of InstrType the IRInstructionData is classified as.
  232. // See InstrType for more details on the possible classifications, and how they
  233. // manifest in the numeric string.
  234. ///
  235. /// The numeric string for an individual BasicBlock is terminated by an unique
  236. /// unsigned integer. This prevents data structures which rely on repetition
  237. /// from matching across BasicBlocks. (For example, the SuffixTree.)
  238. /// As a concrete example, if we have the following two BasicBlocks:
  239. /// \code
  240. /// bb0:
  241. /// %add1 = add i32 %a, %b
  242. /// %add2 = add i32 %c, %d
  243. /// %add3 = add i64 %e, %f
  244. /// bb1:
  245. /// %sub = sub i32 %c, %d
  246. /// \endcode
  247. /// We may hash the Instructions like this (via IRInstructionData):
  248. /// \code
  249. /// bb0:
  250. /// %add1 = add i32 %a, %b ; Hash: 1
  251. /// %add2 = add i32 %c, %d; Hash: 1
  252. /// %add3 = add i64 %e, %f; Hash: 2
  253. /// bb1:
  254. /// %sub = sub i32 %c, %d; Hash: 3
  255. /// %add4 = add i32 %c, %d ; Hash: 1
  256. /// \endcode
  257. /// And produce a "numeric string representation" like so:
  258. /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
  259. ///
  260. /// TODO: This is very similar to the MachineOutliner, and should be
  261. /// consolidated into the same interface.
  262. struct IRInstructionMapper {
  263. /// The starting illegal instruction number to map to.
  264. ///
  265. /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
  266. unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
  267. /// The next available integer to assign to a legal Instruction to.
  268. unsigned LegalInstrNumber = 0;
  269. /// Correspondence from IRInstructionData to unsigned integers.
  270. DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
  271. InstructionIntegerMap;
  272. /// Set if we added an illegal number in the previous step.
  273. /// Since each illegal number is unique, we only need one of them between
  274. /// each range of legal numbers. This lets us make sure we don't add more
  275. /// than one illegal number per range.
  276. bool AddedIllegalLastTime = false;
  277. /// Marks whether we found a illegal instruction in the previous step.
  278. bool CanCombineWithPrevInstr = false;
  279. /// Marks whether we have found a set of instructions that is long enough
  280. /// to be considered for similarity.
  281. bool HaveLegalRange = false;
  282. /// This allocator pointer is in charge of holding on to the IRInstructionData
  283. /// so it is not deallocated until whatever external tool is using it is done
  284. /// with the information.
  285. SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
  286. /// This allocator pointer is in charge of creating the IRInstructionDataList
  287. /// so it is not deallocated until whatever external tool is using it is done
  288. /// with the information.
  289. SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
  290. /// Get an allocated IRInstructionData struct using the InstDataAllocator.
  291. ///
  292. /// \param I - The Instruction to wrap with IRInstructionData.
  293. /// \param Legality - A boolean value that is true if the instruction is to
  294. /// be considered for similarity, and false if not.
  295. /// \param IDL - The InstructionDataList that the IRInstructionData is
  296. /// inserted into.
  297. /// \returns An allocated IRInstructionData struct.
  298. IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
  299. IRInstructionDataList &IDL);
  300. /// Get an allocated IRInstructionDataList object using the IDLAllocator.
  301. ///
  302. /// \returns An allocated IRInstructionDataList object.
  303. IRInstructionDataList *allocateIRInstructionDataList();
  304. IRInstructionDataList *IDL = nullptr;
  305. /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
  306. /// determined by \p InstrType. Two Instructions are mapped to the same value
  307. /// if they are close as defined by the InstructionData class above.
  308. ///
  309. /// \param [in] BB - The BasicBlock to be mapped to integers.
  310. /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
  311. /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
  312. void convertToUnsignedVec(BasicBlock &BB,
  313. std::vector<IRInstructionData *> &InstrList,
  314. std::vector<unsigned> &IntegerMapping);
  315. /// Maps an Instruction to a legal integer.
  316. ///
  317. /// \param [in] It - The Instruction to be mapped to an integer.
  318. /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
  319. /// append to.
  320. /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
  321. /// \returns The integer \p It was mapped to.
  322. unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
  323. std::vector<unsigned> &IntegerMappingForBB,
  324. std::vector<IRInstructionData *> &InstrListForBB);
  325. /// Maps an Instruction to an illegal integer.
  326. ///
  327. /// \param [in] It - The \p Instruction to be mapped to an integer.
  328. /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
  329. /// append to.
  330. /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
  331. /// \param End - true if creating a dummy IRInstructionData at the end of a
  332. /// basic block.
  333. /// \returns The integer \p It was mapped to.
  334. unsigned mapToIllegalUnsigned(
  335. BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
  336. std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
  337. IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
  338. SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
  339. : InstDataAllocator(IDA), IDLAllocator(IDLA) {
  340. // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
  341. // changed.
  342. assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
  343. "DenseMapInfo<unsigned>'s empty key isn't -1!");
  344. assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
  345. static_cast<unsigned>(-2) &&
  346. "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
  347. IDL = new (IDLAllocator->Allocate())
  348. IRInstructionDataList();
  349. }
  350. /// Custom InstVisitor to classify different instructions for whether it can
  351. /// be analyzed for similarity.
  352. struct InstructionClassification
  353. : public InstVisitor<InstructionClassification, InstrType> {
  354. InstructionClassification() {}
  355. // TODO: Determine a scheme to resolve when the label is similar enough.
  356. InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
  357. // TODO: Determine a scheme to resolve when the labels are similar enough.
  358. InstrType visitPHINode(PHINode &PN) { return Illegal; }
  359. // TODO: Handle allocas.
  360. InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
  361. // We exclude variable argument instructions since variable arguments
  362. // requires extra checking of the argument list.
  363. InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
  364. // We exclude all exception handling cases since they are so context
  365. // dependent.
  366. InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
  367. InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
  368. // DebugInfo should be included in the regions, but should not be
  369. // analyzed for similarity as it has no bearing on the outcome of the
  370. // program.
  371. InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
  372. // TODO: Handle specific intrinsics.
  373. InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
  374. // We only allow call instructions where the function has a name and
  375. // is not an indirect call.
  376. InstrType visitCallInst(CallInst &CI) {
  377. Function *F = CI.getCalledFunction();
  378. if (!F || CI.isIndirectCall() || !F->hasName())
  379. return Illegal;
  380. return Legal;
  381. }
  382. // TODO: We do not current handle similarity that changes the control flow.
  383. InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
  384. // TODO: We do not current handle similarity that changes the control flow.
  385. InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
  386. // TODO: Handle interblock similarity.
  387. InstrType visitTerminator(Instruction &I) { return Illegal; }
  388. InstrType visitInstruction(Instruction &I) { return Legal; }
  389. };
  390. /// Maps an Instruction to a member of InstrType.
  391. InstructionClassification InstClassifier;
  392. };
  393. /// This is a class that wraps a range of IRInstructionData from one point to
  394. /// another in the vector of IRInstructionData, which is a region of the
  395. /// program. It is also responsible for defining the structure within this
  396. /// region of instructions.
  397. ///
  398. /// The structure of a region is defined through a value numbering system
  399. /// assigned to each unique value in a region at the creation of the
  400. /// IRSimilarityCandidate.
  401. ///
  402. /// For example, for each Instruction we add a mapping for each new
  403. /// value seen in that Instruction.
  404. /// IR: Mapping Added:
  405. /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
  406. /// %add2 = add i32 %a, %1 %add2 -> 4
  407. /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
  408. ///
  409. /// We can compare IRSimilarityCandidates against one another.
  410. /// The \ref isSimilar function compares each IRInstructionData against one
  411. /// another and if we have the same sequences of IRInstructionData that would
  412. /// create the same hash, we have similar IRSimilarityCandidates.
  413. ///
  414. /// We can also compare the structure of IRSimilarityCandidates. If we can
  415. /// create a mapping of registers in the region contained by one
  416. /// IRSimilarityCandidate to the region contained by different
  417. /// IRSimilarityCandidate, they can be considered structurally similar.
  418. ///
  419. /// IRSimilarityCandidate1: IRSimilarityCandidate2:
  420. /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e
  421. /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
  422. /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
  423. ///
  424. /// Can have the following mapping from candidate to candidate of:
  425. /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
  426. /// and can be considered similar.
  427. ///
  428. /// IRSimilarityCandidate1: IRSimilarityCandidate2:
  429. /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4
  430. /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
  431. /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
  432. ///
  433. /// We cannot create the same mapping since the use of c4 is not used in the
  434. /// same way as %b or c2.
  435. class IRSimilarityCandidate {
  436. private:
  437. /// The start index of this IRSimilarityCandidate in the instruction list.
  438. unsigned StartIdx = 0;
  439. /// The number of instructions in this IRSimilarityCandidate.
  440. unsigned Len = 0;
  441. /// The first instruction in this IRSimilarityCandidate.
  442. IRInstructionData *FirstInst = nullptr;
  443. /// The last instruction in this IRSimilarityCandidate.
  444. IRInstructionData *LastInst = nullptr;
  445. /// Global Value Numbering structures
  446. /// @{
  447. /// Stores the mapping of the value to the number assigned to it in the
  448. /// IRSimilarityCandidate.
  449. DenseMap<Value *, unsigned> ValueToNumber;
  450. /// Stores the mapping of the number to the value assigned this number.
  451. DenseMap<unsigned, Value *> NumberToValue;
  452. /// @}
  453. public:
  454. /// \param StartIdx - The starting location of the region.
  455. /// \param Len - The length of the region.
  456. /// \param FirstInstIt - The starting IRInstructionData of the region.
  457. /// \param LastInstIt - The ending IRInstructionData of the region.
  458. IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
  459. IRInstructionData *FirstInstIt,
  460. IRInstructionData *LastInstIt);
  461. /// \param A - The first IRInstructionCandidate to compare.
  462. /// \param B - The second IRInstructionCandidate to compare.
  463. /// \returns True when every IRInstructionData in \p A is similar to every
  464. /// IRInstructionData in \p B.
  465. static bool isSimilar(const IRSimilarityCandidate &A,
  466. const IRSimilarityCandidate &B);
  467. /// \param A - The first IRInstructionCandidate to compare.
  468. /// \param B - The second IRInstructionCandidate to compare.
  469. /// \returns True when every IRInstructionData in \p A is structurally similar
  470. /// to \p B.
  471. static bool compareStructure(const IRSimilarityCandidate &A,
  472. const IRSimilarityCandidate &B);
  473. struct OperandMapping {
  474. /// The IRSimilarityCandidate that holds the instruction the OperVals were
  475. /// pulled from.
  476. const IRSimilarityCandidate &IRSC;
  477. /// The operand values to be analyzed.
  478. ArrayRef<Value *> &OperVals;
  479. /// The current mapping of global value numbers from one IRSimilarityCandidate
  480. /// to another IRSimilarityCandidate.
  481. DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
  482. };
  483. /// Compare the operands in \p A and \p B and check that the current mapping
  484. /// of global value numbers from \p A to \p B and \p B to \A is consistent.
  485. ///
  486. /// \param A - The first IRInstructionCandidate, operand values, and current
  487. /// operand mappings to compare.
  488. /// \param B - The second IRInstructionCandidate, operand values, and current
  489. /// operand mappings to compare.
  490. /// \returns true if the IRSimilarityCandidates operands are compatible.
  491. static bool compareNonCommutativeOperandMapping(OperandMapping A,
  492. OperandMapping B);
  493. /// Compare the operands in \p A and \p B and check that the current mapping
  494. /// of global value numbers from \p A to \p B and \p B to \A is consistent
  495. /// given that the operands are commutative.
  496. ///
  497. /// \param A - The first IRInstructionCandidate, operand values, and current
  498. /// operand mappings to compare.
  499. /// \param B - The second IRInstructionCandidate, operand values, and current
  500. /// operand mappings to compare.
  501. /// \returns true if the IRSimilarityCandidates operands are compatible.
  502. static bool compareCommutativeOperandMapping(OperandMapping A,
  503. OperandMapping B);
  504. /// Compare the start and end indices of the two IRSimilarityCandidates for
  505. /// whether they overlap. If the start instruction of one
  506. /// IRSimilarityCandidate is less than the end instruction of the other, and
  507. /// the start instruction of one is greater than the start instruction of the
  508. /// other, they overlap.
  509. ///
  510. /// \returns true if the IRSimilarityCandidates do not have overlapping
  511. /// instructions.
  512. static bool overlap(const IRSimilarityCandidate &A,
  513. const IRSimilarityCandidate &B);
  514. /// \returns the number of instructions in this Candidate.
  515. unsigned getLength() const { return Len; }
  516. /// \returns the start index of this IRSimilarityCandidate.
  517. unsigned getStartIdx() const { return StartIdx; }
  518. /// \returns the end index of this IRSimilarityCandidate.
  519. unsigned getEndIdx() const { return StartIdx + Len - 1; }
  520. /// \returns The first IRInstructionData.
  521. IRInstructionData *front() const { return FirstInst; }
  522. /// \returns The last IRInstructionData.
  523. IRInstructionData *back() const { return LastInst; }
  524. /// \returns The first Instruction.
  525. Instruction *frontInstruction() { return FirstInst->Inst; }
  526. /// \returns The last Instruction
  527. Instruction *backInstruction() { return LastInst->Inst; }
  528. /// \returns The BasicBlock the IRSimilarityCandidate starts in.
  529. BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
  530. /// \returns The BasicBlock the IRSimilarityCandidate ends in.
  531. BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
  532. /// \returns The Function that the IRSimilarityCandidate is located in.
  533. Function *getFunction() { return getStartBB()->getParent(); }
  534. /// Finds the positive number associated with \p V if it has been mapped.
  535. /// \param [in] V - the Value to find.
  536. /// \returns The positive number corresponding to the value.
  537. /// \returns None if not present.
  538. Optional<unsigned> getGVN(Value *V) {
  539. assert(V != nullptr && "Value is a nullptr?");
  540. DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
  541. if (VNIt == ValueToNumber.end())
  542. return None;
  543. return VNIt->second;
  544. }
  545. /// Finds the Value associate with \p Num if it exists.
  546. /// \param [in] Num - the number to find.
  547. /// \returns The Value associated with the number.
  548. /// \returns None if not present.
  549. Optional<Value *> fromGVN(unsigned Num) {
  550. DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
  551. if (VNIt == NumberToValue.end())
  552. return None;
  553. assert(VNIt->second != nullptr && "Found value is a nullptr!");
  554. return VNIt->second;
  555. }
  556. /// \param RHS -The IRSimilarityCandidate to compare against
  557. /// \returns true if the IRSimilarityCandidate is occurs after the
  558. /// IRSimilarityCandidate in the program.
  559. bool operator<(const IRSimilarityCandidate &RHS) const {
  560. return getStartIdx() > RHS.getStartIdx();
  561. }
  562. using iterator = IRInstructionDataList::iterator;
  563. iterator begin() const { return iterator(front()); }
  564. iterator end() const { return std::next(iterator(back())); }
  565. };
  566. typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
  567. typedef std::vector<SimilarityGroup> SimilarityGroupList;
  568. /// This class puts all the pieces of the IRInstructionData,
  569. /// IRInstructionMapper, IRSimilarityCandidate together.
  570. ///
  571. /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
  572. /// and puts all the mapped instructions into a single long list of
  573. /// IRInstructionData.
  574. ///
  575. /// The list of unsigned integers is given to the Suffix Tree or similar data
  576. /// structure to find repeated subsequences. We construct an
  577. /// IRSimilarityCandidate for each instance of the subsequence. We compare them
  578. /// against one another since These repeated subsequences can have different
  579. /// structure. For each different kind of structure found, we create a
  580. /// similarity group.
  581. ///
  582. /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
  583. /// structurally similar to one another, while C is different we would have two
  584. /// SimilarityGroups:
  585. ///
  586. /// SimilarityGroup 1: SimilarityGroup 2
  587. /// A, B, D C
  588. ///
  589. /// A list of the different similarity groups is then returned after
  590. /// analyzing the module.
  591. class IRSimilarityIdentifier {
  592. public:
  593. IRSimilarityIdentifier()
  594. : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
  595. /// \param M the module to find similarity in.
  596. explicit IRSimilarityIdentifier(Module &M)
  597. : Mapper(&InstDataAllocator, &InstDataListAllocator) {
  598. findSimilarity(M);
  599. }
  600. private:
  601. /// Map the instructions in the module to unsigned integers, using mapping
  602. /// already present in the Mapper if possible.
  603. ///
  604. /// \param [in] M Module - To map to integers.
  605. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  606. /// \param [in,out] IntegerMapping - The vector to append integers to.
  607. void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
  608. std::vector<unsigned> &IntegerMapping);
  609. /// Map the instructions in the modules vector to unsigned integers, using
  610. /// mapping already present in the mapper if possible.
  611. ///
  612. /// \param [in] Modules - The list of modules to use to populate the mapper
  613. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  614. /// \param [in,out] IntegerMapping - The vector to append integers to.
  615. void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
  616. std::vector<IRInstructionData *> &InstrList,
  617. std::vector<unsigned> &IntegerMapping);
  618. /// Find the similarity candidates in \p InstrList and corresponding
  619. /// \p UnsignedVec
  620. ///
  621. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  622. /// \param [in,out] IntegerMapping - The vector to append integers to.
  623. /// candidates found in the program.
  624. void findCandidates(std::vector<IRInstructionData *> &InstrList,
  625. std::vector<unsigned> &IntegerMapping);
  626. public:
  627. // Find the IRSimilarityCandidates in the \p Modules and group by structural
  628. // similarity in a SimilarityGroup, each group is returned in a
  629. // SimilarityGroupList.
  630. //
  631. // \param [in] Modules - the modules to analyze.
  632. // \returns The groups of similarity ranges found in the modules.
  633. SimilarityGroupList &
  634. findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
  635. // Find the IRSimilarityCandidates in the given Module grouped by structural
  636. // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
  637. //
  638. // \param [in] M - the module to analyze.
  639. // \returns The groups of similarity ranges found in the module.
  640. SimilarityGroupList &findSimilarity(Module &M);
  641. // Clears \ref SimilarityCandidates if it is already filled by a previous run.
  642. void resetSimilarityCandidates() {
  643. // If we've already analyzed a Module or set of Modules, so we must clear
  644. // the SimilarityCandidates to make sure we do not have only old values
  645. // hanging around.
  646. if (SimilarityCandidates.hasValue())
  647. SimilarityCandidates->clear();
  648. else
  649. SimilarityCandidates = SimilarityGroupList();
  650. }
  651. // \returns The groups of similarity ranges found in the most recently passed
  652. // set of modules.
  653. Optional<SimilarityGroupList> &getSimilarity() {
  654. return SimilarityCandidates;
  655. }
  656. private:
  657. /// The allocator for IRInstructionData.
  658. SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
  659. /// The allocator for IRInstructionDataLists.
  660. SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
  661. /// Map Instructions to unsigned integers and wraps the Instruction in an
  662. /// instance of IRInstructionData.
  663. IRInstructionMapper Mapper;
  664. /// The SimilarityGroups found with the most recent run of \ref
  665. /// findSimilarity. None if there is no recent run.
  666. Optional<SimilarityGroupList> SimilarityCandidates;
  667. };
  668. } // end namespace IRSimilarity
  669. /// An analysis pass based on legacy pass manager that runs and returns
  670. /// IRSimilarityIdentifier run on the Module.
  671. class IRSimilarityIdentifierWrapperPass : public ModulePass {
  672. std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
  673. public:
  674. static char ID;
  675. IRSimilarityIdentifierWrapperPass();
  676. IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
  677. const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
  678. bool doInitialization(Module &M) override;
  679. bool doFinalization(Module &M) override;
  680. bool runOnModule(Module &M) override;
  681. void getAnalysisUsage(AnalysisUsage &AU) const override {
  682. AU.setPreservesAll();
  683. }
  684. };
  685. /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
  686. /// Module.
  687. class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
  688. public:
  689. typedef IRSimilarity::IRSimilarityIdentifier Result;
  690. Result run(Module &M, ModuleAnalysisManager &);
  691. private:
  692. friend AnalysisInfoMixin<IRSimilarityAnalysis>;
  693. static AnalysisKey Key;
  694. };
  695. /// Printer pass that uses \c IRSimilarityAnalysis.
  696. class IRSimilarityAnalysisPrinterPass
  697. : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
  698. raw_ostream &OS;
  699. public:
  700. explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
  701. PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
  702. };
  703. } // end namespace llvm
  704. #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  705. #ifdef __GNUC__
  706. #pragma GCC diagnostic pop
  707. #endif