IRSimilarityIdentifier.h 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // \file
  15. // Interface file for the IRSimilarityIdentifier for identifying similarities in
  16. // IR including the IRInstructionMapper, which maps an Instruction to unsigned
  17. // integers.
  18. //
  19. // Two sequences of instructions are called "similar" if they perform the same
  20. // series of operations for all inputs.
  21. //
  22. // \code
  23. // %1 = add i32 %a, 10
  24. // %2 = add i32 %a, %1
  25. // %3 = icmp slt icmp %1, %2
  26. // \endcode
  27. //
  28. // and
  29. //
  30. // \code
  31. // %1 = add i32 11, %a
  32. // %2 = sub i32 %a, %1
  33. // %3 = icmp sgt icmp %2, %1
  34. // \endcode
  35. //
  36. // ultimately have the same result, even if the inputs, and structure are
  37. // slightly different.
  38. //
  39. // For instructions, we do not worry about operands that do not have fixed
  40. // semantic meaning to the program. We consider the opcode that the instruction
  41. // has, the types, parameters, and extra information such as the function name,
  42. // or comparison predicate. These are used to create a hash to map instructions
  43. // to integers to be used in similarity matching in sequences of instructions
  44. //
  45. // Terminology:
  46. // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
  47. // Instructions), usually used to denote a region of similarity has been found.
  48. //
  49. // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
  50. // similar to one another.
  51. //
  52. //===----------------------------------------------------------------------===//
  53. #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  54. #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  55. #include "llvm/IR/InstVisitor.h"
  56. #include "llvm/IR/Instructions.h"
  57. #include "llvm/IR/PassManager.h"
  58. #include "llvm/Pass.h"
  59. #include "llvm/Support/Allocator.h"
  60. #include <optional>
  61. namespace llvm {
  62. class Module;
  63. namespace IRSimilarity {
  64. struct IRInstructionDataList;
  65. /// This represents what is and is not supported when finding similarity in
  66. /// Instructions.
  67. ///
  68. /// Legal Instructions are considered when looking at similarity between
  69. /// Instructions.
  70. ///
  71. /// Illegal Instructions cannot be considered when looking for similarity
  72. /// between Instructions. They act as boundaries between similarity regions.
  73. ///
  74. /// Invisible Instructions are skipped over during analysis.
  75. // TODO: Shared with MachineOutliner
  76. enum InstrType { Legal, Illegal, Invisible };
  77. /// This provides the utilities for hashing an Instruction to an unsigned
  78. /// integer. Two IRInstructionDatas produce the same hash value when their
  79. /// underlying Instructions perform the same operation (even if they don't have
  80. /// the same input operands.)
  81. /// As a more concrete example, consider the following:
  82. ///
  83. /// \code
  84. /// %add1 = add i32 %a, %b
  85. /// %add2 = add i32 %c, %d
  86. /// %add3 = add i64 %e, %f
  87. /// \endcode
  88. ///
  89. // Then the IRInstructionData wrappers for these Instructions may be hashed like
  90. /// so:
  91. ///
  92. /// \code
  93. /// ; These two adds have the same types and operand types, so they hash to the
  94. /// ; same number.
  95. /// %add1 = add i32 %a, %b ; Hash: 1
  96. /// %add2 = add i32 %c, %d ; Hash: 1
  97. /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
  98. /// ; it hashes to a different number.
  99. /// %add3 = add i64 %e, %f; Hash: 2
  100. /// \endcode
  101. ///
  102. ///
  103. /// This hashing scheme will be used to represent the program as a very long
  104. /// string. This string can then be placed in a data structure which can be used
  105. /// for similarity queries.
  106. ///
  107. /// TODO: Handle types of Instructions which can be equal even with different
  108. /// operands. (E.g. comparisons with swapped predicates.)
  109. /// TODO: Handle CallInsts, which are only checked for function type
  110. /// by \ref isSameOperationAs.
  111. /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
  112. /// exact same, and some do not.
  113. struct IRInstructionData
  114. : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> {
  115. /// The source Instruction that is being wrapped.
  116. Instruction *Inst = nullptr;
  117. /// The values of the operands in the Instruction.
  118. SmallVector<Value *, 4> OperVals;
  119. /// The legality of the wrapped instruction. This is informed by InstrType,
  120. /// and is used when checking when two instructions are considered similar.
  121. /// If either instruction is not legal, the instructions are automatically not
  122. /// considered similar.
  123. bool Legal = false;
  124. /// This is only relevant if we are wrapping a CmpInst where we needed to
  125. /// change the predicate of a compare instruction from a greater than form
  126. /// to a less than form. It is None otherwise.
  127. std::optional<CmpInst::Predicate> RevisedPredicate;
  128. /// This is only relevant if we are wrapping a CallInst. If we are requiring
  129. /// that the function calls have matching names as well as types, and the
  130. /// call is not an indirect call, this will hold the name of the function. If
  131. /// it is an indirect string, it will be the empty string. However, if this
  132. /// requirement is not in place it will be the empty string regardless of the
  133. /// function call type. The value held here is used to create the hash of the
  134. /// instruction, and check to make sure two instructions are close to one
  135. /// another.
  136. std::optional<std::string> CalleeName;
  137. /// This structure holds the distances of how far "ahead of" or "behind" the
  138. /// target blocks of a branch, or the incoming blocks of a phi nodes are.
  139. /// If the value is negative, it means that the block was registered before
  140. /// the block of this instruction in terms of blocks in the function.
  141. /// Code Example:
  142. /// \code
  143. /// block_1:
  144. /// br i1 %0, label %block_2, label %block_3
  145. /// block_2:
  146. /// br i1 %1, label %block_1, label %block_2
  147. /// block_3:
  148. /// br i1 %2, label %block_2, label %block_1
  149. /// ; Replacing the labels with relative values, this becomes:
  150. /// block_1:
  151. /// br i1 %0, distance 1, distance 2
  152. /// block_2:
  153. /// br i1 %1, distance -1, distance 0
  154. /// block_3:
  155. /// br i1 %2, distance -1, distance -2
  156. /// \endcode
  157. /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is
  158. /// "ahead" of block_2.
  159. SmallVector<int, 4> RelativeBlockLocations;
  160. /// Gather the information that is difficult to gather for an Instruction, or
  161. /// is changed. i.e. the operands of an Instruction and the Types of those
  162. /// operands. This extra information allows for similarity matching to make
  163. /// assertions that allow for more flexibility when checking for whether an
  164. /// Instruction performs the same operation.
  165. IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
  166. IRInstructionData(IRInstructionDataList &IDL);
  167. /// Fills data stuctures for IRInstructionData when it is constructed from a
  168. // reference or a pointer.
  169. void initializeInstruction();
  170. /// Get the predicate that the compare instruction is using for hashing the
  171. /// instruction. the IRInstructionData must be wrapping a CmpInst.
  172. CmpInst::Predicate getPredicate() const;
  173. /// Get the callee name that the call instruction is using for hashing the
  174. /// instruction. The IRInstructionData must be wrapping a CallInst.
  175. StringRef getCalleeName() const;
  176. /// A function that swaps the predicates to their less than form if they are
  177. /// in a greater than form. Otherwise, the predicate is unchanged.
  178. ///
  179. /// \param CI - The comparison operation to find a consistent preidcate for.
  180. /// \return the consistent comparison predicate.
  181. static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
  182. /// For an IRInstructionData containing a branch, finds the
  183. /// relative distances from the source basic block to the target by taking
  184. /// the difference of the number assigned to the current basic block and the
  185. /// target basic block of the branch.
  186. ///
  187. /// \param BasicBlockToInteger - The mapping of basic blocks to their location
  188. /// in the module.
  189. void
  190. setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
  191. /// For an IRInstructionData containing a CallInst, set the function name
  192. /// appropriately. This will be an empty string if it is an indirect call,
  193. /// or we are not matching by name of the called function. It will be the
  194. /// name of the function if \p MatchByName is true and it is not an indirect
  195. /// call. We may decide not to match by name in order to expand the
  196. /// size of the regions we can match. If a function name has the same type
  197. /// signature, but the different name, the region of code is still almost the
  198. /// same. Since function names can be treated as constants, the name itself
  199. /// could be extrapolated away. However, matching by name provides a
  200. /// specificity and more "identical" code than not matching by name.
  201. ///
  202. /// \param MatchByName - A flag to mark whether we are using the called
  203. /// function name as a differentiating parameter.
  204. void setCalleeName(bool MatchByName = true);
  205. /// For an IRInstructionData containing a PHINode, finds the
  206. /// relative distances from the incoming basic block to the current block by
  207. /// taking the difference of the number assigned to the current basic block
  208. /// and the incoming basic block of the branch.
  209. ///
  210. /// \param BasicBlockToInteger - The mapping of basic blocks to their location
  211. /// in the module.
  212. void
  213. setPHIPredecessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
  214. /// Hashes \p Value based on its opcode, types, and operand types.
  215. /// Two IRInstructionData instances produce the same hash when they perform
  216. /// the same operation.
  217. ///
  218. /// As a simple example, consider the following instructions.
  219. ///
  220. /// \code
  221. /// %add1 = add i32 %x1, %y1
  222. /// %add2 = add i32 %x2, %y2
  223. ///
  224. /// %sub = sub i32 %x1, %y1
  225. ///
  226. /// %add_i64 = add i64 %x2, %y2
  227. /// \endcode
  228. ///
  229. /// Because the first two adds operate the same types, and are performing the
  230. /// same action, they will be hashed to the same value.
  231. ///
  232. /// However, the subtraction instruction is not the same as an addition, and
  233. /// will be hashed to a different value.
  234. ///
  235. /// Finally, the last add has a different type compared to the first two add
  236. /// instructions, so it will also be hashed to a different value that any of
  237. /// the previous instructions.
  238. ///
  239. /// \param [in] ID - The IRInstructionData instance to be hashed.
  240. /// \returns A hash_value of the IRInstructionData.
  241. friend hash_code hash_value(const IRInstructionData &ID) {
  242. SmallVector<Type *, 4> OperTypes;
  243. for (Value *V : ID.OperVals)
  244. OperTypes.push_back(V->getType());
  245. if (isa<CmpInst>(ID.Inst))
  246. return llvm::hash_combine(
  247. llvm::hash_value(ID.Inst->getOpcode()),
  248. llvm::hash_value(ID.Inst->getType()),
  249. llvm::hash_value(ID.getPredicate()),
  250. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  251. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(ID.Inst)) {
  252. // To hash intrinsics, we use the opcode, and types like the other
  253. // instructions, but also, the Intrinsic ID, and the Name of the
  254. // intrinsic.
  255. Intrinsic::ID IntrinsicID = II->getIntrinsicID();
  256. return llvm::hash_combine(
  257. llvm::hash_value(ID.Inst->getOpcode()),
  258. llvm::hash_value(ID.Inst->getType()), llvm::hash_value(IntrinsicID),
  259. llvm::hash_value(*ID.CalleeName),
  260. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  261. }
  262. if (isa<CallInst>(ID.Inst)) {
  263. std::string FunctionName = *ID.CalleeName;
  264. return llvm::hash_combine(
  265. llvm::hash_value(ID.Inst->getOpcode()),
  266. llvm::hash_value(ID.Inst->getType()),
  267. llvm::hash_value(ID.Inst->getType()), llvm::hash_value(FunctionName),
  268. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  269. }
  270. return llvm::hash_combine(
  271. llvm::hash_value(ID.Inst->getOpcode()),
  272. llvm::hash_value(ID.Inst->getType()),
  273. llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
  274. }
  275. IRInstructionDataList *IDL = nullptr;
  276. };
  277. struct IRInstructionDataList
  278. : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {};
  279. /// Compare one IRInstructionData class to another IRInstructionData class for
  280. /// whether they are performing a the same operation, and can mapped to the
  281. /// same value. For regular instructions if the hash value is the same, then
  282. /// they will also be close.
  283. ///
  284. /// \param A - The first IRInstructionData class to compare
  285. /// \param B - The second IRInstructionData class to compare
  286. /// \returns true if \p A and \p B are similar enough to be mapped to the same
  287. /// value.
  288. bool isClose(const IRInstructionData &A, const IRInstructionData &B);
  289. struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
  290. static inline IRInstructionData *getEmptyKey() { return nullptr; }
  291. static inline IRInstructionData *getTombstoneKey() {
  292. return reinterpret_cast<IRInstructionData *>(-1);
  293. }
  294. static unsigned getHashValue(const IRInstructionData *E) {
  295. using llvm::hash_value;
  296. assert(E && "IRInstructionData is a nullptr?");
  297. return hash_value(*E);
  298. }
  299. static bool isEqual(const IRInstructionData *LHS,
  300. const IRInstructionData *RHS) {
  301. if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
  302. LHS == getEmptyKey() || LHS == getTombstoneKey())
  303. return LHS == RHS;
  304. assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
  305. return isClose(*LHS, *RHS);
  306. }
  307. };
  308. /// Helper struct for converting the Instructions in a Module into a vector of
  309. /// unsigned integers. This vector of unsigned integers can be thought of as a
  310. /// "numeric string". This numeric string can then be queried by, for example,
  311. /// data structures that find repeated substrings.
  312. ///
  313. /// This hashing is done per BasicBlock in the module. To hash Instructions
  314. /// based off of their operations, each Instruction is wrapped in an
  315. /// IRInstructionData struct. The unsigned integer for an IRInstructionData
  316. /// depends on:
  317. /// - The hash provided by the IRInstructionData.
  318. /// - Which member of InstrType the IRInstructionData is classified as.
  319. // See InstrType for more details on the possible classifications, and how they
  320. // manifest in the numeric string.
  321. ///
  322. /// The numeric string for an individual BasicBlock is terminated by an unique
  323. /// unsigned integer. This prevents data structures which rely on repetition
  324. /// from matching across BasicBlocks. (For example, the SuffixTree.)
  325. /// As a concrete example, if we have the following two BasicBlocks:
  326. /// \code
  327. /// bb0:
  328. /// %add1 = add i32 %a, %b
  329. /// %add2 = add i32 %c, %d
  330. /// %add3 = add i64 %e, %f
  331. /// bb1:
  332. /// %sub = sub i32 %c, %d
  333. /// \endcode
  334. /// We may hash the Instructions like this (via IRInstructionData):
  335. /// \code
  336. /// bb0:
  337. /// %add1 = add i32 %a, %b ; Hash: 1
  338. /// %add2 = add i32 %c, %d; Hash: 1
  339. /// %add3 = add i64 %e, %f; Hash: 2
  340. /// bb1:
  341. /// %sub = sub i32 %c, %d; Hash: 3
  342. /// %add4 = add i32 %c, %d ; Hash: 1
  343. /// \endcode
  344. /// And produce a "numeric string representation" like so:
  345. /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
  346. ///
  347. /// TODO: This is very similar to the MachineOutliner, and should be
  348. /// consolidated into the same interface.
  349. struct IRInstructionMapper {
  350. /// The starting illegal instruction number to map to.
  351. ///
  352. /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
  353. unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
  354. /// The next available integer to assign to a legal Instruction to.
  355. unsigned LegalInstrNumber = 0;
  356. /// Correspondence from IRInstructionData to unsigned integers.
  357. DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
  358. InstructionIntegerMap;
  359. /// A mapping for a basic block in a module to its assigned number/location
  360. /// in the module.
  361. DenseMap<BasicBlock *, unsigned> BasicBlockToInteger;
  362. /// Set if we added an illegal number in the previous step.
  363. /// Since each illegal number is unique, we only need one of them between
  364. /// each range of legal numbers. This lets us make sure we don't add more
  365. /// than one illegal number per range.
  366. bool AddedIllegalLastTime = false;
  367. /// Marks whether we found a illegal instruction in the previous step.
  368. bool CanCombineWithPrevInstr = false;
  369. /// Marks whether we have found a set of instructions that is long enough
  370. /// to be considered for similarity.
  371. bool HaveLegalRange = false;
  372. /// Marks whether we should use exact function names, as well as types to
  373. /// find similarity between calls.
  374. bool EnableMatchCallsByName = false;
  375. /// This allocator pointer is in charge of holding on to the IRInstructionData
  376. /// so it is not deallocated until whatever external tool is using it is done
  377. /// with the information.
  378. SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
  379. /// This allocator pointer is in charge of creating the IRInstructionDataList
  380. /// so it is not deallocated until whatever external tool is using it is done
  381. /// with the information.
  382. SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
  383. /// Get an allocated IRInstructionData struct using the InstDataAllocator.
  384. ///
  385. /// \param I - The Instruction to wrap with IRInstructionData.
  386. /// \param Legality - A boolean value that is true if the instruction is to
  387. /// be considered for similarity, and false if not.
  388. /// \param IDL - The InstructionDataList that the IRInstructionData is
  389. /// inserted into.
  390. /// \returns An allocated IRInstructionData struct.
  391. IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
  392. IRInstructionDataList &IDL);
  393. /// Get an empty allocated IRInstructionData struct using the
  394. /// InstDataAllocator.
  395. ///
  396. /// \param IDL - The InstructionDataList that the IRInstructionData is
  397. /// inserted into.
  398. /// \returns An allocated IRInstructionData struct.
  399. IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL);
  400. /// Get an allocated IRInstructionDataList object using the IDLAllocator.
  401. ///
  402. /// \returns An allocated IRInstructionDataList object.
  403. IRInstructionDataList *allocateIRInstructionDataList();
  404. IRInstructionDataList *IDL = nullptr;
  405. /// Assigns values to all the basic blocks in function \p F starting from
  406. /// integer \p BBNumber.
  407. ///
  408. /// \param F - The function containing the basic blocks to assign numbers to.
  409. /// \param BBNumber - The number to start from.
  410. void initializeForBBs(Function &F, unsigned &BBNumber) {
  411. for (BasicBlock &BB : F)
  412. BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++));
  413. }
  414. /// Assigns values to all the basic blocks in Module \p M.
  415. /// \param M - The module containing the basic blocks to assign numbers to.
  416. void initializeForBBs(Module &M) {
  417. unsigned BBNumber = 0;
  418. for (Function &F : M)
  419. initializeForBBs(F, BBNumber);
  420. }
  421. /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
  422. /// determined by \p InstrType. Two Instructions are mapped to the same value
  423. /// if they are close as defined by the InstructionData class above.
  424. ///
  425. /// \param [in] BB - The BasicBlock to be mapped to integers.
  426. /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
  427. /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
  428. void convertToUnsignedVec(BasicBlock &BB,
  429. std::vector<IRInstructionData *> &InstrList,
  430. std::vector<unsigned> &IntegerMapping);
  431. /// Maps an Instruction to a legal integer.
  432. ///
  433. /// \param [in] It - The Instruction to be mapped to an integer.
  434. /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
  435. /// append to.
  436. /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
  437. /// \returns The integer \p It was mapped to.
  438. unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
  439. std::vector<unsigned> &IntegerMappingForBB,
  440. std::vector<IRInstructionData *> &InstrListForBB);
  441. /// Maps an Instruction to an illegal integer.
  442. ///
  443. /// \param [in] It - The \p Instruction to be mapped to an integer.
  444. /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
  445. /// append to.
  446. /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
  447. /// \param End - true if creating a dummy IRInstructionData at the end of a
  448. /// basic block.
  449. /// \returns The integer \p It was mapped to.
  450. unsigned mapToIllegalUnsigned(
  451. BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
  452. std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
  453. IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
  454. SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
  455. : InstDataAllocator(IDA), IDLAllocator(IDLA) {
  456. // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
  457. // changed.
  458. assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
  459. "DenseMapInfo<unsigned>'s empty key isn't -1!");
  460. assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
  461. static_cast<unsigned>(-2) &&
  462. "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
  463. IDL = new (IDLAllocator->Allocate())
  464. IRInstructionDataList();
  465. }
  466. /// Custom InstVisitor to classify different instructions for whether it can
  467. /// be analyzed for similarity.
  468. struct InstructionClassification
  469. : public InstVisitor<InstructionClassification, InstrType> {
  470. InstructionClassification() = default;
  471. // TODO: Determine a scheme to resolve when the label is similar enough.
  472. InstrType visitBranchInst(BranchInst &BI) {
  473. if (EnableBranches)
  474. return Legal;
  475. return Illegal;
  476. }
  477. InstrType visitPHINode(PHINode &PN) {
  478. if (EnableBranches)
  479. return Legal;
  480. return Illegal;
  481. }
  482. // TODO: Handle allocas.
  483. InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
  484. // We exclude variable argument instructions since variable arguments
  485. // requires extra checking of the argument list.
  486. InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
  487. // We exclude all exception handling cases since they are so context
  488. // dependent.
  489. InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
  490. InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
  491. // DebugInfo should be included in the regions, but should not be
  492. // analyzed for similarity as it has no bearing on the outcome of the
  493. // program.
  494. InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
  495. InstrType visitIntrinsicInst(IntrinsicInst &II) {
  496. // These are disabled due to complications in the CodeExtractor when
  497. // outlining these instructions. For instance, It is unclear what we
  498. // should do when moving only the start or end lifetime instruction into
  499. // an outlined function. Also, assume-like intrinsics could be removed
  500. // from the region, removing arguments, causing discrepencies in the
  501. // number of inputs between different regions.
  502. if (II.isAssumeLikeIntrinsic())
  503. return Illegal;
  504. return EnableIntrinsics ? Legal : Illegal;
  505. }
  506. // We only allow call instructions where the function has a name and
  507. // is not an indirect call.
  508. InstrType visitCallInst(CallInst &CI) {
  509. Function *F = CI.getCalledFunction();
  510. bool IsIndirectCall = CI.isIndirectCall();
  511. if (IsIndirectCall && !EnableIndirectCalls)
  512. return Illegal;
  513. if (!F && !IsIndirectCall)
  514. return Illegal;
  515. // Functions marked with the swifttailcc and tailcc calling conventions
  516. // require special handling when outlining musttail functions. The
  517. // calling convention must be passed down to the outlined function as
  518. // well. Further, there is special handling for musttail calls as well,
  519. // requiring a return call directly after. For now, the outliner does not
  520. // support this, so we do not handle matching this case either.
  521. if ((CI.getCallingConv() == CallingConv::SwiftTail ||
  522. CI.getCallingConv() == CallingConv::Tail) &&
  523. !EnableMustTailCalls)
  524. return Illegal;
  525. if (CI.isMustTailCall() && !EnableMustTailCalls)
  526. return Illegal;
  527. return Legal;
  528. }
  529. // TODO: We do not current handle similarity that changes the control flow.
  530. InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
  531. // TODO: We do not current handle similarity that changes the control flow.
  532. InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
  533. // TODO: Handle interblock similarity.
  534. InstrType visitTerminator(Instruction &I) { return Illegal; }
  535. InstrType visitInstruction(Instruction &I) { return Legal; }
  536. // The flag variable that lets the classifier know whether we should
  537. // allow branches to be checked for similarity.
  538. bool EnableBranches = false;
  539. // The flag variable that lets the classifier know whether we should
  540. // allow indirect calls to be considered legal instructions.
  541. bool EnableIndirectCalls = false;
  542. // Flag that lets the classifier know whether we should allow intrinsics to
  543. // be checked for similarity.
  544. bool EnableIntrinsics = false;
  545. // Flag that lets the classifier know whether we should allow tail calls to
  546. // be checked for similarity.
  547. bool EnableMustTailCalls = false;
  548. };
  549. /// Maps an Instruction to a member of InstrType.
  550. InstructionClassification InstClassifier;
  551. };
  552. /// This is a class that wraps a range of IRInstructionData from one point to
  553. /// another in the vector of IRInstructionData, which is a region of the
  554. /// program. It is also responsible for defining the structure within this
  555. /// region of instructions.
  556. ///
  557. /// The structure of a region is defined through a value numbering system
  558. /// assigned to each unique value in a region at the creation of the
  559. /// IRSimilarityCandidate.
  560. ///
  561. /// For example, for each Instruction we add a mapping for each new
  562. /// value seen in that Instruction.
  563. /// IR: Mapping Added:
  564. /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
  565. /// %add2 = add i32 %a, %1 %add2 -> 4
  566. /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
  567. ///
  568. /// We can compare IRSimilarityCandidates against one another.
  569. /// The \ref isSimilar function compares each IRInstructionData against one
  570. /// another and if we have the same sequences of IRInstructionData that would
  571. /// create the same hash, we have similar IRSimilarityCandidates.
  572. ///
  573. /// We can also compare the structure of IRSimilarityCandidates. If we can
  574. /// create a mapping of registers in the region contained by one
  575. /// IRSimilarityCandidate to the region contained by different
  576. /// IRSimilarityCandidate, they can be considered structurally similar.
  577. ///
  578. /// IRSimilarityCandidate1: IRSimilarityCandidate2:
  579. /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e
  580. /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
  581. /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
  582. ///
  583. /// Can have the following mapping from candidate to candidate of:
  584. /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
  585. /// and can be considered similar.
  586. ///
  587. /// IRSimilarityCandidate1: IRSimilarityCandidate2:
  588. /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4
  589. /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
  590. /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
  591. ///
  592. /// We cannot create the same mapping since the use of c4 is not used in the
  593. /// same way as %b or c2.
  594. class IRSimilarityCandidate {
  595. private:
  596. /// The start index of this IRSimilarityCandidate in the instruction list.
  597. unsigned StartIdx = 0;
  598. /// The number of instructions in this IRSimilarityCandidate.
  599. unsigned Len = 0;
  600. /// The first instruction in this IRSimilarityCandidate.
  601. IRInstructionData *FirstInst = nullptr;
  602. /// The last instruction in this IRSimilarityCandidate.
  603. IRInstructionData *LastInst = nullptr;
  604. /// Global Value Numbering structures
  605. /// @{
  606. /// Stores the mapping of the value to the number assigned to it in the
  607. /// IRSimilarityCandidate.
  608. DenseMap<Value *, unsigned> ValueToNumber;
  609. /// Stores the mapping of the number to the value assigned this number.
  610. DenseMap<unsigned, Value *> NumberToValue;
  611. /// Stores the mapping of a value's number to canonical numbering in the
  612. /// candidate's respective similarity group.
  613. DenseMap<unsigned, unsigned> NumberToCanonNum;
  614. /// Stores the mapping of canonical number in the candidate's respective
  615. /// similarity group to a value number.
  616. DenseMap<unsigned, unsigned> CanonNumToNumber;
  617. /// @}
  618. public:
  619. /// \param StartIdx - The starting location of the region.
  620. /// \param Len - The length of the region.
  621. /// \param FirstInstIt - The starting IRInstructionData of the region.
  622. /// \param LastInstIt - The ending IRInstructionData of the region.
  623. IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
  624. IRInstructionData *FirstInstIt,
  625. IRInstructionData *LastInstIt);
  626. /// \param A - The first IRInstructionCandidate to compare.
  627. /// \param B - The second IRInstructionCandidate to compare.
  628. /// \returns True when every IRInstructionData in \p A is similar to every
  629. /// IRInstructionData in \p B.
  630. static bool isSimilar(const IRSimilarityCandidate &A,
  631. const IRSimilarityCandidate &B);
  632. /// \param [in] A - The first IRInstructionCandidate to compare.
  633. /// \param [in] B - The second IRInstructionCandidate to compare.
  634. /// \returns True when every IRInstructionData in \p A is structurally similar
  635. /// to \p B.
  636. static bool compareStructure(const IRSimilarityCandidate &A,
  637. const IRSimilarityCandidate &B);
  638. /// \param [in] A - The first IRInstructionCandidate to compare.
  639. /// \param [in] B - The second IRInstructionCandidate to compare.
  640. /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
  641. /// candidate \p A to candidate \B.
  642. /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
  643. /// candidate \p B to candidate \A.
  644. /// \returns True when every IRInstructionData in \p A is structurally similar
  645. /// to \p B.
  646. static bool
  647. compareStructure(const IRSimilarityCandidate &A,
  648. const IRSimilarityCandidate &B,
  649. DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
  650. DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
  651. struct OperandMapping {
  652. /// The IRSimilarityCandidate that holds the instruction the OperVals were
  653. /// pulled from.
  654. const IRSimilarityCandidate &IRSC;
  655. /// The operand values to be analyzed.
  656. ArrayRef<Value *> &OperVals;
  657. /// The current mapping of global value numbers from one IRSimilarityCandidate
  658. /// to another IRSimilarityCandidate.
  659. DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
  660. };
  661. /// A helper struct to hold the candidate, for a branch instruction, the
  662. /// relative location of a label, and the label itself. This is mostly to
  663. /// group the values together before passing them as a bundle to a function.
  664. struct RelativeLocMapping {
  665. /// The IRSimilarityCandidate that holds the instruction the relative
  666. /// location was pulled from.
  667. const IRSimilarityCandidate &IRSC;
  668. /// The relative location to be analyzed.
  669. int RelativeLocation;
  670. /// The corresponding value.
  671. Value *OperVal;
  672. };
  673. /// Compare the operands in \p A and \p B and check that the current mapping
  674. /// of global value numbers from \p A to \p B and \p B to \A is consistent.
  675. ///
  676. /// \param A - The first IRInstructionCandidate, operand values, and current
  677. /// operand mappings to compare.
  678. /// \param B - The second IRInstructionCandidate, operand values, and current
  679. /// operand mappings to compare.
  680. /// \returns true if the IRSimilarityCandidates operands are compatible.
  681. static bool compareNonCommutativeOperandMapping(OperandMapping A,
  682. OperandMapping B);
  683. /// Compare the operands in \p A and \p B and check that the current mapping
  684. /// of global value numbers from \p A to \p B and \p B to \A is consistent
  685. /// given that the operands are commutative.
  686. ///
  687. /// \param A - The first IRInstructionCandidate, operand values, and current
  688. /// operand mappings to compare.
  689. /// \param B - The second IRInstructionCandidate, operand values, and current
  690. /// operand mappings to compare.
  691. /// \returns true if the IRSimilarityCandidates operands are compatible.
  692. static bool compareCommutativeOperandMapping(OperandMapping A,
  693. OperandMapping B);
  694. /// Compare the relative locations in \p A and \p B and check that the
  695. /// distances match if both locations are contained in the region, and that
  696. /// the branches both point outside the region if they do not.
  697. /// Example Region:
  698. /// \code
  699. /// entry:
  700. /// br i1 %0, label %block_1, label %block_3
  701. /// block_0:
  702. /// br i1 %0, label %block_1, label %block_2
  703. /// block_1:
  704. /// br i1 %0, label %block_2, label %block_3
  705. /// block_2:
  706. /// br i1 %1, label %block_1, label %block_4
  707. /// block_3:
  708. /// br i1 %2, label %block_2, label %block_5
  709. /// \endcode
  710. /// If we compare the branches in block_0 and block_1 the relative values are
  711. /// 1 and 2 for both, so we consider this a match.
  712. ///
  713. /// If we compare the branches in entry and block_0 the relative values are
  714. /// 2 and 3, and 1 and 2 respectively. Since these are not the same we do not
  715. /// consider them a match.
  716. ///
  717. /// If we compare the branches in block_1 and block_2 the relative values are
  718. /// 1 and 2, and -1 and None respectively. As a result we do not consider
  719. /// these to be the same
  720. ///
  721. /// If we compare the branches in block_2 and block_3 the relative values are
  722. /// -1 and None for both. We do consider these to be a match.
  723. ///
  724. /// \param A - The first IRInstructionCandidate, relative location value,
  725. /// and incoming block.
  726. /// \param B - The second IRInstructionCandidate, relative location value,
  727. /// and incoming block.
  728. /// \returns true if the relative locations match.
  729. static bool checkRelativeLocations(RelativeLocMapping A,
  730. RelativeLocMapping B);
  731. /// Create a mapping from the value numbering to a different separate set of
  732. /// numbers. This will serve as a guide for relating one candidate to another.
  733. /// The canonical number gives use the ability identify which global value
  734. /// number in one candidate relates to the global value number in the other.
  735. ///
  736. /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a
  737. /// canonical numbering for.
  738. static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand);
  739. /// Create a mapping for the value numbering of the calling
  740. /// IRSimilarityCandidate, to a different separate set of numbers, based on
  741. /// the canonical ordering in \p SourceCand. These are defined based on the
  742. /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of
  743. /// these relationships should have the same information, just in opposite
  744. /// directions.
  745. ///
  746. /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
  747. /// canonical numbering from.
  748. /// \param ToSourceMapping - The mapping of value numbers from this candidate
  749. /// to \p SourceCand.
  750. /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
  751. /// to this candidate.
  752. void createCanonicalRelationFrom(
  753. IRSimilarityCandidate &SourceCand,
  754. DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
  755. DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
  756. /// \param [in,out] BBSet - The set to track the basic blocks.
  757. void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const {
  758. for (IRInstructionData &ID : *this) {
  759. BasicBlock *BB = ID.Inst->getParent();
  760. BBSet.insert(BB);
  761. }
  762. }
  763. /// \param [in,out] BBSet - The set to track the basic blocks.
  764. /// \param [in,out] BBList - A list in order of use to track the basic blocks.
  765. void getBasicBlocks(DenseSet<BasicBlock *> &BBSet,
  766. SmallVector<BasicBlock *> &BBList) const {
  767. for (IRInstructionData &ID : *this) {
  768. BasicBlock *BB = ID.Inst->getParent();
  769. if (BBSet.insert(BB).second)
  770. BBList.push_back(BB);
  771. }
  772. }
  773. /// Compare the start and end indices of the two IRSimilarityCandidates for
  774. /// whether they overlap. If the start instruction of one
  775. /// IRSimilarityCandidate is less than the end instruction of the other, and
  776. /// the start instruction of one is greater than the start instruction of the
  777. /// other, they overlap.
  778. ///
  779. /// \returns true if the IRSimilarityCandidates do not have overlapping
  780. /// instructions.
  781. static bool overlap(const IRSimilarityCandidate &A,
  782. const IRSimilarityCandidate &B);
  783. /// \returns the number of instructions in this Candidate.
  784. unsigned getLength() const { return Len; }
  785. /// \returns the start index of this IRSimilarityCandidate.
  786. unsigned getStartIdx() const { return StartIdx; }
  787. /// \returns the end index of this IRSimilarityCandidate.
  788. unsigned getEndIdx() const { return StartIdx + Len - 1; }
  789. /// \returns The first IRInstructionData.
  790. IRInstructionData *front() const { return FirstInst; }
  791. /// \returns The last IRInstructionData.
  792. IRInstructionData *back() const { return LastInst; }
  793. /// \returns The first Instruction.
  794. Instruction *frontInstruction() { return FirstInst->Inst; }
  795. /// \returns The last Instruction
  796. Instruction *backInstruction() { return LastInst->Inst; }
  797. /// \returns The BasicBlock the IRSimilarityCandidate starts in.
  798. BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
  799. /// \returns The BasicBlock the IRSimilarityCandidate ends in.
  800. BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
  801. /// \returns The Function that the IRSimilarityCandidate is located in.
  802. Function *getFunction() { return getStartBB()->getParent(); }
  803. /// Finds the positive number associated with \p V if it has been mapped.
  804. /// \param [in] V - the Value to find.
  805. /// \returns The positive number corresponding to the value.
  806. /// \returns std::nullopt if not present.
  807. std::optional<unsigned> getGVN(Value *V) {
  808. assert(V != nullptr && "Value is a nullptr?");
  809. DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
  810. if (VNIt == ValueToNumber.end())
  811. return std::nullopt;
  812. return VNIt->second;
  813. }
  814. /// Finds the Value associate with \p Num if it exists.
  815. /// \param [in] Num - the number to find.
  816. /// \returns The Value associated with the number.
  817. /// \returns std::nullopt if not present.
  818. std::optional<Value *> fromGVN(unsigned Num) {
  819. DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
  820. if (VNIt == NumberToValue.end())
  821. return std::nullopt;
  822. assert(VNIt->second != nullptr && "Found value is a nullptr!");
  823. return VNIt->second;
  824. }
  825. /// Find the canonical number from the global value number \p N stored in the
  826. /// candidate.
  827. ///
  828. /// \param N - The global value number to find the canonical number for.
  829. /// \returns An optional containing the value, and std::nullopt if it could
  830. /// not be found.
  831. std::optional<unsigned> getCanonicalNum(unsigned N) {
  832. DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(N);
  833. if (NCIt == NumberToCanonNum.end())
  834. return std::nullopt;
  835. return NCIt->second;
  836. }
  837. /// Find the global value number from the canonical number \p N stored in the
  838. /// candidate.
  839. ///
  840. /// \param N - The canonical number to find the global vlaue number for.
  841. /// \returns An optional containing the value, and std::nullopt if it could
  842. /// not be found.
  843. std::optional<unsigned> fromCanonicalNum(unsigned N) {
  844. DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(N);
  845. if (CNIt == CanonNumToNumber.end())
  846. return std::nullopt;
  847. return CNIt->second;
  848. }
  849. /// \param RHS -The IRSimilarityCandidate to compare against
  850. /// \returns true if the IRSimilarityCandidate is occurs after the
  851. /// IRSimilarityCandidate in the program.
  852. bool operator<(const IRSimilarityCandidate &RHS) const {
  853. return getStartIdx() > RHS.getStartIdx();
  854. }
  855. using iterator = IRInstructionDataList::iterator;
  856. iterator begin() const { return iterator(front()); }
  857. iterator end() const { return std::next(iterator(back())); }
  858. };
  859. typedef DenseMap<IRSimilarityCandidate *,
  860. DenseMap<unsigned, DenseSet<unsigned>>>
  861. CandidateGVNMapping;
  862. typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
  863. typedef std::vector<SimilarityGroup> SimilarityGroupList;
  864. /// This class puts all the pieces of the IRInstructionData,
  865. /// IRInstructionMapper, IRSimilarityCandidate together.
  866. ///
  867. /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
  868. /// and puts all the mapped instructions into a single long list of
  869. /// IRInstructionData.
  870. ///
  871. /// The list of unsigned integers is given to the Suffix Tree or similar data
  872. /// structure to find repeated subsequences. We construct an
  873. /// IRSimilarityCandidate for each instance of the subsequence. We compare them
  874. /// against one another since These repeated subsequences can have different
  875. /// structure. For each different kind of structure found, we create a
  876. /// similarity group.
  877. ///
  878. /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
  879. /// structurally similar to one another, while C is different we would have two
  880. /// SimilarityGroups:
  881. ///
  882. /// SimilarityGroup 1: SimilarityGroup 2
  883. /// A, B, D C
  884. ///
  885. /// A list of the different similarity groups is then returned after
  886. /// analyzing the module.
  887. class IRSimilarityIdentifier {
  888. public:
  889. IRSimilarityIdentifier(bool MatchBranches = true,
  890. bool MatchIndirectCalls = true,
  891. bool MatchCallsWithName = false,
  892. bool MatchIntrinsics = true,
  893. bool MatchMustTailCalls = true)
  894. : Mapper(&InstDataAllocator, &InstDataListAllocator),
  895. EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls),
  896. EnableMatchingCallsByName(MatchCallsWithName),
  897. EnableIntrinsics(MatchIntrinsics),
  898. EnableMustTailCalls(MatchMustTailCalls) {}
  899. private:
  900. /// Map the instructions in the module to unsigned integers, using mapping
  901. /// already present in the Mapper if possible.
  902. ///
  903. /// \param [in] M Module - To map to integers.
  904. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  905. /// \param [in,out] IntegerMapping - The vector to append integers to.
  906. void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
  907. std::vector<unsigned> &IntegerMapping);
  908. /// Map the instructions in the modules vector to unsigned integers, using
  909. /// mapping already present in the mapper if possible.
  910. ///
  911. /// \param [in] Modules - The list of modules to use to populate the mapper
  912. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  913. /// \param [in,out] IntegerMapping - The vector to append integers to.
  914. void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
  915. std::vector<IRInstructionData *> &InstrList,
  916. std::vector<unsigned> &IntegerMapping);
  917. /// Find the similarity candidates in \p InstrList and corresponding
  918. /// \p UnsignedVec
  919. ///
  920. /// \param [in,out] InstrList - The vector to append IRInstructionData to.
  921. /// \param [in,out] IntegerMapping - The vector to append integers to.
  922. /// candidates found in the program.
  923. void findCandidates(std::vector<IRInstructionData *> &InstrList,
  924. std::vector<unsigned> &IntegerMapping);
  925. public:
  926. // Find the IRSimilarityCandidates in the \p Modules and group by structural
  927. // similarity in a SimilarityGroup, each group is returned in a
  928. // SimilarityGroupList.
  929. //
  930. // \param [in] Modules - the modules to analyze.
  931. // \returns The groups of similarity ranges found in the modules.
  932. SimilarityGroupList &
  933. findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
  934. // Find the IRSimilarityCandidates in the given Module grouped by structural
  935. // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
  936. //
  937. // \param [in] M - the module to analyze.
  938. // \returns The groups of similarity ranges found in the module.
  939. SimilarityGroupList &findSimilarity(Module &M);
  940. // Clears \ref SimilarityCandidates if it is already filled by a previous run.
  941. void resetSimilarityCandidates() {
  942. // If we've already analyzed a Module or set of Modules, so we must clear
  943. // the SimilarityCandidates to make sure we do not have only old values
  944. // hanging around.
  945. if (SimilarityCandidates)
  946. SimilarityCandidates->clear();
  947. else
  948. SimilarityCandidates = SimilarityGroupList();
  949. }
  950. // \returns The groups of similarity ranges found in the most recently passed
  951. // set of modules.
  952. std::optional<SimilarityGroupList> &getSimilarity() {
  953. return SimilarityCandidates;
  954. }
  955. private:
  956. /// The allocator for IRInstructionData.
  957. SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
  958. /// The allocator for IRInstructionDataLists.
  959. SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
  960. /// Map Instructions to unsigned integers and wraps the Instruction in an
  961. /// instance of IRInstructionData.
  962. IRInstructionMapper Mapper;
  963. /// The flag variable that marks whether we should check branches for
  964. /// similarity, or only look within basic blocks.
  965. bool EnableBranches = true;
  966. /// The flag variable that marks whether we allow indirect calls to be checked
  967. /// for similarity, or exclude them as a legal instruction.
  968. bool EnableIndirectCalls = true;
  969. /// The flag variable that marks whether we allow calls to be marked as
  970. /// similar if they do not have the same name, only the same calling
  971. /// convention, attributes and type signature.
  972. bool EnableMatchingCallsByName = true;
  973. /// The flag variable that marks whether we should check intrinsics for
  974. /// similarity.
  975. bool EnableIntrinsics = true;
  976. // The flag variable that marks whether we should allow tailcalls
  977. // to be checked for similarity.
  978. bool EnableMustTailCalls = false;
  979. /// The SimilarityGroups found with the most recent run of \ref
  980. /// findSimilarity. std::nullopt if there is no recent run.
  981. std::optional<SimilarityGroupList> SimilarityCandidates;
  982. };
  983. } // end namespace IRSimilarity
  984. /// An analysis pass based on legacy pass manager that runs and returns
  985. /// IRSimilarityIdentifier run on the Module.
  986. class IRSimilarityIdentifierWrapperPass : public ModulePass {
  987. std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
  988. public:
  989. static char ID;
  990. IRSimilarityIdentifierWrapperPass();
  991. IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
  992. const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
  993. bool doInitialization(Module &M) override;
  994. bool doFinalization(Module &M) override;
  995. bool runOnModule(Module &M) override;
  996. void getAnalysisUsage(AnalysisUsage &AU) const override {
  997. AU.setPreservesAll();
  998. }
  999. };
  1000. /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
  1001. /// Module.
  1002. class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
  1003. public:
  1004. typedef IRSimilarity::IRSimilarityIdentifier Result;
  1005. Result run(Module &M, ModuleAnalysisManager &);
  1006. private:
  1007. friend AnalysisInfoMixin<IRSimilarityAnalysis>;
  1008. static AnalysisKey Key;
  1009. };
  1010. /// Printer pass that uses \c IRSimilarityAnalysis.
  1011. class IRSimilarityAnalysisPrinterPass
  1012. : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
  1013. raw_ostream &OS;
  1014. public:
  1015. explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
  1016. PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
  1017. };
  1018. } // end namespace llvm
  1019. #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
  1020. #ifdef __GNUC__
  1021. #pragma GCC diagnostic pop
  1022. #endif