OpenMPOpt.cpp 88 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459
  1. //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // OpenMP specific optimizations:
  10. //
  11. // - Deduplication of runtime calls, e.g., omp_get_thread_num.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/Transforms/IPO/OpenMPOpt.h"
  15. #include "llvm/ADT/EnumeratedArray.h"
  16. #include "llvm/ADT/Statistic.h"
  17. #include "llvm/Analysis/CallGraph.h"
  18. #include "llvm/Analysis/CallGraphSCCPass.h"
  19. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  20. #include "llvm/Analysis/ValueTracking.h"
  21. #include "llvm/Frontend/OpenMP/OMPConstants.h"
  22. #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
  23. #include "llvm/InitializePasses.h"
  24. #include "llvm/Support/CommandLine.h"
  25. #include "llvm/Transforms/IPO.h"
  26. #include "llvm/Transforms/IPO/Attributor.h"
  27. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  28. #include "llvm/Transforms/Utils/CallGraphUpdater.h"
  29. #include "llvm/Transforms/Utils/CodeExtractor.h"
  30. using namespace llvm;
  31. using namespace omp;
  32. #define DEBUG_TYPE "openmp-opt"
  33. static cl::opt<bool> DisableOpenMPOptimizations(
  34. "openmp-opt-disable", cl::ZeroOrMore,
  35. cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
  36. cl::init(false));
  37. static cl::opt<bool> EnableParallelRegionMerging(
  38. "openmp-opt-enable-merging", cl::ZeroOrMore,
  39. cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
  40. cl::init(false));
  41. static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
  42. cl::Hidden);
  43. static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
  44. cl::init(false), cl::Hidden);
  45. static cl::opt<bool> HideMemoryTransferLatency(
  46. "openmp-hide-memory-transfer-latency",
  47. cl::desc("[WIP] Tries to hide the latency of host to device memory"
  48. " transfers"),
  49. cl::Hidden, cl::init(false));
  50. STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
  51. "Number of OpenMP runtime calls deduplicated");
  52. STATISTIC(NumOpenMPParallelRegionsDeleted,
  53. "Number of OpenMP parallel regions deleted");
  54. STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
  55. "Number of OpenMP runtime functions identified");
  56. STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
  57. "Number of OpenMP runtime function uses identified");
  58. STATISTIC(NumOpenMPTargetRegionKernels,
  59. "Number of OpenMP target region entry points (=kernels) identified");
  60. STATISTIC(
  61. NumOpenMPParallelRegionsReplacedInGPUStateMachine,
  62. "Number of OpenMP parallel regions replaced with ID in GPU state machines");
  63. STATISTIC(NumOpenMPParallelRegionsMerged,
  64. "Number of OpenMP parallel regions merged");
  65. #if !defined(NDEBUG)
  66. static constexpr auto TAG = "[" DEBUG_TYPE "]";
  67. #endif
  68. namespace {
  69. struct AAICVTracker;
  70. /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
  71. /// Attributor runs.
  72. struct OMPInformationCache : public InformationCache {
  73. OMPInformationCache(Module &M, AnalysisGetter &AG,
  74. BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
  75. SmallPtrSetImpl<Kernel> &Kernels)
  76. : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
  77. Kernels(Kernels) {
  78. OMPBuilder.initialize();
  79. initializeRuntimeFunctions();
  80. initializeInternalControlVars();
  81. }
  82. /// Generic information that describes an internal control variable.
  83. struct InternalControlVarInfo {
  84. /// The kind, as described by InternalControlVar enum.
  85. InternalControlVar Kind;
  86. /// The name of the ICV.
  87. StringRef Name;
  88. /// Environment variable associated with this ICV.
  89. StringRef EnvVarName;
  90. /// Initial value kind.
  91. ICVInitValue InitKind;
  92. /// Initial value.
  93. ConstantInt *InitValue;
  94. /// Setter RTL function associated with this ICV.
  95. RuntimeFunction Setter;
  96. /// Getter RTL function associated with this ICV.
  97. RuntimeFunction Getter;
  98. /// RTL Function corresponding to the override clause of this ICV
  99. RuntimeFunction Clause;
  100. };
  101. /// Generic information that describes a runtime function
  102. struct RuntimeFunctionInfo {
  103. /// The kind, as described by the RuntimeFunction enum.
  104. RuntimeFunction Kind;
  105. /// The name of the function.
  106. StringRef Name;
  107. /// Flag to indicate a variadic function.
  108. bool IsVarArg;
  109. /// The return type of the function.
  110. Type *ReturnType;
  111. /// The argument types of the function.
  112. SmallVector<Type *, 8> ArgumentTypes;
  113. /// The declaration if available.
  114. Function *Declaration = nullptr;
  115. /// Uses of this runtime function per function containing the use.
  116. using UseVector = SmallVector<Use *, 16>;
  117. /// Clear UsesMap for runtime function.
  118. void clearUsesMap() { UsesMap.clear(); }
  119. /// Boolean conversion that is true if the runtime function was found.
  120. operator bool() const { return Declaration; }
  121. /// Return the vector of uses in function \p F.
  122. UseVector &getOrCreateUseVector(Function *F) {
  123. std::shared_ptr<UseVector> &UV = UsesMap[F];
  124. if (!UV)
  125. UV = std::make_shared<UseVector>();
  126. return *UV;
  127. }
  128. /// Return the vector of uses in function \p F or `nullptr` if there are
  129. /// none.
  130. const UseVector *getUseVector(Function &F) const {
  131. auto I = UsesMap.find(&F);
  132. if (I != UsesMap.end())
  133. return I->second.get();
  134. return nullptr;
  135. }
  136. /// Return how many functions contain uses of this runtime function.
  137. size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
  138. /// Return the number of arguments (or the minimal number for variadic
  139. /// functions).
  140. size_t getNumArgs() const { return ArgumentTypes.size(); }
  141. /// Run the callback \p CB on each use and forget the use if the result is
  142. /// true. The callback will be fed the function in which the use was
  143. /// encountered as second argument.
  144. void foreachUse(SmallVectorImpl<Function *> &SCC,
  145. function_ref<bool(Use &, Function &)> CB) {
  146. for (Function *F : SCC)
  147. foreachUse(CB, F);
  148. }
  149. /// Run the callback \p CB on each use within the function \p F and forget
  150. /// the use if the result is true.
  151. void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
  152. SmallVector<unsigned, 8> ToBeDeleted;
  153. ToBeDeleted.clear();
  154. unsigned Idx = 0;
  155. UseVector &UV = getOrCreateUseVector(F);
  156. for (Use *U : UV) {
  157. if (CB(*U, *F))
  158. ToBeDeleted.push_back(Idx);
  159. ++Idx;
  160. }
  161. // Remove the to-be-deleted indices in reverse order as prior
  162. // modifications will not modify the smaller indices.
  163. while (!ToBeDeleted.empty()) {
  164. unsigned Idx = ToBeDeleted.pop_back_val();
  165. UV[Idx] = UV.back();
  166. UV.pop_back();
  167. }
  168. }
  169. private:
  170. /// Map from functions to all uses of this runtime function contained in
  171. /// them.
  172. DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
  173. };
  174. /// An OpenMP-IR-Builder instance
  175. OpenMPIRBuilder OMPBuilder;
  176. /// Map from runtime function kind to the runtime function description.
  177. EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
  178. RuntimeFunction::OMPRTL___last>
  179. RFIs;
  180. /// Map from ICV kind to the ICV description.
  181. EnumeratedArray<InternalControlVarInfo, InternalControlVar,
  182. InternalControlVar::ICV___last>
  183. ICVs;
  184. /// Helper to initialize all internal control variable information for those
  185. /// defined in OMPKinds.def.
  186. void initializeInternalControlVars() {
  187. #define ICV_RT_SET(_Name, RTL) \
  188. { \
  189. auto &ICV = ICVs[_Name]; \
  190. ICV.Setter = RTL; \
  191. }
  192. #define ICV_RT_GET(Name, RTL) \
  193. { \
  194. auto &ICV = ICVs[Name]; \
  195. ICV.Getter = RTL; \
  196. }
  197. #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
  198. { \
  199. auto &ICV = ICVs[Enum]; \
  200. ICV.Name = _Name; \
  201. ICV.Kind = Enum; \
  202. ICV.InitKind = Init; \
  203. ICV.EnvVarName = _EnvVarName; \
  204. switch (ICV.InitKind) { \
  205. case ICV_IMPLEMENTATION_DEFINED: \
  206. ICV.InitValue = nullptr; \
  207. break; \
  208. case ICV_ZERO: \
  209. ICV.InitValue = ConstantInt::get( \
  210. Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
  211. break; \
  212. case ICV_FALSE: \
  213. ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
  214. break; \
  215. case ICV_LAST: \
  216. break; \
  217. } \
  218. }
  219. #include "llvm/Frontend/OpenMP/OMPKinds.def"
  220. }
  221. /// Returns true if the function declaration \p F matches the runtime
  222. /// function types, that is, return type \p RTFRetType, and argument types
  223. /// \p RTFArgTypes.
  224. static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
  225. SmallVector<Type *, 8> &RTFArgTypes) {
  226. // TODO: We should output information to the user (under debug output
  227. // and via remarks).
  228. if (!F)
  229. return false;
  230. if (F->getReturnType() != RTFRetType)
  231. return false;
  232. if (F->arg_size() != RTFArgTypes.size())
  233. return false;
  234. auto RTFTyIt = RTFArgTypes.begin();
  235. for (Argument &Arg : F->args()) {
  236. if (Arg.getType() != *RTFTyIt)
  237. return false;
  238. ++RTFTyIt;
  239. }
  240. return true;
  241. }
  242. // Helper to collect all uses of the declaration in the UsesMap.
  243. unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
  244. unsigned NumUses = 0;
  245. if (!RFI.Declaration)
  246. return NumUses;
  247. OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
  248. if (CollectStats) {
  249. NumOpenMPRuntimeFunctionsIdentified += 1;
  250. NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
  251. }
  252. // TODO: We directly convert uses into proper calls and unknown uses.
  253. for (Use &U : RFI.Declaration->uses()) {
  254. if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
  255. if (ModuleSlice.count(UserI->getFunction())) {
  256. RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
  257. ++NumUses;
  258. }
  259. } else {
  260. RFI.getOrCreateUseVector(nullptr).push_back(&U);
  261. ++NumUses;
  262. }
  263. }
  264. return NumUses;
  265. }
  266. // Helper function to recollect uses of a runtime function.
  267. void recollectUsesForFunction(RuntimeFunction RTF) {
  268. auto &RFI = RFIs[RTF];
  269. RFI.clearUsesMap();
  270. collectUses(RFI, /*CollectStats*/ false);
  271. }
  272. // Helper function to recollect uses of all runtime functions.
  273. void recollectUses() {
  274. for (int Idx = 0; Idx < RFIs.size(); ++Idx)
  275. recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
  276. }
  277. /// Helper to initialize all runtime function information for those defined
  278. /// in OpenMPKinds.def.
  279. void initializeRuntimeFunctions() {
  280. Module &M = *((*ModuleSlice.begin())->getParent());
  281. // Helper macros for handling __VA_ARGS__ in OMP_RTL
  282. #define OMP_TYPE(VarName, ...) \
  283. Type *VarName = OMPBuilder.VarName; \
  284. (void)VarName;
  285. #define OMP_ARRAY_TYPE(VarName, ...) \
  286. ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
  287. (void)VarName##Ty; \
  288. PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
  289. (void)VarName##PtrTy;
  290. #define OMP_FUNCTION_TYPE(VarName, ...) \
  291. FunctionType *VarName = OMPBuilder.VarName; \
  292. (void)VarName; \
  293. PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
  294. (void)VarName##Ptr;
  295. #define OMP_STRUCT_TYPE(VarName, ...) \
  296. StructType *VarName = OMPBuilder.VarName; \
  297. (void)VarName; \
  298. PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
  299. (void)VarName##Ptr;
  300. #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
  301. { \
  302. SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
  303. Function *F = M.getFunction(_Name); \
  304. if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
  305. auto &RFI = RFIs[_Enum]; \
  306. RFI.Kind = _Enum; \
  307. RFI.Name = _Name; \
  308. RFI.IsVarArg = _IsVarArg; \
  309. RFI.ReturnType = OMPBuilder._ReturnType; \
  310. RFI.ArgumentTypes = std::move(ArgsTypes); \
  311. RFI.Declaration = F; \
  312. unsigned NumUses = collectUses(RFI); \
  313. (void)NumUses; \
  314. LLVM_DEBUG({ \
  315. dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
  316. << " found\n"; \
  317. if (RFI.Declaration) \
  318. dbgs() << TAG << "-> got " << NumUses << " uses in " \
  319. << RFI.getNumFunctionsWithUses() \
  320. << " different functions.\n"; \
  321. }); \
  322. } \
  323. }
  324. #include "llvm/Frontend/OpenMP/OMPKinds.def"
  325. // TODO: We should attach the attributes defined in OMPKinds.def.
  326. }
  327. /// Collection of known kernels (\see Kernel) in the module.
  328. SmallPtrSetImpl<Kernel> &Kernels;
  329. };
  330. /// Used to map the values physically (in the IR) stored in an offload
  331. /// array, to a vector in memory.
  332. struct OffloadArray {
  333. /// Physical array (in the IR).
  334. AllocaInst *Array = nullptr;
  335. /// Mapped values.
  336. SmallVector<Value *, 8> StoredValues;
  337. /// Last stores made in the offload array.
  338. SmallVector<StoreInst *, 8> LastAccesses;
  339. OffloadArray() = default;
  340. /// Initializes the OffloadArray with the values stored in \p Array before
  341. /// instruction \p Before is reached. Returns false if the initialization
  342. /// fails.
  343. /// This MUST be used immediately after the construction of the object.
  344. bool initialize(AllocaInst &Array, Instruction &Before) {
  345. if (!Array.getAllocatedType()->isArrayTy())
  346. return false;
  347. if (!getValues(Array, Before))
  348. return false;
  349. this->Array = &Array;
  350. return true;
  351. }
  352. static const unsigned DeviceIDArgNum = 1;
  353. static const unsigned BasePtrsArgNum = 3;
  354. static const unsigned PtrsArgNum = 4;
  355. static const unsigned SizesArgNum = 5;
  356. private:
  357. /// Traverses the BasicBlock where \p Array is, collecting the stores made to
  358. /// \p Array, leaving StoredValues with the values stored before the
  359. /// instruction \p Before is reached.
  360. bool getValues(AllocaInst &Array, Instruction &Before) {
  361. // Initialize container.
  362. const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
  363. StoredValues.assign(NumValues, nullptr);
  364. LastAccesses.assign(NumValues, nullptr);
  365. // TODO: This assumes the instruction \p Before is in the same
  366. // BasicBlock as Array. Make it general, for any control flow graph.
  367. BasicBlock *BB = Array.getParent();
  368. if (BB != Before.getParent())
  369. return false;
  370. const DataLayout &DL = Array.getModule()->getDataLayout();
  371. const unsigned int PointerSize = DL.getPointerSize();
  372. for (Instruction &I : *BB) {
  373. if (&I == &Before)
  374. break;
  375. if (!isa<StoreInst>(&I))
  376. continue;
  377. auto *S = cast<StoreInst>(&I);
  378. int64_t Offset = -1;
  379. auto *Dst =
  380. GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
  381. if (Dst == &Array) {
  382. int64_t Idx = Offset / PointerSize;
  383. StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
  384. LastAccesses[Idx] = S;
  385. }
  386. }
  387. return isFilled();
  388. }
  389. /// Returns true if all values in StoredValues and
  390. /// LastAccesses are not nullptrs.
  391. bool isFilled() {
  392. const unsigned NumValues = StoredValues.size();
  393. for (unsigned I = 0; I < NumValues; ++I) {
  394. if (!StoredValues[I] || !LastAccesses[I])
  395. return false;
  396. }
  397. return true;
  398. }
  399. };
  400. struct OpenMPOpt {
  401. using OptimizationRemarkGetter =
  402. function_ref<OptimizationRemarkEmitter &(Function *)>;
  403. OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
  404. OptimizationRemarkGetter OREGetter,
  405. OMPInformationCache &OMPInfoCache, Attributor &A)
  406. : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
  407. OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
  408. /// Check if any remarks are enabled for openmp-opt
  409. bool remarksEnabled() {
  410. auto &Ctx = M.getContext();
  411. return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
  412. }
  413. /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
  414. bool run() {
  415. if (SCC.empty())
  416. return false;
  417. bool Changed = false;
  418. LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
  419. << " functions in a slice with "
  420. << OMPInfoCache.ModuleSlice.size() << " functions\n");
  421. if (PrintICVValues)
  422. printICVs();
  423. if (PrintOpenMPKernels)
  424. printKernels();
  425. Changed |= rewriteDeviceCodeStateMachine();
  426. Changed |= runAttributor();
  427. // Recollect uses, in case Attributor deleted any.
  428. OMPInfoCache.recollectUses();
  429. Changed |= deleteParallelRegions();
  430. if (HideMemoryTransferLatency)
  431. Changed |= hideMemTransfersLatency();
  432. if (remarksEnabled())
  433. analysisGlobalization();
  434. Changed |= deduplicateRuntimeCalls();
  435. if (EnableParallelRegionMerging) {
  436. if (mergeParallelRegions()) {
  437. deduplicateRuntimeCalls();
  438. Changed = true;
  439. }
  440. }
  441. return Changed;
  442. }
  443. /// Print initial ICV values for testing.
  444. /// FIXME: This should be done from the Attributor once it is added.
  445. void printICVs() const {
  446. InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
  447. ICV_proc_bind};
  448. for (Function *F : OMPInfoCache.ModuleSlice) {
  449. for (auto ICV : ICVs) {
  450. auto ICVInfo = OMPInfoCache.ICVs[ICV];
  451. auto Remark = [&](OptimizationRemark OR) {
  452. return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
  453. << " Value: "
  454. << (ICVInfo.InitValue
  455. ? ICVInfo.InitValue->getValue().toString(10, true)
  456. : "IMPLEMENTATION_DEFINED");
  457. };
  458. emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
  459. }
  460. }
  461. }
  462. /// Print OpenMP GPU kernels for testing.
  463. void printKernels() const {
  464. for (Function *F : SCC) {
  465. if (!OMPInfoCache.Kernels.count(F))
  466. continue;
  467. auto Remark = [&](OptimizationRemark OR) {
  468. return OR << "OpenMP GPU kernel "
  469. << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
  470. };
  471. emitRemarkOnFunction(F, "OpenMPGPU", Remark);
  472. }
  473. }
  474. /// Return the call if \p U is a callee use in a regular call. If \p RFI is
  475. /// given it has to be the callee or a nullptr is returned.
  476. static CallInst *getCallIfRegularCall(
  477. Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
  478. CallInst *CI = dyn_cast<CallInst>(U.getUser());
  479. if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
  480. (!RFI || CI->getCalledFunction() == RFI->Declaration))
  481. return CI;
  482. return nullptr;
  483. }
  484. /// Return the call if \p V is a regular call. If \p RFI is given it has to be
  485. /// the callee or a nullptr is returned.
  486. static CallInst *getCallIfRegularCall(
  487. Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
  488. CallInst *CI = dyn_cast<CallInst>(&V);
  489. if (CI && !CI->hasOperandBundles() &&
  490. (!RFI || CI->getCalledFunction() == RFI->Declaration))
  491. return CI;
  492. return nullptr;
  493. }
  494. private:
  495. /// Merge parallel regions when it is safe.
  496. bool mergeParallelRegions() {
  497. const unsigned CallbackCalleeOperand = 2;
  498. const unsigned CallbackFirstArgOperand = 3;
  499. using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
  500. // Check if there are any __kmpc_fork_call calls to merge.
  501. OMPInformationCache::RuntimeFunctionInfo &RFI =
  502. OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
  503. if (!RFI.Declaration)
  504. return false;
  505. // Unmergable calls that prevent merging a parallel region.
  506. OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
  507. OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
  508. OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
  509. };
  510. bool Changed = false;
  511. LoopInfo *LI = nullptr;
  512. DominatorTree *DT = nullptr;
  513. SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
  514. BasicBlock *StartBB = nullptr, *EndBB = nullptr;
  515. auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
  516. BasicBlock &ContinuationIP) {
  517. BasicBlock *CGStartBB = CodeGenIP.getBlock();
  518. BasicBlock *CGEndBB =
  519. SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
  520. assert(StartBB != nullptr && "StartBB should not be null");
  521. CGStartBB->getTerminator()->setSuccessor(0, StartBB);
  522. assert(EndBB != nullptr && "EndBB should not be null");
  523. EndBB->getTerminator()->setSuccessor(0, CGEndBB);
  524. };
  525. auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
  526. Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
  527. ReplacementValue = &Inner;
  528. return CodeGenIP;
  529. };
  530. auto FiniCB = [&](InsertPointTy CodeGenIP) {};
  531. /// Create a sequential execution region within a merged parallel region,
  532. /// encapsulated in a master construct with a barrier for synchronization.
  533. auto CreateSequentialRegion = [&](Function *OuterFn,
  534. BasicBlock *OuterPredBB,
  535. Instruction *SeqStartI,
  536. Instruction *SeqEndI) {
  537. // Isolate the instructions of the sequential region to a separate
  538. // block.
  539. BasicBlock *ParentBB = SeqStartI->getParent();
  540. BasicBlock *SeqEndBB =
  541. SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
  542. BasicBlock *SeqAfterBB =
  543. SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
  544. BasicBlock *SeqStartBB =
  545. SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
  546. assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
  547. "Expected a different CFG");
  548. const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
  549. ParentBB->getTerminator()->eraseFromParent();
  550. auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
  551. BasicBlock &ContinuationIP) {
  552. BasicBlock *CGStartBB = CodeGenIP.getBlock();
  553. BasicBlock *CGEndBB =
  554. SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
  555. assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
  556. CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
  557. assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
  558. SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
  559. };
  560. auto FiniCB = [&](InsertPointTy CodeGenIP) {};
  561. // Find outputs from the sequential region to outside users and
  562. // broadcast their values to them.
  563. for (Instruction &I : *SeqStartBB) {
  564. SmallPtrSet<Instruction *, 4> OutsideUsers;
  565. for (User *Usr : I.users()) {
  566. Instruction &UsrI = *cast<Instruction>(Usr);
  567. // Ignore outputs to LT intrinsics, code extraction for the merged
  568. // parallel region will fix them.
  569. if (UsrI.isLifetimeStartOrEnd())
  570. continue;
  571. if (UsrI.getParent() != SeqStartBB)
  572. OutsideUsers.insert(&UsrI);
  573. }
  574. if (OutsideUsers.empty())
  575. continue;
  576. // Emit an alloca in the outer region to store the broadcasted
  577. // value.
  578. const DataLayout &DL = M.getDataLayout();
  579. AllocaInst *AllocaI = new AllocaInst(
  580. I.getType(), DL.getAllocaAddrSpace(), nullptr,
  581. I.getName() + ".seq.output.alloc", &OuterFn->front().front());
  582. // Emit a store instruction in the sequential BB to update the
  583. // value.
  584. new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
  585. // Emit a load instruction and replace the use of the output value
  586. // with it.
  587. for (Instruction *UsrI : OutsideUsers) {
  588. LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
  589. I.getName() + ".seq.output.load", UsrI);
  590. UsrI->replaceUsesOfWith(&I, LoadI);
  591. }
  592. }
  593. OpenMPIRBuilder::LocationDescription Loc(
  594. InsertPointTy(ParentBB, ParentBB->end()), DL);
  595. InsertPointTy SeqAfterIP =
  596. OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
  597. OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
  598. BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
  599. LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
  600. << "\n");
  601. };
  602. // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
  603. // contained in BB and only separated by instructions that can be
  604. // redundantly executed in parallel. The block BB is split before the first
  605. // call (in MergableCIs) and after the last so the entire region we merge
  606. // into a single parallel region is contained in a single basic block
  607. // without any other instructions. We use the OpenMPIRBuilder to outline
  608. // that block and call the resulting function via __kmpc_fork_call.
  609. auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
  610. // TODO: Change the interface to allow single CIs expanded, e.g, to
  611. // include an outer loop.
  612. assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
  613. auto Remark = [&](OptimizationRemark OR) {
  614. OR << "Parallel region at "
  615. << ore::NV("OpenMPParallelMergeFront",
  616. MergableCIs.front()->getDebugLoc())
  617. << " merged with parallel regions at ";
  618. for (auto *CI : llvm::drop_begin(MergableCIs)) {
  619. OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
  620. if (CI != MergableCIs.back())
  621. OR << ", ";
  622. }
  623. return OR;
  624. };
  625. emitRemark<OptimizationRemark>(MergableCIs.front(),
  626. "OpenMPParallelRegionMerging", Remark);
  627. Function *OriginalFn = BB->getParent();
  628. LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
  629. << " parallel regions in " << OriginalFn->getName()
  630. << "\n");
  631. // Isolate the calls to merge in a separate block.
  632. EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
  633. BasicBlock *AfterBB =
  634. SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
  635. StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
  636. "omp.par.merged");
  637. assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
  638. const DebugLoc DL = BB->getTerminator()->getDebugLoc();
  639. BB->getTerminator()->eraseFromParent();
  640. // Create sequential regions for sequential instructions that are
  641. // in-between mergable parallel regions.
  642. for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
  643. It != End; ++It) {
  644. Instruction *ForkCI = *It;
  645. Instruction *NextForkCI = *(It + 1);
  646. // Continue if there are not in-between instructions.
  647. if (ForkCI->getNextNode() == NextForkCI)
  648. continue;
  649. CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
  650. NextForkCI->getPrevNode());
  651. }
  652. OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
  653. DL);
  654. IRBuilder<>::InsertPoint AllocaIP(
  655. &OriginalFn->getEntryBlock(),
  656. OriginalFn->getEntryBlock().getFirstInsertionPt());
  657. // Create the merged parallel region with default proc binding, to
  658. // avoid overriding binding settings, and without explicit cancellation.
  659. InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
  660. Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
  661. OMP_PROC_BIND_default, /* IsCancellable */ false);
  662. BranchInst::Create(AfterBB, AfterIP.getBlock());
  663. // Perform the actual outlining.
  664. OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
  665. Function *OutlinedFn = MergableCIs.front()->getCaller();
  666. // Replace the __kmpc_fork_call calls with direct calls to the outlined
  667. // callbacks.
  668. SmallVector<Value *, 8> Args;
  669. for (auto *CI : MergableCIs) {
  670. Value *Callee =
  671. CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
  672. FunctionType *FT =
  673. cast<FunctionType>(Callee->getType()->getPointerElementType());
  674. Args.clear();
  675. Args.push_back(OutlinedFn->getArg(0));
  676. Args.push_back(OutlinedFn->getArg(1));
  677. for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
  678. U < E; ++U)
  679. Args.push_back(CI->getArgOperand(U));
  680. CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
  681. if (CI->getDebugLoc())
  682. NewCI->setDebugLoc(CI->getDebugLoc());
  683. // Forward parameter attributes from the callback to the callee.
  684. for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
  685. U < E; ++U)
  686. for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
  687. NewCI->addParamAttr(
  688. U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
  689. // Emit an explicit barrier to replace the implicit fork-join barrier.
  690. if (CI != MergableCIs.back()) {
  691. // TODO: Remove barrier if the merged parallel region includes the
  692. // 'nowait' clause.
  693. OMPInfoCache.OMPBuilder.createBarrier(
  694. InsertPointTy(NewCI->getParent(),
  695. NewCI->getNextNode()->getIterator()),
  696. OMPD_parallel);
  697. }
  698. auto Remark = [&](OptimizationRemark OR) {
  699. return OR << "Parallel region at "
  700. << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
  701. << " merged with "
  702. << ore::NV("OpenMPParallelMergeFront",
  703. MergableCIs.front()->getDebugLoc());
  704. };
  705. if (CI != MergableCIs.front())
  706. emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
  707. Remark);
  708. CI->eraseFromParent();
  709. }
  710. assert(OutlinedFn != OriginalFn && "Outlining failed");
  711. CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
  712. CGUpdater.reanalyzeFunction(*OriginalFn);
  713. NumOpenMPParallelRegionsMerged += MergableCIs.size();
  714. return true;
  715. };
  716. // Helper function that identifes sequences of
  717. // __kmpc_fork_call uses in a basic block.
  718. auto DetectPRsCB = [&](Use &U, Function &F) {
  719. CallInst *CI = getCallIfRegularCall(U, &RFI);
  720. BB2PRMap[CI->getParent()].insert(CI);
  721. return false;
  722. };
  723. BB2PRMap.clear();
  724. RFI.foreachUse(SCC, DetectPRsCB);
  725. SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
  726. // Find mergable parallel regions within a basic block that are
  727. // safe to merge, that is any in-between instructions can safely
  728. // execute in parallel after merging.
  729. // TODO: support merging across basic-blocks.
  730. for (auto &It : BB2PRMap) {
  731. auto &CIs = It.getSecond();
  732. if (CIs.size() < 2)
  733. continue;
  734. BasicBlock *BB = It.getFirst();
  735. SmallVector<CallInst *, 4> MergableCIs;
  736. /// Returns true if the instruction is mergable, false otherwise.
  737. /// A terminator instruction is unmergable by definition since merging
  738. /// works within a BB. Instructions before the mergable region are
  739. /// mergable if they are not calls to OpenMP runtime functions that may
  740. /// set different execution parameters for subsequent parallel regions.
  741. /// Instructions in-between parallel regions are mergable if they are not
  742. /// calls to any non-intrinsic function since that may call a non-mergable
  743. /// OpenMP runtime function.
  744. auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
  745. // We do not merge across BBs, hence return false (unmergable) if the
  746. // instruction is a terminator.
  747. if (I.isTerminator())
  748. return false;
  749. if (!isa<CallInst>(&I))
  750. return true;
  751. CallInst *CI = cast<CallInst>(&I);
  752. if (IsBeforeMergableRegion) {
  753. Function *CalledFunction = CI->getCalledFunction();
  754. if (!CalledFunction)
  755. return false;
  756. // Return false (unmergable) if the call before the parallel
  757. // region calls an explicit affinity (proc_bind) or number of
  758. // threads (num_threads) compiler-generated function. Those settings
  759. // may be incompatible with following parallel regions.
  760. // TODO: ICV tracking to detect compatibility.
  761. for (const auto &RFI : UnmergableCallsInfo) {
  762. if (CalledFunction == RFI.Declaration)
  763. return false;
  764. }
  765. } else {
  766. // Return false (unmergable) if there is a call instruction
  767. // in-between parallel regions when it is not an intrinsic. It
  768. // may call an unmergable OpenMP runtime function in its callpath.
  769. // TODO: Keep track of possible OpenMP calls in the callpath.
  770. if (!isa<IntrinsicInst>(CI))
  771. return false;
  772. }
  773. return true;
  774. };
  775. // Find maximal number of parallel region CIs that are safe to merge.
  776. for (auto It = BB->begin(), End = BB->end(); It != End;) {
  777. Instruction &I = *It;
  778. ++It;
  779. if (CIs.count(&I)) {
  780. MergableCIs.push_back(cast<CallInst>(&I));
  781. continue;
  782. }
  783. // Continue expanding if the instruction is mergable.
  784. if (IsMergable(I, MergableCIs.empty()))
  785. continue;
  786. // Forward the instruction iterator to skip the next parallel region
  787. // since there is an unmergable instruction which can affect it.
  788. for (; It != End; ++It) {
  789. Instruction &SkipI = *It;
  790. if (CIs.count(&SkipI)) {
  791. LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
  792. << " due to " << I << "\n");
  793. ++It;
  794. break;
  795. }
  796. }
  797. // Store mergable regions found.
  798. if (MergableCIs.size() > 1) {
  799. MergableCIsVector.push_back(MergableCIs);
  800. LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
  801. << " parallel regions in block " << BB->getName()
  802. << " of function " << BB->getParent()->getName()
  803. << "\n";);
  804. }
  805. MergableCIs.clear();
  806. }
  807. if (!MergableCIsVector.empty()) {
  808. Changed = true;
  809. for (auto &MergableCIs : MergableCIsVector)
  810. Merge(MergableCIs, BB);
  811. }
  812. }
  813. if (Changed) {
  814. /// Re-collect use for fork calls, emitted barrier calls, and
  815. /// any emitted master/end_master calls.
  816. OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
  817. OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
  818. OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
  819. OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
  820. }
  821. return Changed;
  822. }
  823. /// Try to delete parallel regions if possible.
  824. bool deleteParallelRegions() {
  825. const unsigned CallbackCalleeOperand = 2;
  826. OMPInformationCache::RuntimeFunctionInfo &RFI =
  827. OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
  828. if (!RFI.Declaration)
  829. return false;
  830. bool Changed = false;
  831. auto DeleteCallCB = [&](Use &U, Function &) {
  832. CallInst *CI = getCallIfRegularCall(U);
  833. if (!CI)
  834. return false;
  835. auto *Fn = dyn_cast<Function>(
  836. CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
  837. if (!Fn)
  838. return false;
  839. if (!Fn->onlyReadsMemory())
  840. return false;
  841. if (!Fn->hasFnAttribute(Attribute::WillReturn))
  842. return false;
  843. LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
  844. << CI->getCaller()->getName() << "\n");
  845. auto Remark = [&](OptimizationRemark OR) {
  846. return OR << "Parallel region in "
  847. << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
  848. << " deleted";
  849. };
  850. emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
  851. Remark);
  852. CGUpdater.removeCallSite(*CI);
  853. CI->eraseFromParent();
  854. Changed = true;
  855. ++NumOpenMPParallelRegionsDeleted;
  856. return true;
  857. };
  858. RFI.foreachUse(SCC, DeleteCallCB);
  859. return Changed;
  860. }
  861. /// Try to eliminate runtime calls by reusing existing ones.
  862. bool deduplicateRuntimeCalls() {
  863. bool Changed = false;
  864. RuntimeFunction DeduplicableRuntimeCallIDs[] = {
  865. OMPRTL_omp_get_num_threads,
  866. OMPRTL_omp_in_parallel,
  867. OMPRTL_omp_get_cancellation,
  868. OMPRTL_omp_get_thread_limit,
  869. OMPRTL_omp_get_supported_active_levels,
  870. OMPRTL_omp_get_level,
  871. OMPRTL_omp_get_ancestor_thread_num,
  872. OMPRTL_omp_get_team_size,
  873. OMPRTL_omp_get_active_level,
  874. OMPRTL_omp_in_final,
  875. OMPRTL_omp_get_proc_bind,
  876. OMPRTL_omp_get_num_places,
  877. OMPRTL_omp_get_num_procs,
  878. OMPRTL_omp_get_place_num,
  879. OMPRTL_omp_get_partition_num_places,
  880. OMPRTL_omp_get_partition_place_nums};
  881. // Global-tid is handled separately.
  882. SmallSetVector<Value *, 16> GTIdArgs;
  883. collectGlobalThreadIdArguments(GTIdArgs);
  884. LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
  885. << " global thread ID arguments\n");
  886. for (Function *F : SCC) {
  887. for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
  888. Changed |= deduplicateRuntimeCalls(
  889. *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
  890. // __kmpc_global_thread_num is special as we can replace it with an
  891. // argument in enough cases to make it worth trying.
  892. Value *GTIdArg = nullptr;
  893. for (Argument &Arg : F->args())
  894. if (GTIdArgs.count(&Arg)) {
  895. GTIdArg = &Arg;
  896. break;
  897. }
  898. Changed |= deduplicateRuntimeCalls(
  899. *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
  900. }
  901. return Changed;
  902. }
  903. /// Tries to hide the latency of runtime calls that involve host to
  904. /// device memory transfers by splitting them into their "issue" and "wait"
  905. /// versions. The "issue" is moved upwards as much as possible. The "wait" is
  906. /// moved downards as much as possible. The "issue" issues the memory transfer
  907. /// asynchronously, returning a handle. The "wait" waits in the returned
  908. /// handle for the memory transfer to finish.
  909. bool hideMemTransfersLatency() {
  910. auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
  911. bool Changed = false;
  912. auto SplitMemTransfers = [&](Use &U, Function &Decl) {
  913. auto *RTCall = getCallIfRegularCall(U, &RFI);
  914. if (!RTCall)
  915. return false;
  916. OffloadArray OffloadArrays[3];
  917. if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
  918. return false;
  919. LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
  920. // TODO: Check if can be moved upwards.
  921. bool WasSplit = false;
  922. Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
  923. if (WaitMovementPoint)
  924. WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
  925. Changed |= WasSplit;
  926. return WasSplit;
  927. };
  928. RFI.foreachUse(SCC, SplitMemTransfers);
  929. return Changed;
  930. }
  931. void analysisGlobalization() {
  932. RuntimeFunction GlobalizationRuntimeIDs[] = {
  933. OMPRTL___kmpc_data_sharing_coalesced_push_stack,
  934. OMPRTL___kmpc_data_sharing_push_stack};
  935. for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
  936. auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
  937. auto CheckGlobalization = [&](Use &U, Function &Decl) {
  938. if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
  939. auto Remark = [&](OptimizationRemarkAnalysis ORA) {
  940. return ORA
  941. << "Found thread data sharing on the GPU. "
  942. << "Expect degraded performance due to data globalization.";
  943. };
  944. emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
  945. Remark);
  946. }
  947. return false;
  948. };
  949. RFI.foreachUse(SCC, CheckGlobalization);
  950. }
  951. }
  952. /// Maps the values stored in the offload arrays passed as arguments to
  953. /// \p RuntimeCall into the offload arrays in \p OAs.
  954. bool getValuesInOffloadArrays(CallInst &RuntimeCall,
  955. MutableArrayRef<OffloadArray> OAs) {
  956. assert(OAs.size() == 3 && "Need space for three offload arrays!");
  957. // A runtime call that involves memory offloading looks something like:
  958. // call void @__tgt_target_data_begin_mapper(arg0, arg1,
  959. // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
  960. // ...)
  961. // So, the idea is to access the allocas that allocate space for these
  962. // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
  963. // Therefore:
  964. // i8** %offload_baseptrs.
  965. Value *BasePtrsArg =
  966. RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
  967. // i8** %offload_ptrs.
  968. Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
  969. // i8** %offload_sizes.
  970. Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
  971. // Get values stored in **offload_baseptrs.
  972. auto *V = getUnderlyingObject(BasePtrsArg);
  973. if (!isa<AllocaInst>(V))
  974. return false;
  975. auto *BasePtrsArray = cast<AllocaInst>(V);
  976. if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
  977. return false;
  978. // Get values stored in **offload_baseptrs.
  979. V = getUnderlyingObject(PtrsArg);
  980. if (!isa<AllocaInst>(V))
  981. return false;
  982. auto *PtrsArray = cast<AllocaInst>(V);
  983. if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
  984. return false;
  985. // Get values stored in **offload_sizes.
  986. V = getUnderlyingObject(SizesArg);
  987. // If it's a [constant] global array don't analyze it.
  988. if (isa<GlobalValue>(V))
  989. return isa<Constant>(V);
  990. if (!isa<AllocaInst>(V))
  991. return false;
  992. auto *SizesArray = cast<AllocaInst>(V);
  993. if (!OAs[2].initialize(*SizesArray, RuntimeCall))
  994. return false;
  995. return true;
  996. }
  997. /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
  998. /// For now this is a way to test that the function getValuesInOffloadArrays
  999. /// is working properly.
  1000. /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
  1001. void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
  1002. assert(OAs.size() == 3 && "There are three offload arrays to debug!");
  1003. LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
  1004. std::string ValuesStr;
  1005. raw_string_ostream Printer(ValuesStr);
  1006. std::string Separator = " --- ";
  1007. for (auto *BP : OAs[0].StoredValues) {
  1008. BP->print(Printer);
  1009. Printer << Separator;
  1010. }
  1011. LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
  1012. ValuesStr.clear();
  1013. for (auto *P : OAs[1].StoredValues) {
  1014. P->print(Printer);
  1015. Printer << Separator;
  1016. }
  1017. LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
  1018. ValuesStr.clear();
  1019. for (auto *S : OAs[2].StoredValues) {
  1020. S->print(Printer);
  1021. Printer << Separator;
  1022. }
  1023. LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
  1024. }
  1025. /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
  1026. /// moved. Returns nullptr if the movement is not possible, or not worth it.
  1027. Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
  1028. // FIXME: This traverses only the BasicBlock where RuntimeCall is.
  1029. // Make it traverse the CFG.
  1030. Instruction *CurrentI = &RuntimeCall;
  1031. bool IsWorthIt = false;
  1032. while ((CurrentI = CurrentI->getNextNode())) {
  1033. // TODO: Once we detect the regions to be offloaded we should use the
  1034. // alias analysis manager to check if CurrentI may modify one of
  1035. // the offloaded regions.
  1036. if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
  1037. if (IsWorthIt)
  1038. return CurrentI;
  1039. return nullptr;
  1040. }
  1041. // FIXME: For now if we move it over anything without side effect
  1042. // is worth it.
  1043. IsWorthIt = true;
  1044. }
  1045. // Return end of BasicBlock.
  1046. return RuntimeCall.getParent()->getTerminator();
  1047. }
  1048. /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
  1049. bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
  1050. Instruction &WaitMovementPoint) {
  1051. // Create stack allocated handle (__tgt_async_info) at the beginning of the
  1052. // function. Used for storing information of the async transfer, allowing to
  1053. // wait on it later.
  1054. auto &IRBuilder = OMPInfoCache.OMPBuilder;
  1055. auto *F = RuntimeCall.getCaller();
  1056. Instruction *FirstInst = &(F->getEntryBlock().front());
  1057. AllocaInst *Handle = new AllocaInst(
  1058. IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
  1059. // Add "issue" runtime call declaration:
  1060. // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
  1061. // i8**, i8**, i64*, i64*)
  1062. FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
  1063. M, OMPRTL___tgt_target_data_begin_mapper_issue);
  1064. // Change RuntimeCall call site for its asynchronous version.
  1065. SmallVector<Value *, 16> Args;
  1066. for (auto &Arg : RuntimeCall.args())
  1067. Args.push_back(Arg.get());
  1068. Args.push_back(Handle);
  1069. CallInst *IssueCallsite =
  1070. CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
  1071. RuntimeCall.eraseFromParent();
  1072. // Add "wait" runtime call declaration:
  1073. // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
  1074. FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
  1075. M, OMPRTL___tgt_target_data_begin_mapper_wait);
  1076. Value *WaitParams[2] = {
  1077. IssueCallsite->getArgOperand(
  1078. OffloadArray::DeviceIDArgNum), // device_id.
  1079. Handle // handle to wait on.
  1080. };
  1081. CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
  1082. return true;
  1083. }
  1084. static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
  1085. bool GlobalOnly, bool &SingleChoice) {
  1086. if (CurrentIdent == NextIdent)
  1087. return CurrentIdent;
  1088. // TODO: Figure out how to actually combine multiple debug locations. For
  1089. // now we just keep an existing one if there is a single choice.
  1090. if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
  1091. SingleChoice = !CurrentIdent;
  1092. return NextIdent;
  1093. }
  1094. return nullptr;
  1095. }
  1096. /// Return an `struct ident_t*` value that represents the ones used in the
  1097. /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
  1098. /// return a local `struct ident_t*`. For now, if we cannot find a suitable
  1099. /// return value we create one from scratch. We also do not yet combine
  1100. /// information, e.g., the source locations, see combinedIdentStruct.
  1101. Value *
  1102. getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
  1103. Function &F, bool GlobalOnly) {
  1104. bool SingleChoice = true;
  1105. Value *Ident = nullptr;
  1106. auto CombineIdentStruct = [&](Use &U, Function &Caller) {
  1107. CallInst *CI = getCallIfRegularCall(U, &RFI);
  1108. if (!CI || &F != &Caller)
  1109. return false;
  1110. Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
  1111. /* GlobalOnly */ true, SingleChoice);
  1112. return false;
  1113. };
  1114. RFI.foreachUse(SCC, CombineIdentStruct);
  1115. if (!Ident || !SingleChoice) {
  1116. // The IRBuilder uses the insertion block to get to the module, this is
  1117. // unfortunate but we work around it for now.
  1118. if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
  1119. OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
  1120. &F.getEntryBlock(), F.getEntryBlock().begin()));
  1121. // Create a fallback location if non was found.
  1122. // TODO: Use the debug locations of the calls instead.
  1123. Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
  1124. Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
  1125. }
  1126. return Ident;
  1127. }
  1128. /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
  1129. /// \p ReplVal if given.
  1130. bool deduplicateRuntimeCalls(Function &F,
  1131. OMPInformationCache::RuntimeFunctionInfo &RFI,
  1132. Value *ReplVal = nullptr) {
  1133. auto *UV = RFI.getUseVector(F);
  1134. if (!UV || UV->size() + (ReplVal != nullptr) < 2)
  1135. return false;
  1136. LLVM_DEBUG(
  1137. dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
  1138. << (ReplVal ? " with an existing value\n" : "\n") << "\n");
  1139. assert((!ReplVal || (isa<Argument>(ReplVal) &&
  1140. cast<Argument>(ReplVal)->getParent() == &F)) &&
  1141. "Unexpected replacement value!");
  1142. // TODO: Use dominance to find a good position instead.
  1143. auto CanBeMoved = [this](CallBase &CB) {
  1144. unsigned NumArgs = CB.getNumArgOperands();
  1145. if (NumArgs == 0)
  1146. return true;
  1147. if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
  1148. return false;
  1149. for (unsigned u = 1; u < NumArgs; ++u)
  1150. if (isa<Instruction>(CB.getArgOperand(u)))
  1151. return false;
  1152. return true;
  1153. };
  1154. if (!ReplVal) {
  1155. for (Use *U : *UV)
  1156. if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
  1157. if (!CanBeMoved(*CI))
  1158. continue;
  1159. auto Remark = [&](OptimizationRemark OR) {
  1160. auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
  1161. return OR << "OpenMP runtime call "
  1162. << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
  1163. << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
  1164. };
  1165. emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
  1166. CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
  1167. ReplVal = CI;
  1168. break;
  1169. }
  1170. if (!ReplVal)
  1171. return false;
  1172. }
  1173. // If we use a call as a replacement value we need to make sure the ident is
  1174. // valid at the new location. For now we just pick a global one, either
  1175. // existing and used by one of the calls, or created from scratch.
  1176. if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
  1177. if (CI->getNumArgOperands() > 0 &&
  1178. CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
  1179. Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
  1180. /* GlobalOnly */ true);
  1181. CI->setArgOperand(0, Ident);
  1182. }
  1183. }
  1184. bool Changed = false;
  1185. auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
  1186. CallInst *CI = getCallIfRegularCall(U, &RFI);
  1187. if (!CI || CI == ReplVal || &F != &Caller)
  1188. return false;
  1189. assert(CI->getCaller() == &F && "Unexpected call!");
  1190. auto Remark = [&](OptimizationRemark OR) {
  1191. return OR << "OpenMP runtime call "
  1192. << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
  1193. };
  1194. emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
  1195. CGUpdater.removeCallSite(*CI);
  1196. CI->replaceAllUsesWith(ReplVal);
  1197. CI->eraseFromParent();
  1198. ++NumOpenMPRuntimeCallsDeduplicated;
  1199. Changed = true;
  1200. return true;
  1201. };
  1202. RFI.foreachUse(SCC, ReplaceAndDeleteCB);
  1203. return Changed;
  1204. }
  1205. /// Collect arguments that represent the global thread id in \p GTIdArgs.
  1206. void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
  1207. // TODO: Below we basically perform a fixpoint iteration with a pessimistic
  1208. // initialization. We could define an AbstractAttribute instead and
  1209. // run the Attributor here once it can be run as an SCC pass.
  1210. // Helper to check the argument \p ArgNo at all call sites of \p F for
  1211. // a GTId.
  1212. auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
  1213. if (!F.hasLocalLinkage())
  1214. return false;
  1215. for (Use &U : F.uses()) {
  1216. if (CallInst *CI = getCallIfRegularCall(U)) {
  1217. Value *ArgOp = CI->getArgOperand(ArgNo);
  1218. if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
  1219. getCallIfRegularCall(
  1220. *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
  1221. continue;
  1222. }
  1223. return false;
  1224. }
  1225. return true;
  1226. };
  1227. // Helper to identify uses of a GTId as GTId arguments.
  1228. auto AddUserArgs = [&](Value &GTId) {
  1229. for (Use &U : GTId.uses())
  1230. if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
  1231. if (CI->isArgOperand(&U))
  1232. if (Function *Callee = CI->getCalledFunction())
  1233. if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
  1234. GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
  1235. };
  1236. // The argument users of __kmpc_global_thread_num calls are GTIds.
  1237. OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
  1238. OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
  1239. GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
  1240. if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
  1241. AddUserArgs(*CI);
  1242. return false;
  1243. });
  1244. // Transitively search for more arguments by looking at the users of the
  1245. // ones we know already. During the search the GTIdArgs vector is extended
  1246. // so we cannot cache the size nor can we use a range based for.
  1247. for (unsigned u = 0; u < GTIdArgs.size(); ++u)
  1248. AddUserArgs(*GTIdArgs[u]);
  1249. }
  1250. /// Kernel (=GPU) optimizations and utility functions
  1251. ///
  1252. ///{{
  1253. /// Check if \p F is a kernel, hence entry point for target offloading.
  1254. bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
  1255. /// Cache to remember the unique kernel for a function.
  1256. DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
  1257. /// Find the unique kernel that will execute \p F, if any.
  1258. Kernel getUniqueKernelFor(Function &F);
  1259. /// Find the unique kernel that will execute \p I, if any.
  1260. Kernel getUniqueKernelFor(Instruction &I) {
  1261. return getUniqueKernelFor(*I.getFunction());
  1262. }
  1263. /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
  1264. /// the cases we can avoid taking the address of a function.
  1265. bool rewriteDeviceCodeStateMachine();
  1266. ///
  1267. ///}}
  1268. /// Emit a remark generically
  1269. ///
  1270. /// This template function can be used to generically emit a remark. The
  1271. /// RemarkKind should be one of the following:
  1272. /// - OptimizationRemark to indicate a successful optimization attempt
  1273. /// - OptimizationRemarkMissed to report a failed optimization attempt
  1274. /// - OptimizationRemarkAnalysis to provide additional information about an
  1275. /// optimization attempt
  1276. ///
  1277. /// The remark is built using a callback function provided by the caller that
  1278. /// takes a RemarkKind as input and returns a RemarkKind.
  1279. template <typename RemarkKind,
  1280. typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
  1281. void emitRemark(Instruction *Inst, StringRef RemarkName,
  1282. RemarkCallBack &&RemarkCB) const {
  1283. Function *F = Inst->getParent()->getParent();
  1284. auto &ORE = OREGetter(F);
  1285. ORE.emit(
  1286. [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
  1287. }
  1288. /// Emit a remark on a function. Since only OptimizationRemark is supporting
  1289. /// this, it can't be made generic.
  1290. void
  1291. emitRemarkOnFunction(Function *F, StringRef RemarkName,
  1292. function_ref<OptimizationRemark(OptimizationRemark &&)>
  1293. &&RemarkCB) const {
  1294. auto &ORE = OREGetter(F);
  1295. ORE.emit([&]() {
  1296. return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
  1297. });
  1298. }
  1299. /// The underlying module.
  1300. Module &M;
  1301. /// The SCC we are operating on.
  1302. SmallVectorImpl<Function *> &SCC;
  1303. /// Callback to update the call graph, the first argument is a removed call,
  1304. /// the second an optional replacement call.
  1305. CallGraphUpdater &CGUpdater;
  1306. /// Callback to get an OptimizationRemarkEmitter from a Function *
  1307. OptimizationRemarkGetter OREGetter;
  1308. /// OpenMP-specific information cache. Also Used for Attributor runs.
  1309. OMPInformationCache &OMPInfoCache;
  1310. /// Attributor instance.
  1311. Attributor &A;
  1312. /// Helper function to run Attributor on SCC.
  1313. bool runAttributor() {
  1314. if (SCC.empty())
  1315. return false;
  1316. registerAAs();
  1317. ChangeStatus Changed = A.run();
  1318. LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
  1319. << " functions, result: " << Changed << ".\n");
  1320. return Changed == ChangeStatus::CHANGED;
  1321. }
  1322. /// Populate the Attributor with abstract attribute opportunities in the
  1323. /// function.
  1324. void registerAAs() {
  1325. if (SCC.empty())
  1326. return;
  1327. // Create CallSite AA for all Getters.
  1328. for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
  1329. auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
  1330. auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
  1331. auto CreateAA = [&](Use &U, Function &Caller) {
  1332. CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
  1333. if (!CI)
  1334. return false;
  1335. auto &CB = cast<CallBase>(*CI);
  1336. IRPosition CBPos = IRPosition::callsite_function(CB);
  1337. A.getOrCreateAAFor<AAICVTracker>(CBPos);
  1338. return false;
  1339. };
  1340. GetterRFI.foreachUse(SCC, CreateAA);
  1341. }
  1342. }
  1343. };
  1344. Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
  1345. if (!OMPInfoCache.ModuleSlice.count(&F))
  1346. return nullptr;
  1347. // Use a scope to keep the lifetime of the CachedKernel short.
  1348. {
  1349. Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
  1350. if (CachedKernel)
  1351. return *CachedKernel;
  1352. // TODO: We should use an AA to create an (optimistic and callback
  1353. // call-aware) call graph. For now we stick to simple patterns that
  1354. // are less powerful, basically the worst fixpoint.
  1355. if (isKernel(F)) {
  1356. CachedKernel = Kernel(&F);
  1357. return *CachedKernel;
  1358. }
  1359. CachedKernel = nullptr;
  1360. if (!F.hasLocalLinkage()) {
  1361. // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
  1362. auto Remark = [&](OptimizationRemark OR) {
  1363. return OR << "[OMP100] Potentially unknown OpenMP target region caller";
  1364. };
  1365. emitRemarkOnFunction(&F, "OMP100", Remark);
  1366. return nullptr;
  1367. }
  1368. }
  1369. auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
  1370. if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
  1371. // Allow use in equality comparisons.
  1372. if (Cmp->isEquality())
  1373. return getUniqueKernelFor(*Cmp);
  1374. return nullptr;
  1375. }
  1376. if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
  1377. // Allow direct calls.
  1378. if (CB->isCallee(&U))
  1379. return getUniqueKernelFor(*CB);
  1380. // Allow the use in __kmpc_kernel_prepare_parallel calls.
  1381. if (Function *Callee = CB->getCalledFunction())
  1382. if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
  1383. return getUniqueKernelFor(*CB);
  1384. return nullptr;
  1385. }
  1386. // Disallow every other use.
  1387. return nullptr;
  1388. };
  1389. // TODO: In the future we want to track more than just a unique kernel.
  1390. SmallPtrSet<Kernel, 2> PotentialKernels;
  1391. OMPInformationCache::foreachUse(F, [&](const Use &U) {
  1392. PotentialKernels.insert(GetUniqueKernelForUse(U));
  1393. });
  1394. Kernel K = nullptr;
  1395. if (PotentialKernels.size() == 1)
  1396. K = *PotentialKernels.begin();
  1397. // Cache the result.
  1398. UniqueKernelMap[&F] = K;
  1399. return K;
  1400. }
  1401. bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
  1402. OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
  1403. OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
  1404. bool Changed = false;
  1405. if (!KernelPrepareParallelRFI)
  1406. return Changed;
  1407. for (Function *F : SCC) {
  1408. // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
  1409. // all.
  1410. bool UnknownUse = false;
  1411. bool KernelPrepareUse = false;
  1412. unsigned NumDirectCalls = 0;
  1413. SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
  1414. OMPInformationCache::foreachUse(*F, [&](Use &U) {
  1415. if (auto *CB = dyn_cast<CallBase>(U.getUser()))
  1416. if (CB->isCallee(&U)) {
  1417. ++NumDirectCalls;
  1418. return;
  1419. }
  1420. if (isa<ICmpInst>(U.getUser())) {
  1421. ToBeReplacedStateMachineUses.push_back(&U);
  1422. return;
  1423. }
  1424. if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
  1425. *U.getUser(), &KernelPrepareParallelRFI)) {
  1426. KernelPrepareUse = true;
  1427. ToBeReplacedStateMachineUses.push_back(&U);
  1428. return;
  1429. }
  1430. UnknownUse = true;
  1431. });
  1432. // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
  1433. // use.
  1434. if (!KernelPrepareUse)
  1435. continue;
  1436. {
  1437. auto Remark = [&](OptimizationRemark OR) {
  1438. return OR << "Found a parallel region that is called in a target "
  1439. "region but not part of a combined target construct nor "
  1440. "nesed inside a target construct without intermediate "
  1441. "code. This can lead to excessive register usage for "
  1442. "unrelated target regions in the same translation unit "
  1443. "due to spurious call edges assumed by ptxas.";
  1444. };
  1445. emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
  1446. }
  1447. // If this ever hits, we should investigate.
  1448. // TODO: Checking the number of uses is not a necessary restriction and
  1449. // should be lifted.
  1450. if (UnknownUse || NumDirectCalls != 1 ||
  1451. ToBeReplacedStateMachineUses.size() != 2) {
  1452. {
  1453. auto Remark = [&](OptimizationRemark OR) {
  1454. return OR << "Parallel region is used in "
  1455. << (UnknownUse ? "unknown" : "unexpected")
  1456. << " ways; will not attempt to rewrite the state machine.";
  1457. };
  1458. emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
  1459. }
  1460. continue;
  1461. }
  1462. // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
  1463. // up if the function is not called from a unique kernel.
  1464. Kernel K = getUniqueKernelFor(*F);
  1465. if (!K) {
  1466. {
  1467. auto Remark = [&](OptimizationRemark OR) {
  1468. return OR << "Parallel region is not known to be called from a "
  1469. "unique single target region, maybe the surrounding "
  1470. "function has external linkage?; will not attempt to "
  1471. "rewrite the state machine use.";
  1472. };
  1473. emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
  1474. Remark);
  1475. }
  1476. continue;
  1477. }
  1478. // We now know F is a parallel body function called only from the kernel K.
  1479. // We also identified the state machine uses in which we replace the
  1480. // function pointer by a new global symbol for identification purposes. This
  1481. // ensures only direct calls to the function are left.
  1482. {
  1483. auto RemarkParalleRegion = [&](OptimizationRemark OR) {
  1484. return OR << "Specialize parallel region that is only reached from a "
  1485. "single target region to avoid spurious call edges and "
  1486. "excessive register usage in other target regions. "
  1487. "(parallel region ID: "
  1488. << ore::NV("OpenMPParallelRegion", F->getName())
  1489. << ", kernel ID: "
  1490. << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
  1491. };
  1492. emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
  1493. RemarkParalleRegion);
  1494. auto RemarkKernel = [&](OptimizationRemark OR) {
  1495. return OR << "Target region containing the parallel region that is "
  1496. "specialized. (parallel region ID: "
  1497. << ore::NV("OpenMPParallelRegion", F->getName())
  1498. << ", kernel ID: "
  1499. << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
  1500. };
  1501. emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
  1502. }
  1503. Module &M = *F->getParent();
  1504. Type *Int8Ty = Type::getInt8Ty(M.getContext());
  1505. auto *ID = new GlobalVariable(
  1506. M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
  1507. UndefValue::get(Int8Ty), F->getName() + ".ID");
  1508. for (Use *U : ToBeReplacedStateMachineUses)
  1509. U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
  1510. ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
  1511. Changed = true;
  1512. }
  1513. return Changed;
  1514. }
  1515. /// Abstract Attribute for tracking ICV values.
  1516. struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
  1517. using Base = StateWrapper<BooleanState, AbstractAttribute>;
  1518. AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
  1519. void initialize(Attributor &A) override {
  1520. Function *F = getAnchorScope();
  1521. if (!F || !A.isFunctionIPOAmendable(*F))
  1522. indicatePessimisticFixpoint();
  1523. }
  1524. /// Returns true if value is assumed to be tracked.
  1525. bool isAssumedTracked() const { return getAssumed(); }
  1526. /// Returns true if value is known to be tracked.
  1527. bool isKnownTracked() const { return getAssumed(); }
  1528. /// Create an abstract attribute biew for the position \p IRP.
  1529. static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
  1530. /// Return the value with which \p I can be replaced for specific \p ICV.
  1531. virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
  1532. const Instruction *I,
  1533. Attributor &A) const {
  1534. return None;
  1535. }
  1536. /// Return an assumed unique ICV value if a single candidate is found. If
  1537. /// there cannot be one, return a nullptr. If it is not clear yet, return the
  1538. /// Optional::NoneType.
  1539. virtual Optional<Value *>
  1540. getUniqueReplacementValue(InternalControlVar ICV) const = 0;
  1541. // Currently only nthreads is being tracked.
  1542. // this array will only grow with time.
  1543. InternalControlVar TrackableICVs[1] = {ICV_nthreads};
  1544. /// See AbstractAttribute::getName()
  1545. const std::string getName() const override { return "AAICVTracker"; }
  1546. /// See AbstractAttribute::getIdAddr()
  1547. const char *getIdAddr() const override { return &ID; }
  1548. /// This function should return true if the type of the \p AA is AAICVTracker
  1549. static bool classof(const AbstractAttribute *AA) {
  1550. return (AA->getIdAddr() == &ID);
  1551. }
  1552. static const char ID;
  1553. };
  1554. struct AAICVTrackerFunction : public AAICVTracker {
  1555. AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
  1556. : AAICVTracker(IRP, A) {}
  1557. // FIXME: come up with better string.
  1558. const std::string getAsStr() const override { return "ICVTrackerFunction"; }
  1559. // FIXME: come up with some stats.
  1560. void trackStatistics() const override {}
  1561. /// We don't manifest anything for this AA.
  1562. ChangeStatus manifest(Attributor &A) override {
  1563. return ChangeStatus::UNCHANGED;
  1564. }
  1565. // Map of ICV to their values at specific program point.
  1566. EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
  1567. InternalControlVar::ICV___last>
  1568. ICVReplacementValuesMap;
  1569. ChangeStatus updateImpl(Attributor &A) override {
  1570. ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
  1571. Function *F = getAnchorScope();
  1572. auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
  1573. for (InternalControlVar ICV : TrackableICVs) {
  1574. auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
  1575. auto &ValuesMap = ICVReplacementValuesMap[ICV];
  1576. auto TrackValues = [&](Use &U, Function &) {
  1577. CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
  1578. if (!CI)
  1579. return false;
  1580. // FIXME: handle setters with more that 1 arguments.
  1581. /// Track new value.
  1582. if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
  1583. HasChanged = ChangeStatus::CHANGED;
  1584. return false;
  1585. };
  1586. auto CallCheck = [&](Instruction &I) {
  1587. Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
  1588. if (ReplVal.hasValue() &&
  1589. ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
  1590. HasChanged = ChangeStatus::CHANGED;
  1591. return true;
  1592. };
  1593. // Track all changes of an ICV.
  1594. SetterRFI.foreachUse(TrackValues, F);
  1595. A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
  1596. /* CheckBBLivenessOnly */ true);
  1597. /// TODO: Figure out a way to avoid adding entry in
  1598. /// ICVReplacementValuesMap
  1599. Instruction *Entry = &F->getEntryBlock().front();
  1600. if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
  1601. ValuesMap.insert(std::make_pair(Entry, nullptr));
  1602. }
  1603. return HasChanged;
  1604. }
  1605. /// Hepler to check if \p I is a call and get the value for it if it is
  1606. /// unique.
  1607. Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
  1608. InternalControlVar &ICV) const {
  1609. const auto *CB = dyn_cast<CallBase>(I);
  1610. if (!CB || CB->hasFnAttr("no_openmp") ||
  1611. CB->hasFnAttr("no_openmp_routines"))
  1612. return None;
  1613. auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
  1614. auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
  1615. auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
  1616. Function *CalledFunction = CB->getCalledFunction();
  1617. // Indirect call, assume ICV changes.
  1618. if (CalledFunction == nullptr)
  1619. return nullptr;
  1620. if (CalledFunction == GetterRFI.Declaration)
  1621. return None;
  1622. if (CalledFunction == SetterRFI.Declaration) {
  1623. if (ICVReplacementValuesMap[ICV].count(I))
  1624. return ICVReplacementValuesMap[ICV].lookup(I);
  1625. return nullptr;
  1626. }
  1627. // Since we don't know, assume it changes the ICV.
  1628. if (CalledFunction->isDeclaration())
  1629. return nullptr;
  1630. const auto &ICVTrackingAA =
  1631. A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB));
  1632. if (ICVTrackingAA.isAssumedTracked())
  1633. return ICVTrackingAA.getUniqueReplacementValue(ICV);
  1634. // If we don't know, assume it changes.
  1635. return nullptr;
  1636. }
  1637. // We don't check unique value for a function, so return None.
  1638. Optional<Value *>
  1639. getUniqueReplacementValue(InternalControlVar ICV) const override {
  1640. return None;
  1641. }
  1642. /// Return the value with which \p I can be replaced for specific \p ICV.
  1643. Optional<Value *> getReplacementValue(InternalControlVar ICV,
  1644. const Instruction *I,
  1645. Attributor &A) const override {
  1646. const auto &ValuesMap = ICVReplacementValuesMap[ICV];
  1647. if (ValuesMap.count(I))
  1648. return ValuesMap.lookup(I);
  1649. SmallVector<const Instruction *, 16> Worklist;
  1650. SmallPtrSet<const Instruction *, 16> Visited;
  1651. Worklist.push_back(I);
  1652. Optional<Value *> ReplVal;
  1653. while (!Worklist.empty()) {
  1654. const Instruction *CurrInst = Worklist.pop_back_val();
  1655. if (!Visited.insert(CurrInst).second)
  1656. continue;
  1657. const BasicBlock *CurrBB = CurrInst->getParent();
  1658. // Go up and look for all potential setters/calls that might change the
  1659. // ICV.
  1660. while ((CurrInst = CurrInst->getPrevNode())) {
  1661. if (ValuesMap.count(CurrInst)) {
  1662. Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
  1663. // Unknown value, track new.
  1664. if (!ReplVal.hasValue()) {
  1665. ReplVal = NewReplVal;
  1666. break;
  1667. }
  1668. // If we found a new value, we can't know the icv value anymore.
  1669. if (NewReplVal.hasValue())
  1670. if (ReplVal != NewReplVal)
  1671. return nullptr;
  1672. break;
  1673. }
  1674. Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
  1675. if (!NewReplVal.hasValue())
  1676. continue;
  1677. // Unknown value, track new.
  1678. if (!ReplVal.hasValue()) {
  1679. ReplVal = NewReplVal;
  1680. break;
  1681. }
  1682. // if (NewReplVal.hasValue())
  1683. // We found a new value, we can't know the icv value anymore.
  1684. if (ReplVal != NewReplVal)
  1685. return nullptr;
  1686. }
  1687. // If we are in the same BB and we have a value, we are done.
  1688. if (CurrBB == I->getParent() && ReplVal.hasValue())
  1689. return ReplVal;
  1690. // Go through all predecessors and add terminators for analysis.
  1691. for (const BasicBlock *Pred : predecessors(CurrBB))
  1692. if (const Instruction *Terminator = Pred->getTerminator())
  1693. Worklist.push_back(Terminator);
  1694. }
  1695. return ReplVal;
  1696. }
  1697. };
  1698. struct AAICVTrackerFunctionReturned : AAICVTracker {
  1699. AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
  1700. : AAICVTracker(IRP, A) {}
  1701. // FIXME: come up with better string.
  1702. const std::string getAsStr() const override {
  1703. return "ICVTrackerFunctionReturned";
  1704. }
  1705. // FIXME: come up with some stats.
  1706. void trackStatistics() const override {}
  1707. /// We don't manifest anything for this AA.
  1708. ChangeStatus manifest(Attributor &A) override {
  1709. return ChangeStatus::UNCHANGED;
  1710. }
  1711. // Map of ICV to their values at specific program point.
  1712. EnumeratedArray<Optional<Value *>, InternalControlVar,
  1713. InternalControlVar::ICV___last>
  1714. ICVReplacementValuesMap;
  1715. /// Return the value with which \p I can be replaced for specific \p ICV.
  1716. Optional<Value *>
  1717. getUniqueReplacementValue(InternalControlVar ICV) const override {
  1718. return ICVReplacementValuesMap[ICV];
  1719. }
  1720. ChangeStatus updateImpl(Attributor &A) override {
  1721. ChangeStatus Changed = ChangeStatus::UNCHANGED;
  1722. const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
  1723. *this, IRPosition::function(*getAnchorScope()));
  1724. if (!ICVTrackingAA.isAssumedTracked())
  1725. return indicatePessimisticFixpoint();
  1726. for (InternalControlVar ICV : TrackableICVs) {
  1727. Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
  1728. Optional<Value *> UniqueICVValue;
  1729. auto CheckReturnInst = [&](Instruction &I) {
  1730. Optional<Value *> NewReplVal =
  1731. ICVTrackingAA.getReplacementValue(ICV, &I, A);
  1732. // If we found a second ICV value there is no unique returned value.
  1733. if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
  1734. return false;
  1735. UniqueICVValue = NewReplVal;
  1736. return true;
  1737. };
  1738. if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
  1739. /* CheckBBLivenessOnly */ true))
  1740. UniqueICVValue = nullptr;
  1741. if (UniqueICVValue == ReplVal)
  1742. continue;
  1743. ReplVal = UniqueICVValue;
  1744. Changed = ChangeStatus::CHANGED;
  1745. }
  1746. return Changed;
  1747. }
  1748. };
  1749. struct AAICVTrackerCallSite : AAICVTracker {
  1750. AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
  1751. : AAICVTracker(IRP, A) {}
  1752. void initialize(Attributor &A) override {
  1753. Function *F = getAnchorScope();
  1754. if (!F || !A.isFunctionIPOAmendable(*F))
  1755. indicatePessimisticFixpoint();
  1756. // We only initialize this AA for getters, so we need to know which ICV it
  1757. // gets.
  1758. auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
  1759. for (InternalControlVar ICV : TrackableICVs) {
  1760. auto ICVInfo = OMPInfoCache.ICVs[ICV];
  1761. auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
  1762. if (Getter.Declaration == getAssociatedFunction()) {
  1763. AssociatedICV = ICVInfo.Kind;
  1764. return;
  1765. }
  1766. }
  1767. /// Unknown ICV.
  1768. indicatePessimisticFixpoint();
  1769. }
  1770. ChangeStatus manifest(Attributor &A) override {
  1771. if (!ReplVal.hasValue() || !ReplVal.getValue())
  1772. return ChangeStatus::UNCHANGED;
  1773. A.changeValueAfterManifest(*getCtxI(), **ReplVal);
  1774. A.deleteAfterManifest(*getCtxI());
  1775. return ChangeStatus::CHANGED;
  1776. }
  1777. // FIXME: come up with better string.
  1778. const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
  1779. // FIXME: come up with some stats.
  1780. void trackStatistics() const override {}
  1781. InternalControlVar AssociatedICV;
  1782. Optional<Value *> ReplVal;
  1783. ChangeStatus updateImpl(Attributor &A) override {
  1784. const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
  1785. *this, IRPosition::function(*getAnchorScope()));
  1786. // We don't have any information, so we assume it changes the ICV.
  1787. if (!ICVTrackingAA.isAssumedTracked())
  1788. return indicatePessimisticFixpoint();
  1789. Optional<Value *> NewReplVal =
  1790. ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
  1791. if (ReplVal == NewReplVal)
  1792. return ChangeStatus::UNCHANGED;
  1793. ReplVal = NewReplVal;
  1794. return ChangeStatus::CHANGED;
  1795. }
  1796. // Return the value with which associated value can be replaced for specific
  1797. // \p ICV.
  1798. Optional<Value *>
  1799. getUniqueReplacementValue(InternalControlVar ICV) const override {
  1800. return ReplVal;
  1801. }
  1802. };
  1803. struct AAICVTrackerCallSiteReturned : AAICVTracker {
  1804. AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
  1805. : AAICVTracker(IRP, A) {}
  1806. // FIXME: come up with better string.
  1807. const std::string getAsStr() const override {
  1808. return "ICVTrackerCallSiteReturned";
  1809. }
  1810. // FIXME: come up with some stats.
  1811. void trackStatistics() const override {}
  1812. /// We don't manifest anything for this AA.
  1813. ChangeStatus manifest(Attributor &A) override {
  1814. return ChangeStatus::UNCHANGED;
  1815. }
  1816. // Map of ICV to their values at specific program point.
  1817. EnumeratedArray<Optional<Value *>, InternalControlVar,
  1818. InternalControlVar::ICV___last>
  1819. ICVReplacementValuesMap;
  1820. /// Return the value with which associated value can be replaced for specific
  1821. /// \p ICV.
  1822. Optional<Value *>
  1823. getUniqueReplacementValue(InternalControlVar ICV) const override {
  1824. return ICVReplacementValuesMap[ICV];
  1825. }
  1826. ChangeStatus updateImpl(Attributor &A) override {
  1827. ChangeStatus Changed = ChangeStatus::UNCHANGED;
  1828. const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
  1829. *this, IRPosition::returned(*getAssociatedFunction()));
  1830. // We don't have any information, so we assume it changes the ICV.
  1831. if (!ICVTrackingAA.isAssumedTracked())
  1832. return indicatePessimisticFixpoint();
  1833. for (InternalControlVar ICV : TrackableICVs) {
  1834. Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
  1835. Optional<Value *> NewReplVal =
  1836. ICVTrackingAA.getUniqueReplacementValue(ICV);
  1837. if (ReplVal == NewReplVal)
  1838. continue;
  1839. ReplVal = NewReplVal;
  1840. Changed = ChangeStatus::CHANGED;
  1841. }
  1842. return Changed;
  1843. }
  1844. };
  1845. } // namespace
  1846. const char AAICVTracker::ID = 0;
  1847. AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
  1848. Attributor &A) {
  1849. AAICVTracker *AA = nullptr;
  1850. switch (IRP.getPositionKind()) {
  1851. case IRPosition::IRP_INVALID:
  1852. case IRPosition::IRP_FLOAT:
  1853. case IRPosition::IRP_ARGUMENT:
  1854. case IRPosition::IRP_CALL_SITE_ARGUMENT:
  1855. llvm_unreachable("ICVTracker can only be created for function position!");
  1856. case IRPosition::IRP_RETURNED:
  1857. AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
  1858. break;
  1859. case IRPosition::IRP_CALL_SITE_RETURNED:
  1860. AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
  1861. break;
  1862. case IRPosition::IRP_CALL_SITE:
  1863. AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
  1864. break;
  1865. case IRPosition::IRP_FUNCTION:
  1866. AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
  1867. break;
  1868. }
  1869. return *AA;
  1870. }
  1871. PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
  1872. CGSCCAnalysisManager &AM,
  1873. LazyCallGraph &CG, CGSCCUpdateResult &UR) {
  1874. if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
  1875. return PreservedAnalyses::all();
  1876. if (DisableOpenMPOptimizations)
  1877. return PreservedAnalyses::all();
  1878. SmallVector<Function *, 16> SCC;
  1879. // If there are kernels in the module, we have to run on all SCC's.
  1880. bool SCCIsInteresting = !OMPInModule.getKernels().empty();
  1881. for (LazyCallGraph::Node &N : C) {
  1882. Function *Fn = &N.getFunction();
  1883. SCC.push_back(Fn);
  1884. // Do we already know that the SCC contains kernels,
  1885. // or that OpenMP functions are called from this SCC?
  1886. if (SCCIsInteresting)
  1887. continue;
  1888. // If not, let's check that.
  1889. SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
  1890. }
  1891. if (!SCCIsInteresting || SCC.empty())
  1892. return PreservedAnalyses::all();
  1893. FunctionAnalysisManager &FAM =
  1894. AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
  1895. AnalysisGetter AG(FAM);
  1896. auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
  1897. return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
  1898. };
  1899. CallGraphUpdater CGUpdater;
  1900. CGUpdater.initialize(CG, C, AM, UR);
  1901. SetVector<Function *> Functions(SCC.begin(), SCC.end());
  1902. BumpPtrAllocator Allocator;
  1903. OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
  1904. /*CGSCC*/ Functions, OMPInModule.getKernels());
  1905. Attributor A(Functions, InfoCache, CGUpdater);
  1906. OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
  1907. bool Changed = OMPOpt.run();
  1908. if (Changed)
  1909. return PreservedAnalyses::none();
  1910. return PreservedAnalyses::all();
  1911. }
  1912. namespace {
  1913. struct OpenMPOptLegacyPass : public CallGraphSCCPass {
  1914. CallGraphUpdater CGUpdater;
  1915. OpenMPInModule OMPInModule;
  1916. static char ID;
  1917. OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
  1918. initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
  1919. }
  1920. void getAnalysisUsage(AnalysisUsage &AU) const override {
  1921. CallGraphSCCPass::getAnalysisUsage(AU);
  1922. }
  1923. bool doInitialization(CallGraph &CG) override {
  1924. // Disable the pass if there is no OpenMP (runtime call) in the module.
  1925. containsOpenMP(CG.getModule(), OMPInModule);
  1926. return false;
  1927. }
  1928. bool runOnSCC(CallGraphSCC &CGSCC) override {
  1929. if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
  1930. return false;
  1931. if (DisableOpenMPOptimizations || skipSCC(CGSCC))
  1932. return false;
  1933. SmallVector<Function *, 16> SCC;
  1934. // If there are kernels in the module, we have to run on all SCC's.
  1935. bool SCCIsInteresting = !OMPInModule.getKernels().empty();
  1936. for (CallGraphNode *CGN : CGSCC) {
  1937. Function *Fn = CGN->getFunction();
  1938. if (!Fn || Fn->isDeclaration())
  1939. continue;
  1940. SCC.push_back(Fn);
  1941. // Do we already know that the SCC contains kernels,
  1942. // or that OpenMP functions are called from this SCC?
  1943. if (SCCIsInteresting)
  1944. continue;
  1945. // If not, let's check that.
  1946. SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
  1947. }
  1948. if (!SCCIsInteresting || SCC.empty())
  1949. return false;
  1950. CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
  1951. CGUpdater.initialize(CG, CGSCC);
  1952. // Maintain a map of functions to avoid rebuilding the ORE
  1953. DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
  1954. auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
  1955. std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
  1956. if (!ORE)
  1957. ORE = std::make_unique<OptimizationRemarkEmitter>(F);
  1958. return *ORE;
  1959. };
  1960. AnalysisGetter AG;
  1961. SetVector<Function *> Functions(SCC.begin(), SCC.end());
  1962. BumpPtrAllocator Allocator;
  1963. OMPInformationCache InfoCache(
  1964. *(Functions.back()->getParent()), AG, Allocator,
  1965. /*CGSCC*/ Functions, OMPInModule.getKernels());
  1966. Attributor A(Functions, InfoCache, CGUpdater);
  1967. OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
  1968. return OMPOpt.run();
  1969. }
  1970. bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
  1971. };
  1972. } // end anonymous namespace
  1973. void OpenMPInModule::identifyKernels(Module &M) {
  1974. NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
  1975. if (!MD)
  1976. return;
  1977. for (auto *Op : MD->operands()) {
  1978. if (Op->getNumOperands() < 2)
  1979. continue;
  1980. MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
  1981. if (!KindID || KindID->getString() != "kernel")
  1982. continue;
  1983. Function *KernelFn =
  1984. mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
  1985. if (!KernelFn)
  1986. continue;
  1987. ++NumOpenMPTargetRegionKernels;
  1988. Kernels.insert(KernelFn);
  1989. }
  1990. }
  1991. bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
  1992. if (OMPInModule.isKnown())
  1993. return OMPInModule;
  1994. auto RecordFunctionsContainingUsesOf = [&](Function *F) {
  1995. for (User *U : F->users())
  1996. if (auto *I = dyn_cast<Instruction>(U))
  1997. OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
  1998. };
  1999. // MSVC doesn't like long if-else chains for some reason and instead just
  2000. // issues an error. Work around it..
  2001. do {
  2002. #define OMP_RTL(_Enum, _Name, ...) \
  2003. if (Function *F = M.getFunction(_Name)) { \
  2004. RecordFunctionsContainingUsesOf(F); \
  2005. OMPInModule = true; \
  2006. }
  2007. #include "llvm/Frontend/OpenMP/OMPKinds.def"
  2008. } while (false);
  2009. // Identify kernels once. TODO: We should split the OMPInformationCache into a
  2010. // module and an SCC part. The kernel information, among other things, could
  2011. // go into the module part.
  2012. if (OMPInModule.isKnown() && OMPInModule) {
  2013. OMPInModule.identifyKernels(M);
  2014. return true;
  2015. }
  2016. return OMPInModule = false;
  2017. }
  2018. char OpenMPOptLegacyPass::ID = 0;
  2019. INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
  2020. "OpenMP specific optimizations", false, false)
  2021. INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
  2022. INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
  2023. "OpenMP specific optimizations", false, false)
  2024. Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }