CGOpenMPRuntimeGPU.cpp 161 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989
  1. //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This provides a generalized class for OpenMP runtime code generation
  10. // specialized by GPU targets NVPTX and AMDGCN.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "CGOpenMPRuntimeGPU.h"
  14. #include "CodeGenFunction.h"
  15. #include "clang/AST/Attr.h"
  16. #include "clang/AST/DeclOpenMP.h"
  17. #include "clang/AST/StmtOpenMP.h"
  18. #include "clang/AST/StmtVisitor.h"
  19. #include "clang/Basic/Cuda.h"
  20. #include "llvm/ADT/SmallPtrSet.h"
  21. #include "llvm/Frontend/OpenMP/OMPGridValues.h"
  22. #include "llvm/Support/MathExtras.h"
  23. using namespace clang;
  24. using namespace CodeGen;
  25. using namespace llvm::omp;
  26. namespace {
  27. /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
  28. class NVPTXActionTy final : public PrePostActionTy {
  29. llvm::FunctionCallee EnterCallee = nullptr;
  30. ArrayRef<llvm::Value *> EnterArgs;
  31. llvm::FunctionCallee ExitCallee = nullptr;
  32. ArrayRef<llvm::Value *> ExitArgs;
  33. bool Conditional = false;
  34. llvm::BasicBlock *ContBlock = nullptr;
  35. public:
  36. NVPTXActionTy(llvm::FunctionCallee EnterCallee,
  37. ArrayRef<llvm::Value *> EnterArgs,
  38. llvm::FunctionCallee ExitCallee,
  39. ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
  40. : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
  41. ExitArgs(ExitArgs), Conditional(Conditional) {}
  42. void Enter(CodeGenFunction &CGF) override {
  43. llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
  44. if (Conditional) {
  45. llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
  46. auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
  47. ContBlock = CGF.createBasicBlock("omp_if.end");
  48. // Generate the branch (If-stmt)
  49. CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
  50. CGF.EmitBlock(ThenBlock);
  51. }
  52. }
  53. void Done(CodeGenFunction &CGF) {
  54. // Emit the rest of blocks/branches
  55. CGF.EmitBranch(ContBlock);
  56. CGF.EmitBlock(ContBlock, true);
  57. }
  58. void Exit(CodeGenFunction &CGF) override {
  59. CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
  60. }
  61. };
  62. /// A class to track the execution mode when codegening directives within
  63. /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
  64. /// to the target region and used by containing directives such as 'parallel'
  65. /// to emit optimized code.
  66. class ExecutionRuntimeModesRAII {
  67. private:
  68. CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
  69. CGOpenMPRuntimeGPU::EM_Unknown;
  70. CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
  71. bool SavedRuntimeMode = false;
  72. bool *RuntimeMode = nullptr;
  73. public:
  74. /// Constructor for Non-SPMD mode.
  75. ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
  76. : ExecMode(ExecMode) {
  77. SavedExecMode = ExecMode;
  78. ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
  79. }
  80. /// Constructor for SPMD mode.
  81. ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
  82. bool &RuntimeMode, bool FullRuntimeMode)
  83. : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
  84. SavedExecMode = ExecMode;
  85. SavedRuntimeMode = RuntimeMode;
  86. ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
  87. RuntimeMode = FullRuntimeMode;
  88. }
  89. ~ExecutionRuntimeModesRAII() {
  90. ExecMode = SavedExecMode;
  91. if (RuntimeMode)
  92. *RuntimeMode = SavedRuntimeMode;
  93. }
  94. };
  95. /// GPU Configuration: This information can be derived from cuda registers,
  96. /// however, providing compile time constants helps generate more efficient
  97. /// code. For all practical purposes this is fine because the configuration
  98. /// is the same for all known NVPTX architectures.
  99. enum MachineConfiguration : unsigned {
  100. /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
  101. /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
  102. /// Global memory alignment for performance.
  103. GlobalMemoryAlignment = 128,
  104. /// Maximal size of the shared memory buffer.
  105. SharedMemorySize = 128,
  106. };
  107. static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
  108. RefExpr = RefExpr->IgnoreParens();
  109. if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
  110. const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
  111. while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
  112. Base = TempASE->getBase()->IgnoreParenImpCasts();
  113. RefExpr = Base;
  114. } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
  115. const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
  116. while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
  117. Base = TempOASE->getBase()->IgnoreParenImpCasts();
  118. while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
  119. Base = TempASE->getBase()->IgnoreParenImpCasts();
  120. RefExpr = Base;
  121. }
  122. RefExpr = RefExpr->IgnoreParenImpCasts();
  123. if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
  124. return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
  125. const auto *ME = cast<MemberExpr>(RefExpr);
  126. return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
  127. }
  128. static RecordDecl *buildRecordForGlobalizedVars(
  129. ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
  130. ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
  131. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  132. &MappedDeclsFields, int BufSize) {
  133. using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
  134. if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
  135. return nullptr;
  136. SmallVector<VarsDataTy, 4> GlobalizedVars;
  137. for (const ValueDecl *D : EscapedDecls)
  138. GlobalizedVars.emplace_back(
  139. CharUnits::fromQuantity(std::max(
  140. C.getDeclAlign(D).getQuantity(),
  141. static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
  142. D);
  143. for (const ValueDecl *D : EscapedDeclsForTeams)
  144. GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
  145. llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
  146. return L.first > R.first;
  147. });
  148. // Build struct _globalized_locals_ty {
  149. // /* globalized vars */[WarSize] align (max(decl_align,
  150. // GlobalMemoryAlignment))
  151. // /* globalized vars */ for EscapedDeclsForTeams
  152. // };
  153. RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
  154. GlobalizedRD->startDefinition();
  155. llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
  156. EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
  157. for (const auto &Pair : GlobalizedVars) {
  158. const ValueDecl *VD = Pair.second;
  159. QualType Type = VD->getType();
  160. if (Type->isLValueReferenceType())
  161. Type = C.getPointerType(Type.getNonReferenceType());
  162. else
  163. Type = Type.getNonReferenceType();
  164. SourceLocation Loc = VD->getLocation();
  165. FieldDecl *Field;
  166. if (SingleEscaped.count(VD)) {
  167. Field = FieldDecl::Create(
  168. C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
  169. C.getTrivialTypeSourceInfo(Type, SourceLocation()),
  170. /*BW=*/nullptr, /*Mutable=*/false,
  171. /*InitStyle=*/ICIS_NoInit);
  172. Field->setAccess(AS_public);
  173. if (VD->hasAttrs()) {
  174. for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
  175. E(VD->getAttrs().end());
  176. I != E; ++I)
  177. Field->addAttr(*I);
  178. }
  179. } else {
  180. llvm::APInt ArraySize(32, BufSize);
  181. Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
  182. 0);
  183. Field = FieldDecl::Create(
  184. C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
  185. C.getTrivialTypeSourceInfo(Type, SourceLocation()),
  186. /*BW=*/nullptr, /*Mutable=*/false,
  187. /*InitStyle=*/ICIS_NoInit);
  188. Field->setAccess(AS_public);
  189. llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
  190. static_cast<CharUnits::QuantityType>(
  191. GlobalMemoryAlignment)));
  192. Field->addAttr(AlignedAttr::CreateImplicit(
  193. C, /*IsAlignmentExpr=*/true,
  194. IntegerLiteral::Create(C, Align,
  195. C.getIntTypeForBitwidth(32, /*Signed=*/0),
  196. SourceLocation()),
  197. {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
  198. }
  199. GlobalizedRD->addDecl(Field);
  200. MappedDeclsFields.try_emplace(VD, Field);
  201. }
  202. GlobalizedRD->completeDefinition();
  203. return GlobalizedRD;
  204. }
  205. /// Get the list of variables that can escape their declaration context.
  206. class CheckVarsEscapingDeclContext final
  207. : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
  208. CodeGenFunction &CGF;
  209. llvm::SetVector<const ValueDecl *> EscapedDecls;
  210. llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
  211. llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
  212. RecordDecl *GlobalizedRD = nullptr;
  213. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
  214. bool AllEscaped = false;
  215. bool IsForCombinedParallelRegion = false;
  216. void markAsEscaped(const ValueDecl *VD) {
  217. // Do not globalize declare target variables.
  218. if (!isa<VarDecl>(VD) ||
  219. OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
  220. return;
  221. VD = cast<ValueDecl>(VD->getCanonicalDecl());
  222. // Use user-specified allocation.
  223. if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
  224. return;
  225. // Variables captured by value must be globalized.
  226. if (auto *CSI = CGF.CapturedStmtInfo) {
  227. if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
  228. // Check if need to capture the variable that was already captured by
  229. // value in the outer region.
  230. if (!IsForCombinedParallelRegion) {
  231. if (!FD->hasAttrs())
  232. return;
  233. const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
  234. if (!Attr)
  235. return;
  236. if (((Attr->getCaptureKind() != OMPC_map) &&
  237. !isOpenMPPrivate(Attr->getCaptureKind())) ||
  238. ((Attr->getCaptureKind() == OMPC_map) &&
  239. !FD->getType()->isAnyPointerType()))
  240. return;
  241. }
  242. if (!FD->getType()->isReferenceType()) {
  243. assert(!VD->getType()->isVariablyModifiedType() &&
  244. "Parameter captured by value with variably modified type");
  245. EscapedParameters.insert(VD);
  246. } else if (!IsForCombinedParallelRegion) {
  247. return;
  248. }
  249. }
  250. }
  251. if ((!CGF.CapturedStmtInfo ||
  252. (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
  253. VD->getType()->isReferenceType())
  254. // Do not globalize variables with reference type.
  255. return;
  256. if (VD->getType()->isVariablyModifiedType())
  257. EscapedVariableLengthDecls.insert(VD);
  258. else
  259. EscapedDecls.insert(VD);
  260. }
  261. void VisitValueDecl(const ValueDecl *VD) {
  262. if (VD->getType()->isLValueReferenceType())
  263. markAsEscaped(VD);
  264. if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
  265. if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
  266. const bool SavedAllEscaped = AllEscaped;
  267. AllEscaped = VD->getType()->isLValueReferenceType();
  268. Visit(VarD->getInit());
  269. AllEscaped = SavedAllEscaped;
  270. }
  271. }
  272. }
  273. void VisitOpenMPCapturedStmt(const CapturedStmt *S,
  274. ArrayRef<OMPClause *> Clauses,
  275. bool IsCombinedParallelRegion) {
  276. if (!S)
  277. return;
  278. for (const CapturedStmt::Capture &C : S->captures()) {
  279. if (C.capturesVariable() && !C.capturesVariableByCopy()) {
  280. const ValueDecl *VD = C.getCapturedVar();
  281. bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
  282. if (IsCombinedParallelRegion) {
  283. // Check if the variable is privatized in the combined construct and
  284. // those private copies must be shared in the inner parallel
  285. // directive.
  286. IsForCombinedParallelRegion = false;
  287. for (const OMPClause *C : Clauses) {
  288. if (!isOpenMPPrivate(C->getClauseKind()) ||
  289. C->getClauseKind() == OMPC_reduction ||
  290. C->getClauseKind() == OMPC_linear ||
  291. C->getClauseKind() == OMPC_private)
  292. continue;
  293. ArrayRef<const Expr *> Vars;
  294. if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
  295. Vars = PC->getVarRefs();
  296. else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
  297. Vars = PC->getVarRefs();
  298. else
  299. llvm_unreachable("Unexpected clause.");
  300. for (const auto *E : Vars) {
  301. const Decl *D =
  302. cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
  303. if (D == VD->getCanonicalDecl()) {
  304. IsForCombinedParallelRegion = true;
  305. break;
  306. }
  307. }
  308. if (IsForCombinedParallelRegion)
  309. break;
  310. }
  311. }
  312. markAsEscaped(VD);
  313. if (isa<OMPCapturedExprDecl>(VD))
  314. VisitValueDecl(VD);
  315. IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
  316. }
  317. }
  318. }
  319. void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
  320. assert(!GlobalizedRD &&
  321. "Record for globalized variables is built already.");
  322. ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
  323. unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
  324. if (IsInTTDRegion)
  325. EscapedDeclsForTeams = EscapedDecls.getArrayRef();
  326. else
  327. EscapedDeclsForParallel = EscapedDecls.getArrayRef();
  328. GlobalizedRD = ::buildRecordForGlobalizedVars(
  329. CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
  330. MappedDeclsFields, WarpSize);
  331. }
  332. public:
  333. CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
  334. ArrayRef<const ValueDecl *> TeamsReductions)
  335. : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
  336. }
  337. virtual ~CheckVarsEscapingDeclContext() = default;
  338. void VisitDeclStmt(const DeclStmt *S) {
  339. if (!S)
  340. return;
  341. for (const Decl *D : S->decls())
  342. if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
  343. VisitValueDecl(VD);
  344. }
  345. void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
  346. if (!D)
  347. return;
  348. if (!D->hasAssociatedStmt())
  349. return;
  350. if (const auto *S =
  351. dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
  352. // Do not analyze directives that do not actually require capturing,
  353. // like `omp for` or `omp simd` directives.
  354. llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
  355. getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
  356. if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
  357. VisitStmt(S->getCapturedStmt());
  358. return;
  359. }
  360. VisitOpenMPCapturedStmt(
  361. S, D->clauses(),
  362. CaptureRegions.back() == OMPD_parallel &&
  363. isOpenMPDistributeDirective(D->getDirectiveKind()));
  364. }
  365. }
  366. void VisitCapturedStmt(const CapturedStmt *S) {
  367. if (!S)
  368. return;
  369. for (const CapturedStmt::Capture &C : S->captures()) {
  370. if (C.capturesVariable() && !C.capturesVariableByCopy()) {
  371. const ValueDecl *VD = C.getCapturedVar();
  372. markAsEscaped(VD);
  373. if (isa<OMPCapturedExprDecl>(VD))
  374. VisitValueDecl(VD);
  375. }
  376. }
  377. }
  378. void VisitLambdaExpr(const LambdaExpr *E) {
  379. if (!E)
  380. return;
  381. for (const LambdaCapture &C : E->captures()) {
  382. if (C.capturesVariable()) {
  383. if (C.getCaptureKind() == LCK_ByRef) {
  384. const ValueDecl *VD = C.getCapturedVar();
  385. markAsEscaped(VD);
  386. if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
  387. VisitValueDecl(VD);
  388. }
  389. }
  390. }
  391. }
  392. void VisitBlockExpr(const BlockExpr *E) {
  393. if (!E)
  394. return;
  395. for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
  396. if (C.isByRef()) {
  397. const VarDecl *VD = C.getVariable();
  398. markAsEscaped(VD);
  399. if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
  400. VisitValueDecl(VD);
  401. }
  402. }
  403. }
  404. void VisitCallExpr(const CallExpr *E) {
  405. if (!E)
  406. return;
  407. for (const Expr *Arg : E->arguments()) {
  408. if (!Arg)
  409. continue;
  410. if (Arg->isLValue()) {
  411. const bool SavedAllEscaped = AllEscaped;
  412. AllEscaped = true;
  413. Visit(Arg);
  414. AllEscaped = SavedAllEscaped;
  415. } else {
  416. Visit(Arg);
  417. }
  418. }
  419. Visit(E->getCallee());
  420. }
  421. void VisitDeclRefExpr(const DeclRefExpr *E) {
  422. if (!E)
  423. return;
  424. const ValueDecl *VD = E->getDecl();
  425. if (AllEscaped)
  426. markAsEscaped(VD);
  427. if (isa<OMPCapturedExprDecl>(VD))
  428. VisitValueDecl(VD);
  429. else if (const auto *VarD = dyn_cast<VarDecl>(VD))
  430. if (VarD->isInitCapture())
  431. VisitValueDecl(VD);
  432. }
  433. void VisitUnaryOperator(const UnaryOperator *E) {
  434. if (!E)
  435. return;
  436. if (E->getOpcode() == UO_AddrOf) {
  437. const bool SavedAllEscaped = AllEscaped;
  438. AllEscaped = true;
  439. Visit(E->getSubExpr());
  440. AllEscaped = SavedAllEscaped;
  441. } else {
  442. Visit(E->getSubExpr());
  443. }
  444. }
  445. void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
  446. if (!E)
  447. return;
  448. if (E->getCastKind() == CK_ArrayToPointerDecay) {
  449. const bool SavedAllEscaped = AllEscaped;
  450. AllEscaped = true;
  451. Visit(E->getSubExpr());
  452. AllEscaped = SavedAllEscaped;
  453. } else {
  454. Visit(E->getSubExpr());
  455. }
  456. }
  457. void VisitExpr(const Expr *E) {
  458. if (!E)
  459. return;
  460. bool SavedAllEscaped = AllEscaped;
  461. if (!E->isLValue())
  462. AllEscaped = false;
  463. for (const Stmt *Child : E->children())
  464. if (Child)
  465. Visit(Child);
  466. AllEscaped = SavedAllEscaped;
  467. }
  468. void VisitStmt(const Stmt *S) {
  469. if (!S)
  470. return;
  471. for (const Stmt *Child : S->children())
  472. if (Child)
  473. Visit(Child);
  474. }
  475. /// Returns the record that handles all the escaped local variables and used
  476. /// instead of their original storage.
  477. const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
  478. if (!GlobalizedRD)
  479. buildRecordForGlobalizedVars(IsInTTDRegion);
  480. return GlobalizedRD;
  481. }
  482. /// Returns the field in the globalized record for the escaped variable.
  483. const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
  484. assert(GlobalizedRD &&
  485. "Record for globalized variables must be generated already.");
  486. auto I = MappedDeclsFields.find(VD);
  487. if (I == MappedDeclsFields.end())
  488. return nullptr;
  489. return I->getSecond();
  490. }
  491. /// Returns the list of the escaped local variables/parameters.
  492. ArrayRef<const ValueDecl *> getEscapedDecls() const {
  493. return EscapedDecls.getArrayRef();
  494. }
  495. /// Checks if the escaped local variable is actually a parameter passed by
  496. /// value.
  497. const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
  498. return EscapedParameters;
  499. }
  500. /// Returns the list of the escaped variables with the variably modified
  501. /// types.
  502. ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
  503. return EscapedVariableLengthDecls.getArrayRef();
  504. }
  505. };
  506. } // anonymous namespace
  507. /// Get the id of the warp in the block.
  508. /// We assume that the warp size is 32, which is always the case
  509. /// on the NVPTX device, to generate more efficient code.
  510. static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
  511. CGBuilderTy &Bld = CGF.Builder;
  512. unsigned LaneIDBits =
  513. llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
  514. auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  515. return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
  516. }
  517. /// Get the id of the current lane in the Warp.
  518. /// We assume that the warp size is 32, which is always the case
  519. /// on the NVPTX device, to generate more efficient code.
  520. static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
  521. CGBuilderTy &Bld = CGF.Builder;
  522. unsigned LaneIDBits =
  523. llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
  524. unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
  525. auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  526. return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
  527. "nvptx_lane_id");
  528. }
  529. CGOpenMPRuntimeGPU::ExecutionMode
  530. CGOpenMPRuntimeGPU::getExecutionMode() const {
  531. return CurrentExecutionMode;
  532. }
  533. static CGOpenMPRuntimeGPU::DataSharingMode
  534. getDataSharingMode(CodeGenModule &CGM) {
  535. return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
  536. : CGOpenMPRuntimeGPU::Generic;
  537. }
  538. /// Check for inner (nested) SPMD construct, if any
  539. static bool hasNestedSPMDDirective(ASTContext &Ctx,
  540. const OMPExecutableDirective &D) {
  541. const auto *CS = D.getInnermostCapturedStmt();
  542. const auto *Body =
  543. CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
  544. const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  545. if (const auto *NestedDir =
  546. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  547. OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
  548. switch (D.getDirectiveKind()) {
  549. case OMPD_target:
  550. if (isOpenMPParallelDirective(DKind))
  551. return true;
  552. if (DKind == OMPD_teams) {
  553. Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
  554. /*IgnoreCaptured=*/true);
  555. if (!Body)
  556. return false;
  557. ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  558. if (const auto *NND =
  559. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  560. DKind = NND->getDirectiveKind();
  561. if (isOpenMPParallelDirective(DKind))
  562. return true;
  563. }
  564. }
  565. return false;
  566. case OMPD_target_teams:
  567. return isOpenMPParallelDirective(DKind);
  568. case OMPD_target_simd:
  569. case OMPD_target_parallel:
  570. case OMPD_target_parallel_for:
  571. case OMPD_target_parallel_for_simd:
  572. case OMPD_target_teams_distribute:
  573. case OMPD_target_teams_distribute_simd:
  574. case OMPD_target_teams_distribute_parallel_for:
  575. case OMPD_target_teams_distribute_parallel_for_simd:
  576. case OMPD_parallel:
  577. case OMPD_for:
  578. case OMPD_parallel_for:
  579. case OMPD_parallel_master:
  580. case OMPD_parallel_sections:
  581. case OMPD_for_simd:
  582. case OMPD_parallel_for_simd:
  583. case OMPD_cancel:
  584. case OMPD_cancellation_point:
  585. case OMPD_ordered:
  586. case OMPD_threadprivate:
  587. case OMPD_allocate:
  588. case OMPD_task:
  589. case OMPD_simd:
  590. case OMPD_sections:
  591. case OMPD_section:
  592. case OMPD_single:
  593. case OMPD_master:
  594. case OMPD_critical:
  595. case OMPD_taskyield:
  596. case OMPD_barrier:
  597. case OMPD_taskwait:
  598. case OMPD_taskgroup:
  599. case OMPD_atomic:
  600. case OMPD_flush:
  601. case OMPD_depobj:
  602. case OMPD_scan:
  603. case OMPD_teams:
  604. case OMPD_target_data:
  605. case OMPD_target_exit_data:
  606. case OMPD_target_enter_data:
  607. case OMPD_distribute:
  608. case OMPD_distribute_simd:
  609. case OMPD_distribute_parallel_for:
  610. case OMPD_distribute_parallel_for_simd:
  611. case OMPD_teams_distribute:
  612. case OMPD_teams_distribute_simd:
  613. case OMPD_teams_distribute_parallel_for:
  614. case OMPD_teams_distribute_parallel_for_simd:
  615. case OMPD_target_update:
  616. case OMPD_declare_simd:
  617. case OMPD_declare_variant:
  618. case OMPD_begin_declare_variant:
  619. case OMPD_end_declare_variant:
  620. case OMPD_declare_target:
  621. case OMPD_end_declare_target:
  622. case OMPD_declare_reduction:
  623. case OMPD_declare_mapper:
  624. case OMPD_taskloop:
  625. case OMPD_taskloop_simd:
  626. case OMPD_master_taskloop:
  627. case OMPD_master_taskloop_simd:
  628. case OMPD_parallel_master_taskloop:
  629. case OMPD_parallel_master_taskloop_simd:
  630. case OMPD_requires:
  631. case OMPD_unknown:
  632. default:
  633. llvm_unreachable("Unexpected directive.");
  634. }
  635. }
  636. return false;
  637. }
  638. static bool supportsSPMDExecutionMode(ASTContext &Ctx,
  639. const OMPExecutableDirective &D) {
  640. OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
  641. switch (DirectiveKind) {
  642. case OMPD_target:
  643. case OMPD_target_teams:
  644. return hasNestedSPMDDirective(Ctx, D);
  645. case OMPD_target_parallel:
  646. case OMPD_target_parallel_for:
  647. case OMPD_target_parallel_for_simd:
  648. case OMPD_target_teams_distribute_parallel_for:
  649. case OMPD_target_teams_distribute_parallel_for_simd:
  650. case OMPD_target_simd:
  651. case OMPD_target_teams_distribute_simd:
  652. return true;
  653. case OMPD_target_teams_distribute:
  654. return false;
  655. case OMPD_parallel:
  656. case OMPD_for:
  657. case OMPD_parallel_for:
  658. case OMPD_parallel_master:
  659. case OMPD_parallel_sections:
  660. case OMPD_for_simd:
  661. case OMPD_parallel_for_simd:
  662. case OMPD_cancel:
  663. case OMPD_cancellation_point:
  664. case OMPD_ordered:
  665. case OMPD_threadprivate:
  666. case OMPD_allocate:
  667. case OMPD_task:
  668. case OMPD_simd:
  669. case OMPD_sections:
  670. case OMPD_section:
  671. case OMPD_single:
  672. case OMPD_master:
  673. case OMPD_critical:
  674. case OMPD_taskyield:
  675. case OMPD_barrier:
  676. case OMPD_taskwait:
  677. case OMPD_taskgroup:
  678. case OMPD_atomic:
  679. case OMPD_flush:
  680. case OMPD_depobj:
  681. case OMPD_scan:
  682. case OMPD_teams:
  683. case OMPD_target_data:
  684. case OMPD_target_exit_data:
  685. case OMPD_target_enter_data:
  686. case OMPD_distribute:
  687. case OMPD_distribute_simd:
  688. case OMPD_distribute_parallel_for:
  689. case OMPD_distribute_parallel_for_simd:
  690. case OMPD_teams_distribute:
  691. case OMPD_teams_distribute_simd:
  692. case OMPD_teams_distribute_parallel_for:
  693. case OMPD_teams_distribute_parallel_for_simd:
  694. case OMPD_target_update:
  695. case OMPD_declare_simd:
  696. case OMPD_declare_variant:
  697. case OMPD_begin_declare_variant:
  698. case OMPD_end_declare_variant:
  699. case OMPD_declare_target:
  700. case OMPD_end_declare_target:
  701. case OMPD_declare_reduction:
  702. case OMPD_declare_mapper:
  703. case OMPD_taskloop:
  704. case OMPD_taskloop_simd:
  705. case OMPD_master_taskloop:
  706. case OMPD_master_taskloop_simd:
  707. case OMPD_parallel_master_taskloop:
  708. case OMPD_parallel_master_taskloop_simd:
  709. case OMPD_requires:
  710. case OMPD_unknown:
  711. default:
  712. break;
  713. }
  714. llvm_unreachable(
  715. "Unknown programming model for OpenMP directive on NVPTX target.");
  716. }
  717. /// Check if the directive is loops based and has schedule clause at all or has
  718. /// static scheduling.
  719. static bool hasStaticScheduling(const OMPExecutableDirective &D) {
  720. assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
  721. isOpenMPLoopDirective(D.getDirectiveKind()) &&
  722. "Expected loop-based directive.");
  723. return !D.hasClausesOfKind<OMPOrderedClause>() &&
  724. (!D.hasClausesOfKind<OMPScheduleClause>() ||
  725. llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
  726. [](const OMPScheduleClause *C) {
  727. return C->getScheduleKind() == OMPC_SCHEDULE_static;
  728. }));
  729. }
  730. /// Check for inner (nested) lightweight runtime construct, if any
  731. static bool hasNestedLightweightDirective(ASTContext &Ctx,
  732. const OMPExecutableDirective &D) {
  733. assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
  734. const auto *CS = D.getInnermostCapturedStmt();
  735. const auto *Body =
  736. CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
  737. const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  738. if (const auto *NestedDir =
  739. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  740. OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
  741. switch (D.getDirectiveKind()) {
  742. case OMPD_target:
  743. if (isOpenMPParallelDirective(DKind) &&
  744. isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
  745. hasStaticScheduling(*NestedDir))
  746. return true;
  747. if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
  748. return true;
  749. if (DKind == OMPD_parallel) {
  750. Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
  751. /*IgnoreCaptured=*/true);
  752. if (!Body)
  753. return false;
  754. ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  755. if (const auto *NND =
  756. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  757. DKind = NND->getDirectiveKind();
  758. if (isOpenMPWorksharingDirective(DKind) &&
  759. isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
  760. return true;
  761. }
  762. } else if (DKind == OMPD_teams) {
  763. Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
  764. /*IgnoreCaptured=*/true);
  765. if (!Body)
  766. return false;
  767. ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  768. if (const auto *NND =
  769. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  770. DKind = NND->getDirectiveKind();
  771. if (isOpenMPParallelDirective(DKind) &&
  772. isOpenMPWorksharingDirective(DKind) &&
  773. isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
  774. return true;
  775. if (DKind == OMPD_parallel) {
  776. Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
  777. /*IgnoreCaptured=*/true);
  778. if (!Body)
  779. return false;
  780. ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  781. if (const auto *NND =
  782. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  783. DKind = NND->getDirectiveKind();
  784. if (isOpenMPWorksharingDirective(DKind) &&
  785. isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
  786. return true;
  787. }
  788. }
  789. }
  790. }
  791. return false;
  792. case OMPD_target_teams:
  793. if (isOpenMPParallelDirective(DKind) &&
  794. isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
  795. hasStaticScheduling(*NestedDir))
  796. return true;
  797. if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
  798. return true;
  799. if (DKind == OMPD_parallel) {
  800. Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
  801. /*IgnoreCaptured=*/true);
  802. if (!Body)
  803. return false;
  804. ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
  805. if (const auto *NND =
  806. dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
  807. DKind = NND->getDirectiveKind();
  808. if (isOpenMPWorksharingDirective(DKind) &&
  809. isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
  810. return true;
  811. }
  812. }
  813. return false;
  814. case OMPD_target_parallel:
  815. if (DKind == OMPD_simd)
  816. return true;
  817. return isOpenMPWorksharingDirective(DKind) &&
  818. isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
  819. case OMPD_target_teams_distribute:
  820. case OMPD_target_simd:
  821. case OMPD_target_parallel_for:
  822. case OMPD_target_parallel_for_simd:
  823. case OMPD_target_teams_distribute_simd:
  824. case OMPD_target_teams_distribute_parallel_for:
  825. case OMPD_target_teams_distribute_parallel_for_simd:
  826. case OMPD_parallel:
  827. case OMPD_for:
  828. case OMPD_parallel_for:
  829. case OMPD_parallel_master:
  830. case OMPD_parallel_sections:
  831. case OMPD_for_simd:
  832. case OMPD_parallel_for_simd:
  833. case OMPD_cancel:
  834. case OMPD_cancellation_point:
  835. case OMPD_ordered:
  836. case OMPD_threadprivate:
  837. case OMPD_allocate:
  838. case OMPD_task:
  839. case OMPD_simd:
  840. case OMPD_sections:
  841. case OMPD_section:
  842. case OMPD_single:
  843. case OMPD_master:
  844. case OMPD_critical:
  845. case OMPD_taskyield:
  846. case OMPD_barrier:
  847. case OMPD_taskwait:
  848. case OMPD_taskgroup:
  849. case OMPD_atomic:
  850. case OMPD_flush:
  851. case OMPD_depobj:
  852. case OMPD_scan:
  853. case OMPD_teams:
  854. case OMPD_target_data:
  855. case OMPD_target_exit_data:
  856. case OMPD_target_enter_data:
  857. case OMPD_distribute:
  858. case OMPD_distribute_simd:
  859. case OMPD_distribute_parallel_for:
  860. case OMPD_distribute_parallel_for_simd:
  861. case OMPD_teams_distribute:
  862. case OMPD_teams_distribute_simd:
  863. case OMPD_teams_distribute_parallel_for:
  864. case OMPD_teams_distribute_parallel_for_simd:
  865. case OMPD_target_update:
  866. case OMPD_declare_simd:
  867. case OMPD_declare_variant:
  868. case OMPD_begin_declare_variant:
  869. case OMPD_end_declare_variant:
  870. case OMPD_declare_target:
  871. case OMPD_end_declare_target:
  872. case OMPD_declare_reduction:
  873. case OMPD_declare_mapper:
  874. case OMPD_taskloop:
  875. case OMPD_taskloop_simd:
  876. case OMPD_master_taskloop:
  877. case OMPD_master_taskloop_simd:
  878. case OMPD_parallel_master_taskloop:
  879. case OMPD_parallel_master_taskloop_simd:
  880. case OMPD_requires:
  881. case OMPD_unknown:
  882. default:
  883. llvm_unreachable("Unexpected directive.");
  884. }
  885. }
  886. return false;
  887. }
  888. /// Checks if the construct supports lightweight runtime. It must be SPMD
  889. /// construct + inner loop-based construct with static scheduling.
  890. static bool supportsLightweightRuntime(ASTContext &Ctx,
  891. const OMPExecutableDirective &D) {
  892. if (!supportsSPMDExecutionMode(Ctx, D))
  893. return false;
  894. OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
  895. switch (DirectiveKind) {
  896. case OMPD_target:
  897. case OMPD_target_teams:
  898. case OMPD_target_parallel:
  899. return hasNestedLightweightDirective(Ctx, D);
  900. case OMPD_target_parallel_for:
  901. case OMPD_target_parallel_for_simd:
  902. case OMPD_target_teams_distribute_parallel_for:
  903. case OMPD_target_teams_distribute_parallel_for_simd:
  904. // (Last|First)-privates must be shared in parallel region.
  905. return hasStaticScheduling(D);
  906. case OMPD_target_simd:
  907. case OMPD_target_teams_distribute_simd:
  908. return true;
  909. case OMPD_target_teams_distribute:
  910. return false;
  911. case OMPD_parallel:
  912. case OMPD_for:
  913. case OMPD_parallel_for:
  914. case OMPD_parallel_master:
  915. case OMPD_parallel_sections:
  916. case OMPD_for_simd:
  917. case OMPD_parallel_for_simd:
  918. case OMPD_cancel:
  919. case OMPD_cancellation_point:
  920. case OMPD_ordered:
  921. case OMPD_threadprivate:
  922. case OMPD_allocate:
  923. case OMPD_task:
  924. case OMPD_simd:
  925. case OMPD_sections:
  926. case OMPD_section:
  927. case OMPD_single:
  928. case OMPD_master:
  929. case OMPD_critical:
  930. case OMPD_taskyield:
  931. case OMPD_barrier:
  932. case OMPD_taskwait:
  933. case OMPD_taskgroup:
  934. case OMPD_atomic:
  935. case OMPD_flush:
  936. case OMPD_depobj:
  937. case OMPD_scan:
  938. case OMPD_teams:
  939. case OMPD_target_data:
  940. case OMPD_target_exit_data:
  941. case OMPD_target_enter_data:
  942. case OMPD_distribute:
  943. case OMPD_distribute_simd:
  944. case OMPD_distribute_parallel_for:
  945. case OMPD_distribute_parallel_for_simd:
  946. case OMPD_teams_distribute:
  947. case OMPD_teams_distribute_simd:
  948. case OMPD_teams_distribute_parallel_for:
  949. case OMPD_teams_distribute_parallel_for_simd:
  950. case OMPD_target_update:
  951. case OMPD_declare_simd:
  952. case OMPD_declare_variant:
  953. case OMPD_begin_declare_variant:
  954. case OMPD_end_declare_variant:
  955. case OMPD_declare_target:
  956. case OMPD_end_declare_target:
  957. case OMPD_declare_reduction:
  958. case OMPD_declare_mapper:
  959. case OMPD_taskloop:
  960. case OMPD_taskloop_simd:
  961. case OMPD_master_taskloop:
  962. case OMPD_master_taskloop_simd:
  963. case OMPD_parallel_master_taskloop:
  964. case OMPD_parallel_master_taskloop_simd:
  965. case OMPD_requires:
  966. case OMPD_unknown:
  967. default:
  968. break;
  969. }
  970. llvm_unreachable(
  971. "Unknown programming model for OpenMP directive on NVPTX target.");
  972. }
  973. void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
  974. StringRef ParentName,
  975. llvm::Function *&OutlinedFn,
  976. llvm::Constant *&OutlinedFnID,
  977. bool IsOffloadEntry,
  978. const RegionCodeGenTy &CodeGen) {
  979. ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
  980. EntryFunctionState EST;
  981. WrapperFunctionsMap.clear();
  982. // Emit target region as a standalone region.
  983. class NVPTXPrePostActionTy : public PrePostActionTy {
  984. CGOpenMPRuntimeGPU::EntryFunctionState &EST;
  985. public:
  986. NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
  987. : EST(EST) {}
  988. void Enter(CodeGenFunction &CGF) override {
  989. auto &RT =
  990. static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  991. RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
  992. // Skip target region initialization.
  993. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
  994. }
  995. void Exit(CodeGenFunction &CGF) override {
  996. auto &RT =
  997. static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  998. RT.clearLocThreadIdInsertPt(CGF);
  999. RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
  1000. }
  1001. } Action(EST);
  1002. CodeGen.setAction(Action);
  1003. IsInTTDRegion = true;
  1004. emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
  1005. IsOffloadEntry, CodeGen);
  1006. IsInTTDRegion = false;
  1007. }
  1008. void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
  1009. EntryFunctionState &EST, bool IsSPMD) {
  1010. CGBuilderTy &Bld = CGF.Builder;
  1011. Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
  1012. IsInTargetMasterThreadRegion = IsSPMD;
  1013. if (!IsSPMD)
  1014. emitGenericVarsProlog(CGF, EST.Loc);
  1015. }
  1016. void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
  1017. EntryFunctionState &EST,
  1018. bool IsSPMD) {
  1019. if (!IsSPMD)
  1020. emitGenericVarsEpilog(CGF);
  1021. CGBuilderTy &Bld = CGF.Builder;
  1022. OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
  1023. }
  1024. void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
  1025. StringRef ParentName,
  1026. llvm::Function *&OutlinedFn,
  1027. llvm::Constant *&OutlinedFnID,
  1028. bool IsOffloadEntry,
  1029. const RegionCodeGenTy &CodeGen) {
  1030. ExecutionRuntimeModesRAII ModeRAII(
  1031. CurrentExecutionMode, RequiresFullRuntime,
  1032. CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
  1033. !supportsLightweightRuntime(CGM.getContext(), D));
  1034. EntryFunctionState EST;
  1035. // Emit target region as a standalone region.
  1036. class NVPTXPrePostActionTy : public PrePostActionTy {
  1037. CGOpenMPRuntimeGPU &RT;
  1038. CGOpenMPRuntimeGPU::EntryFunctionState &EST;
  1039. public:
  1040. NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
  1041. CGOpenMPRuntimeGPU::EntryFunctionState &EST)
  1042. : RT(RT), EST(EST) {}
  1043. void Enter(CodeGenFunction &CGF) override {
  1044. RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
  1045. // Skip target region initialization.
  1046. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
  1047. }
  1048. void Exit(CodeGenFunction &CGF) override {
  1049. RT.clearLocThreadIdInsertPt(CGF);
  1050. RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
  1051. }
  1052. } Action(*this, EST);
  1053. CodeGen.setAction(Action);
  1054. IsInTTDRegion = true;
  1055. emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
  1056. IsOffloadEntry, CodeGen);
  1057. IsInTTDRegion = false;
  1058. }
  1059. // Create a unique global variable to indicate the execution mode of this target
  1060. // region. The execution mode is either 'generic', or 'spmd' depending on the
  1061. // target directive. This variable is picked up by the offload library to setup
  1062. // the device appropriately before kernel launch. If the execution mode is
  1063. // 'generic', the runtime reserves one warp for the master, otherwise, all
  1064. // warps participate in parallel work.
  1065. static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
  1066. bool Mode) {
  1067. auto *GVMode = new llvm::GlobalVariable(
  1068. CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
  1069. llvm::GlobalValue::WeakAnyLinkage,
  1070. llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
  1071. : OMP_TGT_EXEC_MODE_GENERIC),
  1072. Twine(Name, "_exec_mode"));
  1073. CGM.addCompilerUsedGlobal(GVMode);
  1074. }
  1075. void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
  1076. llvm::Constant *Addr,
  1077. uint64_t Size, int32_t,
  1078. llvm::GlobalValue::LinkageTypes) {
  1079. // TODO: Add support for global variables on the device after declare target
  1080. // support.
  1081. llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
  1082. if (!Fn)
  1083. return;
  1084. llvm::Module &M = CGM.getModule();
  1085. llvm::LLVMContext &Ctx = CGM.getLLVMContext();
  1086. // Get "nvvm.annotations" metadata node.
  1087. llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
  1088. llvm::Metadata *MDVals[] = {
  1089. llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
  1090. llvm::ConstantAsMetadata::get(
  1091. llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
  1092. // Append metadata to nvvm.annotations.
  1093. MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
  1094. // Add a function attribute for the kernel.
  1095. Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
  1096. }
  1097. void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
  1098. const OMPExecutableDirective &D, StringRef ParentName,
  1099. llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
  1100. bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
  1101. if (!IsOffloadEntry) // Nothing to do.
  1102. return;
  1103. assert(!ParentName.empty() && "Invalid target region parent name!");
  1104. bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
  1105. if (Mode)
  1106. emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
  1107. CodeGen);
  1108. else
  1109. emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
  1110. CodeGen);
  1111. setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
  1112. }
  1113. namespace {
  1114. LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
  1115. /// Enum for accesseing the reserved_2 field of the ident_t struct.
  1116. enum ModeFlagsTy : unsigned {
  1117. /// Bit set to 1 when in SPMD mode.
  1118. KMP_IDENT_SPMD_MODE = 0x01,
  1119. /// Bit set to 1 when a simplified runtime is used.
  1120. KMP_IDENT_SIMPLE_RT_MODE = 0x02,
  1121. LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
  1122. };
  1123. /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
  1124. static const ModeFlagsTy UndefinedMode =
  1125. (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
  1126. } // anonymous namespace
  1127. unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
  1128. switch (getExecutionMode()) {
  1129. case EM_SPMD:
  1130. if (requiresFullRuntime())
  1131. return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
  1132. return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
  1133. case EM_NonSPMD:
  1134. assert(requiresFullRuntime() && "Expected full runtime.");
  1135. return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
  1136. case EM_Unknown:
  1137. return UndefinedMode;
  1138. }
  1139. llvm_unreachable("Unknown flags are requested.");
  1140. }
  1141. CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
  1142. : CGOpenMPRuntime(CGM, "_", "$") {
  1143. if (!CGM.getLangOpts().OpenMPIsDevice)
  1144. llvm_unreachable("OpenMP can only handle device code.");
  1145. llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
  1146. if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
  1147. !CGM.getLangOpts().OMPHostIRFile.empty()) {
  1148. OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
  1149. "__omp_rtl_debug_kind");
  1150. OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
  1151. "__omp_rtl_assume_teams_oversubscription");
  1152. OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
  1153. "__omp_rtl_assume_threads_oversubscription");
  1154. }
  1155. }
  1156. void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
  1157. ProcBindKind ProcBind,
  1158. SourceLocation Loc) {
  1159. // Do nothing in case of SPMD mode and L0 parallel.
  1160. if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
  1161. return;
  1162. CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
  1163. }
  1164. void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
  1165. llvm::Value *NumThreads,
  1166. SourceLocation Loc) {
  1167. // Nothing to do.
  1168. }
  1169. void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
  1170. const Expr *NumTeams,
  1171. const Expr *ThreadLimit,
  1172. SourceLocation Loc) {}
  1173. llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
  1174. const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
  1175. OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
  1176. // Emit target region as a standalone region.
  1177. class NVPTXPrePostActionTy : public PrePostActionTy {
  1178. bool &IsInParallelRegion;
  1179. bool PrevIsInParallelRegion;
  1180. public:
  1181. NVPTXPrePostActionTy(bool &IsInParallelRegion)
  1182. : IsInParallelRegion(IsInParallelRegion) {}
  1183. void Enter(CodeGenFunction &CGF) override {
  1184. PrevIsInParallelRegion = IsInParallelRegion;
  1185. IsInParallelRegion = true;
  1186. }
  1187. void Exit(CodeGenFunction &CGF) override {
  1188. IsInParallelRegion = PrevIsInParallelRegion;
  1189. }
  1190. } Action(IsInParallelRegion);
  1191. CodeGen.setAction(Action);
  1192. bool PrevIsInTTDRegion = IsInTTDRegion;
  1193. IsInTTDRegion = false;
  1194. bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
  1195. IsInTargetMasterThreadRegion = false;
  1196. auto *OutlinedFun =
  1197. cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
  1198. D, ThreadIDVar, InnermostKind, CodeGen));
  1199. IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
  1200. IsInTTDRegion = PrevIsInTTDRegion;
  1201. if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
  1202. !IsInParallelRegion) {
  1203. llvm::Function *WrapperFun =
  1204. createParallelDataSharingWrapper(OutlinedFun, D);
  1205. WrapperFunctionsMap[OutlinedFun] = WrapperFun;
  1206. }
  1207. return OutlinedFun;
  1208. }
  1209. /// Get list of lastprivate variables from the teams distribute ... or
  1210. /// teams {distribute ...} directives.
  1211. static void
  1212. getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
  1213. llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
  1214. assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
  1215. "expected teams directive.");
  1216. const OMPExecutableDirective *Dir = &D;
  1217. if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
  1218. if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
  1219. Ctx,
  1220. D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
  1221. /*IgnoreCaptured=*/true))) {
  1222. Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
  1223. if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
  1224. Dir = nullptr;
  1225. }
  1226. }
  1227. if (!Dir)
  1228. return;
  1229. for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
  1230. for (const Expr *E : C->getVarRefs())
  1231. Vars.push_back(getPrivateItem(E));
  1232. }
  1233. }
  1234. /// Get list of reduction variables from the teams ... directives.
  1235. static void
  1236. getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
  1237. llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
  1238. assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
  1239. "expected teams directive.");
  1240. for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
  1241. for (const Expr *E : C->privates())
  1242. Vars.push_back(getPrivateItem(E));
  1243. }
  1244. }
  1245. llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
  1246. const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
  1247. OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
  1248. SourceLocation Loc = D.getBeginLoc();
  1249. const RecordDecl *GlobalizedRD = nullptr;
  1250. llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
  1251. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
  1252. unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
  1253. // Globalize team reductions variable unconditionally in all modes.
  1254. if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
  1255. getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
  1256. if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
  1257. getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
  1258. if (!LastPrivatesReductions.empty()) {
  1259. GlobalizedRD = ::buildRecordForGlobalizedVars(
  1260. CGM.getContext(), llvm::None, LastPrivatesReductions,
  1261. MappedDeclsFields, WarpSize);
  1262. }
  1263. } else if (!LastPrivatesReductions.empty()) {
  1264. assert(!TeamAndReductions.first &&
  1265. "Previous team declaration is not expected.");
  1266. TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
  1267. std::swap(TeamAndReductions.second, LastPrivatesReductions);
  1268. }
  1269. // Emit target region as a standalone region.
  1270. class NVPTXPrePostActionTy : public PrePostActionTy {
  1271. SourceLocation &Loc;
  1272. const RecordDecl *GlobalizedRD;
  1273. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  1274. &MappedDeclsFields;
  1275. public:
  1276. NVPTXPrePostActionTy(
  1277. SourceLocation &Loc, const RecordDecl *GlobalizedRD,
  1278. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  1279. &MappedDeclsFields)
  1280. : Loc(Loc), GlobalizedRD(GlobalizedRD),
  1281. MappedDeclsFields(MappedDeclsFields) {}
  1282. void Enter(CodeGenFunction &CGF) override {
  1283. auto &Rt =
  1284. static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  1285. if (GlobalizedRD) {
  1286. auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
  1287. I->getSecond().MappedParams =
  1288. std::make_unique<CodeGenFunction::OMPMapVars>();
  1289. DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
  1290. for (const auto &Pair : MappedDeclsFields) {
  1291. assert(Pair.getFirst()->isCanonicalDecl() &&
  1292. "Expected canonical declaration");
  1293. Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
  1294. }
  1295. }
  1296. Rt.emitGenericVarsProlog(CGF, Loc);
  1297. }
  1298. void Exit(CodeGenFunction &CGF) override {
  1299. static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
  1300. .emitGenericVarsEpilog(CGF);
  1301. }
  1302. } Action(Loc, GlobalizedRD, MappedDeclsFields);
  1303. CodeGen.setAction(Action);
  1304. llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
  1305. D, ThreadIDVar, InnermostKind, CodeGen);
  1306. return OutlinedFun;
  1307. }
  1308. void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
  1309. SourceLocation Loc,
  1310. bool WithSPMDCheck) {
  1311. if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
  1312. getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
  1313. return;
  1314. CGBuilderTy &Bld = CGF.Builder;
  1315. const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
  1316. if (I == FunctionGlobalizedDecls.end())
  1317. return;
  1318. for (auto &Rec : I->getSecond().LocalVarData) {
  1319. const auto *VD = cast<VarDecl>(Rec.first);
  1320. bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
  1321. QualType VarTy = VD->getType();
  1322. // Get the local allocation of a firstprivate variable before sharing
  1323. llvm::Value *ParValue;
  1324. if (EscapedParam) {
  1325. LValue ParLVal =
  1326. CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
  1327. ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
  1328. }
  1329. // Allocate space for the variable to be globalized
  1330. llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
  1331. llvm::CallBase *VoidPtr =
  1332. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1333. CGM.getModule(), OMPRTL___kmpc_alloc_shared),
  1334. AllocArgs, VD->getName());
  1335. // FIXME: We should use the variables actual alignment as an argument.
  1336. VoidPtr->addRetAttr(llvm::Attribute::get(
  1337. CGM.getLLVMContext(), llvm::Attribute::Alignment,
  1338. CGM.getContext().getTargetInfo().getNewAlign() / 8));
  1339. // Cast the void pointer and get the address of the globalized variable.
  1340. llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
  1341. llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  1342. VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
  1343. LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
  1344. Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
  1345. Rec.second.GlobalizedVal = VoidPtr;
  1346. // Assign the local allocation to the newly globalized location.
  1347. if (EscapedParam) {
  1348. CGF.EmitStoreOfScalar(ParValue, VarAddr);
  1349. I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
  1350. }
  1351. if (auto *DI = CGF.getDebugInfo())
  1352. VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
  1353. }
  1354. for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
  1355. // Use actual memory size of the VLA object including the padding
  1356. // for alignment purposes.
  1357. llvm::Value *Size = CGF.getTypeSize(VD->getType());
  1358. CharUnits Align = CGM.getContext().getDeclAlign(VD);
  1359. Size = Bld.CreateNUWAdd(
  1360. Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
  1361. llvm::Value *AlignVal =
  1362. llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
  1363. Size = Bld.CreateUDiv(Size, AlignVal);
  1364. Size = Bld.CreateNUWMul(Size, AlignVal);
  1365. // Allocate space for this VLA object to be globalized.
  1366. llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
  1367. llvm::CallBase *VoidPtr =
  1368. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1369. CGM.getModule(), OMPRTL___kmpc_alloc_shared),
  1370. AllocArgs, VD->getName());
  1371. VoidPtr->addRetAttr(
  1372. llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
  1373. CGM.getContext().getTargetInfo().getNewAlign()));
  1374. I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
  1375. std::pair<llvm::Value *, llvm::Value *>(
  1376. {VoidPtr, CGF.getTypeSize(VD->getType())}));
  1377. LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
  1378. CGM.getContext().getDeclAlign(VD),
  1379. AlignmentSource::Decl);
  1380. I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
  1381. Base.getAddress(CGF));
  1382. }
  1383. I->getSecond().MappedParams->apply(CGF);
  1384. }
  1385. void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
  1386. bool WithSPMDCheck) {
  1387. if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
  1388. getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
  1389. return;
  1390. const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
  1391. if (I != FunctionGlobalizedDecls.end()) {
  1392. // Deallocate the memory for each globalized VLA object
  1393. for (auto AddrSizePair :
  1394. llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
  1395. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1396. CGM.getModule(), OMPRTL___kmpc_free_shared),
  1397. {AddrSizePair.first, AddrSizePair.second});
  1398. }
  1399. // Deallocate the memory for each globalized value
  1400. for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
  1401. const auto *VD = cast<VarDecl>(Rec.first);
  1402. I->getSecond().MappedParams->restore(CGF);
  1403. llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
  1404. CGF.getTypeSize(VD->getType())};
  1405. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1406. CGM.getModule(), OMPRTL___kmpc_free_shared),
  1407. FreeArgs);
  1408. }
  1409. }
  1410. }
  1411. void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
  1412. const OMPExecutableDirective &D,
  1413. SourceLocation Loc,
  1414. llvm::Function *OutlinedFn,
  1415. ArrayRef<llvm::Value *> CapturedVars) {
  1416. if (!CGF.HaveInsertPoint())
  1417. return;
  1418. Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
  1419. /*Name=*/".zero.addr");
  1420. CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
  1421. llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
  1422. OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
  1423. OutlinedFnArgs.push_back(ZeroAddr.getPointer());
  1424. OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
  1425. emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
  1426. }
  1427. void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
  1428. SourceLocation Loc,
  1429. llvm::Function *OutlinedFn,
  1430. ArrayRef<llvm::Value *> CapturedVars,
  1431. const Expr *IfCond,
  1432. llvm::Value *NumThreads) {
  1433. if (!CGF.HaveInsertPoint())
  1434. return;
  1435. auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
  1436. NumThreads](CodeGenFunction &CGF,
  1437. PrePostActionTy &Action) {
  1438. CGBuilderTy &Bld = CGF.Builder;
  1439. llvm::Value *NumThreadsVal = NumThreads;
  1440. llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
  1441. llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
  1442. if (WFn)
  1443. ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
  1444. llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
  1445. // Create a private scope that will globalize the arguments
  1446. // passed from the outside of the target region.
  1447. // TODO: Is that needed?
  1448. CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
  1449. Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
  1450. llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
  1451. "captured_vars_addrs");
  1452. // There's something to share.
  1453. if (!CapturedVars.empty()) {
  1454. // Prepare for parallel region. Indicate the outlined function.
  1455. ASTContext &Ctx = CGF.getContext();
  1456. unsigned Idx = 0;
  1457. for (llvm::Value *V : CapturedVars) {
  1458. Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
  1459. llvm::Value *PtrV;
  1460. if (V->getType()->isIntegerTy())
  1461. PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
  1462. else
  1463. PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
  1464. CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
  1465. Ctx.getPointerType(Ctx.VoidPtrTy));
  1466. ++Idx;
  1467. }
  1468. }
  1469. llvm::Value *IfCondVal = nullptr;
  1470. if (IfCond)
  1471. IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
  1472. /* isSigned */ false);
  1473. else
  1474. IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
  1475. if (!NumThreadsVal)
  1476. NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
  1477. else
  1478. NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
  1479. assert(IfCondVal && "Expected a value");
  1480. llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
  1481. llvm::Value *Args[] = {
  1482. RTLoc,
  1483. getThreadID(CGF, Loc),
  1484. IfCondVal,
  1485. NumThreadsVal,
  1486. llvm::ConstantInt::get(CGF.Int32Ty, -1),
  1487. FnPtr,
  1488. ID,
  1489. Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
  1490. CGF.VoidPtrPtrTy),
  1491. llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
  1492. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1493. CGM.getModule(), OMPRTL___kmpc_parallel_51),
  1494. Args);
  1495. };
  1496. RegionCodeGenTy RCG(ParallelGen);
  1497. RCG(CGF);
  1498. }
  1499. void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
  1500. // Always emit simple barriers!
  1501. if (!CGF.HaveInsertPoint())
  1502. return;
  1503. // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
  1504. // This function does not use parameters, so we can emit just default values.
  1505. llvm::Value *Args[] = {
  1506. llvm::ConstantPointerNull::get(
  1507. cast<llvm::PointerType>(getIdentTyPointerTy())),
  1508. llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
  1509. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1510. CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
  1511. Args);
  1512. }
  1513. void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
  1514. SourceLocation Loc,
  1515. OpenMPDirectiveKind Kind, bool,
  1516. bool) {
  1517. // Always emit simple barriers!
  1518. if (!CGF.HaveInsertPoint())
  1519. return;
  1520. // Build call __kmpc_cancel_barrier(loc, thread_id);
  1521. unsigned Flags = getDefaultFlagsForBarriers(Kind);
  1522. llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
  1523. getThreadID(CGF, Loc)};
  1524. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1525. CGM.getModule(), OMPRTL___kmpc_barrier),
  1526. Args);
  1527. }
  1528. void CGOpenMPRuntimeGPU::emitCriticalRegion(
  1529. CodeGenFunction &CGF, StringRef CriticalName,
  1530. const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
  1531. const Expr *Hint) {
  1532. llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
  1533. llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
  1534. llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
  1535. llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
  1536. llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
  1537. auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  1538. // Get the mask of active threads in the warp.
  1539. llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1540. CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
  1541. // Fetch team-local id of the thread.
  1542. llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
  1543. // Get the width of the team.
  1544. llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
  1545. // Initialize the counter variable for the loop.
  1546. QualType Int32Ty =
  1547. CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
  1548. Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
  1549. LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
  1550. CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
  1551. /*isInit=*/true);
  1552. // Block checks if loop counter exceeds upper bound.
  1553. CGF.EmitBlock(LoopBB);
  1554. llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
  1555. llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
  1556. CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
  1557. // Block tests which single thread should execute region, and which threads
  1558. // should go straight to synchronisation point.
  1559. CGF.EmitBlock(TestBB);
  1560. CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
  1561. llvm::Value *CmpThreadToCounter =
  1562. CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
  1563. CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
  1564. // Block emits the body of the critical region.
  1565. CGF.EmitBlock(BodyBB);
  1566. // Output the critical statement.
  1567. CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
  1568. Hint);
  1569. // After the body surrounded by the critical region, the single executing
  1570. // thread will jump to the synchronisation point.
  1571. // Block waits for all threads in current team to finish then increments the
  1572. // counter variable and returns to the loop.
  1573. CGF.EmitBlock(SyncBB);
  1574. // Reconverge active threads in the warp.
  1575. (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  1576. CGM.getModule(), OMPRTL___kmpc_syncwarp),
  1577. Mask);
  1578. llvm::Value *IncCounterVal =
  1579. CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
  1580. CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
  1581. CGF.EmitBranch(LoopBB);
  1582. // Block that is reached when all threads in the team complete the region.
  1583. CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
  1584. }
  1585. /// Cast value to the specified type.
  1586. static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
  1587. QualType ValTy, QualType CastTy,
  1588. SourceLocation Loc) {
  1589. assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
  1590. "Cast type must sized.");
  1591. assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
  1592. "Val type must sized.");
  1593. llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
  1594. if (ValTy == CastTy)
  1595. return Val;
  1596. if (CGF.getContext().getTypeSizeInChars(ValTy) ==
  1597. CGF.getContext().getTypeSizeInChars(CastTy))
  1598. return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
  1599. if (CastTy->isIntegerType() && ValTy->isIntegerType())
  1600. return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
  1601. CastTy->hasSignedIntegerRepresentation());
  1602. Address CastItem = CGF.CreateMemTemp(CastTy);
  1603. Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  1604. CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
  1605. CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
  1606. LValueBaseInfo(AlignmentSource::Type),
  1607. TBAAAccessInfo());
  1608. return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
  1609. LValueBaseInfo(AlignmentSource::Type),
  1610. TBAAAccessInfo());
  1611. }
  1612. /// This function creates calls to one of two shuffle functions to copy
  1613. /// variables between lanes in a warp.
  1614. static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
  1615. llvm::Value *Elem,
  1616. QualType ElemType,
  1617. llvm::Value *Offset,
  1618. SourceLocation Loc) {
  1619. CodeGenModule &CGM = CGF.CGM;
  1620. CGBuilderTy &Bld = CGF.Builder;
  1621. CGOpenMPRuntimeGPU &RT =
  1622. *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
  1623. llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
  1624. CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
  1625. assert(Size.getQuantity() <= 8 &&
  1626. "Unsupported bitwidth in shuffle instruction.");
  1627. RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
  1628. ? OMPRTL___kmpc_shuffle_int32
  1629. : OMPRTL___kmpc_shuffle_int64;
  1630. // Cast all types to 32- or 64-bit values before calling shuffle routines.
  1631. QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
  1632. Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
  1633. llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
  1634. llvm::Value *WarpSize =
  1635. Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
  1636. llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
  1637. OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
  1638. {ElemCast, Offset, WarpSize});
  1639. return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
  1640. }
  1641. static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
  1642. Address DestAddr, QualType ElemType,
  1643. llvm::Value *Offset, SourceLocation Loc) {
  1644. CGBuilderTy &Bld = CGF.Builder;
  1645. CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
  1646. // Create the loop over the big sized data.
  1647. // ptr = (void*)Elem;
  1648. // ptrEnd = (void*) Elem + 1;
  1649. // Step = 8;
  1650. // while (ptr + Step < ptrEnd)
  1651. // shuffle((int64_t)*ptr);
  1652. // Step = 4;
  1653. // while (ptr + Step < ptrEnd)
  1654. // shuffle((int32_t)*ptr);
  1655. // ...
  1656. Address ElemPtr = DestAddr;
  1657. Address Ptr = SrcAddr;
  1658. Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
  1659. Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
  1660. for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
  1661. if (Size < CharUnits::fromQuantity(IntSize))
  1662. continue;
  1663. QualType IntType = CGF.getContext().getIntTypeForBitwidth(
  1664. CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
  1665. /*Signed=*/1);
  1666. llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
  1667. Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
  1668. ElemPtr =
  1669. Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
  1670. if (Size.getQuantity() / IntSize > 1) {
  1671. llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
  1672. llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
  1673. llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
  1674. llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
  1675. CGF.EmitBlock(PreCondBB);
  1676. llvm::PHINode *PhiSrc =
  1677. Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
  1678. PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
  1679. llvm::PHINode *PhiDest =
  1680. Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
  1681. PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
  1682. Ptr = Address(PhiSrc, Ptr.getAlignment());
  1683. ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
  1684. llvm::Value *PtrDiff = Bld.CreatePtrDiff(
  1685. CGF.Int8Ty, PtrEnd.getPointer(),
  1686. Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
  1687. CGF.VoidPtrTy));
  1688. Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
  1689. ThenBB, ExitBB);
  1690. CGF.EmitBlock(ThenBB);
  1691. llvm::Value *Res = createRuntimeShuffleFunction(
  1692. CGF,
  1693. CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
  1694. LValueBaseInfo(AlignmentSource::Type),
  1695. TBAAAccessInfo()),
  1696. IntType, Offset, Loc);
  1697. CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
  1698. LValueBaseInfo(AlignmentSource::Type),
  1699. TBAAAccessInfo());
  1700. Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
  1701. Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
  1702. PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
  1703. PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
  1704. CGF.EmitBranch(PreCondBB);
  1705. CGF.EmitBlock(ExitBB);
  1706. } else {
  1707. llvm::Value *Res = createRuntimeShuffleFunction(
  1708. CGF,
  1709. CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
  1710. LValueBaseInfo(AlignmentSource::Type),
  1711. TBAAAccessInfo()),
  1712. IntType, Offset, Loc);
  1713. CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
  1714. LValueBaseInfo(AlignmentSource::Type),
  1715. TBAAAccessInfo());
  1716. Ptr = Bld.CreateConstGEP(Ptr, 1);
  1717. ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
  1718. }
  1719. Size = Size % IntSize;
  1720. }
  1721. }
  1722. namespace {
  1723. enum CopyAction : unsigned {
  1724. // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
  1725. // the warp using shuffle instructions.
  1726. RemoteLaneToThread,
  1727. // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
  1728. ThreadCopy,
  1729. // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
  1730. ThreadToScratchpad,
  1731. // ScratchpadToThread: Copy from a scratchpad array in global memory
  1732. // containing team-reduced data to a thread's stack.
  1733. ScratchpadToThread,
  1734. };
  1735. } // namespace
  1736. struct CopyOptionsTy {
  1737. llvm::Value *RemoteLaneOffset;
  1738. llvm::Value *ScratchpadIndex;
  1739. llvm::Value *ScratchpadWidth;
  1740. };
  1741. /// Emit instructions to copy a Reduce list, which contains partially
  1742. /// aggregated values, in the specified direction.
  1743. static void emitReductionListCopy(
  1744. CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
  1745. ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
  1746. CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
  1747. CodeGenModule &CGM = CGF.CGM;
  1748. ASTContext &C = CGM.getContext();
  1749. CGBuilderTy &Bld = CGF.Builder;
  1750. llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
  1751. llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
  1752. llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
  1753. // Iterates, element-by-element, through the source Reduce list and
  1754. // make a copy.
  1755. unsigned Idx = 0;
  1756. unsigned Size = Privates.size();
  1757. for (const Expr *Private : Privates) {
  1758. Address SrcElementAddr = Address::invalid();
  1759. Address DestElementAddr = Address::invalid();
  1760. Address DestElementPtrAddr = Address::invalid();
  1761. // Should we shuffle in an element from a remote lane?
  1762. bool ShuffleInElement = false;
  1763. // Set to true to update the pointer in the dest Reduce list to a
  1764. // newly created element.
  1765. bool UpdateDestListPtr = false;
  1766. // Increment the src or dest pointer to the scratchpad, for each
  1767. // new element.
  1768. bool IncrScratchpadSrc = false;
  1769. bool IncrScratchpadDest = false;
  1770. switch (Action) {
  1771. case RemoteLaneToThread: {
  1772. // Step 1.1: Get the address for the src element in the Reduce list.
  1773. Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
  1774. SrcElementAddr = CGF.EmitLoadOfPointer(
  1775. SrcElementPtrAddr,
  1776. C.getPointerType(Private->getType())->castAs<PointerType>());
  1777. // Step 1.2: Create a temporary to store the element in the destination
  1778. // Reduce list.
  1779. DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
  1780. DestElementAddr =
  1781. CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
  1782. ShuffleInElement = true;
  1783. UpdateDestListPtr = true;
  1784. break;
  1785. }
  1786. case ThreadCopy: {
  1787. // Step 1.1: Get the address for the src element in the Reduce list.
  1788. Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
  1789. SrcElementAddr = CGF.EmitLoadOfPointer(
  1790. SrcElementPtrAddr,
  1791. C.getPointerType(Private->getType())->castAs<PointerType>());
  1792. // Step 1.2: Get the address for dest element. The destination
  1793. // element has already been created on the thread's stack.
  1794. DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
  1795. DestElementAddr = CGF.EmitLoadOfPointer(
  1796. DestElementPtrAddr,
  1797. C.getPointerType(Private->getType())->castAs<PointerType>());
  1798. break;
  1799. }
  1800. case ThreadToScratchpad: {
  1801. // Step 1.1: Get the address for the src element in the Reduce list.
  1802. Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
  1803. SrcElementAddr = CGF.EmitLoadOfPointer(
  1804. SrcElementPtrAddr,
  1805. C.getPointerType(Private->getType())->castAs<PointerType>());
  1806. // Step 1.2: Get the address for dest element:
  1807. // address = base + index * ElementSizeInChars.
  1808. llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
  1809. llvm::Value *CurrentOffset =
  1810. Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
  1811. llvm::Value *ScratchPadElemAbsolutePtrVal =
  1812. Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
  1813. ScratchPadElemAbsolutePtrVal =
  1814. Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
  1815. DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
  1816. C.getTypeAlignInChars(Private->getType()));
  1817. IncrScratchpadDest = true;
  1818. break;
  1819. }
  1820. case ScratchpadToThread: {
  1821. // Step 1.1: Get the address for the src element in the scratchpad.
  1822. // address = base + index * ElementSizeInChars.
  1823. llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
  1824. llvm::Value *CurrentOffset =
  1825. Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
  1826. llvm::Value *ScratchPadElemAbsolutePtrVal =
  1827. Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
  1828. ScratchPadElemAbsolutePtrVal =
  1829. Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
  1830. SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
  1831. C.getTypeAlignInChars(Private->getType()));
  1832. IncrScratchpadSrc = true;
  1833. // Step 1.2: Create a temporary to store the element in the destination
  1834. // Reduce list.
  1835. DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
  1836. DestElementAddr =
  1837. CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
  1838. UpdateDestListPtr = true;
  1839. break;
  1840. }
  1841. }
  1842. // Regardless of src and dest of copy, we emit the load of src
  1843. // element as this is required in all directions
  1844. SrcElementAddr = Bld.CreateElementBitCast(
  1845. SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
  1846. DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
  1847. SrcElementAddr.getElementType());
  1848. // Now that all active lanes have read the element in the
  1849. // Reduce list, shuffle over the value from the remote lane.
  1850. if (ShuffleInElement) {
  1851. shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
  1852. RemoteLaneOffset, Private->getExprLoc());
  1853. } else {
  1854. switch (CGF.getEvaluationKind(Private->getType())) {
  1855. case TEK_Scalar: {
  1856. llvm::Value *Elem = CGF.EmitLoadOfScalar(
  1857. SrcElementAddr, /*Volatile=*/false, Private->getType(),
  1858. Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
  1859. TBAAAccessInfo());
  1860. // Store the source element value to the dest element address.
  1861. CGF.EmitStoreOfScalar(
  1862. Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
  1863. LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
  1864. break;
  1865. }
  1866. case TEK_Complex: {
  1867. CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
  1868. CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
  1869. Private->getExprLoc());
  1870. CGF.EmitStoreOfComplex(
  1871. Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
  1872. /*isInit=*/false);
  1873. break;
  1874. }
  1875. case TEK_Aggregate:
  1876. CGF.EmitAggregateCopy(
  1877. CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
  1878. CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
  1879. Private->getType(), AggValueSlot::DoesNotOverlap);
  1880. break;
  1881. }
  1882. }
  1883. // Step 3.1: Modify reference in dest Reduce list as needed.
  1884. // Modifying the reference in Reduce list to point to the newly
  1885. // created element. The element is live in the current function
  1886. // scope and that of functions it invokes (i.e., reduce_function).
  1887. // RemoteReduceData[i] = (void*)&RemoteElem
  1888. if (UpdateDestListPtr) {
  1889. CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
  1890. DestElementAddr.getPointer(), CGF.VoidPtrTy),
  1891. DestElementPtrAddr, /*Volatile=*/false,
  1892. C.VoidPtrTy);
  1893. }
  1894. // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
  1895. // address of the next element in scratchpad memory, unless we're currently
  1896. // processing the last one. Memory alignment is also taken care of here.
  1897. if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
  1898. llvm::Value *ScratchpadBasePtr =
  1899. IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
  1900. llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
  1901. ScratchpadBasePtr = Bld.CreateNUWAdd(
  1902. ScratchpadBasePtr,
  1903. Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
  1904. // Take care of global memory alignment for performance
  1905. ScratchpadBasePtr = Bld.CreateNUWSub(
  1906. ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
  1907. ScratchpadBasePtr = Bld.CreateUDiv(
  1908. ScratchpadBasePtr,
  1909. llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
  1910. ScratchpadBasePtr = Bld.CreateNUWAdd(
  1911. ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
  1912. ScratchpadBasePtr = Bld.CreateNUWMul(
  1913. ScratchpadBasePtr,
  1914. llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
  1915. if (IncrScratchpadDest)
  1916. DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
  1917. else /* IncrScratchpadSrc = true */
  1918. SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
  1919. }
  1920. ++Idx;
  1921. }
  1922. }
  1923. /// This function emits a helper that gathers Reduce lists from the first
  1924. /// lane of every active warp to lanes in the first warp.
  1925. ///
  1926. /// void inter_warp_copy_func(void* reduce_data, num_warps)
  1927. /// shared smem[warp_size];
  1928. /// For all data entries D in reduce_data:
  1929. /// sync
  1930. /// If (I am the first lane in each warp)
  1931. /// Copy my local D to smem[warp_id]
  1932. /// sync
  1933. /// if (I am the first warp)
  1934. /// Copy smem[thread_id] to my local D
  1935. static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
  1936. ArrayRef<const Expr *> Privates,
  1937. QualType ReductionArrayTy,
  1938. SourceLocation Loc) {
  1939. ASTContext &C = CGM.getContext();
  1940. llvm::Module &M = CGM.getModule();
  1941. // ReduceList: thread local Reduce list.
  1942. // At the stage of the computation when this function is called, partially
  1943. // aggregated values reside in the first lane of every active warp.
  1944. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  1945. C.VoidPtrTy, ImplicitParamDecl::Other);
  1946. // NumWarps: number of warps active in the parallel region. This could
  1947. // be smaller than 32 (max warps in a CTA) for partial block reduction.
  1948. ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  1949. C.getIntTypeForBitwidth(32, /* Signed */ true),
  1950. ImplicitParamDecl::Other);
  1951. FunctionArgList Args;
  1952. Args.push_back(&ReduceListArg);
  1953. Args.push_back(&NumWarpsArg);
  1954. const CGFunctionInfo &CGFI =
  1955. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  1956. auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
  1957. llvm::GlobalValue::InternalLinkage,
  1958. "_omp_reduction_inter_warp_copy_func", &M);
  1959. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  1960. Fn->setDoesNotRecurse();
  1961. CodeGenFunction CGF(CGM);
  1962. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  1963. CGBuilderTy &Bld = CGF.Builder;
  1964. // This array is used as a medium to transfer, one reduce element at a time,
  1965. // the data from the first lane of every warp to lanes in the first warp
  1966. // in order to perform the final step of a reduction in a parallel region
  1967. // (reduction across warps). The array is placed in NVPTX __shared__ memory
  1968. // for reduced latency, as well as to have a distinct copy for concurrently
  1969. // executing target regions. The array is declared with common linkage so
  1970. // as to be shared across compilation units.
  1971. StringRef TransferMediumName =
  1972. "__openmp_nvptx_data_transfer_temporary_storage";
  1973. llvm::GlobalVariable *TransferMedium =
  1974. M.getGlobalVariable(TransferMediumName);
  1975. unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
  1976. if (!TransferMedium) {
  1977. auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
  1978. unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
  1979. TransferMedium = new llvm::GlobalVariable(
  1980. M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
  1981. llvm::UndefValue::get(Ty), TransferMediumName,
  1982. /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
  1983. SharedAddressSpace);
  1984. CGM.addCompilerUsedGlobal(TransferMedium);
  1985. }
  1986. auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  1987. // Get the CUDA thread id of the current OpenMP thread on the GPU.
  1988. llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
  1989. // nvptx_lane_id = nvptx_id % warpsize
  1990. llvm::Value *LaneID = getNVPTXLaneID(CGF);
  1991. // nvptx_warp_id = nvptx_id / warpsize
  1992. llvm::Value *WarpID = getNVPTXWarpID(CGF);
  1993. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  1994. Address LocalReduceList(
  1995. Bld.CreatePointerBitCastOrAddrSpaceCast(
  1996. CGF.EmitLoadOfScalar(
  1997. AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
  1998. LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
  1999. CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
  2000. CGF.getPointerAlign());
  2001. unsigned Idx = 0;
  2002. for (const Expr *Private : Privates) {
  2003. //
  2004. // Warp master copies reduce element to transfer medium in __shared__
  2005. // memory.
  2006. //
  2007. unsigned RealTySize =
  2008. C.getTypeSizeInChars(Private->getType())
  2009. .alignTo(C.getTypeAlignInChars(Private->getType()))
  2010. .getQuantity();
  2011. for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
  2012. unsigned NumIters = RealTySize / TySize;
  2013. if (NumIters == 0)
  2014. continue;
  2015. QualType CType = C.getIntTypeForBitwidth(
  2016. C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
  2017. llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
  2018. CharUnits Align = CharUnits::fromQuantity(TySize);
  2019. llvm::Value *Cnt = nullptr;
  2020. Address CntAddr = Address::invalid();
  2021. llvm::BasicBlock *PrecondBB = nullptr;
  2022. llvm::BasicBlock *ExitBB = nullptr;
  2023. if (NumIters > 1) {
  2024. CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
  2025. CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
  2026. /*Volatile=*/false, C.IntTy);
  2027. PrecondBB = CGF.createBasicBlock("precond");
  2028. ExitBB = CGF.createBasicBlock("exit");
  2029. llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
  2030. // There is no need to emit line number for unconditional branch.
  2031. (void)ApplyDebugLocation::CreateEmpty(CGF);
  2032. CGF.EmitBlock(PrecondBB);
  2033. Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
  2034. llvm::Value *Cmp =
  2035. Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
  2036. Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
  2037. CGF.EmitBlock(BodyBB);
  2038. }
  2039. // kmpc_barrier.
  2040. CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
  2041. /*EmitChecks=*/false,
  2042. /*ForceSimpleCall=*/true);
  2043. llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
  2044. llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
  2045. llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
  2046. // if (lane_id == 0)
  2047. llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
  2048. Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
  2049. CGF.EmitBlock(ThenBB);
  2050. // Reduce element = LocalReduceList[i]
  2051. Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
  2052. llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
  2053. ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
  2054. // elemptr = ((CopyType*)(elemptrptr)) + I
  2055. Address ElemPtr = Address(ElemPtrPtr, Align);
  2056. ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
  2057. if (NumIters > 1)
  2058. ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
  2059. // Get pointer to location in transfer medium.
  2060. // MediumPtr = &medium[warp_id]
  2061. llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
  2062. TransferMedium->getValueType(), TransferMedium,
  2063. {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
  2064. Address MediumPtr(MediumPtrVal, Align);
  2065. // Casting to actual data type.
  2066. // MediumPtr = (CopyType*)MediumPtrAddr;
  2067. MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
  2068. // elem = *elemptr
  2069. //*MediumPtr = elem
  2070. llvm::Value *Elem = CGF.EmitLoadOfScalar(
  2071. ElemPtr, /*Volatile=*/false, CType, Loc,
  2072. LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
  2073. // Store the source element value to the dest element address.
  2074. CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
  2075. LValueBaseInfo(AlignmentSource::Type),
  2076. TBAAAccessInfo());
  2077. Bld.CreateBr(MergeBB);
  2078. CGF.EmitBlock(ElseBB);
  2079. Bld.CreateBr(MergeBB);
  2080. CGF.EmitBlock(MergeBB);
  2081. // kmpc_barrier.
  2082. CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
  2083. /*EmitChecks=*/false,
  2084. /*ForceSimpleCall=*/true);
  2085. //
  2086. // Warp 0 copies reduce element from transfer medium.
  2087. //
  2088. llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
  2089. llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
  2090. llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
  2091. Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
  2092. llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
  2093. AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
  2094. // Up to 32 threads in warp 0 are active.
  2095. llvm::Value *IsActiveThread =
  2096. Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
  2097. Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
  2098. CGF.EmitBlock(W0ThenBB);
  2099. // SrcMediumPtr = &medium[tid]
  2100. llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
  2101. TransferMedium->getValueType(), TransferMedium,
  2102. {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
  2103. Address SrcMediumPtr(SrcMediumPtrVal, Align);
  2104. // SrcMediumVal = *SrcMediumPtr;
  2105. SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
  2106. // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
  2107. Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
  2108. llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
  2109. TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
  2110. Address TargetElemPtr = Address(TargetElemPtrVal, Align);
  2111. TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
  2112. if (NumIters > 1)
  2113. TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
  2114. // *TargetElemPtr = SrcMediumVal;
  2115. llvm::Value *SrcMediumValue =
  2116. CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
  2117. CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
  2118. CType);
  2119. Bld.CreateBr(W0MergeBB);
  2120. CGF.EmitBlock(W0ElseBB);
  2121. Bld.CreateBr(W0MergeBB);
  2122. CGF.EmitBlock(W0MergeBB);
  2123. if (NumIters > 1) {
  2124. Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
  2125. CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
  2126. CGF.EmitBranch(PrecondBB);
  2127. (void)ApplyDebugLocation::CreateEmpty(CGF);
  2128. CGF.EmitBlock(ExitBB);
  2129. }
  2130. RealTySize %= TySize;
  2131. }
  2132. ++Idx;
  2133. }
  2134. CGF.FinishFunction();
  2135. return Fn;
  2136. }
  2137. /// Emit a helper that reduces data across two OpenMP threads (lanes)
  2138. /// in the same warp. It uses shuffle instructions to copy over data from
  2139. /// a remote lane's stack. The reduction algorithm performed is specified
  2140. /// by the fourth parameter.
  2141. ///
  2142. /// Algorithm Versions.
  2143. /// Full Warp Reduce (argument value 0):
  2144. /// This algorithm assumes that all 32 lanes are active and gathers
  2145. /// data from these 32 lanes, producing a single resultant value.
  2146. /// Contiguous Partial Warp Reduce (argument value 1):
  2147. /// This algorithm assumes that only a *contiguous* subset of lanes
  2148. /// are active. This happens for the last warp in a parallel region
  2149. /// when the user specified num_threads is not an integer multiple of
  2150. /// 32. This contiguous subset always starts with the zeroth lane.
  2151. /// Partial Warp Reduce (argument value 2):
  2152. /// This algorithm gathers data from any number of lanes at any position.
  2153. /// All reduced values are stored in the lowest possible lane. The set
  2154. /// of problems every algorithm addresses is a super set of those
  2155. /// addressable by algorithms with a lower version number. Overhead
  2156. /// increases as algorithm version increases.
  2157. ///
  2158. /// Terminology
  2159. /// Reduce element:
  2160. /// Reduce element refers to the individual data field with primitive
  2161. /// data types to be combined and reduced across threads.
  2162. /// Reduce list:
  2163. /// Reduce list refers to a collection of local, thread-private
  2164. /// reduce elements.
  2165. /// Remote Reduce list:
  2166. /// Remote Reduce list refers to a collection of remote (relative to
  2167. /// the current thread) reduce elements.
  2168. ///
  2169. /// We distinguish between three states of threads that are important to
  2170. /// the implementation of this function.
  2171. /// Alive threads:
  2172. /// Threads in a warp executing the SIMT instruction, as distinguished from
  2173. /// threads that are inactive due to divergent control flow.
  2174. /// Active threads:
  2175. /// The minimal set of threads that has to be alive upon entry to this
  2176. /// function. The computation is correct iff active threads are alive.
  2177. /// Some threads are alive but they are not active because they do not
  2178. /// contribute to the computation in any useful manner. Turning them off
  2179. /// may introduce control flow overheads without any tangible benefits.
  2180. /// Effective threads:
  2181. /// In order to comply with the argument requirements of the shuffle
  2182. /// function, we must keep all lanes holding data alive. But at most
  2183. /// half of them perform value aggregation; we refer to this half of
  2184. /// threads as effective. The other half is simply handing off their
  2185. /// data.
  2186. ///
  2187. /// Procedure
  2188. /// Value shuffle:
  2189. /// In this step active threads transfer data from higher lane positions
  2190. /// in the warp to lower lane positions, creating Remote Reduce list.
  2191. /// Value aggregation:
  2192. /// In this step, effective threads combine their thread local Reduce list
  2193. /// with Remote Reduce list and store the result in the thread local
  2194. /// Reduce list.
  2195. /// Value copy:
  2196. /// In this step, we deal with the assumption made by algorithm 2
  2197. /// (i.e. contiguity assumption). When we have an odd number of lanes
  2198. /// active, say 2k+1, only k threads will be effective and therefore k
  2199. /// new values will be produced. However, the Reduce list owned by the
  2200. /// (2k+1)th thread is ignored in the value aggregation. Therefore
  2201. /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
  2202. /// that the contiguity assumption still holds.
  2203. static llvm::Function *emitShuffleAndReduceFunction(
  2204. CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
  2205. QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
  2206. ASTContext &C = CGM.getContext();
  2207. // Thread local Reduce list used to host the values of data to be reduced.
  2208. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2209. C.VoidPtrTy, ImplicitParamDecl::Other);
  2210. // Current lane id; could be logical.
  2211. ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
  2212. ImplicitParamDecl::Other);
  2213. // Offset of the remote source lane relative to the current lane.
  2214. ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2215. C.ShortTy, ImplicitParamDecl::Other);
  2216. // Algorithm version. This is expected to be known at compile time.
  2217. ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2218. C.ShortTy, ImplicitParamDecl::Other);
  2219. FunctionArgList Args;
  2220. Args.push_back(&ReduceListArg);
  2221. Args.push_back(&LaneIDArg);
  2222. Args.push_back(&RemoteLaneOffsetArg);
  2223. Args.push_back(&AlgoVerArg);
  2224. const CGFunctionInfo &CGFI =
  2225. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  2226. auto *Fn = llvm::Function::Create(
  2227. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  2228. "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
  2229. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  2230. Fn->setDoesNotRecurse();
  2231. CodeGenFunction CGF(CGM);
  2232. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  2233. CGBuilderTy &Bld = CGF.Builder;
  2234. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  2235. Address LocalReduceList(
  2236. Bld.CreatePointerBitCastOrAddrSpaceCast(
  2237. CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
  2238. C.VoidPtrTy, SourceLocation()),
  2239. CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
  2240. CGF.getPointerAlign());
  2241. Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
  2242. llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
  2243. AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
  2244. Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
  2245. llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
  2246. AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
  2247. Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
  2248. llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
  2249. AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
  2250. // Create a local thread-private variable to host the Reduce list
  2251. // from a remote lane.
  2252. Address RemoteReduceList =
  2253. CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
  2254. // This loop iterates through the list of reduce elements and copies,
  2255. // element by element, from a remote lane in the warp to RemoteReduceList,
  2256. // hosted on the thread's stack.
  2257. emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
  2258. LocalReduceList, RemoteReduceList,
  2259. {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
  2260. /*ScratchpadIndex=*/nullptr,
  2261. /*ScratchpadWidth=*/nullptr});
  2262. // The actions to be performed on the Remote Reduce list is dependent
  2263. // on the algorithm version.
  2264. //
  2265. // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
  2266. // LaneId % 2 == 0 && Offset > 0):
  2267. // do the reduction value aggregation
  2268. //
  2269. // The thread local variable Reduce list is mutated in place to host the
  2270. // reduced data, which is the aggregated value produced from local and
  2271. // remote lanes.
  2272. //
  2273. // Note that AlgoVer is expected to be a constant integer known at compile
  2274. // time.
  2275. // When AlgoVer==0, the first conjunction evaluates to true, making
  2276. // the entire predicate true during compile time.
  2277. // When AlgoVer==1, the second conjunction has only the second part to be
  2278. // evaluated during runtime. Other conjunctions evaluates to false
  2279. // during compile time.
  2280. // When AlgoVer==2, the third conjunction has only the second part to be
  2281. // evaluated during runtime. Other conjunctions evaluates to false
  2282. // during compile time.
  2283. llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
  2284. llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
  2285. llvm::Value *CondAlgo1 = Bld.CreateAnd(
  2286. Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
  2287. llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
  2288. llvm::Value *CondAlgo2 = Bld.CreateAnd(
  2289. Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
  2290. CondAlgo2 = Bld.CreateAnd(
  2291. CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
  2292. llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
  2293. CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
  2294. llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
  2295. llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
  2296. llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
  2297. Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
  2298. CGF.EmitBlock(ThenBB);
  2299. // reduce_function(LocalReduceList, RemoteReduceList)
  2300. llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2301. LocalReduceList.getPointer(), CGF.VoidPtrTy);
  2302. llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2303. RemoteReduceList.getPointer(), CGF.VoidPtrTy);
  2304. CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
  2305. CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
  2306. Bld.CreateBr(MergeBB);
  2307. CGF.EmitBlock(ElseBB);
  2308. Bld.CreateBr(MergeBB);
  2309. CGF.EmitBlock(MergeBB);
  2310. // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
  2311. // Reduce list.
  2312. Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
  2313. llvm::Value *CondCopy = Bld.CreateAnd(
  2314. Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
  2315. llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
  2316. llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
  2317. llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
  2318. Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
  2319. CGF.EmitBlock(CpyThenBB);
  2320. emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
  2321. RemoteReduceList, LocalReduceList);
  2322. Bld.CreateBr(CpyMergeBB);
  2323. CGF.EmitBlock(CpyElseBB);
  2324. Bld.CreateBr(CpyMergeBB);
  2325. CGF.EmitBlock(CpyMergeBB);
  2326. CGF.FinishFunction();
  2327. return Fn;
  2328. }
  2329. /// This function emits a helper that copies all the reduction variables from
  2330. /// the team into the provided global buffer for the reduction variables.
  2331. ///
  2332. /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
  2333. /// For all data entries D in reduce_data:
  2334. /// Copy local D to buffer.D[Idx]
  2335. static llvm::Value *emitListToGlobalCopyFunction(
  2336. CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
  2337. QualType ReductionArrayTy, SourceLocation Loc,
  2338. const RecordDecl *TeamReductionRec,
  2339. const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  2340. &VarFieldMap) {
  2341. ASTContext &C = CGM.getContext();
  2342. // Buffer: global reduction buffer.
  2343. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2344. C.VoidPtrTy, ImplicitParamDecl::Other);
  2345. // Idx: index of the buffer.
  2346. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
  2347. ImplicitParamDecl::Other);
  2348. // ReduceList: thread local Reduce list.
  2349. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2350. C.VoidPtrTy, ImplicitParamDecl::Other);
  2351. FunctionArgList Args;
  2352. Args.push_back(&BufferArg);
  2353. Args.push_back(&IdxArg);
  2354. Args.push_back(&ReduceListArg);
  2355. const CGFunctionInfo &CGFI =
  2356. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  2357. auto *Fn = llvm::Function::Create(
  2358. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  2359. "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
  2360. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  2361. Fn->setDoesNotRecurse();
  2362. CodeGenFunction CGF(CGM);
  2363. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  2364. CGBuilderTy &Bld = CGF.Builder;
  2365. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  2366. Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
  2367. Address LocalReduceList(
  2368. Bld.CreatePointerBitCastOrAddrSpaceCast(
  2369. CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
  2370. C.VoidPtrTy, Loc),
  2371. CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
  2372. CGF.getPointerAlign());
  2373. QualType StaticTy = C.getRecordType(TeamReductionRec);
  2374. llvm::Type *LLVMReductionsBufferTy =
  2375. CGM.getTypes().ConvertTypeForMem(StaticTy);
  2376. llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2377. CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
  2378. LLVMReductionsBufferTy->getPointerTo());
  2379. llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
  2380. CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
  2381. /*Volatile=*/false, C.IntTy,
  2382. Loc)};
  2383. unsigned Idx = 0;
  2384. for (const Expr *Private : Privates) {
  2385. // Reduce element = LocalReduceList[i]
  2386. Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
  2387. llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
  2388. ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
  2389. // elemptr = ((CopyType*)(elemptrptr)) + I
  2390. ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2391. ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
  2392. Address ElemPtr =
  2393. Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
  2394. const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
  2395. // Global = Buffer.VD[Idx];
  2396. const FieldDecl *FD = VarFieldMap.lookup(VD);
  2397. LValue GlobLVal = CGF.EmitLValueForField(
  2398. CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
  2399. Address GlobAddr = GlobLVal.getAddress(CGF);
  2400. llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
  2401. GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
  2402. GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
  2403. switch (CGF.getEvaluationKind(Private->getType())) {
  2404. case TEK_Scalar: {
  2405. llvm::Value *V = CGF.EmitLoadOfScalar(
  2406. ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
  2407. LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
  2408. CGF.EmitStoreOfScalar(V, GlobLVal);
  2409. break;
  2410. }
  2411. case TEK_Complex: {
  2412. CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
  2413. CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
  2414. CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
  2415. break;
  2416. }
  2417. case TEK_Aggregate:
  2418. CGF.EmitAggregateCopy(GlobLVal,
  2419. CGF.MakeAddrLValue(ElemPtr, Private->getType()),
  2420. Private->getType(), AggValueSlot::DoesNotOverlap);
  2421. break;
  2422. }
  2423. ++Idx;
  2424. }
  2425. CGF.FinishFunction();
  2426. return Fn;
  2427. }
  2428. /// This function emits a helper that reduces all the reduction variables from
  2429. /// the team into the provided global buffer for the reduction variables.
  2430. ///
  2431. /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
  2432. /// void *GlobPtrs[];
  2433. /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
  2434. /// ...
  2435. /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
  2436. /// reduce_function(GlobPtrs, reduce_data);
  2437. static llvm::Value *emitListToGlobalReduceFunction(
  2438. CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
  2439. QualType ReductionArrayTy, SourceLocation Loc,
  2440. const RecordDecl *TeamReductionRec,
  2441. const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  2442. &VarFieldMap,
  2443. llvm::Function *ReduceFn) {
  2444. ASTContext &C = CGM.getContext();
  2445. // Buffer: global reduction buffer.
  2446. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2447. C.VoidPtrTy, ImplicitParamDecl::Other);
  2448. // Idx: index of the buffer.
  2449. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
  2450. ImplicitParamDecl::Other);
  2451. // ReduceList: thread local Reduce list.
  2452. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2453. C.VoidPtrTy, ImplicitParamDecl::Other);
  2454. FunctionArgList Args;
  2455. Args.push_back(&BufferArg);
  2456. Args.push_back(&IdxArg);
  2457. Args.push_back(&ReduceListArg);
  2458. const CGFunctionInfo &CGFI =
  2459. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  2460. auto *Fn = llvm::Function::Create(
  2461. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  2462. "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
  2463. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  2464. Fn->setDoesNotRecurse();
  2465. CodeGenFunction CGF(CGM);
  2466. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  2467. CGBuilderTy &Bld = CGF.Builder;
  2468. Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
  2469. QualType StaticTy = C.getRecordType(TeamReductionRec);
  2470. llvm::Type *LLVMReductionsBufferTy =
  2471. CGM.getTypes().ConvertTypeForMem(StaticTy);
  2472. llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2473. CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
  2474. LLVMReductionsBufferTy->getPointerTo());
  2475. // 1. Build a list of reduction variables.
  2476. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
  2477. Address ReductionList =
  2478. CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
  2479. auto IPriv = Privates.begin();
  2480. llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
  2481. CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
  2482. /*Volatile=*/false, C.IntTy,
  2483. Loc)};
  2484. unsigned Idx = 0;
  2485. for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
  2486. Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  2487. // Global = Buffer.VD[Idx];
  2488. const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
  2489. const FieldDecl *FD = VarFieldMap.lookup(VD);
  2490. LValue GlobLVal = CGF.EmitLValueForField(
  2491. CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
  2492. Address GlobAddr = GlobLVal.getAddress(CGF);
  2493. llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
  2494. GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
  2495. llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
  2496. CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
  2497. if ((*IPriv)->getType()->isVariablyModifiedType()) {
  2498. // Store array size.
  2499. ++Idx;
  2500. Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  2501. llvm::Value *Size = CGF.Builder.CreateIntCast(
  2502. CGF.getVLASize(
  2503. CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
  2504. .NumElts,
  2505. CGF.SizeTy, /*isSigned=*/false);
  2506. CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
  2507. Elem);
  2508. }
  2509. }
  2510. // Call reduce_function(GlobalReduceList, ReduceList)
  2511. llvm::Value *GlobalReduceList =
  2512. CGF.EmitCastToVoidPtr(ReductionList.getPointer());
  2513. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  2514. llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
  2515. AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
  2516. CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
  2517. CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
  2518. CGF.FinishFunction();
  2519. return Fn;
  2520. }
  2521. /// This function emits a helper that copies all the reduction variables from
  2522. /// the team into the provided global buffer for the reduction variables.
  2523. ///
  2524. /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
  2525. /// For all data entries D in reduce_data:
  2526. /// Copy buffer.D[Idx] to local D;
  2527. static llvm::Value *emitGlobalToListCopyFunction(
  2528. CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
  2529. QualType ReductionArrayTy, SourceLocation Loc,
  2530. const RecordDecl *TeamReductionRec,
  2531. const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  2532. &VarFieldMap) {
  2533. ASTContext &C = CGM.getContext();
  2534. // Buffer: global reduction buffer.
  2535. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2536. C.VoidPtrTy, ImplicitParamDecl::Other);
  2537. // Idx: index of the buffer.
  2538. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
  2539. ImplicitParamDecl::Other);
  2540. // ReduceList: thread local Reduce list.
  2541. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2542. C.VoidPtrTy, ImplicitParamDecl::Other);
  2543. FunctionArgList Args;
  2544. Args.push_back(&BufferArg);
  2545. Args.push_back(&IdxArg);
  2546. Args.push_back(&ReduceListArg);
  2547. const CGFunctionInfo &CGFI =
  2548. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  2549. auto *Fn = llvm::Function::Create(
  2550. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  2551. "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
  2552. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  2553. Fn->setDoesNotRecurse();
  2554. CodeGenFunction CGF(CGM);
  2555. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  2556. CGBuilderTy &Bld = CGF.Builder;
  2557. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  2558. Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
  2559. Address LocalReduceList(
  2560. Bld.CreatePointerBitCastOrAddrSpaceCast(
  2561. CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
  2562. C.VoidPtrTy, Loc),
  2563. CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
  2564. CGF.getPointerAlign());
  2565. QualType StaticTy = C.getRecordType(TeamReductionRec);
  2566. llvm::Type *LLVMReductionsBufferTy =
  2567. CGM.getTypes().ConvertTypeForMem(StaticTy);
  2568. llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2569. CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
  2570. LLVMReductionsBufferTy->getPointerTo());
  2571. llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
  2572. CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
  2573. /*Volatile=*/false, C.IntTy,
  2574. Loc)};
  2575. unsigned Idx = 0;
  2576. for (const Expr *Private : Privates) {
  2577. // Reduce element = LocalReduceList[i]
  2578. Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
  2579. llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
  2580. ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
  2581. // elemptr = ((CopyType*)(elemptrptr)) + I
  2582. ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2583. ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
  2584. Address ElemPtr =
  2585. Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
  2586. const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
  2587. // Global = Buffer.VD[Idx];
  2588. const FieldDecl *FD = VarFieldMap.lookup(VD);
  2589. LValue GlobLVal = CGF.EmitLValueForField(
  2590. CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
  2591. Address GlobAddr = GlobLVal.getAddress(CGF);
  2592. llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
  2593. GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
  2594. GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
  2595. switch (CGF.getEvaluationKind(Private->getType())) {
  2596. case TEK_Scalar: {
  2597. llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
  2598. CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
  2599. LValueBaseInfo(AlignmentSource::Type),
  2600. TBAAAccessInfo());
  2601. break;
  2602. }
  2603. case TEK_Complex: {
  2604. CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
  2605. CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
  2606. /*isInit=*/false);
  2607. break;
  2608. }
  2609. case TEK_Aggregate:
  2610. CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
  2611. GlobLVal, Private->getType(),
  2612. AggValueSlot::DoesNotOverlap);
  2613. break;
  2614. }
  2615. ++Idx;
  2616. }
  2617. CGF.FinishFunction();
  2618. return Fn;
  2619. }
  2620. /// This function emits a helper that reduces all the reduction variables from
  2621. /// the team into the provided global buffer for the reduction variables.
  2622. ///
  2623. /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
  2624. /// void *GlobPtrs[];
  2625. /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
  2626. /// ...
  2627. /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
  2628. /// reduce_function(reduce_data, GlobPtrs);
  2629. static llvm::Value *emitGlobalToListReduceFunction(
  2630. CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
  2631. QualType ReductionArrayTy, SourceLocation Loc,
  2632. const RecordDecl *TeamReductionRec,
  2633. const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
  2634. &VarFieldMap,
  2635. llvm::Function *ReduceFn) {
  2636. ASTContext &C = CGM.getContext();
  2637. // Buffer: global reduction buffer.
  2638. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2639. C.VoidPtrTy, ImplicitParamDecl::Other);
  2640. // Idx: index of the buffer.
  2641. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
  2642. ImplicitParamDecl::Other);
  2643. // ReduceList: thread local Reduce list.
  2644. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
  2645. C.VoidPtrTy, ImplicitParamDecl::Other);
  2646. FunctionArgList Args;
  2647. Args.push_back(&BufferArg);
  2648. Args.push_back(&IdxArg);
  2649. Args.push_back(&ReduceListArg);
  2650. const CGFunctionInfo &CGFI =
  2651. CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
  2652. auto *Fn = llvm::Function::Create(
  2653. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  2654. "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
  2655. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  2656. Fn->setDoesNotRecurse();
  2657. CodeGenFunction CGF(CGM);
  2658. CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
  2659. CGBuilderTy &Bld = CGF.Builder;
  2660. Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
  2661. QualType StaticTy = C.getRecordType(TeamReductionRec);
  2662. llvm::Type *LLVMReductionsBufferTy =
  2663. CGM.getTypes().ConvertTypeForMem(StaticTy);
  2664. llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
  2665. CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
  2666. LLVMReductionsBufferTy->getPointerTo());
  2667. // 1. Build a list of reduction variables.
  2668. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
  2669. Address ReductionList =
  2670. CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
  2671. auto IPriv = Privates.begin();
  2672. llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
  2673. CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
  2674. /*Volatile=*/false, C.IntTy,
  2675. Loc)};
  2676. unsigned Idx = 0;
  2677. for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
  2678. Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  2679. // Global = Buffer.VD[Idx];
  2680. const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
  2681. const FieldDecl *FD = VarFieldMap.lookup(VD);
  2682. LValue GlobLVal = CGF.EmitLValueForField(
  2683. CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
  2684. Address GlobAddr = GlobLVal.getAddress(CGF);
  2685. llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
  2686. GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
  2687. llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
  2688. CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
  2689. if ((*IPriv)->getType()->isVariablyModifiedType()) {
  2690. // Store array size.
  2691. ++Idx;
  2692. Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  2693. llvm::Value *Size = CGF.Builder.CreateIntCast(
  2694. CGF.getVLASize(
  2695. CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
  2696. .NumElts,
  2697. CGF.SizeTy, /*isSigned=*/false);
  2698. CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
  2699. Elem);
  2700. }
  2701. }
  2702. // Call reduce_function(ReduceList, GlobalReduceList)
  2703. llvm::Value *GlobalReduceList =
  2704. CGF.EmitCastToVoidPtr(ReductionList.getPointer());
  2705. Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
  2706. llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
  2707. AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
  2708. CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
  2709. CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
  2710. CGF.FinishFunction();
  2711. return Fn;
  2712. }
  2713. ///
  2714. /// Design of OpenMP reductions on the GPU
  2715. ///
  2716. /// Consider a typical OpenMP program with one or more reduction
  2717. /// clauses:
  2718. ///
  2719. /// float foo;
  2720. /// double bar;
  2721. /// #pragma omp target teams distribute parallel for \
  2722. /// reduction(+:foo) reduction(*:bar)
  2723. /// for (int i = 0; i < N; i++) {
  2724. /// foo += A[i]; bar *= B[i];
  2725. /// }
  2726. ///
  2727. /// where 'foo' and 'bar' are reduced across all OpenMP threads in
  2728. /// all teams. In our OpenMP implementation on the NVPTX device an
  2729. /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
  2730. /// within a team are mapped to CUDA threads within a threadblock.
  2731. /// Our goal is to efficiently aggregate values across all OpenMP
  2732. /// threads such that:
  2733. ///
  2734. /// - the compiler and runtime are logically concise, and
  2735. /// - the reduction is performed efficiently in a hierarchical
  2736. /// manner as follows: within OpenMP threads in the same warp,
  2737. /// across warps in a threadblock, and finally across teams on
  2738. /// the NVPTX device.
  2739. ///
  2740. /// Introduction to Decoupling
  2741. ///
  2742. /// We would like to decouple the compiler and the runtime so that the
  2743. /// latter is ignorant of the reduction variables (number, data types)
  2744. /// and the reduction operators. This allows a simpler interface
  2745. /// and implementation while still attaining good performance.
  2746. ///
  2747. /// Pseudocode for the aforementioned OpenMP program generated by the
  2748. /// compiler is as follows:
  2749. ///
  2750. /// 1. Create private copies of reduction variables on each OpenMP
  2751. /// thread: 'foo_private', 'bar_private'
  2752. /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
  2753. /// to it and writes the result in 'foo_private' and 'bar_private'
  2754. /// respectively.
  2755. /// 3. Call the OpenMP runtime on the GPU to reduce within a team
  2756. /// and store the result on the team master:
  2757. ///
  2758. /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
  2759. /// reduceData, shuffleReduceFn, interWarpCpyFn)
  2760. ///
  2761. /// where:
  2762. /// struct ReduceData {
  2763. /// double *foo;
  2764. /// double *bar;
  2765. /// } reduceData
  2766. /// reduceData.foo = &foo_private
  2767. /// reduceData.bar = &bar_private
  2768. ///
  2769. /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
  2770. /// auxiliary functions generated by the compiler that operate on
  2771. /// variables of type 'ReduceData'. They aid the runtime perform
  2772. /// algorithmic steps in a data agnostic manner.
  2773. ///
  2774. /// 'shuffleReduceFn' is a pointer to a function that reduces data
  2775. /// of type 'ReduceData' across two OpenMP threads (lanes) in the
  2776. /// same warp. It takes the following arguments as input:
  2777. ///
  2778. /// a. variable of type 'ReduceData' on the calling lane,
  2779. /// b. its lane_id,
  2780. /// c. an offset relative to the current lane_id to generate a
  2781. /// remote_lane_id. The remote lane contains the second
  2782. /// variable of type 'ReduceData' that is to be reduced.
  2783. /// d. an algorithm version parameter determining which reduction
  2784. /// algorithm to use.
  2785. ///
  2786. /// 'shuffleReduceFn' retrieves data from the remote lane using
  2787. /// efficient GPU shuffle intrinsics and reduces, using the
  2788. /// algorithm specified by the 4th parameter, the two operands
  2789. /// element-wise. The result is written to the first operand.
  2790. ///
  2791. /// Different reduction algorithms are implemented in different
  2792. /// runtime functions, all calling 'shuffleReduceFn' to perform
  2793. /// the essential reduction step. Therefore, based on the 4th
  2794. /// parameter, this function behaves slightly differently to
  2795. /// cooperate with the runtime to ensure correctness under
  2796. /// different circumstances.
  2797. ///
  2798. /// 'InterWarpCpyFn' is a pointer to a function that transfers
  2799. /// reduced variables across warps. It tunnels, through CUDA
  2800. /// shared memory, the thread-private data of type 'ReduceData'
  2801. /// from lane 0 of each warp to a lane in the first warp.
  2802. /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
  2803. /// The last team writes the global reduced value to memory.
  2804. ///
  2805. /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
  2806. /// reduceData, shuffleReduceFn, interWarpCpyFn,
  2807. /// scratchpadCopyFn, loadAndReduceFn)
  2808. ///
  2809. /// 'scratchpadCopyFn' is a helper that stores reduced
  2810. /// data from the team master to a scratchpad array in
  2811. /// global memory.
  2812. ///
  2813. /// 'loadAndReduceFn' is a helper that loads data from
  2814. /// the scratchpad array and reduces it with the input
  2815. /// operand.
  2816. ///
  2817. /// These compiler generated functions hide address
  2818. /// calculation and alignment information from the runtime.
  2819. /// 5. if ret == 1:
  2820. /// The team master of the last team stores the reduced
  2821. /// result to the globals in memory.
  2822. /// foo += reduceData.foo; bar *= reduceData.bar
  2823. ///
  2824. ///
  2825. /// Warp Reduction Algorithms
  2826. ///
  2827. /// On the warp level, we have three algorithms implemented in the
  2828. /// OpenMP runtime depending on the number of active lanes:
  2829. ///
  2830. /// Full Warp Reduction
  2831. ///
  2832. /// The reduce algorithm within a warp where all lanes are active
  2833. /// is implemented in the runtime as follows:
  2834. ///
  2835. /// full_warp_reduce(void *reduce_data,
  2836. /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
  2837. /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
  2838. /// ShuffleReduceFn(reduce_data, 0, offset, 0);
  2839. /// }
  2840. ///
  2841. /// The algorithm completes in log(2, WARPSIZE) steps.
  2842. ///
  2843. /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
  2844. /// not used therefore we save instructions by not retrieving lane_id
  2845. /// from the corresponding special registers. The 4th parameter, which
  2846. /// represents the version of the algorithm being used, is set to 0 to
  2847. /// signify full warp reduction.
  2848. ///
  2849. /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  2850. ///
  2851. /// #reduce_elem refers to an element in the local lane's data structure
  2852. /// #remote_elem is retrieved from a remote lane
  2853. /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  2854. /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
  2855. ///
  2856. /// Contiguous Partial Warp Reduction
  2857. ///
  2858. /// This reduce algorithm is used within a warp where only the first
  2859. /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
  2860. /// number of OpenMP threads in a parallel region is not a multiple of
  2861. /// WARPSIZE. The algorithm is implemented in the runtime as follows:
  2862. ///
  2863. /// void
  2864. /// contiguous_partial_reduce(void *reduce_data,
  2865. /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
  2866. /// int size, int lane_id) {
  2867. /// int curr_size;
  2868. /// int offset;
  2869. /// curr_size = size;
  2870. /// mask = curr_size/2;
  2871. /// while (offset>0) {
  2872. /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
  2873. /// curr_size = (curr_size+1)/2;
  2874. /// offset = curr_size/2;
  2875. /// }
  2876. /// }
  2877. ///
  2878. /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  2879. ///
  2880. /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  2881. /// if (lane_id < offset)
  2882. /// reduce_elem = reduce_elem REDUCE_OP remote_elem
  2883. /// else
  2884. /// reduce_elem = remote_elem
  2885. ///
  2886. /// This algorithm assumes that the data to be reduced are located in a
  2887. /// contiguous subset of lanes starting from the first. When there is
  2888. /// an odd number of active lanes, the data in the last lane is not
  2889. /// aggregated with any other lane's dat but is instead copied over.
  2890. ///
  2891. /// Dispersed Partial Warp Reduction
  2892. ///
  2893. /// This algorithm is used within a warp when any discontiguous subset of
  2894. /// lanes are active. It is used to implement the reduction operation
  2895. /// across lanes in an OpenMP simd region or in a nested parallel region.
  2896. ///
  2897. /// void
  2898. /// dispersed_partial_reduce(void *reduce_data,
  2899. /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
  2900. /// int size, remote_id;
  2901. /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
  2902. /// do {
  2903. /// remote_id = next_active_lane_id_right_after_me();
  2904. /// # the above function returns 0 of no active lane
  2905. /// # is present right after the current lane.
  2906. /// size = number_of_active_lanes_in_this_warp();
  2907. /// logical_lane_id /= 2;
  2908. /// ShuffleReduceFn(reduce_data, logical_lane_id,
  2909. /// remote_id-1-threadIdx.x, 2);
  2910. /// } while (logical_lane_id % 2 == 0 && size > 1);
  2911. /// }
  2912. ///
  2913. /// There is no assumption made about the initial state of the reduction.
  2914. /// Any number of lanes (>=1) could be active at any position. The reduction
  2915. /// result is returned in the first active lane.
  2916. ///
  2917. /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  2918. ///
  2919. /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  2920. /// if (lane_id % 2 == 0 && offset > 0)
  2921. /// reduce_elem = reduce_elem REDUCE_OP remote_elem
  2922. /// else
  2923. /// reduce_elem = remote_elem
  2924. ///
  2925. ///
  2926. /// Intra-Team Reduction
  2927. ///
  2928. /// This function, as implemented in the runtime call
  2929. /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
  2930. /// threads in a team. It first reduces within a warp using the
  2931. /// aforementioned algorithms. We then proceed to gather all such
  2932. /// reduced values at the first warp.
  2933. ///
  2934. /// The runtime makes use of the function 'InterWarpCpyFn', which copies
  2935. /// data from each of the "warp master" (zeroth lane of each warp, where
  2936. /// warp-reduced data is held) to the zeroth warp. This step reduces (in
  2937. /// a mathematical sense) the problem of reduction across warp masters in
  2938. /// a block to the problem of warp reduction.
  2939. ///
  2940. ///
  2941. /// Inter-Team Reduction
  2942. ///
  2943. /// Once a team has reduced its data to a single value, it is stored in
  2944. /// a global scratchpad array. Since each team has a distinct slot, this
  2945. /// can be done without locking.
  2946. ///
  2947. /// The last team to write to the scratchpad array proceeds to reduce the
  2948. /// scratchpad array. One or more workers in the last team use the helper
  2949. /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
  2950. /// the k'th worker reduces every k'th element.
  2951. ///
  2952. /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
  2953. /// reduce across workers and compute a globally reduced value.
  2954. ///
  2955. void CGOpenMPRuntimeGPU::emitReduction(
  2956. CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
  2957. ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
  2958. ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
  2959. if (!CGF.HaveInsertPoint())
  2960. return;
  2961. bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
  2962. #ifndef NDEBUG
  2963. bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
  2964. #endif
  2965. if (Options.SimpleReduction) {
  2966. assert(!TeamsReduction && !ParallelReduction &&
  2967. "Invalid reduction selection in emitReduction.");
  2968. CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
  2969. ReductionOps, Options);
  2970. return;
  2971. }
  2972. assert((TeamsReduction || ParallelReduction) &&
  2973. "Invalid reduction selection in emitReduction.");
  2974. // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
  2975. // RedList, shuffle_reduce_func, interwarp_copy_func);
  2976. // or
  2977. // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
  2978. llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
  2979. llvm::Value *ThreadId = getThreadID(CGF, Loc);
  2980. llvm::Value *Res;
  2981. ASTContext &C = CGM.getContext();
  2982. // 1. Build a list of reduction variables.
  2983. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
  2984. auto Size = RHSExprs.size();
  2985. for (const Expr *E : Privates) {
  2986. if (E->getType()->isVariablyModifiedType())
  2987. // Reserve place for array size.
  2988. ++Size;
  2989. }
  2990. llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
  2991. QualType ReductionArrayTy =
  2992. C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
  2993. /*IndexTypeQuals=*/0);
  2994. Address ReductionList =
  2995. CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
  2996. auto IPriv = Privates.begin();
  2997. unsigned Idx = 0;
  2998. for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
  2999. Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  3000. CGF.Builder.CreateStore(
  3001. CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3002. CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
  3003. Elem);
  3004. if ((*IPriv)->getType()->isVariablyModifiedType()) {
  3005. // Store array size.
  3006. ++Idx;
  3007. Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
  3008. llvm::Value *Size = CGF.Builder.CreateIntCast(
  3009. CGF.getVLASize(
  3010. CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
  3011. .NumElts,
  3012. CGF.SizeTy, /*isSigned=*/false);
  3013. CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
  3014. Elem);
  3015. }
  3016. }
  3017. llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3018. ReductionList.getPointer(), CGF.VoidPtrTy);
  3019. llvm::Function *ReductionFn = emitReductionFunction(
  3020. Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
  3021. LHSExprs, RHSExprs, ReductionOps);
  3022. llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
  3023. llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
  3024. CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
  3025. llvm::Value *InterWarpCopyFn =
  3026. emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
  3027. if (ParallelReduction) {
  3028. llvm::Value *Args[] = {RTLoc,
  3029. ThreadId,
  3030. CGF.Builder.getInt32(RHSExprs.size()),
  3031. ReductionArrayTySize,
  3032. RL,
  3033. ShuffleAndReduceFn,
  3034. InterWarpCopyFn};
  3035. Res = CGF.EmitRuntimeCall(
  3036. OMPBuilder.getOrCreateRuntimeFunction(
  3037. CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
  3038. Args);
  3039. } else {
  3040. assert(TeamsReduction && "expected teams reduction.");
  3041. llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
  3042. llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
  3043. int Cnt = 0;
  3044. for (const Expr *DRE : Privates) {
  3045. PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
  3046. ++Cnt;
  3047. }
  3048. const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
  3049. CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
  3050. C.getLangOpts().OpenMPCUDAReductionBufNum);
  3051. TeamsReductions.push_back(TeamReductionRec);
  3052. if (!KernelTeamsReductionPtr) {
  3053. KernelTeamsReductionPtr = new llvm::GlobalVariable(
  3054. CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
  3055. llvm::GlobalValue::InternalLinkage, nullptr,
  3056. "_openmp_teams_reductions_buffer_$_$ptr");
  3057. }
  3058. llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
  3059. Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
  3060. /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
  3061. llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
  3062. CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
  3063. llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
  3064. CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
  3065. ReductionFn);
  3066. llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
  3067. CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
  3068. llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
  3069. CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
  3070. ReductionFn);
  3071. llvm::Value *Args[] = {
  3072. RTLoc,
  3073. ThreadId,
  3074. GlobalBufferPtr,
  3075. CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
  3076. RL,
  3077. ShuffleAndReduceFn,
  3078. InterWarpCopyFn,
  3079. GlobalToBufferCpyFn,
  3080. GlobalToBufferRedFn,
  3081. BufferToGlobalCpyFn,
  3082. BufferToGlobalRedFn};
  3083. Res = CGF.EmitRuntimeCall(
  3084. OMPBuilder.getOrCreateRuntimeFunction(
  3085. CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
  3086. Args);
  3087. }
  3088. // 5. Build if (res == 1)
  3089. llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
  3090. llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
  3091. llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
  3092. Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
  3093. CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
  3094. // 6. Build then branch: where we have reduced values in the master
  3095. // thread in each team.
  3096. // __kmpc_end_reduce{_nowait}(<gtid>);
  3097. // break;
  3098. CGF.EmitBlock(ThenBB);
  3099. // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
  3100. auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
  3101. this](CodeGenFunction &CGF, PrePostActionTy &Action) {
  3102. auto IPriv = Privates.begin();
  3103. auto ILHS = LHSExprs.begin();
  3104. auto IRHS = RHSExprs.begin();
  3105. for (const Expr *E : ReductionOps) {
  3106. emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
  3107. cast<DeclRefExpr>(*IRHS));
  3108. ++IPriv;
  3109. ++ILHS;
  3110. ++IRHS;
  3111. }
  3112. };
  3113. llvm::Value *EndArgs[] = {ThreadId};
  3114. RegionCodeGenTy RCG(CodeGen);
  3115. NVPTXActionTy Action(
  3116. nullptr, llvm::None,
  3117. OMPBuilder.getOrCreateRuntimeFunction(
  3118. CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
  3119. EndArgs);
  3120. RCG.setAction(Action);
  3121. RCG(CGF);
  3122. // There is no need to emit line number for unconditional branch.
  3123. (void)ApplyDebugLocation::CreateEmpty(CGF);
  3124. CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
  3125. }
  3126. const VarDecl *
  3127. CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
  3128. const VarDecl *NativeParam) const {
  3129. if (!NativeParam->getType()->isReferenceType())
  3130. return NativeParam;
  3131. QualType ArgType = NativeParam->getType();
  3132. QualifierCollector QC;
  3133. const Type *NonQualTy = QC.strip(ArgType);
  3134. QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
  3135. if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
  3136. if (Attr->getCaptureKind() == OMPC_map) {
  3137. PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
  3138. LangAS::opencl_global);
  3139. }
  3140. }
  3141. ArgType = CGM.getContext().getPointerType(PointeeTy);
  3142. QC.addRestrict();
  3143. enum { NVPTX_local_addr = 5 };
  3144. QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
  3145. ArgType = QC.apply(CGM.getContext(), ArgType);
  3146. if (isa<ImplicitParamDecl>(NativeParam))
  3147. return ImplicitParamDecl::Create(
  3148. CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
  3149. NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
  3150. return ParmVarDecl::Create(
  3151. CGM.getContext(),
  3152. const_cast<DeclContext *>(NativeParam->getDeclContext()),
  3153. NativeParam->getBeginLoc(), NativeParam->getLocation(),
  3154. NativeParam->getIdentifier(), ArgType,
  3155. /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
  3156. }
  3157. Address
  3158. CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
  3159. const VarDecl *NativeParam,
  3160. const VarDecl *TargetParam) const {
  3161. assert(NativeParam != TargetParam &&
  3162. NativeParam->getType()->isReferenceType() &&
  3163. "Native arg must not be the same as target arg.");
  3164. Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
  3165. QualType NativeParamType = NativeParam->getType();
  3166. QualifierCollector QC;
  3167. const Type *NonQualTy = QC.strip(NativeParamType);
  3168. QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
  3169. unsigned NativePointeeAddrSpace =
  3170. CGF.getContext().getTargetAddressSpace(NativePointeeTy);
  3171. QualType TargetTy = TargetParam->getType();
  3172. llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
  3173. LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
  3174. // First cast to generic.
  3175. TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3176. TargetAddr, llvm::PointerType::getWithSamePointeeType(
  3177. cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
  3178. // Cast from generic to native address space.
  3179. TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3180. TargetAddr, llvm::PointerType::getWithSamePointeeType(
  3181. cast<llvm::PointerType>(TargetAddr->getType()),
  3182. NativePointeeAddrSpace));
  3183. Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
  3184. CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
  3185. NativeParamType);
  3186. return NativeParamAddr;
  3187. }
  3188. void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
  3189. CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
  3190. ArrayRef<llvm::Value *> Args) const {
  3191. SmallVector<llvm::Value *, 4> TargetArgs;
  3192. TargetArgs.reserve(Args.size());
  3193. auto *FnType = OutlinedFn.getFunctionType();
  3194. for (unsigned I = 0, E = Args.size(); I < E; ++I) {
  3195. if (FnType->isVarArg() && FnType->getNumParams() <= I) {
  3196. TargetArgs.append(std::next(Args.begin(), I), Args.end());
  3197. break;
  3198. }
  3199. llvm::Type *TargetType = FnType->getParamType(I);
  3200. llvm::Value *NativeArg = Args[I];
  3201. if (!TargetType->isPointerTy()) {
  3202. TargetArgs.emplace_back(NativeArg);
  3203. continue;
  3204. }
  3205. llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3206. NativeArg, llvm::PointerType::getWithSamePointeeType(
  3207. cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
  3208. TargetArgs.emplace_back(
  3209. CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
  3210. }
  3211. CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
  3212. }
  3213. /// Emit function which wraps the outline parallel region
  3214. /// and controls the arguments which are passed to this function.
  3215. /// The wrapper ensures that the outlined function is called
  3216. /// with the correct arguments when data is shared.
  3217. llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
  3218. llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
  3219. ASTContext &Ctx = CGM.getContext();
  3220. const auto &CS = *D.getCapturedStmt(OMPD_parallel);
  3221. // Create a function that takes as argument the source thread.
  3222. FunctionArgList WrapperArgs;
  3223. QualType Int16QTy =
  3224. Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
  3225. QualType Int32QTy =
  3226. Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
  3227. ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
  3228. /*Id=*/nullptr, Int16QTy,
  3229. ImplicitParamDecl::Other);
  3230. ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
  3231. /*Id=*/nullptr, Int32QTy,
  3232. ImplicitParamDecl::Other);
  3233. WrapperArgs.emplace_back(&ParallelLevelArg);
  3234. WrapperArgs.emplace_back(&WrapperArg);
  3235. const CGFunctionInfo &CGFI =
  3236. CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
  3237. auto *Fn = llvm::Function::Create(
  3238. CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
  3239. Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
  3240. // Ensure we do not inline the function. This is trivially true for the ones
  3241. // passed to __kmpc_fork_call but the ones calles in serialized regions
  3242. // could be inlined. This is not a perfect but it is closer to the invariant
  3243. // we want, namely, every data environment starts with a new function.
  3244. // TODO: We should pass the if condition to the runtime function and do the
  3245. // handling there. Much cleaner code.
  3246. Fn->addFnAttr(llvm::Attribute::NoInline);
  3247. CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
  3248. Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
  3249. Fn->setDoesNotRecurse();
  3250. CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
  3251. CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
  3252. D.getBeginLoc(), D.getBeginLoc());
  3253. const auto *RD = CS.getCapturedRecordDecl();
  3254. auto CurField = RD->field_begin();
  3255. Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
  3256. /*Name=*/".zero.addr");
  3257. CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
  3258. // Get the array of arguments.
  3259. SmallVector<llvm::Value *, 8> Args;
  3260. Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
  3261. Args.emplace_back(ZeroAddr.getPointer());
  3262. CGBuilderTy &Bld = CGF.Builder;
  3263. auto CI = CS.capture_begin();
  3264. // Use global memory for data sharing.
  3265. // Handle passing of global args to workers.
  3266. Address GlobalArgs =
  3267. CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
  3268. llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
  3269. llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
  3270. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  3271. CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
  3272. DataSharingArgs);
  3273. // Retrieve the shared variables from the list of references returned
  3274. // by the runtime. Pass the variables to the outlined function.
  3275. Address SharedArgListAddress = Address::invalid();
  3276. if (CS.capture_size() > 0 ||
  3277. isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
  3278. SharedArgListAddress = CGF.EmitLoadOfPointer(
  3279. GlobalArgs, CGF.getContext()
  3280. .getPointerType(CGF.getContext().getPointerType(
  3281. CGF.getContext().VoidPtrTy))
  3282. .castAs<PointerType>());
  3283. }
  3284. unsigned Idx = 0;
  3285. if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
  3286. Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
  3287. Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
  3288. Src, CGF.SizeTy->getPointerTo());
  3289. llvm::Value *LB = CGF.EmitLoadOfScalar(
  3290. TypedAddress,
  3291. /*Volatile=*/false,
  3292. CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
  3293. cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
  3294. Args.emplace_back(LB);
  3295. ++Idx;
  3296. Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
  3297. TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
  3298. Src, CGF.SizeTy->getPointerTo());
  3299. llvm::Value *UB = CGF.EmitLoadOfScalar(
  3300. TypedAddress,
  3301. /*Volatile=*/false,
  3302. CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
  3303. cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
  3304. Args.emplace_back(UB);
  3305. ++Idx;
  3306. }
  3307. if (CS.capture_size() > 0) {
  3308. ASTContext &CGFContext = CGF.getContext();
  3309. for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
  3310. QualType ElemTy = CurField->getType();
  3311. Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
  3312. Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
  3313. Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
  3314. llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
  3315. /*Volatile=*/false,
  3316. CGFContext.getPointerType(ElemTy),
  3317. CI->getLocation());
  3318. if (CI->capturesVariableByCopy() &&
  3319. !CI->getCapturedVar()->getType()->isAnyPointerType()) {
  3320. Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
  3321. CI->getLocation());
  3322. }
  3323. Args.emplace_back(Arg);
  3324. }
  3325. }
  3326. emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
  3327. CGF.FinishFunction();
  3328. return Fn;
  3329. }
  3330. void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
  3331. const Decl *D) {
  3332. if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
  3333. return;
  3334. assert(D && "Expected function or captured|block decl.");
  3335. assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
  3336. "Function is registered already.");
  3337. assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
  3338. "Team is set but not processed.");
  3339. const Stmt *Body = nullptr;
  3340. bool NeedToDelayGlobalization = false;
  3341. if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
  3342. Body = FD->getBody();
  3343. } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
  3344. Body = BD->getBody();
  3345. } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
  3346. Body = CD->getBody();
  3347. NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
  3348. if (NeedToDelayGlobalization &&
  3349. getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
  3350. return;
  3351. }
  3352. if (!Body)
  3353. return;
  3354. CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
  3355. VarChecker.Visit(Body);
  3356. const RecordDecl *GlobalizedVarsRecord =
  3357. VarChecker.getGlobalizedRecord(IsInTTDRegion);
  3358. TeamAndReductions.first = nullptr;
  3359. TeamAndReductions.second.clear();
  3360. ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
  3361. VarChecker.getEscapedVariableLengthDecls();
  3362. if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
  3363. return;
  3364. auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
  3365. I->getSecond().MappedParams =
  3366. std::make_unique<CodeGenFunction::OMPMapVars>();
  3367. I->getSecond().EscapedParameters.insert(
  3368. VarChecker.getEscapedParameters().begin(),
  3369. VarChecker.getEscapedParameters().end());
  3370. I->getSecond().EscapedVariableLengthDecls.append(
  3371. EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
  3372. DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
  3373. for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
  3374. assert(VD->isCanonicalDecl() && "Expected canonical declaration");
  3375. Data.insert(std::make_pair(VD, MappedVarData()));
  3376. }
  3377. if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
  3378. CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
  3379. VarChecker.Visit(Body);
  3380. I->getSecond().SecondaryLocalVarData.emplace();
  3381. DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
  3382. for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
  3383. assert(VD->isCanonicalDecl() && "Expected canonical declaration");
  3384. Data.insert(std::make_pair(VD, MappedVarData()));
  3385. }
  3386. }
  3387. if (!NeedToDelayGlobalization) {
  3388. emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
  3389. struct GlobalizationScope final : EHScopeStack::Cleanup {
  3390. GlobalizationScope() = default;
  3391. void Emit(CodeGenFunction &CGF, Flags flags) override {
  3392. static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
  3393. .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
  3394. }
  3395. };
  3396. CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
  3397. }
  3398. }
  3399. Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
  3400. const VarDecl *VD) {
  3401. if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
  3402. const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
  3403. auto AS = LangAS::Default;
  3404. switch (A->getAllocatorType()) {
  3405. // Use the default allocator here as by default local vars are
  3406. // threadlocal.
  3407. case OMPAllocateDeclAttr::OMPNullMemAlloc:
  3408. case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
  3409. case OMPAllocateDeclAttr::OMPThreadMemAlloc:
  3410. case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
  3411. case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
  3412. // Follow the user decision - use default allocation.
  3413. return Address::invalid();
  3414. case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
  3415. // TODO: implement aupport for user-defined allocators.
  3416. return Address::invalid();
  3417. case OMPAllocateDeclAttr::OMPConstMemAlloc:
  3418. AS = LangAS::cuda_constant;
  3419. break;
  3420. case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
  3421. AS = LangAS::cuda_shared;
  3422. break;
  3423. case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
  3424. case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
  3425. break;
  3426. }
  3427. llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
  3428. auto *GV = new llvm::GlobalVariable(
  3429. CGM.getModule(), VarTy, /*isConstant=*/false,
  3430. llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
  3431. VD->getName(),
  3432. /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
  3433. CGM.getContext().getTargetAddressSpace(AS));
  3434. CharUnits Align = CGM.getContext().getDeclAlign(VD);
  3435. GV->setAlignment(Align.getAsAlign());
  3436. return Address(
  3437. CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
  3438. GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
  3439. VD->getType().getAddressSpace()))),
  3440. Align);
  3441. }
  3442. if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
  3443. return Address::invalid();
  3444. VD = VD->getCanonicalDecl();
  3445. auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
  3446. if (I == FunctionGlobalizedDecls.end())
  3447. return Address::invalid();
  3448. auto VDI = I->getSecond().LocalVarData.find(VD);
  3449. if (VDI != I->getSecond().LocalVarData.end())
  3450. return VDI->second.PrivateAddr;
  3451. if (VD->hasAttrs()) {
  3452. for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
  3453. E(VD->attr_end());
  3454. IT != E; ++IT) {
  3455. auto VDI = I->getSecond().LocalVarData.find(
  3456. cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
  3457. ->getCanonicalDecl());
  3458. if (VDI != I->getSecond().LocalVarData.end())
  3459. return VDI->second.PrivateAddr;
  3460. }
  3461. }
  3462. return Address::invalid();
  3463. }
  3464. void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
  3465. FunctionGlobalizedDecls.erase(CGF.CurFn);
  3466. CGOpenMPRuntime::functionFinished(CGF);
  3467. }
  3468. void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
  3469. CodeGenFunction &CGF, const OMPLoopDirective &S,
  3470. OpenMPDistScheduleClauseKind &ScheduleKind,
  3471. llvm::Value *&Chunk) const {
  3472. auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  3473. if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
  3474. ScheduleKind = OMPC_DIST_SCHEDULE_static;
  3475. Chunk = CGF.EmitScalarConversion(
  3476. RT.getGPUNumThreads(CGF),
  3477. CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
  3478. S.getIterationVariable()->getType(), S.getBeginLoc());
  3479. return;
  3480. }
  3481. CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
  3482. CGF, S, ScheduleKind, Chunk);
  3483. }
  3484. void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
  3485. CodeGenFunction &CGF, const OMPLoopDirective &S,
  3486. OpenMPScheduleClauseKind &ScheduleKind,
  3487. const Expr *&ChunkExpr) const {
  3488. ScheduleKind = OMPC_SCHEDULE_static;
  3489. // Chunk size is 1 in this case.
  3490. llvm::APInt ChunkSize(32, 1);
  3491. ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
  3492. CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
  3493. SourceLocation());
  3494. }
  3495. void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
  3496. CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
  3497. assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
  3498. " Expected target-based directive.");
  3499. const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
  3500. for (const CapturedStmt::Capture &C : CS->captures()) {
  3501. // Capture variables captured by reference in lambdas for target-based
  3502. // directives.
  3503. if (!C.capturesVariable())
  3504. continue;
  3505. const VarDecl *VD = C.getCapturedVar();
  3506. const auto *RD = VD->getType()
  3507. .getCanonicalType()
  3508. .getNonReferenceType()
  3509. ->getAsCXXRecordDecl();
  3510. if (!RD || !RD->isLambda())
  3511. continue;
  3512. Address VDAddr = CGF.GetAddrOfLocalVar(VD);
  3513. LValue VDLVal;
  3514. if (VD->getType().getCanonicalType()->isReferenceType())
  3515. VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
  3516. else
  3517. VDLVal = CGF.MakeAddrLValue(
  3518. VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
  3519. llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
  3520. FieldDecl *ThisCapture = nullptr;
  3521. RD->getCaptureFields(Captures, ThisCapture);
  3522. if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
  3523. LValue ThisLVal =
  3524. CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
  3525. llvm::Value *CXXThis = CGF.LoadCXXThis();
  3526. CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
  3527. }
  3528. for (const LambdaCapture &LC : RD->captures()) {
  3529. if (LC.getCaptureKind() != LCK_ByRef)
  3530. continue;
  3531. const VarDecl *VD = LC.getCapturedVar();
  3532. if (!CS->capturesVariable(VD))
  3533. continue;
  3534. auto It = Captures.find(VD);
  3535. assert(It != Captures.end() && "Found lambda capture without field.");
  3536. LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
  3537. Address VDAddr = CGF.GetAddrOfLocalVar(VD);
  3538. if (VD->getType().getCanonicalType()->isReferenceType())
  3539. VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
  3540. VD->getType().getCanonicalType())
  3541. .getAddress(CGF);
  3542. CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
  3543. }
  3544. }
  3545. }
  3546. bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
  3547. LangAS &AS) {
  3548. if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
  3549. return false;
  3550. const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
  3551. switch(A->getAllocatorType()) {
  3552. case OMPAllocateDeclAttr::OMPNullMemAlloc:
  3553. case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
  3554. // Not supported, fallback to the default mem space.
  3555. case OMPAllocateDeclAttr::OMPThreadMemAlloc:
  3556. case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
  3557. case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
  3558. case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
  3559. case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
  3560. AS = LangAS::Default;
  3561. return true;
  3562. case OMPAllocateDeclAttr::OMPConstMemAlloc:
  3563. AS = LangAS::cuda_constant;
  3564. return true;
  3565. case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
  3566. AS = LangAS::cuda_shared;
  3567. return true;
  3568. case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
  3569. llvm_unreachable("Expected predefined allocator for the variables with the "
  3570. "static storage.");
  3571. }
  3572. return false;
  3573. }
  3574. // Get current CudaArch and ignore any unknown values
  3575. static CudaArch getCudaArch(CodeGenModule &CGM) {
  3576. if (!CGM.getTarget().hasFeature("ptx"))
  3577. return CudaArch::UNKNOWN;
  3578. for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
  3579. if (Feature.getValue()) {
  3580. CudaArch Arch = StringToCudaArch(Feature.getKey());
  3581. if (Arch != CudaArch::UNKNOWN)
  3582. return Arch;
  3583. }
  3584. }
  3585. return CudaArch::UNKNOWN;
  3586. }
  3587. /// Check to see if target architecture supports unified addressing which is
  3588. /// a restriction for OpenMP requires clause "unified_shared_memory".
  3589. void CGOpenMPRuntimeGPU::processRequiresDirective(
  3590. const OMPRequiresDecl *D) {
  3591. for (const OMPClause *Clause : D->clauselists()) {
  3592. if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
  3593. CudaArch Arch = getCudaArch(CGM);
  3594. switch (Arch) {
  3595. case CudaArch::SM_20:
  3596. case CudaArch::SM_21:
  3597. case CudaArch::SM_30:
  3598. case CudaArch::SM_32:
  3599. case CudaArch::SM_35:
  3600. case CudaArch::SM_37:
  3601. case CudaArch::SM_50:
  3602. case CudaArch::SM_52:
  3603. case CudaArch::SM_53: {
  3604. SmallString<256> Buffer;
  3605. llvm::raw_svector_ostream Out(Buffer);
  3606. Out << "Target architecture " << CudaArchToString(Arch)
  3607. << " does not support unified addressing";
  3608. CGM.Error(Clause->getBeginLoc(), Out.str());
  3609. return;
  3610. }
  3611. case CudaArch::SM_60:
  3612. case CudaArch::SM_61:
  3613. case CudaArch::SM_62:
  3614. case CudaArch::SM_70:
  3615. case CudaArch::SM_72:
  3616. case CudaArch::SM_75:
  3617. case CudaArch::SM_80:
  3618. case CudaArch::SM_86:
  3619. case CudaArch::GFX600:
  3620. case CudaArch::GFX601:
  3621. case CudaArch::GFX602:
  3622. case CudaArch::GFX700:
  3623. case CudaArch::GFX701:
  3624. case CudaArch::GFX702:
  3625. case CudaArch::GFX703:
  3626. case CudaArch::GFX704:
  3627. case CudaArch::GFX705:
  3628. case CudaArch::GFX801:
  3629. case CudaArch::GFX802:
  3630. case CudaArch::GFX803:
  3631. case CudaArch::GFX805:
  3632. case CudaArch::GFX810:
  3633. case CudaArch::GFX900:
  3634. case CudaArch::GFX902:
  3635. case CudaArch::GFX904:
  3636. case CudaArch::GFX906:
  3637. case CudaArch::GFX908:
  3638. case CudaArch::GFX909:
  3639. case CudaArch::GFX90a:
  3640. case CudaArch::GFX90c:
  3641. case CudaArch::GFX1010:
  3642. case CudaArch::GFX1011:
  3643. case CudaArch::GFX1012:
  3644. case CudaArch::GFX1013:
  3645. case CudaArch::GFX1030:
  3646. case CudaArch::GFX1031:
  3647. case CudaArch::GFX1032:
  3648. case CudaArch::GFX1033:
  3649. case CudaArch::GFX1034:
  3650. case CudaArch::GFX1035:
  3651. case CudaArch::Generic:
  3652. case CudaArch::UNUSED:
  3653. case CudaArch::UNKNOWN:
  3654. break;
  3655. case CudaArch::LAST:
  3656. llvm_unreachable("Unexpected Cuda arch.");
  3657. }
  3658. }
  3659. }
  3660. CGOpenMPRuntime::processRequiresDirective(D);
  3661. }
  3662. void CGOpenMPRuntimeGPU::clear() {
  3663. if (!TeamsReductions.empty()) {
  3664. ASTContext &C = CGM.getContext();
  3665. RecordDecl *StaticRD = C.buildImplicitRecord(
  3666. "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
  3667. StaticRD->startDefinition();
  3668. for (const RecordDecl *TeamReductionRec : TeamsReductions) {
  3669. QualType RecTy = C.getRecordType(TeamReductionRec);
  3670. auto *Field = FieldDecl::Create(
  3671. C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
  3672. C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
  3673. /*BW=*/nullptr, /*Mutable=*/false,
  3674. /*InitStyle=*/ICIS_NoInit);
  3675. Field->setAccess(AS_public);
  3676. StaticRD->addDecl(Field);
  3677. }
  3678. StaticRD->completeDefinition();
  3679. QualType StaticTy = C.getRecordType(StaticRD);
  3680. llvm::Type *LLVMReductionsBufferTy =
  3681. CGM.getTypes().ConvertTypeForMem(StaticTy);
  3682. // FIXME: nvlink does not handle weak linkage correctly (object with the
  3683. // different size are reported as erroneous).
  3684. // Restore CommonLinkage as soon as nvlink is fixed.
  3685. auto *GV = new llvm::GlobalVariable(
  3686. CGM.getModule(), LLVMReductionsBufferTy,
  3687. /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
  3688. llvm::Constant::getNullValue(LLVMReductionsBufferTy),
  3689. "_openmp_teams_reductions_buffer_$_");
  3690. KernelTeamsReductionPtr->setInitializer(
  3691. llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
  3692. CGM.VoidPtrTy));
  3693. }
  3694. CGOpenMPRuntime::clear();
  3695. }
  3696. llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
  3697. CGBuilderTy &Bld = CGF.Builder;
  3698. llvm::Module *M = &CGF.CGM.getModule();
  3699. const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
  3700. llvm::Function *F = M->getFunction(LocSize);
  3701. if (!F) {
  3702. F = llvm::Function::Create(
  3703. llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
  3704. llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
  3705. }
  3706. return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
  3707. }
  3708. llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
  3709. ArrayRef<llvm::Value *> Args{};
  3710. return CGF.EmitRuntimeCall(
  3711. OMPBuilder.getOrCreateRuntimeFunction(
  3712. CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
  3713. Args);
  3714. }
  3715. llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
  3716. ArrayRef<llvm::Value *> Args{};
  3717. return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
  3718. CGM.getModule(), OMPRTL___kmpc_get_warp_size),
  3719. Args);
  3720. }