12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989 |
- //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This provides a generalized class for OpenMP runtime code generation
- // specialized by GPU targets NVPTX and AMDGCN.
- //
- //===----------------------------------------------------------------------===//
- #include "CGOpenMPRuntimeGPU.h"
- #include "CodeGenFunction.h"
- #include "clang/AST/Attr.h"
- #include "clang/AST/DeclOpenMP.h"
- #include "clang/AST/StmtOpenMP.h"
- #include "clang/AST/StmtVisitor.h"
- #include "clang/Basic/Cuda.h"
- #include "llvm/ADT/SmallPtrSet.h"
- #include "llvm/Frontend/OpenMP/OMPGridValues.h"
- #include "llvm/Support/MathExtras.h"
- using namespace clang;
- using namespace CodeGen;
- using namespace llvm::omp;
- namespace {
- /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
- class NVPTXActionTy final : public PrePostActionTy {
- llvm::FunctionCallee EnterCallee = nullptr;
- ArrayRef<llvm::Value *> EnterArgs;
- llvm::FunctionCallee ExitCallee = nullptr;
- ArrayRef<llvm::Value *> ExitArgs;
- bool Conditional = false;
- llvm::BasicBlock *ContBlock = nullptr;
- public:
- NVPTXActionTy(llvm::FunctionCallee EnterCallee,
- ArrayRef<llvm::Value *> EnterArgs,
- llvm::FunctionCallee ExitCallee,
- ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
- : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
- ExitArgs(ExitArgs), Conditional(Conditional) {}
- void Enter(CodeGenFunction &CGF) override {
- llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
- if (Conditional) {
- llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
- auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
- ContBlock = CGF.createBasicBlock("omp_if.end");
- // Generate the branch (If-stmt)
- CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
- CGF.EmitBlock(ThenBlock);
- }
- }
- void Done(CodeGenFunction &CGF) {
- // Emit the rest of blocks/branches
- CGF.EmitBranch(ContBlock);
- CGF.EmitBlock(ContBlock, true);
- }
- void Exit(CodeGenFunction &CGF) override {
- CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
- }
- };
- /// A class to track the execution mode when codegening directives within
- /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
- /// to the target region and used by containing directives such as 'parallel'
- /// to emit optimized code.
- class ExecutionRuntimeModesRAII {
- private:
- CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
- CGOpenMPRuntimeGPU::EM_Unknown;
- CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
- bool SavedRuntimeMode = false;
- bool *RuntimeMode = nullptr;
- public:
- /// Constructor for Non-SPMD mode.
- ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
- : ExecMode(ExecMode) {
- SavedExecMode = ExecMode;
- ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
- }
- /// Constructor for SPMD mode.
- ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
- bool &RuntimeMode, bool FullRuntimeMode)
- : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
- SavedExecMode = ExecMode;
- SavedRuntimeMode = RuntimeMode;
- ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
- RuntimeMode = FullRuntimeMode;
- }
- ~ExecutionRuntimeModesRAII() {
- ExecMode = SavedExecMode;
- if (RuntimeMode)
- *RuntimeMode = SavedRuntimeMode;
- }
- };
- /// GPU Configuration: This information can be derived from cuda registers,
- /// however, providing compile time constants helps generate more efficient
- /// code. For all practical purposes this is fine because the configuration
- /// is the same for all known NVPTX architectures.
- enum MachineConfiguration : unsigned {
- /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
- /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
- /// Global memory alignment for performance.
- GlobalMemoryAlignment = 128,
- /// Maximal size of the shared memory buffer.
- SharedMemorySize = 128,
- };
- static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
- RefExpr = RefExpr->IgnoreParens();
- if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
- const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
- while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
- Base = TempASE->getBase()->IgnoreParenImpCasts();
- RefExpr = Base;
- } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
- const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
- while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
- Base = TempOASE->getBase()->IgnoreParenImpCasts();
- while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
- Base = TempASE->getBase()->IgnoreParenImpCasts();
- RefExpr = Base;
- }
- RefExpr = RefExpr->IgnoreParenImpCasts();
- if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
- return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
- const auto *ME = cast<MemberExpr>(RefExpr);
- return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
- }
- static RecordDecl *buildRecordForGlobalizedVars(
- ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
- ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &MappedDeclsFields, int BufSize) {
- using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
- if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
- return nullptr;
- SmallVector<VarsDataTy, 4> GlobalizedVars;
- for (const ValueDecl *D : EscapedDecls)
- GlobalizedVars.emplace_back(
- CharUnits::fromQuantity(std::max(
- C.getDeclAlign(D).getQuantity(),
- static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
- D);
- for (const ValueDecl *D : EscapedDeclsForTeams)
- GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
- llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
- return L.first > R.first;
- });
- // Build struct _globalized_locals_ty {
- // /* globalized vars */[WarSize] align (max(decl_align,
- // GlobalMemoryAlignment))
- // /* globalized vars */ for EscapedDeclsForTeams
- // };
- RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
- GlobalizedRD->startDefinition();
- llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
- EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
- for (const auto &Pair : GlobalizedVars) {
- const ValueDecl *VD = Pair.second;
- QualType Type = VD->getType();
- if (Type->isLValueReferenceType())
- Type = C.getPointerType(Type.getNonReferenceType());
- else
- Type = Type.getNonReferenceType();
- SourceLocation Loc = VD->getLocation();
- FieldDecl *Field;
- if (SingleEscaped.count(VD)) {
- Field = FieldDecl::Create(
- C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
- C.getTrivialTypeSourceInfo(Type, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- if (VD->hasAttrs()) {
- for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
- E(VD->getAttrs().end());
- I != E; ++I)
- Field->addAttr(*I);
- }
- } else {
- llvm::APInt ArraySize(32, BufSize);
- Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
- 0);
- Field = FieldDecl::Create(
- C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
- C.getTrivialTypeSourceInfo(Type, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
- static_cast<CharUnits::QuantityType>(
- GlobalMemoryAlignment)));
- Field->addAttr(AlignedAttr::CreateImplicit(
- C, /*IsAlignmentExpr=*/true,
- IntegerLiteral::Create(C, Align,
- C.getIntTypeForBitwidth(32, /*Signed=*/0),
- SourceLocation()),
- {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
- }
- GlobalizedRD->addDecl(Field);
- MappedDeclsFields.try_emplace(VD, Field);
- }
- GlobalizedRD->completeDefinition();
- return GlobalizedRD;
- }
- /// Get the list of variables that can escape their declaration context.
- class CheckVarsEscapingDeclContext final
- : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
- CodeGenFunction &CGF;
- llvm::SetVector<const ValueDecl *> EscapedDecls;
- llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
- llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
- RecordDecl *GlobalizedRD = nullptr;
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
- bool AllEscaped = false;
- bool IsForCombinedParallelRegion = false;
- void markAsEscaped(const ValueDecl *VD) {
- // Do not globalize declare target variables.
- if (!isa<VarDecl>(VD) ||
- OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
- return;
- VD = cast<ValueDecl>(VD->getCanonicalDecl());
- // Use user-specified allocation.
- if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
- return;
- // Variables captured by value must be globalized.
- if (auto *CSI = CGF.CapturedStmtInfo) {
- if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
- // Check if need to capture the variable that was already captured by
- // value in the outer region.
- if (!IsForCombinedParallelRegion) {
- if (!FD->hasAttrs())
- return;
- const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
- if (!Attr)
- return;
- if (((Attr->getCaptureKind() != OMPC_map) &&
- !isOpenMPPrivate(Attr->getCaptureKind())) ||
- ((Attr->getCaptureKind() == OMPC_map) &&
- !FD->getType()->isAnyPointerType()))
- return;
- }
- if (!FD->getType()->isReferenceType()) {
- assert(!VD->getType()->isVariablyModifiedType() &&
- "Parameter captured by value with variably modified type");
- EscapedParameters.insert(VD);
- } else if (!IsForCombinedParallelRegion) {
- return;
- }
- }
- }
- if ((!CGF.CapturedStmtInfo ||
- (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
- VD->getType()->isReferenceType())
- // Do not globalize variables with reference type.
- return;
- if (VD->getType()->isVariablyModifiedType())
- EscapedVariableLengthDecls.insert(VD);
- else
- EscapedDecls.insert(VD);
- }
- void VisitValueDecl(const ValueDecl *VD) {
- if (VD->getType()->isLValueReferenceType())
- markAsEscaped(VD);
- if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
- if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
- const bool SavedAllEscaped = AllEscaped;
- AllEscaped = VD->getType()->isLValueReferenceType();
- Visit(VarD->getInit());
- AllEscaped = SavedAllEscaped;
- }
- }
- }
- void VisitOpenMPCapturedStmt(const CapturedStmt *S,
- ArrayRef<OMPClause *> Clauses,
- bool IsCombinedParallelRegion) {
- if (!S)
- return;
- for (const CapturedStmt::Capture &C : S->captures()) {
- if (C.capturesVariable() && !C.capturesVariableByCopy()) {
- const ValueDecl *VD = C.getCapturedVar();
- bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
- if (IsCombinedParallelRegion) {
- // Check if the variable is privatized in the combined construct and
- // those private copies must be shared in the inner parallel
- // directive.
- IsForCombinedParallelRegion = false;
- for (const OMPClause *C : Clauses) {
- if (!isOpenMPPrivate(C->getClauseKind()) ||
- C->getClauseKind() == OMPC_reduction ||
- C->getClauseKind() == OMPC_linear ||
- C->getClauseKind() == OMPC_private)
- continue;
- ArrayRef<const Expr *> Vars;
- if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
- Vars = PC->getVarRefs();
- else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
- Vars = PC->getVarRefs();
- else
- llvm_unreachable("Unexpected clause.");
- for (const auto *E : Vars) {
- const Decl *D =
- cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
- if (D == VD->getCanonicalDecl()) {
- IsForCombinedParallelRegion = true;
- break;
- }
- }
- if (IsForCombinedParallelRegion)
- break;
- }
- }
- markAsEscaped(VD);
- if (isa<OMPCapturedExprDecl>(VD))
- VisitValueDecl(VD);
- IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
- }
- }
- }
- void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
- assert(!GlobalizedRD &&
- "Record for globalized variables is built already.");
- ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (IsInTTDRegion)
- EscapedDeclsForTeams = EscapedDecls.getArrayRef();
- else
- EscapedDeclsForParallel = EscapedDecls.getArrayRef();
- GlobalizedRD = ::buildRecordForGlobalizedVars(
- CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
- MappedDeclsFields, WarpSize);
- }
- public:
- CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
- ArrayRef<const ValueDecl *> TeamsReductions)
- : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
- }
- virtual ~CheckVarsEscapingDeclContext() = default;
- void VisitDeclStmt(const DeclStmt *S) {
- if (!S)
- return;
- for (const Decl *D : S->decls())
- if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
- VisitValueDecl(VD);
- }
- void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
- if (!D)
- return;
- if (!D->hasAssociatedStmt())
- return;
- if (const auto *S =
- dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
- // Do not analyze directives that do not actually require capturing,
- // like `omp for` or `omp simd` directives.
- llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
- getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
- if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
- VisitStmt(S->getCapturedStmt());
- return;
- }
- VisitOpenMPCapturedStmt(
- S, D->clauses(),
- CaptureRegions.back() == OMPD_parallel &&
- isOpenMPDistributeDirective(D->getDirectiveKind()));
- }
- }
- void VisitCapturedStmt(const CapturedStmt *S) {
- if (!S)
- return;
- for (const CapturedStmt::Capture &C : S->captures()) {
- if (C.capturesVariable() && !C.capturesVariableByCopy()) {
- const ValueDecl *VD = C.getCapturedVar();
- markAsEscaped(VD);
- if (isa<OMPCapturedExprDecl>(VD))
- VisitValueDecl(VD);
- }
- }
- }
- void VisitLambdaExpr(const LambdaExpr *E) {
- if (!E)
- return;
- for (const LambdaCapture &C : E->captures()) {
- if (C.capturesVariable()) {
- if (C.getCaptureKind() == LCK_ByRef) {
- const ValueDecl *VD = C.getCapturedVar();
- markAsEscaped(VD);
- if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
- VisitValueDecl(VD);
- }
- }
- }
- }
- void VisitBlockExpr(const BlockExpr *E) {
- if (!E)
- return;
- for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
- if (C.isByRef()) {
- const VarDecl *VD = C.getVariable();
- markAsEscaped(VD);
- if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
- VisitValueDecl(VD);
- }
- }
- }
- void VisitCallExpr(const CallExpr *E) {
- if (!E)
- return;
- for (const Expr *Arg : E->arguments()) {
- if (!Arg)
- continue;
- if (Arg->isLValue()) {
- const bool SavedAllEscaped = AllEscaped;
- AllEscaped = true;
- Visit(Arg);
- AllEscaped = SavedAllEscaped;
- } else {
- Visit(Arg);
- }
- }
- Visit(E->getCallee());
- }
- void VisitDeclRefExpr(const DeclRefExpr *E) {
- if (!E)
- return;
- const ValueDecl *VD = E->getDecl();
- if (AllEscaped)
- markAsEscaped(VD);
- if (isa<OMPCapturedExprDecl>(VD))
- VisitValueDecl(VD);
- else if (const auto *VarD = dyn_cast<VarDecl>(VD))
- if (VarD->isInitCapture())
- VisitValueDecl(VD);
- }
- void VisitUnaryOperator(const UnaryOperator *E) {
- if (!E)
- return;
- if (E->getOpcode() == UO_AddrOf) {
- const bool SavedAllEscaped = AllEscaped;
- AllEscaped = true;
- Visit(E->getSubExpr());
- AllEscaped = SavedAllEscaped;
- } else {
- Visit(E->getSubExpr());
- }
- }
- void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
- if (!E)
- return;
- if (E->getCastKind() == CK_ArrayToPointerDecay) {
- const bool SavedAllEscaped = AllEscaped;
- AllEscaped = true;
- Visit(E->getSubExpr());
- AllEscaped = SavedAllEscaped;
- } else {
- Visit(E->getSubExpr());
- }
- }
- void VisitExpr(const Expr *E) {
- if (!E)
- return;
- bool SavedAllEscaped = AllEscaped;
- if (!E->isLValue())
- AllEscaped = false;
- for (const Stmt *Child : E->children())
- if (Child)
- Visit(Child);
- AllEscaped = SavedAllEscaped;
- }
- void VisitStmt(const Stmt *S) {
- if (!S)
- return;
- for (const Stmt *Child : S->children())
- if (Child)
- Visit(Child);
- }
- /// Returns the record that handles all the escaped local variables and used
- /// instead of their original storage.
- const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
- if (!GlobalizedRD)
- buildRecordForGlobalizedVars(IsInTTDRegion);
- return GlobalizedRD;
- }
- /// Returns the field in the globalized record for the escaped variable.
- const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
- assert(GlobalizedRD &&
- "Record for globalized variables must be generated already.");
- auto I = MappedDeclsFields.find(VD);
- if (I == MappedDeclsFields.end())
- return nullptr;
- return I->getSecond();
- }
- /// Returns the list of the escaped local variables/parameters.
- ArrayRef<const ValueDecl *> getEscapedDecls() const {
- return EscapedDecls.getArrayRef();
- }
- /// Checks if the escaped local variable is actually a parameter passed by
- /// value.
- const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
- return EscapedParameters;
- }
- /// Returns the list of the escaped variables with the variably modified
- /// types.
- ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
- return EscapedVariableLengthDecls.getArrayRef();
- }
- };
- } // anonymous namespace
- /// Get the id of the warp in the block.
- /// We assume that the warp size is 32, which is always the case
- /// on the NVPTX device, to generate more efficient code.
- static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
- }
- /// Get the id of the current lane in the Warp.
- /// We assume that the warp size is 32, which is always the case
- /// on the NVPTX device, to generate more efficient code.
- static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
- "nvptx_lane_id");
- }
- CGOpenMPRuntimeGPU::ExecutionMode
- CGOpenMPRuntimeGPU::getExecutionMode() const {
- return CurrentExecutionMode;
- }
- static CGOpenMPRuntimeGPU::DataSharingMode
- getDataSharingMode(CodeGenModule &CGM) {
- return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
- : CGOpenMPRuntimeGPU::Generic;
- }
- /// Check for inner (nested) SPMD construct, if any
- static bool hasNestedSPMDDirective(ASTContext &Ctx,
- const OMPExecutableDirective &D) {
- const auto *CS = D.getInnermostCapturedStmt();
- const auto *Body =
- CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
- const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NestedDir =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
- switch (D.getDirectiveKind()) {
- case OMPD_target:
- if (isOpenMPParallelDirective(DKind))
- return true;
- if (DKind == OMPD_teams) {
- Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true);
- if (!Body)
- return false;
- ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NND =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- DKind = NND->getDirectiveKind();
- if (isOpenMPParallelDirective(DKind))
- return true;
- }
- }
- return false;
- case OMPD_target_teams:
- return isOpenMPParallelDirective(DKind);
- case OMPD_target_simd:
- case OMPD_target_parallel:
- case OMPD_target_parallel_for:
- case OMPD_target_parallel_for_simd:
- case OMPD_target_teams_distribute:
- case OMPD_target_teams_distribute_simd:
- case OMPD_target_teams_distribute_parallel_for:
- case OMPD_target_teams_distribute_parallel_for_simd:
- case OMPD_parallel:
- case OMPD_for:
- case OMPD_parallel_for:
- case OMPD_parallel_master:
- case OMPD_parallel_sections:
- case OMPD_for_simd:
- case OMPD_parallel_for_simd:
- case OMPD_cancel:
- case OMPD_cancellation_point:
- case OMPD_ordered:
- case OMPD_threadprivate:
- case OMPD_allocate:
- case OMPD_task:
- case OMPD_simd:
- case OMPD_sections:
- case OMPD_section:
- case OMPD_single:
- case OMPD_master:
- case OMPD_critical:
- case OMPD_taskyield:
- case OMPD_barrier:
- case OMPD_taskwait:
- case OMPD_taskgroup:
- case OMPD_atomic:
- case OMPD_flush:
- case OMPD_depobj:
- case OMPD_scan:
- case OMPD_teams:
- case OMPD_target_data:
- case OMPD_target_exit_data:
- case OMPD_target_enter_data:
- case OMPD_distribute:
- case OMPD_distribute_simd:
- case OMPD_distribute_parallel_for:
- case OMPD_distribute_parallel_for_simd:
- case OMPD_teams_distribute:
- case OMPD_teams_distribute_simd:
- case OMPD_teams_distribute_parallel_for:
- case OMPD_teams_distribute_parallel_for_simd:
- case OMPD_target_update:
- case OMPD_declare_simd:
- case OMPD_declare_variant:
- case OMPD_begin_declare_variant:
- case OMPD_end_declare_variant:
- case OMPD_declare_target:
- case OMPD_end_declare_target:
- case OMPD_declare_reduction:
- case OMPD_declare_mapper:
- case OMPD_taskloop:
- case OMPD_taskloop_simd:
- case OMPD_master_taskloop:
- case OMPD_master_taskloop_simd:
- case OMPD_parallel_master_taskloop:
- case OMPD_parallel_master_taskloop_simd:
- case OMPD_requires:
- case OMPD_unknown:
- default:
- llvm_unreachable("Unexpected directive.");
- }
- }
- return false;
- }
- static bool supportsSPMDExecutionMode(ASTContext &Ctx,
- const OMPExecutableDirective &D) {
- OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
- switch (DirectiveKind) {
- case OMPD_target:
- case OMPD_target_teams:
- return hasNestedSPMDDirective(Ctx, D);
- case OMPD_target_parallel:
- case OMPD_target_parallel_for:
- case OMPD_target_parallel_for_simd:
- case OMPD_target_teams_distribute_parallel_for:
- case OMPD_target_teams_distribute_parallel_for_simd:
- case OMPD_target_simd:
- case OMPD_target_teams_distribute_simd:
- return true;
- case OMPD_target_teams_distribute:
- return false;
- case OMPD_parallel:
- case OMPD_for:
- case OMPD_parallel_for:
- case OMPD_parallel_master:
- case OMPD_parallel_sections:
- case OMPD_for_simd:
- case OMPD_parallel_for_simd:
- case OMPD_cancel:
- case OMPD_cancellation_point:
- case OMPD_ordered:
- case OMPD_threadprivate:
- case OMPD_allocate:
- case OMPD_task:
- case OMPD_simd:
- case OMPD_sections:
- case OMPD_section:
- case OMPD_single:
- case OMPD_master:
- case OMPD_critical:
- case OMPD_taskyield:
- case OMPD_barrier:
- case OMPD_taskwait:
- case OMPD_taskgroup:
- case OMPD_atomic:
- case OMPD_flush:
- case OMPD_depobj:
- case OMPD_scan:
- case OMPD_teams:
- case OMPD_target_data:
- case OMPD_target_exit_data:
- case OMPD_target_enter_data:
- case OMPD_distribute:
- case OMPD_distribute_simd:
- case OMPD_distribute_parallel_for:
- case OMPD_distribute_parallel_for_simd:
- case OMPD_teams_distribute:
- case OMPD_teams_distribute_simd:
- case OMPD_teams_distribute_parallel_for:
- case OMPD_teams_distribute_parallel_for_simd:
- case OMPD_target_update:
- case OMPD_declare_simd:
- case OMPD_declare_variant:
- case OMPD_begin_declare_variant:
- case OMPD_end_declare_variant:
- case OMPD_declare_target:
- case OMPD_end_declare_target:
- case OMPD_declare_reduction:
- case OMPD_declare_mapper:
- case OMPD_taskloop:
- case OMPD_taskloop_simd:
- case OMPD_master_taskloop:
- case OMPD_master_taskloop_simd:
- case OMPD_parallel_master_taskloop:
- case OMPD_parallel_master_taskloop_simd:
- case OMPD_requires:
- case OMPD_unknown:
- default:
- break;
- }
- llvm_unreachable(
- "Unknown programming model for OpenMP directive on NVPTX target.");
- }
- /// Check if the directive is loops based and has schedule clause at all or has
- /// static scheduling.
- static bool hasStaticScheduling(const OMPExecutableDirective &D) {
- assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
- isOpenMPLoopDirective(D.getDirectiveKind()) &&
- "Expected loop-based directive.");
- return !D.hasClausesOfKind<OMPOrderedClause>() &&
- (!D.hasClausesOfKind<OMPScheduleClause>() ||
- llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
- [](const OMPScheduleClause *C) {
- return C->getScheduleKind() == OMPC_SCHEDULE_static;
- }));
- }
- /// Check for inner (nested) lightweight runtime construct, if any
- static bool hasNestedLightweightDirective(ASTContext &Ctx,
- const OMPExecutableDirective &D) {
- assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
- const auto *CS = D.getInnermostCapturedStmt();
- const auto *Body =
- CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
- const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NestedDir =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
- switch (D.getDirectiveKind()) {
- case OMPD_target:
- if (isOpenMPParallelDirective(DKind) &&
- isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
- hasStaticScheduling(*NestedDir))
- return true;
- if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
- return true;
- if (DKind == OMPD_parallel) {
- Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true);
- if (!Body)
- return false;
- ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NND =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- DKind = NND->getDirectiveKind();
- if (isOpenMPWorksharingDirective(DKind) &&
- isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
- return true;
- }
- } else if (DKind == OMPD_teams) {
- Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true);
- if (!Body)
- return false;
- ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NND =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- DKind = NND->getDirectiveKind();
- if (isOpenMPParallelDirective(DKind) &&
- isOpenMPWorksharingDirective(DKind) &&
- isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
- return true;
- if (DKind == OMPD_parallel) {
- Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true);
- if (!Body)
- return false;
- ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NND =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- DKind = NND->getDirectiveKind();
- if (isOpenMPWorksharingDirective(DKind) &&
- isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
- return true;
- }
- }
- }
- }
- return false;
- case OMPD_target_teams:
- if (isOpenMPParallelDirective(DKind) &&
- isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
- hasStaticScheduling(*NestedDir))
- return true;
- if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
- return true;
- if (DKind == OMPD_parallel) {
- Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true);
- if (!Body)
- return false;
- ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
- if (const auto *NND =
- dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
- DKind = NND->getDirectiveKind();
- if (isOpenMPWorksharingDirective(DKind) &&
- isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
- return true;
- }
- }
- return false;
- case OMPD_target_parallel:
- if (DKind == OMPD_simd)
- return true;
- return isOpenMPWorksharingDirective(DKind) &&
- isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
- case OMPD_target_teams_distribute:
- case OMPD_target_simd:
- case OMPD_target_parallel_for:
- case OMPD_target_parallel_for_simd:
- case OMPD_target_teams_distribute_simd:
- case OMPD_target_teams_distribute_parallel_for:
- case OMPD_target_teams_distribute_parallel_for_simd:
- case OMPD_parallel:
- case OMPD_for:
- case OMPD_parallel_for:
- case OMPD_parallel_master:
- case OMPD_parallel_sections:
- case OMPD_for_simd:
- case OMPD_parallel_for_simd:
- case OMPD_cancel:
- case OMPD_cancellation_point:
- case OMPD_ordered:
- case OMPD_threadprivate:
- case OMPD_allocate:
- case OMPD_task:
- case OMPD_simd:
- case OMPD_sections:
- case OMPD_section:
- case OMPD_single:
- case OMPD_master:
- case OMPD_critical:
- case OMPD_taskyield:
- case OMPD_barrier:
- case OMPD_taskwait:
- case OMPD_taskgroup:
- case OMPD_atomic:
- case OMPD_flush:
- case OMPD_depobj:
- case OMPD_scan:
- case OMPD_teams:
- case OMPD_target_data:
- case OMPD_target_exit_data:
- case OMPD_target_enter_data:
- case OMPD_distribute:
- case OMPD_distribute_simd:
- case OMPD_distribute_parallel_for:
- case OMPD_distribute_parallel_for_simd:
- case OMPD_teams_distribute:
- case OMPD_teams_distribute_simd:
- case OMPD_teams_distribute_parallel_for:
- case OMPD_teams_distribute_parallel_for_simd:
- case OMPD_target_update:
- case OMPD_declare_simd:
- case OMPD_declare_variant:
- case OMPD_begin_declare_variant:
- case OMPD_end_declare_variant:
- case OMPD_declare_target:
- case OMPD_end_declare_target:
- case OMPD_declare_reduction:
- case OMPD_declare_mapper:
- case OMPD_taskloop:
- case OMPD_taskloop_simd:
- case OMPD_master_taskloop:
- case OMPD_master_taskloop_simd:
- case OMPD_parallel_master_taskloop:
- case OMPD_parallel_master_taskloop_simd:
- case OMPD_requires:
- case OMPD_unknown:
- default:
- llvm_unreachable("Unexpected directive.");
- }
- }
- return false;
- }
- /// Checks if the construct supports lightweight runtime. It must be SPMD
- /// construct + inner loop-based construct with static scheduling.
- static bool supportsLightweightRuntime(ASTContext &Ctx,
- const OMPExecutableDirective &D) {
- if (!supportsSPMDExecutionMode(Ctx, D))
- return false;
- OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
- switch (DirectiveKind) {
- case OMPD_target:
- case OMPD_target_teams:
- case OMPD_target_parallel:
- return hasNestedLightweightDirective(Ctx, D);
- case OMPD_target_parallel_for:
- case OMPD_target_parallel_for_simd:
- case OMPD_target_teams_distribute_parallel_for:
- case OMPD_target_teams_distribute_parallel_for_simd:
- // (Last|First)-privates must be shared in parallel region.
- return hasStaticScheduling(D);
- case OMPD_target_simd:
- case OMPD_target_teams_distribute_simd:
- return true;
- case OMPD_target_teams_distribute:
- return false;
- case OMPD_parallel:
- case OMPD_for:
- case OMPD_parallel_for:
- case OMPD_parallel_master:
- case OMPD_parallel_sections:
- case OMPD_for_simd:
- case OMPD_parallel_for_simd:
- case OMPD_cancel:
- case OMPD_cancellation_point:
- case OMPD_ordered:
- case OMPD_threadprivate:
- case OMPD_allocate:
- case OMPD_task:
- case OMPD_simd:
- case OMPD_sections:
- case OMPD_section:
- case OMPD_single:
- case OMPD_master:
- case OMPD_critical:
- case OMPD_taskyield:
- case OMPD_barrier:
- case OMPD_taskwait:
- case OMPD_taskgroup:
- case OMPD_atomic:
- case OMPD_flush:
- case OMPD_depobj:
- case OMPD_scan:
- case OMPD_teams:
- case OMPD_target_data:
- case OMPD_target_exit_data:
- case OMPD_target_enter_data:
- case OMPD_distribute:
- case OMPD_distribute_simd:
- case OMPD_distribute_parallel_for:
- case OMPD_distribute_parallel_for_simd:
- case OMPD_teams_distribute:
- case OMPD_teams_distribute_simd:
- case OMPD_teams_distribute_parallel_for:
- case OMPD_teams_distribute_parallel_for_simd:
- case OMPD_target_update:
- case OMPD_declare_simd:
- case OMPD_declare_variant:
- case OMPD_begin_declare_variant:
- case OMPD_end_declare_variant:
- case OMPD_declare_target:
- case OMPD_end_declare_target:
- case OMPD_declare_reduction:
- case OMPD_declare_mapper:
- case OMPD_taskloop:
- case OMPD_taskloop_simd:
- case OMPD_master_taskloop:
- case OMPD_master_taskloop_simd:
- case OMPD_parallel_master_taskloop:
- case OMPD_parallel_master_taskloop_simd:
- case OMPD_requires:
- case OMPD_unknown:
- default:
- break;
- }
- llvm_unreachable(
- "Unknown programming model for OpenMP directive on NVPTX target.");
- }
- void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
- StringRef ParentName,
- llvm::Function *&OutlinedFn,
- llvm::Constant *&OutlinedFnID,
- bool IsOffloadEntry,
- const RegionCodeGenTy &CodeGen) {
- ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
- EntryFunctionState EST;
- WrapperFunctionsMap.clear();
- // Emit target region as a standalone region.
- class NVPTXPrePostActionTy : public PrePostActionTy {
- CGOpenMPRuntimeGPU::EntryFunctionState &EST;
- public:
- NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
- : EST(EST) {}
- void Enter(CodeGenFunction &CGF) override {
- auto &RT =
- static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
- // Skip target region initialization.
- RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
- }
- void Exit(CodeGenFunction &CGF) override {
- auto &RT =
- static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- RT.clearLocThreadIdInsertPt(CGF);
- RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
- }
- } Action(EST);
- CodeGen.setAction(Action);
- IsInTTDRegion = true;
- emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
- IsOffloadEntry, CodeGen);
- IsInTTDRegion = false;
- }
- void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
- EntryFunctionState &EST, bool IsSPMD) {
- CGBuilderTy &Bld = CGF.Builder;
- Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
- IsInTargetMasterThreadRegion = IsSPMD;
- if (!IsSPMD)
- emitGenericVarsProlog(CGF, EST.Loc);
- }
- void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
- EntryFunctionState &EST,
- bool IsSPMD) {
- if (!IsSPMD)
- emitGenericVarsEpilog(CGF);
- CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
- }
- void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
- StringRef ParentName,
- llvm::Function *&OutlinedFn,
- llvm::Constant *&OutlinedFnID,
- bool IsOffloadEntry,
- const RegionCodeGenTy &CodeGen) {
- ExecutionRuntimeModesRAII ModeRAII(
- CurrentExecutionMode, RequiresFullRuntime,
- CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
- !supportsLightweightRuntime(CGM.getContext(), D));
- EntryFunctionState EST;
- // Emit target region as a standalone region.
- class NVPTXPrePostActionTy : public PrePostActionTy {
- CGOpenMPRuntimeGPU &RT;
- CGOpenMPRuntimeGPU::EntryFunctionState &EST;
- public:
- NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
- CGOpenMPRuntimeGPU::EntryFunctionState &EST)
- : RT(RT), EST(EST) {}
- void Enter(CodeGenFunction &CGF) override {
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
- // Skip target region initialization.
- RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
- }
- void Exit(CodeGenFunction &CGF) override {
- RT.clearLocThreadIdInsertPt(CGF);
- RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
- }
- } Action(*this, EST);
- CodeGen.setAction(Action);
- IsInTTDRegion = true;
- emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
- IsOffloadEntry, CodeGen);
- IsInTTDRegion = false;
- }
- // Create a unique global variable to indicate the execution mode of this target
- // region. The execution mode is either 'generic', or 'spmd' depending on the
- // target directive. This variable is picked up by the offload library to setup
- // the device appropriately before kernel launch. If the execution mode is
- // 'generic', the runtime reserves one warp for the master, otherwise, all
- // warps participate in parallel work.
- static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
- bool Mode) {
- auto *GVMode = new llvm::GlobalVariable(
- CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage,
- llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
- : OMP_TGT_EXEC_MODE_GENERIC),
- Twine(Name, "_exec_mode"));
- CGM.addCompilerUsedGlobal(GVMode);
- }
- void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
- llvm::Constant *Addr,
- uint64_t Size, int32_t,
- llvm::GlobalValue::LinkageTypes) {
- // TODO: Add support for global variables on the device after declare target
- // support.
- llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
- if (!Fn)
- return;
- llvm::Module &M = CGM.getModule();
- llvm::LLVMContext &Ctx = CGM.getLLVMContext();
- // Get "nvvm.annotations" metadata node.
- llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- llvm::Metadata *MDVals[] = {
- llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
- llvm::ConstantAsMetadata::get(
- llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
- // Append metadata to nvvm.annotations.
- MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
- // Add a function attribute for the kernel.
- Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
- }
- void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
- const OMPExecutableDirective &D, StringRef ParentName,
- llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
- bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
- if (!IsOffloadEntry) // Nothing to do.
- return;
- assert(!ParentName.empty() && "Invalid target region parent name!");
- bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
- if (Mode)
- emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
- else
- emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
- setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
- }
- namespace {
- LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
- /// Enum for accesseing the reserved_2 field of the ident_t struct.
- enum ModeFlagsTy : unsigned {
- /// Bit set to 1 when in SPMD mode.
- KMP_IDENT_SPMD_MODE = 0x01,
- /// Bit set to 1 when a simplified runtime is used.
- KMP_IDENT_SIMPLE_RT_MODE = 0x02,
- LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
- };
- /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
- static const ModeFlagsTy UndefinedMode =
- (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
- } // anonymous namespace
- unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
- switch (getExecutionMode()) {
- case EM_SPMD:
- if (requiresFullRuntime())
- return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
- return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
- case EM_NonSPMD:
- assert(requiresFullRuntime() && "Expected full runtime.");
- return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
- case EM_Unknown:
- return UndefinedMode;
- }
- llvm_unreachable("Unknown flags are requested.");
- }
- CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
- : CGOpenMPRuntime(CGM, "_", "$") {
- if (!CGM.getLangOpts().OpenMPIsDevice)
- llvm_unreachable("OpenMP can only handle device code.");
- llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
- if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
- !CGM.getLangOpts().OMPHostIRFile.empty()) {
- OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
- "__omp_rtl_debug_kind");
- OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
- "__omp_rtl_assume_teams_oversubscription");
- OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
- "__omp_rtl_assume_threads_oversubscription");
- }
- }
- void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
- ProcBindKind ProcBind,
- SourceLocation Loc) {
- // Do nothing in case of SPMD mode and L0 parallel.
- if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
- return;
- CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
- }
- void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
- llvm::Value *NumThreads,
- SourceLocation Loc) {
- // Nothing to do.
- }
- void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
- const Expr *NumTeams,
- const Expr *ThreadLimit,
- SourceLocation Loc) {}
- llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
- const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
- OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
- // Emit target region as a standalone region.
- class NVPTXPrePostActionTy : public PrePostActionTy {
- bool &IsInParallelRegion;
- bool PrevIsInParallelRegion;
- public:
- NVPTXPrePostActionTy(bool &IsInParallelRegion)
- : IsInParallelRegion(IsInParallelRegion) {}
- void Enter(CodeGenFunction &CGF) override {
- PrevIsInParallelRegion = IsInParallelRegion;
- IsInParallelRegion = true;
- }
- void Exit(CodeGenFunction &CGF) override {
- IsInParallelRegion = PrevIsInParallelRegion;
- }
- } Action(IsInParallelRegion);
- CodeGen.setAction(Action);
- bool PrevIsInTTDRegion = IsInTTDRegion;
- IsInTTDRegion = false;
- bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
- IsInTargetMasterThreadRegion = false;
- auto *OutlinedFun =
- cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
- D, ThreadIDVar, InnermostKind, CodeGen));
- IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
- IsInTTDRegion = PrevIsInTTDRegion;
- if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
- !IsInParallelRegion) {
- llvm::Function *WrapperFun =
- createParallelDataSharingWrapper(OutlinedFun, D);
- WrapperFunctionsMap[OutlinedFun] = WrapperFun;
- }
- return OutlinedFun;
- }
- /// Get list of lastprivate variables from the teams distribute ... or
- /// teams {distribute ...} directives.
- static void
- getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
- llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
- assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
- "expected teams directive.");
- const OMPExecutableDirective *Dir = &D;
- if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
- if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
- Ctx,
- D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
- /*IgnoreCaptured=*/true))) {
- Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
- if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
- Dir = nullptr;
- }
- }
- if (!Dir)
- return;
- for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
- for (const Expr *E : C->getVarRefs())
- Vars.push_back(getPrivateItem(E));
- }
- }
- /// Get list of reduction variables from the teams ... directives.
- static void
- getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
- llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
- assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
- "expected teams directive.");
- for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
- for (const Expr *E : C->privates())
- Vars.push_back(getPrivateItem(E));
- }
- }
- llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
- const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
- OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
- SourceLocation Loc = D.getBeginLoc();
- const RecordDecl *GlobalizedRD = nullptr;
- llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
- unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
- // Globalize team reductions variable unconditionally in all modes.
- if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
- getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
- if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
- getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
- if (!LastPrivatesReductions.empty()) {
- GlobalizedRD = ::buildRecordForGlobalizedVars(
- CGM.getContext(), llvm::None, LastPrivatesReductions,
- MappedDeclsFields, WarpSize);
- }
- } else if (!LastPrivatesReductions.empty()) {
- assert(!TeamAndReductions.first &&
- "Previous team declaration is not expected.");
- TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
- std::swap(TeamAndReductions.second, LastPrivatesReductions);
- }
- // Emit target region as a standalone region.
- class NVPTXPrePostActionTy : public PrePostActionTy {
- SourceLocation &Loc;
- const RecordDecl *GlobalizedRD;
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &MappedDeclsFields;
- public:
- NVPTXPrePostActionTy(
- SourceLocation &Loc, const RecordDecl *GlobalizedRD,
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &MappedDeclsFields)
- : Loc(Loc), GlobalizedRD(GlobalizedRD),
- MappedDeclsFields(MappedDeclsFields) {}
- void Enter(CodeGenFunction &CGF) override {
- auto &Rt =
- static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- if (GlobalizedRD) {
- auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
- I->getSecond().MappedParams =
- std::make_unique<CodeGenFunction::OMPMapVars>();
- DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
- for (const auto &Pair : MappedDeclsFields) {
- assert(Pair.getFirst()->isCanonicalDecl() &&
- "Expected canonical declaration");
- Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
- }
- }
- Rt.emitGenericVarsProlog(CGF, Loc);
- }
- void Exit(CodeGenFunction &CGF) override {
- static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericVarsEpilog(CGF);
- }
- } Action(Loc, GlobalizedRD, MappedDeclsFields);
- CodeGen.setAction(Action);
- llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
- D, ThreadIDVar, InnermostKind, CodeGen);
- return OutlinedFun;
- }
- void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
- SourceLocation Loc,
- bool WithSPMDCheck) {
- if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
- getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
- return;
- CGBuilderTy &Bld = CGF.Builder;
- const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
- if (I == FunctionGlobalizedDecls.end())
- return;
- for (auto &Rec : I->getSecond().LocalVarData) {
- const auto *VD = cast<VarDecl>(Rec.first);
- bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
- QualType VarTy = VD->getType();
- // Get the local allocation of a firstprivate variable before sharing
- llvm::Value *ParValue;
- if (EscapedParam) {
- LValue ParLVal =
- CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
- ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
- }
- // Allocate space for the variable to be globalized
- llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
- // FIXME: We should use the variables actual alignment as an argument.
- VoidPtr->addRetAttr(llvm::Attribute::get(
- CGM.getLLVMContext(), llvm::Attribute::Alignment,
- CGM.getContext().getTargetInfo().getNewAlign() / 8));
- // Cast the void pointer and get the address of the globalized variable.
- llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
- llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
- LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
- Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
- Rec.second.GlobalizedVal = VoidPtr;
- // Assign the local allocation to the newly globalized location.
- if (EscapedParam) {
- CGF.EmitStoreOfScalar(ParValue, VarAddr);
- I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
- }
- if (auto *DI = CGF.getDebugInfo())
- VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
- }
- for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
- // Use actual memory size of the VLA object including the padding
- // for alignment purposes.
- llvm::Value *Size = CGF.getTypeSize(VD->getType());
- CharUnits Align = CGM.getContext().getDeclAlign(VD);
- Size = Bld.CreateNUWAdd(
- Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
- llvm::Value *AlignVal =
- llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
- Size = Bld.CreateUDiv(Size, AlignVal);
- Size = Bld.CreateNUWMul(Size, AlignVal);
- // Allocate space for this VLA object to be globalized.
- llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
- llvm::CallBase *VoidPtr =
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_alloc_shared),
- AllocArgs, VD->getName());
- VoidPtr->addRetAttr(
- llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
- CGM.getContext().getTargetInfo().getNewAlign()));
- I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
- std::pair<llvm::Value *, llvm::Value *>(
- {VoidPtr, CGF.getTypeSize(VD->getType())}));
- LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
- CGM.getContext().getDeclAlign(VD),
- AlignmentSource::Decl);
- I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
- Base.getAddress(CGF));
- }
- I->getSecond().MappedParams->apply(CGF);
- }
- void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
- bool WithSPMDCheck) {
- if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
- getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
- return;
- const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
- if (I != FunctionGlobalizedDecls.end()) {
- // Deallocate the memory for each globalized VLA object
- for (auto AddrSizePair :
- llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- {AddrSizePair.first, AddrSizePair.second});
- }
- // Deallocate the memory for each globalized value
- for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
- const auto *VD = cast<VarDecl>(Rec.first);
- I->getSecond().MappedParams->restore(CGF);
- llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
- CGF.getTypeSize(VD->getType())};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_free_shared),
- FreeArgs);
- }
- }
- }
- void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
- const OMPExecutableDirective &D,
- SourceLocation Loc,
- llvm::Function *OutlinedFn,
- ArrayRef<llvm::Value *> CapturedVars) {
- if (!CGF.HaveInsertPoint())
- return;
- Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
- /*Name=*/".zero.addr");
- CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
- llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
- OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
- OutlinedFnArgs.push_back(ZeroAddr.getPointer());
- OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
- emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
- }
- void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
- SourceLocation Loc,
- llvm::Function *OutlinedFn,
- ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond,
- llvm::Value *NumThreads) {
- if (!CGF.HaveInsertPoint())
- return;
- auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
- NumThreads](CodeGenFunction &CGF,
- PrePostActionTy &Action) {
- CGBuilderTy &Bld = CGF.Builder;
- llvm::Value *NumThreadsVal = NumThreads;
- llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
- llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
- if (WFn)
- ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
- llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
- // Create a private scope that will globalize the arguments
- // passed from the outside of the target region.
- // TODO: Is that needed?
- CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
- Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
- llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
- "captured_vars_addrs");
- // There's something to share.
- if (!CapturedVars.empty()) {
- // Prepare for parallel region. Indicate the outlined function.
- ASTContext &Ctx = CGF.getContext();
- unsigned Idx = 0;
- for (llvm::Value *V : CapturedVars) {
- Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
- llvm::Value *PtrV;
- if (V->getType()->isIntegerTy())
- PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
- else
- PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
- CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
- Ctx.getPointerType(Ctx.VoidPtrTy));
- ++Idx;
- }
- }
- llvm::Value *IfCondVal = nullptr;
- if (IfCond)
- IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
- /* isSigned */ false);
- else
- IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
- if (!NumThreadsVal)
- NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
- else
- NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
- assert(IfCondVal && "Expected a value");
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *Args[] = {
- RTLoc,
- getThreadID(CGF, Loc),
- IfCondVal,
- NumThreadsVal,
- llvm::ConstantInt::get(CGF.Int32Ty, -1),
- FnPtr,
- ID,
- Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
- CGF.VoidPtrPtrTy),
- llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_parallel_51),
- Args);
- };
- RegionCodeGenTy RCG(ParallelGen);
- RCG(CGF);
- }
- void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
- // Always emit simple barriers!
- if (!CGF.HaveInsertPoint())
- return;
- // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
- // This function does not use parameters, so we can emit just default values.
- llvm::Value *Args[] = {
- llvm::ConstantPointerNull::get(
- cast<llvm::PointerType>(getIdentTyPointerTy())),
- llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
- Args);
- }
- void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
- SourceLocation Loc,
- OpenMPDirectiveKind Kind, bool,
- bool) {
- // Always emit simple barriers!
- if (!CGF.HaveInsertPoint())
- return;
- // Build call __kmpc_cancel_barrier(loc, thread_id);
- unsigned Flags = getDefaultFlagsForBarriers(Kind);
- llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
- getThreadID(CGF, Loc)};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_barrier),
- Args);
- }
- void CGOpenMPRuntimeGPU::emitCriticalRegion(
- CodeGenFunction &CGF, StringRef CriticalName,
- const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
- const Expr *Hint) {
- llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
- llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
- llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the mask of active threads in the warp.
- llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
- // Fetch team-local id of the thread.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // Get the width of the team.
- llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
- // Initialize the counter variable for the loop.
- QualType Int32Ty =
- CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
- Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
- LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
- /*isInit=*/true);
- // Block checks if loop counter exceeds upper bound.
- CGF.EmitBlock(LoopBB);
- llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
- llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
- CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
- // Block tests which single thread should execute region, and which threads
- // should go straight to synchronisation point.
- CGF.EmitBlock(TestBB);
- CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
- llvm::Value *CmpThreadToCounter =
- CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
- CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
- // Block emits the body of the critical region.
- CGF.EmitBlock(BodyBB);
- // Output the critical statement.
- CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
- Hint);
- // After the body surrounded by the critical region, the single executing
- // thread will jump to the synchronisation point.
- // Block waits for all threads in current team to finish then increments the
- // counter variable and returns to the loop.
- CGF.EmitBlock(SyncBB);
- // Reconverge active threads in the warp.
- (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_syncwarp),
- Mask);
- llvm::Value *IncCounterVal =
- CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
- CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
- CGF.EmitBranch(LoopBB);
- // Block that is reached when all threads in the team complete the region.
- CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
- }
- /// Cast value to the specified type.
- static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
- QualType ValTy, QualType CastTy,
- SourceLocation Loc) {
- assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
- "Cast type must sized.");
- assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
- "Val type must sized.");
- llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
- if (ValTy == CastTy)
- return Val;
- if (CGF.getContext().getTypeSizeInChars(ValTy) ==
- CGF.getContext().getTypeSizeInChars(CastTy))
- return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
- if (CastTy->isIntegerType() && ValTy->isIntegerType())
- return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
- CastTy->hasSignedIntegerRepresentation());
- Address CastItem = CGF.CreateMemTemp(CastTy);
- Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
- CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- }
- /// This function creates calls to one of two shuffle functions to copy
- /// variables between lanes in a warp.
- static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
- llvm::Value *Elem,
- QualType ElemType,
- llvm::Value *Offset,
- SourceLocation Loc) {
- CodeGenModule &CGM = CGF.CGM;
- CGBuilderTy &Bld = CGF.Builder;
- CGOpenMPRuntimeGPU &RT =
- *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
- llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- assert(Size.getQuantity() <= 8 &&
- "Unsupported bitwidth in shuffle instruction.");
- RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
- ? OMPRTL___kmpc_shuffle_int32
- : OMPRTL___kmpc_shuffle_int64;
- // Cast all types to 32- or 64-bit values before calling shuffle routines.
- QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
- Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
- llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
- llvm::Value *WarpSize =
- Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
- llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
- {ElemCast, Offset, WarpSize});
- return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
- }
- static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
- Address DestAddr, QualType ElemType,
- llvm::Value *Offset, SourceLocation Loc) {
- CGBuilderTy &Bld = CGF.Builder;
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- // Create the loop over the big sized data.
- // ptr = (void*)Elem;
- // ptrEnd = (void*) Elem + 1;
- // Step = 8;
- // while (ptr + Step < ptrEnd)
- // shuffle((int64_t)*ptr);
- // Step = 4;
- // while (ptr + Step < ptrEnd)
- // shuffle((int32_t)*ptr);
- // ...
- Address ElemPtr = DestAddr;
- Address Ptr = SrcAddr;
- Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
- if (Size < CharUnits::fromQuantity(IntSize))
- continue;
- QualType IntType = CGF.getContext().getIntTypeForBitwidth(
- CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
- /*Signed=*/1);
- llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
- Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
- ElemPtr =
- Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
- if (Size.getQuantity() / IntSize > 1) {
- llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
- llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
- CGF.EmitBlock(PreCondBB);
- llvm::PHINode *PhiSrc =
- Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
- PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
- llvm::PHINode *PhiDest =
- Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
- PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
- Ptr = Address(PhiSrc, Ptr.getAlignment());
- ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
- llvm::Value *PtrDiff = Bld.CreatePtrDiff(
- CGF.Int8Ty, PtrEnd.getPointer(),
- Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
- CGF.VoidPtrTy));
- Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
- ThenBB, ExitBB);
- CGF.EmitBlock(ThenBB);
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
- Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
- PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
- CGF.EmitBranch(PreCondBB);
- CGF.EmitBlock(ExitBB);
- } else {
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Ptr = Bld.CreateConstGEP(Ptr, 1);
- ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- }
- Size = Size % IntSize;
- }
- }
- namespace {
- enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
- // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
- ThreadToScratchpad,
- // ScratchpadToThread: Copy from a scratchpad array in global memory
- // containing team-reduced data to a thread's stack.
- ScratchpadToThread,
- };
- } // namespace
- struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
- };
- /// Emit instructions to copy a Reduce list, which contains partially
- /// aggregated values, in the specified direction.
- static void emitReductionListCopy(
- CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
- ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
- CodeGenModule &CGM = CGF.CGM;
- ASTContext &C = CGM.getContext();
- CGBuilderTy &Bld = CGF.Builder;
- llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
- llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
- llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
- // Iterates, element-by-element, through the source Reduce list and
- // make a copy.
- unsigned Idx = 0;
- unsigned Size = Privates.size();
- for (const Expr *Private : Privates) {
- Address SrcElementAddr = Address::invalid();
- Address DestElementAddr = Address::invalid();
- Address DestElementPtrAddr = Address::invalid();
- // Should we shuffle in an element from a remote lane?
- bool ShuffleInElement = false;
- // Set to true to update the pointer in the dest Reduce list to a
- // newly created element.
- bool UpdateDestListPtr = false;
- // Increment the src or dest pointer to the scratchpad, for each
- // new element.
- bool IncrScratchpadSrc = false;
- bool IncrScratchpadDest = false;
- switch (Action) {
- case RemoteLaneToThread: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr,
- C.getPointerType(Private->getType())->castAs<PointerType>());
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- ShuffleInElement = true;
- UpdateDestListPtr = true;
- break;
- }
- case ThreadCopy: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr,
- C.getPointerType(Private->getType())->castAs<PointerType>());
- // Step 1.2: Get the address for dest element. The destination
- // element has already been created on the thread's stack.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr = CGF.EmitLoadOfPointer(
- DestElementPtrAddr,
- C.getPointerType(Private->getType())->castAs<PointerType>());
- break;
- }
- case ThreadToScratchpad: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr,
- C.getPointerType(Private->getType())->castAs<PointerType>());
- // Step 1.2: Get the address for dest element:
- // address = base + index * ElementSizeInChars.
- llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
- llvm::Value *CurrentOffset =
- Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
- llvm::Value *ScratchPadElemAbsolutePtrVal =
- Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
- ScratchPadElemAbsolutePtrVal =
- Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
- DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
- C.getTypeAlignInChars(Private->getType()));
- IncrScratchpadDest = true;
- break;
- }
- case ScratchpadToThread: {
- // Step 1.1: Get the address for the src element in the scratchpad.
- // address = base + index * ElementSizeInChars.
- llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
- llvm::Value *CurrentOffset =
- Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
- llvm::Value *ScratchPadElemAbsolutePtrVal =
- Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
- ScratchPadElemAbsolutePtrVal =
- Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
- SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
- C.getTypeAlignInChars(Private->getType()));
- IncrScratchpadSrc = true;
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- UpdateDestListPtr = true;
- break;
- }
- }
- // Regardless of src and dest of copy, we emit the load of src
- // element as this is required in all directions
- SrcElementAddr = Bld.CreateElementBitCast(
- SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
- DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
- SrcElementAddr.getElementType());
- // Now that all active lanes have read the element in the
- // Reduce list, shuffle over the value from the remote lane.
- if (ShuffleInElement) {
- shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
- } else {
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- SrcElementAddr, /*Volatile=*/false, Private->getType(),
- Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(
- Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getExprLoc());
- CGF.EmitStoreOfComplex(
- Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- }
- // Step 3.1: Modify reference in dest Reduce list as needed.
- // Modifying the reference in Reduce list to point to the newly
- // created element. The element is live in the current function
- // scope and that of functions it invokes (i.e., reduce_function).
- // RemoteReduceData[i] = (void*)&RemoteElem
- if (UpdateDestListPtr) {
- CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr.getPointer(), CGF.VoidPtrTy),
- DestElementPtrAddr, /*Volatile=*/false,
- C.VoidPtrTy);
- }
- // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
- // address of the next element in scratchpad memory, unless we're currently
- // processing the last one. Memory alignment is also taken care of here.
- if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
- llvm::Value *ScratchpadBasePtr =
- IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
- llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
- ScratchpadBasePtr = Bld.CreateNUWAdd(
- ScratchpadBasePtr,
- Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
- // Take care of global memory alignment for performance
- ScratchpadBasePtr = Bld.CreateNUWSub(
- ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
- ScratchpadBasePtr = Bld.CreateUDiv(
- ScratchpadBasePtr,
- llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
- ScratchpadBasePtr = Bld.CreateNUWAdd(
- ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
- ScratchpadBasePtr = Bld.CreateNUWMul(
- ScratchpadBasePtr,
- llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
- if (IncrScratchpadDest)
- DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
- else /* IncrScratchpadSrc = true */
- SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
- }
- ++Idx;
- }
- }
- /// This function emits a helper that gathers Reduce lists from the first
- /// lane of every active warp to lanes in the first warp.
- ///
- /// void inter_warp_copy_func(void* reduce_data, num_warps)
- /// shared smem[warp_size];
- /// For all data entries D in reduce_data:
- /// sync
- /// If (I am the first lane in each warp)
- /// Copy my local D to smem[warp_id]
- /// sync
- /// if (I am the first warp)
- /// Copy smem[thread_id] to my local D
- static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
- ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy,
- SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- llvm::Module &M = CGM.getModule();
- // ReduceList: thread local Reduce list.
- // At the stage of the computation when this function is called, partially
- // aggregated values reside in the first lane of every active warp.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // NumWarps: number of warps active in the parallel region. This could
- // be smaller than 32 (max warps in a CTA) for partial block reduction.
- ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.getIntTypeForBitwidth(32, /* Signed */ true),
- ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&NumWarpsArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
- llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_inter_warp_copy_func", &M);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- // This array is used as a medium to transfer, one reduce element at a time,
- // the data from the first lane of every warp to lanes in the first warp
- // in order to perform the final step of a reduction in a parallel region
- // (reduction across warps). The array is placed in NVPTX __shared__ memory
- // for reduced latency, as well as to have a distinct copy for concurrently
- // executing target regions. The array is declared with common linkage so
- // as to be shared across compilation units.
- StringRef TransferMediumName =
- "__openmp_nvptx_data_transfer_temporary_storage";
- llvm::GlobalVariable *TransferMedium =
- M.getGlobalVariable(TransferMediumName);
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
- llvm::UndefValue::get(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
- CGM.addCompilerUsedGlobal(TransferMedium);
- }
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // nvptx_lane_id = nvptx_id % warpsize
- llvm::Value *LaneID = getNVPTXLaneID(CGF);
- // nvptx_warp_id = nvptx_id / warpsize
- llvm::Value *WarpID = getNVPTXWarpID(CGF);
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
- CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
- CGF.getPointerAlign());
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- //
- // Warp master copies reduce element to transfer medium in __shared__
- // memory.
- //
- unsigned RealTySize =
- C.getTypeSizeInChars(Private->getType())
- .alignTo(C.getTypeAlignInChars(Private->getType()))
- .getQuantity();
- for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
- unsigned NumIters = RealTySize / TySize;
- if (NumIters == 0)
- continue;
- QualType CType = C.getIntTypeForBitwidth(
- C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
- llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
- CharUnits Align = CharUnits::fromQuantity(TySize);
- llvm::Value *Cnt = nullptr;
- Address CntAddr = Address::invalid();
- llvm::BasicBlock *PrecondBB = nullptr;
- llvm::BasicBlock *ExitBB = nullptr;
- if (NumIters > 1) {
- CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
- /*Volatile=*/false, C.IntTy);
- PrecondBB = CGF.createBasicBlock("precond");
- ExitBB = CGF.createBasicBlock("exit");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(PrecondBB);
- Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
- llvm::Value *Cmp =
- Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
- Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
- CGF.EmitBlock(BodyBB);
- }
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- Address ElemPtr = Address(ElemPtrPtr, Align);
- ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
- if (NumIters > 1)
- ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
- Address MediumPtr(MediumPtrVal, Align);
- // Casting to actual data type.
- // MediumPtr = (CopyType*)MediumPtrAddr;
- MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
- // elem = *elemptr
- //*MediumPtr = elem
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, CType, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Bld.CreateBr(MergeBB);
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
- CGF.EmitBlock(MergeBB);
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- //
- // Warp 0 copies reduce element from transfer medium.
- //
- llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
- // Up to 32 threads in warp 0 are active.
- llvm::Value *IsActiveThread =
- Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
- Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
- CGF.EmitBlock(W0ThenBB);
- // SrcMediumPtr = &medium[tid]
- llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
- Address SrcMediumPtr(SrcMediumPtrVal, Align);
- // SrcMediumVal = *SrcMediumPtr;
- SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
- // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
- Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
- TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
- Address TargetElemPtr = Address(TargetElemPtrVal, Align);
- TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
- if (NumIters > 1)
- TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
- // *TargetElemPtr = SrcMediumVal;
- llvm::Value *SrcMediumValue =
- CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
- CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- CType);
- Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0ElseBB);
- Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0MergeBB);
- if (NumIters > 1) {
- Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
- CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
- CGF.EmitBranch(PrecondBB);
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB);
- }
- RealTySize %= TySize;
- }
- ++Idx;
- }
- CGF.FinishFunction();
- return Fn;
- }
- /// Emit a helper that reduces data across two OpenMP threads (lanes)
- /// in the same warp. It uses shuffle instructions to copy over data from
- /// a remote lane's stack. The reduction algorithm performed is specified
- /// by the fourth parameter.
- ///
- /// Algorithm Versions.
- /// Full Warp Reduce (argument value 0):
- /// This algorithm assumes that all 32 lanes are active and gathers
- /// data from these 32 lanes, producing a single resultant value.
- /// Contiguous Partial Warp Reduce (argument value 1):
- /// This algorithm assumes that only a *contiguous* subset of lanes
- /// are active. This happens for the last warp in a parallel region
- /// when the user specified num_threads is not an integer multiple of
- /// 32. This contiguous subset always starts with the zeroth lane.
- /// Partial Warp Reduce (argument value 2):
- /// This algorithm gathers data from any number of lanes at any position.
- /// All reduced values are stored in the lowest possible lane. The set
- /// of problems every algorithm addresses is a super set of those
- /// addressable by algorithms with a lower version number. Overhead
- /// increases as algorithm version increases.
- ///
- /// Terminology
- /// Reduce element:
- /// Reduce element refers to the individual data field with primitive
- /// data types to be combined and reduced across threads.
- /// Reduce list:
- /// Reduce list refers to a collection of local, thread-private
- /// reduce elements.
- /// Remote Reduce list:
- /// Remote Reduce list refers to a collection of remote (relative to
- /// the current thread) reduce elements.
- ///
- /// We distinguish between three states of threads that are important to
- /// the implementation of this function.
- /// Alive threads:
- /// Threads in a warp executing the SIMT instruction, as distinguished from
- /// threads that are inactive due to divergent control flow.
- /// Active threads:
- /// The minimal set of threads that has to be alive upon entry to this
- /// function. The computation is correct iff active threads are alive.
- /// Some threads are alive but they are not active because they do not
- /// contribute to the computation in any useful manner. Turning them off
- /// may introduce control flow overheads without any tangible benefits.
- /// Effective threads:
- /// In order to comply with the argument requirements of the shuffle
- /// function, we must keep all lanes holding data alive. But at most
- /// half of them perform value aggregation; we refer to this half of
- /// threads as effective. The other half is simply handing off their
- /// data.
- ///
- /// Procedure
- /// Value shuffle:
- /// In this step active threads transfer data from higher lane positions
- /// in the warp to lower lane positions, creating Remote Reduce list.
- /// Value aggregation:
- /// In this step, effective threads combine their thread local Reduce list
- /// with Remote Reduce list and store the result in the thread local
- /// Reduce list.
- /// Value copy:
- /// In this step, we deal with the assumption made by algorithm 2
- /// (i.e. contiguity assumption). When we have an odd number of lanes
- /// active, say 2k+1, only k threads will be effective and therefore k
- /// new values will be produced. However, the Reduce list owned by the
- /// (2k+1)th thread is ignored in the value aggregation. Therefore
- /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
- /// that the contiguity assumption still holds.
- static llvm::Function *emitShuffleAndReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- // Thread local Reduce list used to host the values of data to be reduced.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // Current lane id; could be logical.
- ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
- ImplicitParamDecl::Other);
- // Offset of the remote source lane relative to the current lane.
- ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamDecl::Other);
- // Algorithm version. This is expected to be known at compile time.
- ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&LaneIDArg);
- Args.push_back(&RemoteLaneOffsetArg);
- Args.push_back(&AlgoVerArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, SourceLocation()),
- CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
- CGF.getPointerAlign());
- Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
- llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
- AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
- Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
- llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
- AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
- Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
- llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
- AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
- // Create a local thread-private variable to host the Reduce list
- // from a remote lane.
- Address RemoteReduceList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
- // This loop iterates through the list of reduce elements and copies,
- // element by element, from a remote lane in the warp to RemoteReduceList,
- // hosted on the thread's stack.
- emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
- LocalReduceList, RemoteReduceList,
- {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
- /*ScratchpadIndex=*/nullptr,
- /*ScratchpadWidth=*/nullptr});
- // The actions to be performed on the Remote Reduce list is dependent
- // on the algorithm version.
- //
- // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
- // LaneId % 2 == 0 && Offset > 0):
- // do the reduction value aggregation
- //
- // The thread local variable Reduce list is mutated in place to host the
- // reduced data, which is the aggregated value produced from local and
- // remote lanes.
- //
- // Note that AlgoVer is expected to be a constant integer known at compile
- // time.
- // When AlgoVer==0, the first conjunction evaluates to true, making
- // the entire predicate true during compile time.
- // When AlgoVer==1, the second conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- // When AlgoVer==2, the third conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
- llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondAlgo1 = Bld.CreateAnd(
- Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
- llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
- llvm::Value *CondAlgo2 = Bld.CreateAnd(
- Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
- CondAlgo2 = Bld.CreateAnd(
- CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
- llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
- CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
- // reduce_function(LocalReduceList, RemoteReduceList)
- llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- LocalReduceList.getPointer(), CGF.VoidPtrTy);
- llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- RemoteReduceList.getPointer(), CGF.VoidPtrTy);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
- Bld.CreateBr(MergeBB);
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
- CGF.EmitBlock(MergeBB);
- // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
- // Reduce list.
- Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondCopy = Bld.CreateAnd(
- Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
- llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
- CGF.EmitBlock(CpyThenBB);
- emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
- RemoteReduceList, LocalReduceList);
- Bld.CreateBr(CpyMergeBB);
- CGF.EmitBlock(CpyElseBB);
- Bld.CreateBr(CpyMergeBB);
- CGF.EmitBlock(CpyMergeBB);
- CGF.FinishFunction();
- return Fn;
- }
- /// This function emits a helper that copies all the reduction variables from
- /// the team into the provided global buffer for the reduction variables.
- ///
- /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
- /// For all data entries D in reduce_data:
- /// Copy local D to buffer.D[Idx]
- static llvm::Value *emitListToGlobalCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamDecl::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
- CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
- CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
- GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
- GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- CGF.EmitStoreOfScalar(V, GlobLVal);
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
- CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(GlobLVal,
- CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
- CGF.FinishFunction();
- return Fn;
- }
- /// This function emits a helper that reduces all the reduction variables from
- /// the team into the provided global buffer for the reduction variables.
- ///
- /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
- /// void *GlobPtrs[];
- /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
- /// ...
- /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
- /// reduce_function(GlobPtrs, reduce_data);
- static llvm::Value *emitListToGlobalReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamDecl::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
- CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
- GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
- llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
- CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
- // Call reduce_function(GlobalReduceList, ReduceList)
- llvm::Value *GlobalReduceList =
- CGF.EmitCastToVoidPtr(ReductionList.getPointer());
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
- CGF.FinishFunction();
- return Fn;
- }
- /// This function emits a helper that copies all the reduction variables from
- /// the team into the provided global buffer for the reduction variables.
- ///
- /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
- /// For all data entries D in reduce_data:
- /// Copy buffer.D[Idx] to local D;
- static llvm::Value *emitGlobalToListCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamDecl::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
- CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
- CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
- GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
- GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
- CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
- CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- GlobLVal, Private->getType(),
- AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
- CGF.FinishFunction();
- return Fn;
- }
- /// This function emits a helper that reduces all the reduction variables from
- /// the team into the provided global buffer for the reduction variables.
- ///
- /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
- /// void *GlobPtrs[];
- /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
- /// ...
- /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
- /// reduce_function(reduce_data, GlobPtrs);
- static llvm::Value *emitGlobalToListReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamDecl::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamDecl::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
- CGBuilderTy &Bld = CGF.Builder;
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
- CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
- GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
- llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
- CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
- // Call reduce_function(ReduceList, GlobalReduceList)
- llvm::Value *GlobalReduceList =
- CGF.EmitCastToVoidPtr(ReductionList.getPointer());
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
- CGF.FinishFunction();
- return Fn;
- }
- ///
- /// Design of OpenMP reductions on the GPU
- ///
- /// Consider a typical OpenMP program with one or more reduction
- /// clauses:
- ///
- /// float foo;
- /// double bar;
- /// #pragma omp target teams distribute parallel for \
- /// reduction(+:foo) reduction(*:bar)
- /// for (int i = 0; i < N; i++) {
- /// foo += A[i]; bar *= B[i];
- /// }
- ///
- /// where 'foo' and 'bar' are reduced across all OpenMP threads in
- /// all teams. In our OpenMP implementation on the NVPTX device an
- /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
- /// within a team are mapped to CUDA threads within a threadblock.
- /// Our goal is to efficiently aggregate values across all OpenMP
- /// threads such that:
- ///
- /// - the compiler and runtime are logically concise, and
- /// - the reduction is performed efficiently in a hierarchical
- /// manner as follows: within OpenMP threads in the same warp,
- /// across warps in a threadblock, and finally across teams on
- /// the NVPTX device.
- ///
- /// Introduction to Decoupling
- ///
- /// We would like to decouple the compiler and the runtime so that the
- /// latter is ignorant of the reduction variables (number, data types)
- /// and the reduction operators. This allows a simpler interface
- /// and implementation while still attaining good performance.
- ///
- /// Pseudocode for the aforementioned OpenMP program generated by the
- /// compiler is as follows:
- ///
- /// 1. Create private copies of reduction variables on each OpenMP
- /// thread: 'foo_private', 'bar_private'
- /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
- /// to it and writes the result in 'foo_private' and 'bar_private'
- /// respectively.
- /// 3. Call the OpenMP runtime on the GPU to reduce within a team
- /// and store the result on the team master:
- ///
- /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
- /// reduceData, shuffleReduceFn, interWarpCpyFn)
- ///
- /// where:
- /// struct ReduceData {
- /// double *foo;
- /// double *bar;
- /// } reduceData
- /// reduceData.foo = &foo_private
- /// reduceData.bar = &bar_private
- ///
- /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
- /// auxiliary functions generated by the compiler that operate on
- /// variables of type 'ReduceData'. They aid the runtime perform
- /// algorithmic steps in a data agnostic manner.
- ///
- /// 'shuffleReduceFn' is a pointer to a function that reduces data
- /// of type 'ReduceData' across two OpenMP threads (lanes) in the
- /// same warp. It takes the following arguments as input:
- ///
- /// a. variable of type 'ReduceData' on the calling lane,
- /// b. its lane_id,
- /// c. an offset relative to the current lane_id to generate a
- /// remote_lane_id. The remote lane contains the second
- /// variable of type 'ReduceData' that is to be reduced.
- /// d. an algorithm version parameter determining which reduction
- /// algorithm to use.
- ///
- /// 'shuffleReduceFn' retrieves data from the remote lane using
- /// efficient GPU shuffle intrinsics and reduces, using the
- /// algorithm specified by the 4th parameter, the two operands
- /// element-wise. The result is written to the first operand.
- ///
- /// Different reduction algorithms are implemented in different
- /// runtime functions, all calling 'shuffleReduceFn' to perform
- /// the essential reduction step. Therefore, based on the 4th
- /// parameter, this function behaves slightly differently to
- /// cooperate with the runtime to ensure correctness under
- /// different circumstances.
- ///
- /// 'InterWarpCpyFn' is a pointer to a function that transfers
- /// reduced variables across warps. It tunnels, through CUDA
- /// shared memory, the thread-private data of type 'ReduceData'
- /// from lane 0 of each warp to a lane in the first warp.
- /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
- /// The last team writes the global reduced value to memory.
- ///
- /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
- /// reduceData, shuffleReduceFn, interWarpCpyFn,
- /// scratchpadCopyFn, loadAndReduceFn)
- ///
- /// 'scratchpadCopyFn' is a helper that stores reduced
- /// data from the team master to a scratchpad array in
- /// global memory.
- ///
- /// 'loadAndReduceFn' is a helper that loads data from
- /// the scratchpad array and reduces it with the input
- /// operand.
- ///
- /// These compiler generated functions hide address
- /// calculation and alignment information from the runtime.
- /// 5. if ret == 1:
- /// The team master of the last team stores the reduced
- /// result to the globals in memory.
- /// foo += reduceData.foo; bar *= reduceData.bar
- ///
- ///
- /// Warp Reduction Algorithms
- ///
- /// On the warp level, we have three algorithms implemented in the
- /// OpenMP runtime depending on the number of active lanes:
- ///
- /// Full Warp Reduction
- ///
- /// The reduce algorithm within a warp where all lanes are active
- /// is implemented in the runtime as follows:
- ///
- /// full_warp_reduce(void *reduce_data,
- /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
- /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
- /// ShuffleReduceFn(reduce_data, 0, offset, 0);
- /// }
- ///
- /// The algorithm completes in log(2, WARPSIZE) steps.
- ///
- /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
- /// not used therefore we save instructions by not retrieving lane_id
- /// from the corresponding special registers. The 4th parameter, which
- /// represents the version of the algorithm being used, is set to 0 to
- /// signify full warp reduction.
- ///
- /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
- ///
- /// #reduce_elem refers to an element in the local lane's data structure
- /// #remote_elem is retrieved from a remote lane
- /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
- /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
- ///
- /// Contiguous Partial Warp Reduction
- ///
- /// This reduce algorithm is used within a warp where only the first
- /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
- /// number of OpenMP threads in a parallel region is not a multiple of
- /// WARPSIZE. The algorithm is implemented in the runtime as follows:
- ///
- /// void
- /// contiguous_partial_reduce(void *reduce_data,
- /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
- /// int size, int lane_id) {
- /// int curr_size;
- /// int offset;
- /// curr_size = size;
- /// mask = curr_size/2;
- /// while (offset>0) {
- /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
- /// curr_size = (curr_size+1)/2;
- /// offset = curr_size/2;
- /// }
- /// }
- ///
- /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
- ///
- /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
- /// if (lane_id < offset)
- /// reduce_elem = reduce_elem REDUCE_OP remote_elem
- /// else
- /// reduce_elem = remote_elem
- ///
- /// This algorithm assumes that the data to be reduced are located in a
- /// contiguous subset of lanes starting from the first. When there is
- /// an odd number of active lanes, the data in the last lane is not
- /// aggregated with any other lane's dat but is instead copied over.
- ///
- /// Dispersed Partial Warp Reduction
- ///
- /// This algorithm is used within a warp when any discontiguous subset of
- /// lanes are active. It is used to implement the reduction operation
- /// across lanes in an OpenMP simd region or in a nested parallel region.
- ///
- /// void
- /// dispersed_partial_reduce(void *reduce_data,
- /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
- /// int size, remote_id;
- /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
- /// do {
- /// remote_id = next_active_lane_id_right_after_me();
- /// # the above function returns 0 of no active lane
- /// # is present right after the current lane.
- /// size = number_of_active_lanes_in_this_warp();
- /// logical_lane_id /= 2;
- /// ShuffleReduceFn(reduce_data, logical_lane_id,
- /// remote_id-1-threadIdx.x, 2);
- /// } while (logical_lane_id % 2 == 0 && size > 1);
- /// }
- ///
- /// There is no assumption made about the initial state of the reduction.
- /// Any number of lanes (>=1) could be active at any position. The reduction
- /// result is returned in the first active lane.
- ///
- /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
- ///
- /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
- /// if (lane_id % 2 == 0 && offset > 0)
- /// reduce_elem = reduce_elem REDUCE_OP remote_elem
- /// else
- /// reduce_elem = remote_elem
- ///
- ///
- /// Intra-Team Reduction
- ///
- /// This function, as implemented in the runtime call
- /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
- /// threads in a team. It first reduces within a warp using the
- /// aforementioned algorithms. We then proceed to gather all such
- /// reduced values at the first warp.
- ///
- /// The runtime makes use of the function 'InterWarpCpyFn', which copies
- /// data from each of the "warp master" (zeroth lane of each warp, where
- /// warp-reduced data is held) to the zeroth warp. This step reduces (in
- /// a mathematical sense) the problem of reduction across warp masters in
- /// a block to the problem of warp reduction.
- ///
- ///
- /// Inter-Team Reduction
- ///
- /// Once a team has reduced its data to a single value, it is stored in
- /// a global scratchpad array. Since each team has a distinct slot, this
- /// can be done without locking.
- ///
- /// The last team to write to the scratchpad array proceeds to reduce the
- /// scratchpad array. One or more workers in the last team use the helper
- /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
- /// the k'th worker reduces every k'th element.
- ///
- /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
- /// reduce across workers and compute a globally reduced value.
- ///
- void CGOpenMPRuntimeGPU::emitReduction(
- CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
- ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
- ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
- if (!CGF.HaveInsertPoint())
- return;
- bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
- #ifndef NDEBUG
- bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
- #endif
- if (Options.SimpleReduction) {
- assert(!TeamsReduction && !ParallelReduction &&
- "Invalid reduction selection in emitReduction.");
- CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
- ReductionOps, Options);
- return;
- }
- assert((TeamsReduction || ParallelReduction) &&
- "Invalid reduction selection in emitReduction.");
- // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
- // RedList, shuffle_reduce_func, interwarp_copy_func);
- // or
- // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *ThreadId = getThreadID(CGF, Loc);
- llvm::Value *Res;
- ASTContext &C = CGM.getContext();
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- auto Size = RHSExprs.size();
- for (const Expr *E : Privates) {
- if (E->getType()->isVariablyModifiedType())
- // Reserve place for array size.
- ++Size;
- }
- llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
- QualType ReductionArrayTy =
- C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- unsigned Idx = 0;
- for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- CGF.Builder.CreateStore(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
- Elem);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
- llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReductionList.getPointer(), CGF.VoidPtrTy);
- llvm::Function *ReductionFn = emitReductionFunction(
- Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
- LHSExprs, RHSExprs, ReductionOps);
- llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
- llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
- CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
- llvm::Value *InterWarpCopyFn =
- emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
- if (ParallelReduction) {
- llvm::Value *Args[] = {RTLoc,
- ThreadId,
- CGF.Builder.getInt32(RHSExprs.size()),
- ReductionArrayTySize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn};
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
- Args);
- } else {
- assert(TeamsReduction && "expected teams reduction.");
- llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
- llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
- int Cnt = 0;
- for (const Expr *DRE : Privates) {
- PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
- ++Cnt;
- }
- const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
- CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
- C.getLangOpts().OpenMPCUDAReductionBufNum);
- TeamsReductions.push_back(TeamReductionRec);
- if (!KernelTeamsReductionPtr) {
- KernelTeamsReductionPtr = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::InternalLinkage, nullptr,
- "_openmp_teams_reductions_buffer_$_$ptr");
- }
- llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
- Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
- /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
- llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
- llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
- llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *Args[] = {
- RTLoc,
- ThreadId,
- GlobalBufferPtr,
- CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn,
- GlobalToBufferCpyFn,
- GlobalToBufferRedFn,
- BufferToGlobalCpyFn,
- BufferToGlobalRedFn};
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
- Args);
- }
- // 5. Build if (res == 1)
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
- llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
- Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
- CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
- // 6. Build then branch: where we have reduced values in the master
- // thread in each team.
- // __kmpc_end_reduce{_nowait}(<gtid>);
- // break;
- CGF.EmitBlock(ThenBB);
- // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
- auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
- this](CodeGenFunction &CGF, PrePostActionTy &Action) {
- auto IPriv = Privates.begin();
- auto ILHS = LHSExprs.begin();
- auto IRHS = RHSExprs.begin();
- for (const Expr *E : ReductionOps) {
- emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
- cast<DeclRefExpr>(*IRHS));
- ++IPriv;
- ++ILHS;
- ++IRHS;
- }
- };
- llvm::Value *EndArgs[] = {ThreadId};
- RegionCodeGenTy RCG(CodeGen);
- NVPTXActionTy Action(
- nullptr, llvm::None,
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
- EndArgs);
- RCG.setAction(Action);
- RCG(CGF);
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
- }
- const VarDecl *
- CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
- const VarDecl *NativeParam) const {
- if (!NativeParam->getType()->isReferenceType())
- return NativeParam;
- QualType ArgType = NativeParam->getType();
- QualifierCollector QC;
- const Type *NonQualTy = QC.strip(ArgType);
- QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
- if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
- if (Attr->getCaptureKind() == OMPC_map) {
- PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
- LangAS::opencl_global);
- }
- }
- ArgType = CGM.getContext().getPointerType(PointeeTy);
- QC.addRestrict();
- enum { NVPTX_local_addr = 5 };
- QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
- ArgType = QC.apply(CGM.getContext(), ArgType);
- if (isa<ImplicitParamDecl>(NativeParam))
- return ImplicitParamDecl::Create(
- CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
- NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
- return ParmVarDecl::Create(
- CGM.getContext(),
- const_cast<DeclContext *>(NativeParam->getDeclContext()),
- NativeParam->getBeginLoc(), NativeParam->getLocation(),
- NativeParam->getIdentifier(), ArgType,
- /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
- }
- Address
- CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
- const VarDecl *NativeParam,
- const VarDecl *TargetParam) const {
- assert(NativeParam != TargetParam &&
- NativeParam->getType()->isReferenceType() &&
- "Native arg must not be the same as target arg.");
- Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
- QualType NativeParamType = NativeParam->getType();
- QualifierCollector QC;
- const Type *NonQualTy = QC.strip(NativeParamType);
- QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
- unsigned NativePointeeAddrSpace =
- CGF.getContext().getTargetAddressSpace(NativePointeeTy);
- QualType TargetTy = TargetParam->getType();
- llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
- LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
- // First cast to generic.
- TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- TargetAddr, llvm::PointerType::getWithSamePointeeType(
- cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
- // Cast from generic to native address space.
- TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- TargetAddr, llvm::PointerType::getWithSamePointeeType(
- cast<llvm::PointerType>(TargetAddr->getType()),
- NativePointeeAddrSpace));
- Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
- CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
- NativeParamType);
- return NativeParamAddr;
- }
- void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
- CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
- ArrayRef<llvm::Value *> Args) const {
- SmallVector<llvm::Value *, 4> TargetArgs;
- TargetArgs.reserve(Args.size());
- auto *FnType = OutlinedFn.getFunctionType();
- for (unsigned I = 0, E = Args.size(); I < E; ++I) {
- if (FnType->isVarArg() && FnType->getNumParams() <= I) {
- TargetArgs.append(std::next(Args.begin(), I), Args.end());
- break;
- }
- llvm::Type *TargetType = FnType->getParamType(I);
- llvm::Value *NativeArg = Args[I];
- if (!TargetType->isPointerTy()) {
- TargetArgs.emplace_back(NativeArg);
- continue;
- }
- llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- NativeArg, llvm::PointerType::getWithSamePointeeType(
- cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
- TargetArgs.emplace_back(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
- }
- CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
- }
- /// Emit function which wraps the outline parallel region
- /// and controls the arguments which are passed to this function.
- /// The wrapper ensures that the outlined function is called
- /// with the correct arguments when data is shared.
- llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
- llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
- ASTContext &Ctx = CGM.getContext();
- const auto &CS = *D.getCapturedStmt(OMPD_parallel);
- // Create a function that takes as argument the source thread.
- FunctionArgList WrapperArgs;
- QualType Int16QTy =
- Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
- QualType Int32QTy =
- Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
- ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
- /*Id=*/nullptr, Int16QTy,
- ImplicitParamDecl::Other);
- ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
- /*Id=*/nullptr, Int32QTy,
- ImplicitParamDecl::Other);
- WrapperArgs.emplace_back(&ParallelLevelArg);
- WrapperArgs.emplace_back(&WrapperArg);
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
- // Ensure we do not inline the function. This is trivially true for the ones
- // passed to __kmpc_fork_call but the ones calles in serialized regions
- // could be inlined. This is not a perfect but it is closer to the invariant
- // we want, namely, every data environment starts with a new function.
- // TODO: We should pass the if condition to the runtime function and do the
- // handling there. Much cleaner code.
- Fn->addFnAttr(llvm::Attribute::NoInline);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
- CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
- D.getBeginLoc(), D.getBeginLoc());
- const auto *RD = CS.getCapturedRecordDecl();
- auto CurField = RD->field_begin();
- Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
- /*Name=*/".zero.addr");
- CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
- // Get the array of arguments.
- SmallVector<llvm::Value *, 8> Args;
- Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
- Args.emplace_back(ZeroAddr.getPointer());
- CGBuilderTy &Bld = CGF.Builder;
- auto CI = CS.capture_begin();
- // Use global memory for data sharing.
- // Handle passing of global args to workers.
- Address GlobalArgs =
- CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
- llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
- llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
- DataSharingArgs);
- // Retrieve the shared variables from the list of references returned
- // by the runtime. Pass the variables to the outlined function.
- Address SharedArgListAddress = Address::invalid();
- if (CS.capture_size() > 0 ||
- isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
- SharedArgListAddress = CGF.EmitLoadOfPointer(
- GlobalArgs, CGF.getContext()
- .getPointerType(CGF.getContext().getPointerType(
- CGF.getContext().VoidPtrTy))
- .castAs<PointerType>());
- }
- unsigned Idx = 0;
- if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
- Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
- Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Src, CGF.SizeTy->getPointerTo());
- llvm::Value *LB = CGF.EmitLoadOfScalar(
- TypedAddress,
- /*Volatile=*/false,
- CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
- cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
- Args.emplace_back(LB);
- ++Idx;
- Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
- TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Src, CGF.SizeTy->getPointerTo());
- llvm::Value *UB = CGF.EmitLoadOfScalar(
- TypedAddress,
- /*Volatile=*/false,
- CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
- cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
- Args.emplace_back(UB);
- ++Idx;
- }
- if (CS.capture_size() > 0) {
- ASTContext &CGFContext = CGF.getContext();
- for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
- QualType ElemTy = CurField->getType();
- Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
- Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
- llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
- /*Volatile=*/false,
- CGFContext.getPointerType(ElemTy),
- CI->getLocation());
- if (CI->capturesVariableByCopy() &&
- !CI->getCapturedVar()->getType()->isAnyPointerType()) {
- Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
- CI->getLocation());
- }
- Args.emplace_back(Arg);
- }
- }
- emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
- CGF.FinishFunction();
- return Fn;
- }
- void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
- const Decl *D) {
- if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
- return;
- assert(D && "Expected function or captured|block decl.");
- assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
- "Function is registered already.");
- assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
- "Team is set but not processed.");
- const Stmt *Body = nullptr;
- bool NeedToDelayGlobalization = false;
- if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
- Body = FD->getBody();
- } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
- Body = BD->getBody();
- } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
- Body = CD->getBody();
- NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
- if (NeedToDelayGlobalization &&
- getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
- return;
- }
- if (!Body)
- return;
- CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
- VarChecker.Visit(Body);
- const RecordDecl *GlobalizedVarsRecord =
- VarChecker.getGlobalizedRecord(IsInTTDRegion);
- TeamAndReductions.first = nullptr;
- TeamAndReductions.second.clear();
- ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
- VarChecker.getEscapedVariableLengthDecls();
- if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
- return;
- auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
- I->getSecond().MappedParams =
- std::make_unique<CodeGenFunction::OMPMapVars>();
- I->getSecond().EscapedParameters.insert(
- VarChecker.getEscapedParameters().begin(),
- VarChecker.getEscapedParameters().end());
- I->getSecond().EscapedVariableLengthDecls.append(
- EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
- DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
- for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
- assert(VD->isCanonicalDecl() && "Expected canonical declaration");
- Data.insert(std::make_pair(VD, MappedVarData()));
- }
- if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
- CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
- VarChecker.Visit(Body);
- I->getSecond().SecondaryLocalVarData.emplace();
- DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
- for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
- assert(VD->isCanonicalDecl() && "Expected canonical declaration");
- Data.insert(std::make_pair(VD, MappedVarData()));
- }
- }
- if (!NeedToDelayGlobalization) {
- emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
- struct GlobalizationScope final : EHScopeStack::Cleanup {
- GlobalizationScope() = default;
- void Emit(CodeGenFunction &CGF, Flags flags) override {
- static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
- }
- };
- CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
- }
- }
- Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
- const VarDecl *VD) {
- if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
- const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
- auto AS = LangAS::Default;
- switch (A->getAllocatorType()) {
- // Use the default allocator here as by default local vars are
- // threadlocal.
- case OMPAllocateDeclAttr::OMPNullMemAlloc:
- case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
- case OMPAllocateDeclAttr::OMPThreadMemAlloc:
- case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
- case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
- // Follow the user decision - use default allocation.
- return Address::invalid();
- case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
- // TODO: implement aupport for user-defined allocators.
- return Address::invalid();
- case OMPAllocateDeclAttr::OMPConstMemAlloc:
- AS = LangAS::cuda_constant;
- break;
- case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
- AS = LangAS::cuda_shared;
- break;
- case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
- case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
- break;
- }
- llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), VarTy, /*isConstant=*/false,
- llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
- VD->getName(),
- /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
- CGM.getContext().getTargetAddressSpace(AS));
- CharUnits Align = CGM.getContext().getDeclAlign(VD);
- GV->setAlignment(Align.getAsAlign());
- return Address(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
- VD->getType().getAddressSpace()))),
- Align);
- }
- if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
- return Address::invalid();
- VD = VD->getCanonicalDecl();
- auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
- if (I == FunctionGlobalizedDecls.end())
- return Address::invalid();
- auto VDI = I->getSecond().LocalVarData.find(VD);
- if (VDI != I->getSecond().LocalVarData.end())
- return VDI->second.PrivateAddr;
- if (VD->hasAttrs()) {
- for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
- E(VD->attr_end());
- IT != E; ++IT) {
- auto VDI = I->getSecond().LocalVarData.find(
- cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
- ->getCanonicalDecl());
- if (VDI != I->getSecond().LocalVarData.end())
- return VDI->second.PrivateAddr;
- }
- }
- return Address::invalid();
- }
- void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
- FunctionGlobalizedDecls.erase(CGF.CurFn);
- CGOpenMPRuntime::functionFinished(CGF);
- }
- void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
- CodeGenFunction &CGF, const OMPLoopDirective &S,
- OpenMPDistScheduleClauseKind &ScheduleKind,
- llvm::Value *&Chunk) const {
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
- ScheduleKind = OMPC_DIST_SCHEDULE_static;
- Chunk = CGF.EmitScalarConversion(
- RT.getGPUNumThreads(CGF),
- CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
- S.getIterationVariable()->getType(), S.getBeginLoc());
- return;
- }
- CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
- CGF, S, ScheduleKind, Chunk);
- }
- void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
- CodeGenFunction &CGF, const OMPLoopDirective &S,
- OpenMPScheduleClauseKind &ScheduleKind,
- const Expr *&ChunkExpr) const {
- ScheduleKind = OMPC_SCHEDULE_static;
- // Chunk size is 1 in this case.
- llvm::APInt ChunkSize(32, 1);
- ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
- CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
- SourceLocation());
- }
- void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
- CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
- assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
- " Expected target-based directive.");
- const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
- for (const CapturedStmt::Capture &C : CS->captures()) {
- // Capture variables captured by reference in lambdas for target-based
- // directives.
- if (!C.capturesVariable())
- continue;
- const VarDecl *VD = C.getCapturedVar();
- const auto *RD = VD->getType()
- .getCanonicalType()
- .getNonReferenceType()
- ->getAsCXXRecordDecl();
- if (!RD || !RD->isLambda())
- continue;
- Address VDAddr = CGF.GetAddrOfLocalVar(VD);
- LValue VDLVal;
- if (VD->getType().getCanonicalType()->isReferenceType())
- VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
- else
- VDLVal = CGF.MakeAddrLValue(
- VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
- llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
- FieldDecl *ThisCapture = nullptr;
- RD->getCaptureFields(Captures, ThisCapture);
- if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
- LValue ThisLVal =
- CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
- llvm::Value *CXXThis = CGF.LoadCXXThis();
- CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
- }
- for (const LambdaCapture &LC : RD->captures()) {
- if (LC.getCaptureKind() != LCK_ByRef)
- continue;
- const VarDecl *VD = LC.getCapturedVar();
- if (!CS->capturesVariable(VD))
- continue;
- auto It = Captures.find(VD);
- assert(It != Captures.end() && "Found lambda capture without field.");
- LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
- Address VDAddr = CGF.GetAddrOfLocalVar(VD);
- if (VD->getType().getCanonicalType()->isReferenceType())
- VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
- VD->getType().getCanonicalType())
- .getAddress(CGF);
- CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
- }
- }
- }
- bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
- LangAS &AS) {
- if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
- return false;
- const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
- switch(A->getAllocatorType()) {
- case OMPAllocateDeclAttr::OMPNullMemAlloc:
- case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
- // Not supported, fallback to the default mem space.
- case OMPAllocateDeclAttr::OMPThreadMemAlloc:
- case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
- case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
- case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
- case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
- AS = LangAS::Default;
- return true;
- case OMPAllocateDeclAttr::OMPConstMemAlloc:
- AS = LangAS::cuda_constant;
- return true;
- case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
- AS = LangAS::cuda_shared;
- return true;
- case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
- llvm_unreachable("Expected predefined allocator for the variables with the "
- "static storage.");
- }
- return false;
- }
- // Get current CudaArch and ignore any unknown values
- static CudaArch getCudaArch(CodeGenModule &CGM) {
- if (!CGM.getTarget().hasFeature("ptx"))
- return CudaArch::UNKNOWN;
- for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
- if (Feature.getValue()) {
- CudaArch Arch = StringToCudaArch(Feature.getKey());
- if (Arch != CudaArch::UNKNOWN)
- return Arch;
- }
- }
- return CudaArch::UNKNOWN;
- }
- /// Check to see if target architecture supports unified addressing which is
- /// a restriction for OpenMP requires clause "unified_shared_memory".
- void CGOpenMPRuntimeGPU::processRequiresDirective(
- const OMPRequiresDecl *D) {
- for (const OMPClause *Clause : D->clauselists()) {
- if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
- CudaArch Arch = getCudaArch(CGM);
- switch (Arch) {
- case CudaArch::SM_20:
- case CudaArch::SM_21:
- case CudaArch::SM_30:
- case CudaArch::SM_32:
- case CudaArch::SM_35:
- case CudaArch::SM_37:
- case CudaArch::SM_50:
- case CudaArch::SM_52:
- case CudaArch::SM_53: {
- SmallString<256> Buffer;
- llvm::raw_svector_ostream Out(Buffer);
- Out << "Target architecture " << CudaArchToString(Arch)
- << " does not support unified addressing";
- CGM.Error(Clause->getBeginLoc(), Out.str());
- return;
- }
- case CudaArch::SM_60:
- case CudaArch::SM_61:
- case CudaArch::SM_62:
- case CudaArch::SM_70:
- case CudaArch::SM_72:
- case CudaArch::SM_75:
- case CudaArch::SM_80:
- case CudaArch::SM_86:
- case CudaArch::GFX600:
- case CudaArch::GFX601:
- case CudaArch::GFX602:
- case CudaArch::GFX700:
- case CudaArch::GFX701:
- case CudaArch::GFX702:
- case CudaArch::GFX703:
- case CudaArch::GFX704:
- case CudaArch::GFX705:
- case CudaArch::GFX801:
- case CudaArch::GFX802:
- case CudaArch::GFX803:
- case CudaArch::GFX805:
- case CudaArch::GFX810:
- case CudaArch::GFX900:
- case CudaArch::GFX902:
- case CudaArch::GFX904:
- case CudaArch::GFX906:
- case CudaArch::GFX908:
- case CudaArch::GFX909:
- case CudaArch::GFX90a:
- case CudaArch::GFX90c:
- case CudaArch::GFX1010:
- case CudaArch::GFX1011:
- case CudaArch::GFX1012:
- case CudaArch::GFX1013:
- case CudaArch::GFX1030:
- case CudaArch::GFX1031:
- case CudaArch::GFX1032:
- case CudaArch::GFX1033:
- case CudaArch::GFX1034:
- case CudaArch::GFX1035:
- case CudaArch::Generic:
- case CudaArch::UNUSED:
- case CudaArch::UNKNOWN:
- break;
- case CudaArch::LAST:
- llvm_unreachable("Unexpected Cuda arch.");
- }
- }
- }
- CGOpenMPRuntime::processRequiresDirective(D);
- }
- void CGOpenMPRuntimeGPU::clear() {
- if (!TeamsReductions.empty()) {
- ASTContext &C = CGM.getContext();
- RecordDecl *StaticRD = C.buildImplicitRecord(
- "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
- StaticRD->startDefinition();
- for (const RecordDecl *TeamReductionRec : TeamsReductions) {
- QualType RecTy = C.getRecordType(TeamReductionRec);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
- C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
- Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
- }
- StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- // different size are reported as erroneous).
- // Restore CommonLinkage as soon as nvlink is fixed.
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMReductionsBufferTy,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
- llvm::Constant::getNullValue(LLVMReductionsBufferTy),
- "_openmp_teams_reductions_buffer_$_");
- KernelTeamsReductionPtr->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
- }
- CGOpenMPRuntime::clear();
- }
- llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- llvm::Module *M = &CGF.CGM.getModule();
- const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
- llvm::Function *F = M->getFunction(LocSize);
- if (!F) {
- F = llvm::Function::Create(
- llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
- llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
- }
- return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
- }
- llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
- ArrayRef<llvm::Value *> Args{};
- return CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
- Args);
- }
- llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
- ArrayRef<llvm::Value *> Args{};
- return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_get_warp_size),
- Args);
- }
|