1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545 |
- //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // OpenMP specific optimizations:
- //
- // - Deduplication of runtime calls, e.g., omp_get_thread_num.
- // - Replacing globalized device memory with stack memory.
- // - Replacing globalized device memory with shared memory.
- // - Parallel region merging.
- // - Transforming generic-mode device kernels to SPMD mode.
- // - Specializing the state machine for generic-mode device kernels.
- //
- //===----------------------------------------------------------------------===//
- #include "llvm/Transforms/IPO/OpenMPOpt.h"
- #include "llvm/ADT/EnumeratedArray.h"
- #include "llvm/ADT/PostOrderIterator.h"
- #include "llvm/ADT/SetVector.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/Analysis/CallGraph.h"
- #include "llvm/Analysis/CallGraphSCCPass.h"
- #include "llvm/Analysis/MemoryLocation.h"
- #include "llvm/Analysis/OptimizationRemarkEmitter.h"
- #include "llvm/Analysis/ValueTracking.h"
- #include "llvm/Frontend/OpenMP/OMPConstants.h"
- #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
- #include "llvm/IR/Assumptions.h"
- #include "llvm/IR/BasicBlock.h"
- #include "llvm/IR/Constants.h"
- #include "llvm/IR/DiagnosticInfo.h"
- #include "llvm/IR/GlobalValue.h"
- #include "llvm/IR/GlobalVariable.h"
- #include "llvm/IR/Instruction.h"
- #include "llvm/IR/Instructions.h"
- #include "llvm/IR/IntrinsicInst.h"
- #include "llvm/IR/IntrinsicsAMDGPU.h"
- #include "llvm/IR/IntrinsicsNVPTX.h"
- #include "llvm/IR/LLVMContext.h"
- #include "llvm/InitializePasses.h"
- #include "llvm/Support/CommandLine.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Transforms/IPO/Attributor.h"
- #include "llvm/Transforms/Utils/BasicBlockUtils.h"
- #include "llvm/Transforms/Utils/CallGraphUpdater.h"
- #include <algorithm>
- #include <optional>
- #include <string>
- using namespace llvm;
- using namespace omp;
- #define DEBUG_TYPE "openmp-opt"
- static cl::opt<bool> DisableOpenMPOptimizations(
- "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> EnableParallelRegionMerging(
- "openmp-opt-enable-merging",
- cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
- cl::init(false));
- static cl::opt<bool>
- DisableInternalization("openmp-opt-disable-internalization",
- cl::desc("Disable function internalization."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
- cl::init(false), cl::Hidden);
- static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
- cl::Hidden);
- static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
- cl::init(false), cl::Hidden);
- static cl::opt<bool> HideMemoryTransferLatency(
- "openmp-hide-memory-transfer-latency",
- cl::desc("[WIP] Tries to hide the latency of host to device memory"
- " transfers"),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> DisableOpenMPOptDeglobalization(
- "openmp-opt-disable-deglobalization",
- cl::desc("Disable OpenMP optimizations involving deglobalization."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> DisableOpenMPOptSPMDization(
- "openmp-opt-disable-spmdization",
- cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> DisableOpenMPOptFolding(
- "openmp-opt-disable-folding",
- cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
- cl::init(false));
- static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
- "openmp-opt-disable-state-machine-rewrite",
- cl::desc("Disable OpenMP optimizations that replace the state machine."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> DisableOpenMPOptBarrierElimination(
- "openmp-opt-disable-barrier-elimination",
- cl::desc("Disable OpenMP optimizations that eliminate barriers."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> PrintModuleAfterOptimizations(
- "openmp-opt-print-module-after",
- cl::desc("Print the current module after OpenMP optimizations."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> PrintModuleBeforeOptimizations(
- "openmp-opt-print-module-before",
- cl::desc("Print the current module before OpenMP optimizations."),
- cl::Hidden, cl::init(false));
- static cl::opt<bool> AlwaysInlineDeviceFunctions(
- "openmp-opt-inline-device",
- cl::desc("Inline all applicible functions on the device."), cl::Hidden,
- cl::init(false));
- static cl::opt<bool>
- EnableVerboseRemarks("openmp-opt-verbose-remarks",
- cl::desc("Enables more verbose remarks."), cl::Hidden,
- cl::init(false));
- static cl::opt<unsigned>
- SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
- cl::desc("Maximal number of attributor iterations."),
- cl::init(256));
- static cl::opt<unsigned>
- SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
- cl::desc("Maximum amount of shared memory to use."),
- cl::init(std::numeric_limits<unsigned>::max()));
- STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
- "Number of OpenMP runtime calls deduplicated");
- STATISTIC(NumOpenMPParallelRegionsDeleted,
- "Number of OpenMP parallel regions deleted");
- STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
- "Number of OpenMP runtime functions identified");
- STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
- "Number of OpenMP runtime function uses identified");
- STATISTIC(NumOpenMPTargetRegionKernels,
- "Number of OpenMP target region entry points (=kernels) identified");
- STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
- "Number of OpenMP target region entry points (=kernels) executed in "
- "SPMD-mode instead of generic-mode");
- STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
- "Number of OpenMP target region entry points (=kernels) executed in "
- "generic-mode without a state machines");
- STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
- "Number of OpenMP target region entry points (=kernels) executed in "
- "generic-mode with customized state machines with fallback");
- STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
- "Number of OpenMP target region entry points (=kernels) executed in "
- "generic-mode with customized state machines without fallback");
- STATISTIC(
- NumOpenMPParallelRegionsReplacedInGPUStateMachine,
- "Number of OpenMP parallel regions replaced with ID in GPU state machines");
- STATISTIC(NumOpenMPParallelRegionsMerged,
- "Number of OpenMP parallel regions merged");
- STATISTIC(NumBytesMovedToSharedMemory,
- "Amount of memory pushed to shared memory");
- STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");
- #if !defined(NDEBUG)
- static constexpr auto TAG = "[" DEBUG_TYPE "]";
- #endif
- namespace {
- struct AAHeapToShared;
- struct AAICVTracker;
- /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
- /// Attributor runs.
- struct OMPInformationCache : public InformationCache {
- OMPInformationCache(Module &M, AnalysisGetter &AG,
- BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
- KernelSet &Kernels, bool OpenMPPostLink)
- : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
- Kernels(Kernels), OpenMPPostLink(OpenMPPostLink) {
- OMPBuilder.initialize();
- initializeRuntimeFunctions(M);
- initializeInternalControlVars();
- }
- /// Generic information that describes an internal control variable.
- struct InternalControlVarInfo {
- /// The kind, as described by InternalControlVar enum.
- InternalControlVar Kind;
- /// The name of the ICV.
- StringRef Name;
- /// Environment variable associated with this ICV.
- StringRef EnvVarName;
- /// Initial value kind.
- ICVInitValue InitKind;
- /// Initial value.
- ConstantInt *InitValue;
- /// Setter RTL function associated with this ICV.
- RuntimeFunction Setter;
- /// Getter RTL function associated with this ICV.
- RuntimeFunction Getter;
- /// RTL Function corresponding to the override clause of this ICV
- RuntimeFunction Clause;
- };
- /// Generic information that describes a runtime function
- struct RuntimeFunctionInfo {
- /// The kind, as described by the RuntimeFunction enum.
- RuntimeFunction Kind;
- /// The name of the function.
- StringRef Name;
- /// Flag to indicate a variadic function.
- bool IsVarArg;
- /// The return type of the function.
- Type *ReturnType;
- /// The argument types of the function.
- SmallVector<Type *, 8> ArgumentTypes;
- /// The declaration if available.
- Function *Declaration = nullptr;
- /// Uses of this runtime function per function containing the use.
- using UseVector = SmallVector<Use *, 16>;
- /// Clear UsesMap for runtime function.
- void clearUsesMap() { UsesMap.clear(); }
- /// Boolean conversion that is true if the runtime function was found.
- operator bool() const { return Declaration; }
- /// Return the vector of uses in function \p F.
- UseVector &getOrCreateUseVector(Function *F) {
- std::shared_ptr<UseVector> &UV = UsesMap[F];
- if (!UV)
- UV = std::make_shared<UseVector>();
- return *UV;
- }
- /// Return the vector of uses in function \p F or `nullptr` if there are
- /// none.
- const UseVector *getUseVector(Function &F) const {
- auto I = UsesMap.find(&F);
- if (I != UsesMap.end())
- return I->second.get();
- return nullptr;
- }
- /// Return how many functions contain uses of this runtime function.
- size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
- /// Return the number of arguments (or the minimal number for variadic
- /// functions).
- size_t getNumArgs() const { return ArgumentTypes.size(); }
- /// Run the callback \p CB on each use and forget the use if the result is
- /// true. The callback will be fed the function in which the use was
- /// encountered as second argument.
- void foreachUse(SmallVectorImpl<Function *> &SCC,
- function_ref<bool(Use &, Function &)> CB) {
- for (Function *F : SCC)
- foreachUse(CB, F);
- }
- /// Run the callback \p CB on each use within the function \p F and forget
- /// the use if the result is true.
- void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
- SmallVector<unsigned, 8> ToBeDeleted;
- ToBeDeleted.clear();
- unsigned Idx = 0;
- UseVector &UV = getOrCreateUseVector(F);
- for (Use *U : UV) {
- if (CB(*U, *F))
- ToBeDeleted.push_back(Idx);
- ++Idx;
- }
- // Remove the to-be-deleted indices in reverse order as prior
- // modifications will not modify the smaller indices.
- while (!ToBeDeleted.empty()) {
- unsigned Idx = ToBeDeleted.pop_back_val();
- UV[Idx] = UV.back();
- UV.pop_back();
- }
- }
- private:
- /// Map from functions to all uses of this runtime function contained in
- /// them.
- DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
- public:
- /// Iterators for the uses of this runtime function.
- decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
- decltype(UsesMap)::iterator end() { return UsesMap.end(); }
- };
- /// An OpenMP-IR-Builder instance
- OpenMPIRBuilder OMPBuilder;
- /// Map from runtime function kind to the runtime function description.
- EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
- RuntimeFunction::OMPRTL___last>
- RFIs;
- /// Map from function declarations/definitions to their runtime enum type.
- DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
- /// Map from ICV kind to the ICV description.
- EnumeratedArray<InternalControlVarInfo, InternalControlVar,
- InternalControlVar::ICV___last>
- ICVs;
- /// Helper to initialize all internal control variable information for those
- /// defined in OMPKinds.def.
- void initializeInternalControlVars() {
- #define ICV_RT_SET(_Name, RTL) \
- { \
- auto &ICV = ICVs[_Name]; \
- ICV.Setter = RTL; \
- }
- #define ICV_RT_GET(Name, RTL) \
- { \
- auto &ICV = ICVs[Name]; \
- ICV.Getter = RTL; \
- }
- #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
- { \
- auto &ICV = ICVs[Enum]; \
- ICV.Name = _Name; \
- ICV.Kind = Enum; \
- ICV.InitKind = Init; \
- ICV.EnvVarName = _EnvVarName; \
- switch (ICV.InitKind) { \
- case ICV_IMPLEMENTATION_DEFINED: \
- ICV.InitValue = nullptr; \
- break; \
- case ICV_ZERO: \
- ICV.InitValue = ConstantInt::get( \
- Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
- break; \
- case ICV_FALSE: \
- ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
- break; \
- case ICV_LAST: \
- break; \
- } \
- }
- #include "llvm/Frontend/OpenMP/OMPKinds.def"
- }
- /// Returns true if the function declaration \p F matches the runtime
- /// function types, that is, return type \p RTFRetType, and argument types
- /// \p RTFArgTypes.
- static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
- SmallVector<Type *, 8> &RTFArgTypes) {
- // TODO: We should output information to the user (under debug output
- // and via remarks).
- if (!F)
- return false;
- if (F->getReturnType() != RTFRetType)
- return false;
- if (F->arg_size() != RTFArgTypes.size())
- return false;
- auto *RTFTyIt = RTFArgTypes.begin();
- for (Argument &Arg : F->args()) {
- if (Arg.getType() != *RTFTyIt)
- return false;
- ++RTFTyIt;
- }
- return true;
- }
- // Helper to collect all uses of the declaration in the UsesMap.
- unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
- unsigned NumUses = 0;
- if (!RFI.Declaration)
- return NumUses;
- OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
- if (CollectStats) {
- NumOpenMPRuntimeFunctionsIdentified += 1;
- NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
- }
- // TODO: We directly convert uses into proper calls and unknown uses.
- for (Use &U : RFI.Declaration->uses()) {
- if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
- if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) {
- RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
- ++NumUses;
- }
- } else {
- RFI.getOrCreateUseVector(nullptr).push_back(&U);
- ++NumUses;
- }
- }
- return NumUses;
- }
- // Helper function to recollect uses of a runtime function.
- void recollectUsesForFunction(RuntimeFunction RTF) {
- auto &RFI = RFIs[RTF];
- RFI.clearUsesMap();
- collectUses(RFI, /*CollectStats*/ false);
- }
- // Helper function to recollect uses of all runtime functions.
- void recollectUses() {
- for (int Idx = 0; Idx < RFIs.size(); ++Idx)
- recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
- }
- // Helper function to inherit the calling convention of the function callee.
- void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
- if (Function *Fn = dyn_cast<Function>(Callee.getCallee()))
- CI->setCallingConv(Fn->getCallingConv());
- }
- // Helper function to determine if it's legal to create a call to the runtime
- // functions.
- bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) {
- // We can always emit calls if we haven't yet linked in the runtime.
- if (!OpenMPPostLink)
- return true;
- // Once the runtime has been already been linked in we cannot emit calls to
- // any undefined functions.
- for (RuntimeFunction Fn : Fns) {
- RuntimeFunctionInfo &RFI = RFIs[Fn];
- if (RFI.Declaration && RFI.Declaration->isDeclaration())
- return false;
- }
- return true;
- }
- /// Helper to initialize all runtime function information for those defined
- /// in OpenMPKinds.def.
- void initializeRuntimeFunctions(Module &M) {
- // Helper macros for handling __VA_ARGS__ in OMP_RTL
- #define OMP_TYPE(VarName, ...) \
- Type *VarName = OMPBuilder.VarName; \
- (void)VarName;
- #define OMP_ARRAY_TYPE(VarName, ...) \
- ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
- (void)VarName##Ty; \
- PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
- (void)VarName##PtrTy;
- #define OMP_FUNCTION_TYPE(VarName, ...) \
- FunctionType *VarName = OMPBuilder.VarName; \
- (void)VarName; \
- PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
- (void)VarName##Ptr;
- #define OMP_STRUCT_TYPE(VarName, ...) \
- StructType *VarName = OMPBuilder.VarName; \
- (void)VarName; \
- PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
- (void)VarName##Ptr;
- #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
- { \
- SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
- Function *F = M.getFunction(_Name); \
- RTLFunctions.insert(F); \
- if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
- RuntimeFunctionIDMap[F] = _Enum; \
- auto &RFI = RFIs[_Enum]; \
- RFI.Kind = _Enum; \
- RFI.Name = _Name; \
- RFI.IsVarArg = _IsVarArg; \
- RFI.ReturnType = OMPBuilder._ReturnType; \
- RFI.ArgumentTypes = std::move(ArgsTypes); \
- RFI.Declaration = F; \
- unsigned NumUses = collectUses(RFI); \
- (void)NumUses; \
- LLVM_DEBUG({ \
- dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
- << " found\n"; \
- if (RFI.Declaration) \
- dbgs() << TAG << "-> got " << NumUses << " uses in " \
- << RFI.getNumFunctionsWithUses() \
- << " different functions.\n"; \
- }); \
- } \
- }
- #include "llvm/Frontend/OpenMP/OMPKinds.def"
- // Remove the `noinline` attribute from `__kmpc`, `ompx::` and `omp_`
- // functions, except if `optnone` is present.
- if (isOpenMPDevice(M)) {
- for (Function &F : M) {
- for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
- if (F.hasFnAttribute(Attribute::NoInline) &&
- F.getName().startswith(Prefix) &&
- !F.hasFnAttribute(Attribute::OptimizeNone))
- F.removeFnAttr(Attribute::NoInline);
- }
- }
- // TODO: We should attach the attributes defined in OMPKinds.def.
- }
- /// Collection of known kernels (\see Kernel) in the module.
- KernelSet &Kernels;
- /// Collection of known OpenMP runtime functions..
- DenseSet<const Function *> RTLFunctions;
- /// Indicates if we have already linked in the OpenMP device library.
- bool OpenMPPostLink = false;
- };
- template <typename Ty, bool InsertInvalidates = true>
- struct BooleanStateWithSetVector : public BooleanState {
- bool contains(const Ty &Elem) const { return Set.contains(Elem); }
- bool insert(const Ty &Elem) {
- if (InsertInvalidates)
- BooleanState::indicatePessimisticFixpoint();
- return Set.insert(Elem);
- }
- const Ty &operator[](int Idx) const { return Set[Idx]; }
- bool operator==(const BooleanStateWithSetVector &RHS) const {
- return BooleanState::operator==(RHS) && Set == RHS.Set;
- }
- bool operator!=(const BooleanStateWithSetVector &RHS) const {
- return !(*this == RHS);
- }
- bool empty() const { return Set.empty(); }
- size_t size() const { return Set.size(); }
- /// "Clamp" this state with \p RHS.
- BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
- BooleanState::operator^=(RHS);
- Set.insert(RHS.Set.begin(), RHS.Set.end());
- return *this;
- }
- private:
- /// A set to keep track of elements.
- SetVector<Ty> Set;
- public:
- typename decltype(Set)::iterator begin() { return Set.begin(); }
- typename decltype(Set)::iterator end() { return Set.end(); }
- typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
- typename decltype(Set)::const_iterator end() const { return Set.end(); }
- };
- template <typename Ty, bool InsertInvalidates = true>
- using BooleanStateWithPtrSetVector =
- BooleanStateWithSetVector<Ty *, InsertInvalidates>;
- struct KernelInfoState : AbstractState {
- /// Flag to track if we reached a fixpoint.
- bool IsAtFixpoint = false;
- /// The parallel regions (identified by the outlined parallel functions) that
- /// can be reached from the associated function.
- BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
- ReachedKnownParallelRegions;
- /// State to track what parallel region we might reach.
- BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
- /// State to track if we are in SPMD-mode, assumed or know, and why we decided
- /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
- /// false.
- BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
- /// The __kmpc_target_init call in this kernel, if any. If we find more than
- /// one we abort as the kernel is malformed.
- CallBase *KernelInitCB = nullptr;
- /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
- /// one we abort as the kernel is malformed.
- CallBase *KernelDeinitCB = nullptr;
- /// Flag to indicate if the associated function is a kernel entry.
- bool IsKernelEntry = false;
- /// State to track what kernel entries can reach the associated function.
- BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
- /// State to indicate if we can track parallel level of the associated
- /// function. We will give up tracking if we encounter unknown caller or the
- /// caller is __kmpc_parallel_51.
- BooleanStateWithSetVector<uint8_t> ParallelLevels;
- /// Flag that indicates if the kernel has nested Parallelism
- bool NestedParallelism = false;
- /// Abstract State interface
- ///{
- KernelInfoState() = default;
- KernelInfoState(bool BestState) {
- if (!BestState)
- indicatePessimisticFixpoint();
- }
- /// See AbstractState::isValidState(...)
- bool isValidState() const override { return true; }
- /// See AbstractState::isAtFixpoint(...)
- bool isAtFixpoint() const override { return IsAtFixpoint; }
- /// See AbstractState::indicatePessimisticFixpoint(...)
- ChangeStatus indicatePessimisticFixpoint() override {
- IsAtFixpoint = true;
- ParallelLevels.indicatePessimisticFixpoint();
- ReachingKernelEntries.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- ReachedKnownParallelRegions.indicatePessimisticFixpoint();
- ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
- return ChangeStatus::CHANGED;
- }
- /// See AbstractState::indicateOptimisticFixpoint(...)
- ChangeStatus indicateOptimisticFixpoint() override {
- IsAtFixpoint = true;
- ParallelLevels.indicateOptimisticFixpoint();
- ReachingKernelEntries.indicateOptimisticFixpoint();
- SPMDCompatibilityTracker.indicateOptimisticFixpoint();
- ReachedKnownParallelRegions.indicateOptimisticFixpoint();
- ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
- /// Return the assumed state
- KernelInfoState &getAssumed() { return *this; }
- const KernelInfoState &getAssumed() const { return *this; }
- bool operator==(const KernelInfoState &RHS) const {
- if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
- return false;
- if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
- return false;
- if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
- return false;
- if (ReachingKernelEntries != RHS.ReachingKernelEntries)
- return false;
- if (ParallelLevels != RHS.ParallelLevels)
- return false;
- return true;
- }
- /// Returns true if this kernel contains any OpenMP parallel regions.
- bool mayContainParallelRegion() {
- return !ReachedKnownParallelRegions.empty() ||
- !ReachedUnknownParallelRegions.empty();
- }
- /// Return empty set as the best state of potential values.
- static KernelInfoState getBestState() { return KernelInfoState(true); }
- static KernelInfoState getBestState(KernelInfoState &KIS) {
- return getBestState();
- }
- /// Return full set as the worst state of potential values.
- static KernelInfoState getWorstState() { return KernelInfoState(false); }
- /// "Clamp" this state with \p KIS.
- KernelInfoState operator^=(const KernelInfoState &KIS) {
- // Do not merge two different _init and _deinit call sites.
- if (KIS.KernelInitCB) {
- if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
- llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
- "assumptions.");
- KernelInitCB = KIS.KernelInitCB;
- }
- if (KIS.KernelDeinitCB) {
- if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
- llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
- "assumptions.");
- KernelDeinitCB = KIS.KernelDeinitCB;
- }
- SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
- ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
- ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
- NestedParallelism |= KIS.NestedParallelism;
- return *this;
- }
- KernelInfoState operator&=(const KernelInfoState &KIS) {
- return (*this ^= KIS);
- }
- ///}
- };
- /// Used to map the values physically (in the IR) stored in an offload
- /// array, to a vector in memory.
- struct OffloadArray {
- /// Physical array (in the IR).
- AllocaInst *Array = nullptr;
- /// Mapped values.
- SmallVector<Value *, 8> StoredValues;
- /// Last stores made in the offload array.
- SmallVector<StoreInst *, 8> LastAccesses;
- OffloadArray() = default;
- /// Initializes the OffloadArray with the values stored in \p Array before
- /// instruction \p Before is reached. Returns false if the initialization
- /// fails.
- /// This MUST be used immediately after the construction of the object.
- bool initialize(AllocaInst &Array, Instruction &Before) {
- if (!Array.getAllocatedType()->isArrayTy())
- return false;
- if (!getValues(Array, Before))
- return false;
- this->Array = &Array;
- return true;
- }
- static const unsigned DeviceIDArgNum = 1;
- static const unsigned BasePtrsArgNum = 3;
- static const unsigned PtrsArgNum = 4;
- static const unsigned SizesArgNum = 5;
- private:
- /// Traverses the BasicBlock where \p Array is, collecting the stores made to
- /// \p Array, leaving StoredValues with the values stored before the
- /// instruction \p Before is reached.
- bool getValues(AllocaInst &Array, Instruction &Before) {
- // Initialize container.
- const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
- StoredValues.assign(NumValues, nullptr);
- LastAccesses.assign(NumValues, nullptr);
- // TODO: This assumes the instruction \p Before is in the same
- // BasicBlock as Array. Make it general, for any control flow graph.
- BasicBlock *BB = Array.getParent();
- if (BB != Before.getParent())
- return false;
- const DataLayout &DL = Array.getModule()->getDataLayout();
- const unsigned int PointerSize = DL.getPointerSize();
- for (Instruction &I : *BB) {
- if (&I == &Before)
- break;
- if (!isa<StoreInst>(&I))
- continue;
- auto *S = cast<StoreInst>(&I);
- int64_t Offset = -1;
- auto *Dst =
- GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
- if (Dst == &Array) {
- int64_t Idx = Offset / PointerSize;
- StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
- LastAccesses[Idx] = S;
- }
- }
- return isFilled();
- }
- /// Returns true if all values in StoredValues and
- /// LastAccesses are not nullptrs.
- bool isFilled() {
- const unsigned NumValues = StoredValues.size();
- for (unsigned I = 0; I < NumValues; ++I) {
- if (!StoredValues[I] || !LastAccesses[I])
- return false;
- }
- return true;
- }
- };
- struct OpenMPOpt {
- using OptimizationRemarkGetter =
- function_ref<OptimizationRemarkEmitter &(Function *)>;
- OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
- OptimizationRemarkGetter OREGetter,
- OMPInformationCache &OMPInfoCache, Attributor &A)
- : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
- OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
- /// Check if any remarks are enabled for openmp-opt
- bool remarksEnabled() {
- auto &Ctx = M.getContext();
- return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
- }
- /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
- bool run(bool IsModulePass) {
- if (SCC.empty())
- return false;
- bool Changed = false;
- LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
- << " functions in a slice with "
- << OMPInfoCache.ModuleSlice.size() << " functions\n");
- if (IsModulePass) {
- Changed |= runAttributor(IsModulePass);
- // Recollect uses, in case Attributor deleted any.
- OMPInfoCache.recollectUses();
- // TODO: This should be folded into buildCustomStateMachine.
- Changed |= rewriteDeviceCodeStateMachine();
- if (remarksEnabled())
- analysisGlobalization();
- } else {
- if (PrintICVValues)
- printICVs();
- if (PrintOpenMPKernels)
- printKernels();
- Changed |= runAttributor(IsModulePass);
- // Recollect uses, in case Attributor deleted any.
- OMPInfoCache.recollectUses();
- Changed |= deleteParallelRegions();
- if (HideMemoryTransferLatency)
- Changed |= hideMemTransfersLatency();
- Changed |= deduplicateRuntimeCalls();
- if (EnableParallelRegionMerging) {
- if (mergeParallelRegions()) {
- deduplicateRuntimeCalls();
- Changed = true;
- }
- }
- }
- return Changed;
- }
- /// Print initial ICV values for testing.
- /// FIXME: This should be done from the Attributor once it is added.
- void printICVs() const {
- InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
- ICV_proc_bind};
- for (Function *F : SCC) {
- for (auto ICV : ICVs) {
- auto ICVInfo = OMPInfoCache.ICVs[ICV];
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
- << " Value: "
- << (ICVInfo.InitValue
- ? toString(ICVInfo.InitValue->getValue(), 10, true)
- : "IMPLEMENTATION_DEFINED");
- };
- emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
- }
- }
- }
- /// Print OpenMP GPU kernels for testing.
- void printKernels() const {
- for (Function *F : SCC) {
- if (!OMPInfoCache.Kernels.count(F))
- continue;
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "OpenMP GPU kernel "
- << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
- };
- emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
- }
- }
- /// Return the call if \p U is a callee use in a regular call. If \p RFI is
- /// given it has to be the callee or a nullptr is returned.
- static CallInst *getCallIfRegularCall(
- Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
- CallInst *CI = dyn_cast<CallInst>(U.getUser());
- if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
- (!RFI ||
- (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
- return CI;
- return nullptr;
- }
- /// Return the call if \p V is a regular call. If \p RFI is given it has to be
- /// the callee or a nullptr is returned.
- static CallInst *getCallIfRegularCall(
- Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
- CallInst *CI = dyn_cast<CallInst>(&V);
- if (CI && !CI->hasOperandBundles() &&
- (!RFI ||
- (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
- return CI;
- return nullptr;
- }
- private:
- /// Merge parallel regions when it is safe.
- bool mergeParallelRegions() {
- const unsigned CallbackCalleeOperand = 2;
- const unsigned CallbackFirstArgOperand = 3;
- using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
- // Check if there are any __kmpc_fork_call calls to merge.
- OMPInformationCache::RuntimeFunctionInfo &RFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
- if (!RFI.Declaration)
- return false;
- // Unmergable calls that prevent merging a parallel region.
- OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
- OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
- OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
- };
- bool Changed = false;
- LoopInfo *LI = nullptr;
- DominatorTree *DT = nullptr;
- SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
- BasicBlock *StartBB = nullptr, *EndBB = nullptr;
- auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
- BasicBlock *CGStartBB = CodeGenIP.getBlock();
- BasicBlock *CGEndBB =
- SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
- assert(StartBB != nullptr && "StartBB should not be null");
- CGStartBB->getTerminator()->setSuccessor(0, StartBB);
- assert(EndBB != nullptr && "EndBB should not be null");
- EndBB->getTerminator()->setSuccessor(0, CGEndBB);
- };
- auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
- Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
- ReplacementValue = &Inner;
- return CodeGenIP;
- };
- auto FiniCB = [&](InsertPointTy CodeGenIP) {};
- /// Create a sequential execution region within a merged parallel region,
- /// encapsulated in a master construct with a barrier for synchronization.
- auto CreateSequentialRegion = [&](Function *OuterFn,
- BasicBlock *OuterPredBB,
- Instruction *SeqStartI,
- Instruction *SeqEndI) {
- // Isolate the instructions of the sequential region to a separate
- // block.
- BasicBlock *ParentBB = SeqStartI->getParent();
- BasicBlock *SeqEndBB =
- SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
- BasicBlock *SeqAfterBB =
- SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
- BasicBlock *SeqStartBB =
- SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
- assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
- "Expected a different CFG");
- const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
- ParentBB->getTerminator()->eraseFromParent();
- auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
- BasicBlock *CGStartBB = CodeGenIP.getBlock();
- BasicBlock *CGEndBB =
- SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
- assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
- CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
- assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
- SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
- };
- auto FiniCB = [&](InsertPointTy CodeGenIP) {};
- // Find outputs from the sequential region to outside users and
- // broadcast their values to them.
- for (Instruction &I : *SeqStartBB) {
- SmallPtrSet<Instruction *, 4> OutsideUsers;
- for (User *Usr : I.users()) {
- Instruction &UsrI = *cast<Instruction>(Usr);
- // Ignore outputs to LT intrinsics, code extraction for the merged
- // parallel region will fix them.
- if (UsrI.isLifetimeStartOrEnd())
- continue;
- if (UsrI.getParent() != SeqStartBB)
- OutsideUsers.insert(&UsrI);
- }
- if (OutsideUsers.empty())
- continue;
- // Emit an alloca in the outer region to store the broadcasted
- // value.
- const DataLayout &DL = M.getDataLayout();
- AllocaInst *AllocaI = new AllocaInst(
- I.getType(), DL.getAllocaAddrSpace(), nullptr,
- I.getName() + ".seq.output.alloc", &OuterFn->front().front());
- // Emit a store instruction in the sequential BB to update the
- // value.
- new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
- // Emit a load instruction and replace the use of the output value
- // with it.
- for (Instruction *UsrI : OutsideUsers) {
- LoadInst *LoadI = new LoadInst(
- I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
- UsrI->replaceUsesOfWith(&I, LoadI);
- }
- }
- OpenMPIRBuilder::LocationDescription Loc(
- InsertPointTy(ParentBB, ParentBB->end()), DL);
- InsertPointTy SeqAfterIP =
- OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
- OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
- BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
- LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
- << "\n");
- };
- // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
- // contained in BB and only separated by instructions that can be
- // redundantly executed in parallel. The block BB is split before the first
- // call (in MergableCIs) and after the last so the entire region we merge
- // into a single parallel region is contained in a single basic block
- // without any other instructions. We use the OpenMPIRBuilder to outline
- // that block and call the resulting function via __kmpc_fork_call.
- auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
- BasicBlock *BB) {
- // TODO: Change the interface to allow single CIs expanded, e.g, to
- // include an outer loop.
- assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
- auto Remark = [&](OptimizationRemark OR) {
- OR << "Parallel region merged with parallel region"
- << (MergableCIs.size() > 2 ? "s" : "") << " at ";
- for (auto *CI : llvm::drop_begin(MergableCIs)) {
- OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
- if (CI != MergableCIs.back())
- OR << ", ";
- }
- return OR << ".";
- };
- emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
- Function *OriginalFn = BB->getParent();
- LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
- << " parallel regions in " << OriginalFn->getName()
- << "\n");
- // Isolate the calls to merge in a separate block.
- EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
- BasicBlock *AfterBB =
- SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
- StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
- "omp.par.merged");
- assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
- const DebugLoc DL = BB->getTerminator()->getDebugLoc();
- BB->getTerminator()->eraseFromParent();
- // Create sequential regions for sequential instructions that are
- // in-between mergable parallel regions.
- for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
- It != End; ++It) {
- Instruction *ForkCI = *It;
- Instruction *NextForkCI = *(It + 1);
- // Continue if there are not in-between instructions.
- if (ForkCI->getNextNode() == NextForkCI)
- continue;
- CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
- NextForkCI->getPrevNode());
- }
- OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
- DL);
- IRBuilder<>::InsertPoint AllocaIP(
- &OriginalFn->getEntryBlock(),
- OriginalFn->getEntryBlock().getFirstInsertionPt());
- // Create the merged parallel region with default proc binding, to
- // avoid overriding binding settings, and without explicit cancellation.
- InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
- Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
- OMP_PROC_BIND_default, /* IsCancellable */ false);
- BranchInst::Create(AfterBB, AfterIP.getBlock());
- // Perform the actual outlining.
- OMPInfoCache.OMPBuilder.finalize(OriginalFn);
- Function *OutlinedFn = MergableCIs.front()->getCaller();
- // Replace the __kmpc_fork_call calls with direct calls to the outlined
- // callbacks.
- SmallVector<Value *, 8> Args;
- for (auto *CI : MergableCIs) {
- Value *Callee = CI->getArgOperand(CallbackCalleeOperand);
- FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
- Args.clear();
- Args.push_back(OutlinedFn->getArg(0));
- Args.push_back(OutlinedFn->getArg(1));
- for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
- ++U)
- Args.push_back(CI->getArgOperand(U));
- CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
- if (CI->getDebugLoc())
- NewCI->setDebugLoc(CI->getDebugLoc());
- // Forward parameter attributes from the callback to the callee.
- for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
- ++U)
- for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
- NewCI->addParamAttr(
- U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
- // Emit an explicit barrier to replace the implicit fork-join barrier.
- if (CI != MergableCIs.back()) {
- // TODO: Remove barrier if the merged parallel region includes the
- // 'nowait' clause.
- OMPInfoCache.OMPBuilder.createBarrier(
- InsertPointTy(NewCI->getParent(),
- NewCI->getNextNode()->getIterator()),
- OMPD_parallel);
- }
- CI->eraseFromParent();
- }
- assert(OutlinedFn != OriginalFn && "Outlining failed");
- CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
- CGUpdater.reanalyzeFunction(*OriginalFn);
- NumOpenMPParallelRegionsMerged += MergableCIs.size();
- return true;
- };
- // Helper function that identifes sequences of
- // __kmpc_fork_call uses in a basic block.
- auto DetectPRsCB = [&](Use &U, Function &F) {
- CallInst *CI = getCallIfRegularCall(U, &RFI);
- BB2PRMap[CI->getParent()].insert(CI);
- return false;
- };
- BB2PRMap.clear();
- RFI.foreachUse(SCC, DetectPRsCB);
- SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
- // Find mergable parallel regions within a basic block that are
- // safe to merge, that is any in-between instructions can safely
- // execute in parallel after merging.
- // TODO: support merging across basic-blocks.
- for (auto &It : BB2PRMap) {
- auto &CIs = It.getSecond();
- if (CIs.size() < 2)
- continue;
- BasicBlock *BB = It.getFirst();
- SmallVector<CallInst *, 4> MergableCIs;
- /// Returns true if the instruction is mergable, false otherwise.
- /// A terminator instruction is unmergable by definition since merging
- /// works within a BB. Instructions before the mergable region are
- /// mergable if they are not calls to OpenMP runtime functions that may
- /// set different execution parameters for subsequent parallel regions.
- /// Instructions in-between parallel regions are mergable if they are not
- /// calls to any non-intrinsic function since that may call a non-mergable
- /// OpenMP runtime function.
- auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
- // We do not merge across BBs, hence return false (unmergable) if the
- // instruction is a terminator.
- if (I.isTerminator())
- return false;
- if (!isa<CallInst>(&I))
- return true;
- CallInst *CI = cast<CallInst>(&I);
- if (IsBeforeMergableRegion) {
- Function *CalledFunction = CI->getCalledFunction();
- if (!CalledFunction)
- return false;
- // Return false (unmergable) if the call before the parallel
- // region calls an explicit affinity (proc_bind) or number of
- // threads (num_threads) compiler-generated function. Those settings
- // may be incompatible with following parallel regions.
- // TODO: ICV tracking to detect compatibility.
- for (const auto &RFI : UnmergableCallsInfo) {
- if (CalledFunction == RFI.Declaration)
- return false;
- }
- } else {
- // Return false (unmergable) if there is a call instruction
- // in-between parallel regions when it is not an intrinsic. It
- // may call an unmergable OpenMP runtime function in its callpath.
- // TODO: Keep track of possible OpenMP calls in the callpath.
- if (!isa<IntrinsicInst>(CI))
- return false;
- }
- return true;
- };
- // Find maximal number of parallel region CIs that are safe to merge.
- for (auto It = BB->begin(), End = BB->end(); It != End;) {
- Instruction &I = *It;
- ++It;
- if (CIs.count(&I)) {
- MergableCIs.push_back(cast<CallInst>(&I));
- continue;
- }
- // Continue expanding if the instruction is mergable.
- if (IsMergable(I, MergableCIs.empty()))
- continue;
- // Forward the instruction iterator to skip the next parallel region
- // since there is an unmergable instruction which can affect it.
- for (; It != End; ++It) {
- Instruction &SkipI = *It;
- if (CIs.count(&SkipI)) {
- LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
- << " due to " << I << "\n");
- ++It;
- break;
- }
- }
- // Store mergable regions found.
- if (MergableCIs.size() > 1) {
- MergableCIsVector.push_back(MergableCIs);
- LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
- << " parallel regions in block " << BB->getName()
- << " of function " << BB->getParent()->getName()
- << "\n";);
- }
- MergableCIs.clear();
- }
- if (!MergableCIsVector.empty()) {
- Changed = true;
- for (auto &MergableCIs : MergableCIsVector)
- Merge(MergableCIs, BB);
- MergableCIsVector.clear();
- }
- }
- if (Changed) {
- /// Re-collect use for fork calls, emitted barrier calls, and
- /// any emitted master/end_master calls.
- OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
- OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
- OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
- OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
- }
- return Changed;
- }
- /// Try to delete parallel regions if possible.
- bool deleteParallelRegions() {
- const unsigned CallbackCalleeOperand = 2;
- OMPInformationCache::RuntimeFunctionInfo &RFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
- if (!RFI.Declaration)
- return false;
- bool Changed = false;
- auto DeleteCallCB = [&](Use &U, Function &) {
- CallInst *CI = getCallIfRegularCall(U);
- if (!CI)
- return false;
- auto *Fn = dyn_cast<Function>(
- CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
- if (!Fn)
- return false;
- if (!Fn->onlyReadsMemory())
- return false;
- if (!Fn->hasFnAttribute(Attribute::WillReturn))
- return false;
- LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
- << CI->getCaller()->getName() << "\n");
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Removing parallel region with no side-effects.";
- };
- emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
- CGUpdater.removeCallSite(*CI);
- CI->eraseFromParent();
- Changed = true;
- ++NumOpenMPParallelRegionsDeleted;
- return true;
- };
- RFI.foreachUse(SCC, DeleteCallCB);
- return Changed;
- }
- /// Try to eliminate runtime calls by reusing existing ones.
- bool deduplicateRuntimeCalls() {
- bool Changed = false;
- RuntimeFunction DeduplicableRuntimeCallIDs[] = {
- OMPRTL_omp_get_num_threads,
- OMPRTL_omp_in_parallel,
- OMPRTL_omp_get_cancellation,
- OMPRTL_omp_get_thread_limit,
- OMPRTL_omp_get_supported_active_levels,
- OMPRTL_omp_get_level,
- OMPRTL_omp_get_ancestor_thread_num,
- OMPRTL_omp_get_team_size,
- OMPRTL_omp_get_active_level,
- OMPRTL_omp_in_final,
- OMPRTL_omp_get_proc_bind,
- OMPRTL_omp_get_num_places,
- OMPRTL_omp_get_num_procs,
- OMPRTL_omp_get_place_num,
- OMPRTL_omp_get_partition_num_places,
- OMPRTL_omp_get_partition_place_nums};
- // Global-tid is handled separately.
- SmallSetVector<Value *, 16> GTIdArgs;
- collectGlobalThreadIdArguments(GTIdArgs);
- LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
- << " global thread ID arguments\n");
- for (Function *F : SCC) {
- for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
- Changed |= deduplicateRuntimeCalls(
- *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
- // __kmpc_global_thread_num is special as we can replace it with an
- // argument in enough cases to make it worth trying.
- Value *GTIdArg = nullptr;
- for (Argument &Arg : F->args())
- if (GTIdArgs.count(&Arg)) {
- GTIdArg = &Arg;
- break;
- }
- Changed |= deduplicateRuntimeCalls(
- *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
- }
- return Changed;
- }
- /// Tries to hide the latency of runtime calls that involve host to
- /// device memory transfers by splitting them into their "issue" and "wait"
- /// versions. The "issue" is moved upwards as much as possible. The "wait" is
- /// moved downards as much as possible. The "issue" issues the memory transfer
- /// asynchronously, returning a handle. The "wait" waits in the returned
- /// handle for the memory transfer to finish.
- bool hideMemTransfersLatency() {
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
- bool Changed = false;
- auto SplitMemTransfers = [&](Use &U, Function &Decl) {
- auto *RTCall = getCallIfRegularCall(U, &RFI);
- if (!RTCall)
- return false;
- OffloadArray OffloadArrays[3];
- if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
- return false;
- LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
- // TODO: Check if can be moved upwards.
- bool WasSplit = false;
- Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
- if (WaitMovementPoint)
- WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
- Changed |= WasSplit;
- return WasSplit;
- };
- if (OMPInfoCache.runtimeFnsAvailable(
- {OMPRTL___tgt_target_data_begin_mapper_issue,
- OMPRTL___tgt_target_data_begin_mapper_wait}))
- RFI.foreachUse(SCC, SplitMemTransfers);
- return Changed;
- }
- void analysisGlobalization() {
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
- auto CheckGlobalization = [&](Use &U, Function &Decl) {
- if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
- auto Remark = [&](OptimizationRemarkMissed ORM) {
- return ORM
- << "Found thread data sharing on the GPU. "
- << "Expect degraded performance due to data globalization.";
- };
- emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
- }
- return false;
- };
- RFI.foreachUse(SCC, CheckGlobalization);
- }
- /// Maps the values stored in the offload arrays passed as arguments to
- /// \p RuntimeCall into the offload arrays in \p OAs.
- bool getValuesInOffloadArrays(CallInst &RuntimeCall,
- MutableArrayRef<OffloadArray> OAs) {
- assert(OAs.size() == 3 && "Need space for three offload arrays!");
- // A runtime call that involves memory offloading looks something like:
- // call void @__tgt_target_data_begin_mapper(arg0, arg1,
- // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
- // ...)
- // So, the idea is to access the allocas that allocate space for these
- // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
- // Therefore:
- // i8** %offload_baseptrs.
- Value *BasePtrsArg =
- RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
- // i8** %offload_ptrs.
- Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
- // i8** %offload_sizes.
- Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
- // Get values stored in **offload_baseptrs.
- auto *V = getUnderlyingObject(BasePtrsArg);
- if (!isa<AllocaInst>(V))
- return false;
- auto *BasePtrsArray = cast<AllocaInst>(V);
- if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
- return false;
- // Get values stored in **offload_baseptrs.
- V = getUnderlyingObject(PtrsArg);
- if (!isa<AllocaInst>(V))
- return false;
- auto *PtrsArray = cast<AllocaInst>(V);
- if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
- return false;
- // Get values stored in **offload_sizes.
- V = getUnderlyingObject(SizesArg);
- // If it's a [constant] global array don't analyze it.
- if (isa<GlobalValue>(V))
- return isa<Constant>(V);
- if (!isa<AllocaInst>(V))
- return false;
- auto *SizesArray = cast<AllocaInst>(V);
- if (!OAs[2].initialize(*SizesArray, RuntimeCall))
- return false;
- return true;
- }
- /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
- /// For now this is a way to test that the function getValuesInOffloadArrays
- /// is working properly.
- /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
- void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
- assert(OAs.size() == 3 && "There are three offload arrays to debug!");
- LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
- std::string ValuesStr;
- raw_string_ostream Printer(ValuesStr);
- std::string Separator = " --- ";
- for (auto *BP : OAs[0].StoredValues) {
- BP->print(Printer);
- Printer << Separator;
- }
- LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
- ValuesStr.clear();
- for (auto *P : OAs[1].StoredValues) {
- P->print(Printer);
- Printer << Separator;
- }
- LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
- ValuesStr.clear();
- for (auto *S : OAs[2].StoredValues) {
- S->print(Printer);
- Printer << Separator;
- }
- LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
- }
- /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
- /// moved. Returns nullptr if the movement is not possible, or not worth it.
- Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
- // FIXME: This traverses only the BasicBlock where RuntimeCall is.
- // Make it traverse the CFG.
- Instruction *CurrentI = &RuntimeCall;
- bool IsWorthIt = false;
- while ((CurrentI = CurrentI->getNextNode())) {
- // TODO: Once we detect the regions to be offloaded we should use the
- // alias analysis manager to check if CurrentI may modify one of
- // the offloaded regions.
- if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
- if (IsWorthIt)
- return CurrentI;
- return nullptr;
- }
- // FIXME: For now if we move it over anything without side effect
- // is worth it.
- IsWorthIt = true;
- }
- // Return end of BasicBlock.
- return RuntimeCall.getParent()->getTerminator();
- }
- /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
- bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
- Instruction &WaitMovementPoint) {
- // Create stack allocated handle (__tgt_async_info) at the beginning of the
- // function. Used for storing information of the async transfer, allowing to
- // wait on it later.
- auto &IRBuilder = OMPInfoCache.OMPBuilder;
- Function *F = RuntimeCall.getCaller();
- BasicBlock &Entry = F->getEntryBlock();
- IRBuilder.Builder.SetInsertPoint(&Entry,
- Entry.getFirstNonPHIOrDbgOrAlloca());
- Value *Handle = IRBuilder.Builder.CreateAlloca(
- IRBuilder.AsyncInfo, /*ArraySize=*/nullptr, "handle");
- Handle =
- IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr);
- // Add "issue" runtime call declaration:
- // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
- // i8**, i8**, i64*, i64*)
- FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___tgt_target_data_begin_mapper_issue);
- // Change RuntimeCall call site for its asynchronous version.
- SmallVector<Value *, 16> Args;
- for (auto &Arg : RuntimeCall.args())
- Args.push_back(Arg.get());
- Args.push_back(Handle);
- CallInst *IssueCallsite =
- CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
- OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
- RuntimeCall.eraseFromParent();
- // Add "wait" runtime call declaration:
- // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
- FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___tgt_target_data_begin_mapper_wait);
- Value *WaitParams[2] = {
- IssueCallsite->getArgOperand(
- OffloadArray::DeviceIDArgNum), // device_id.
- Handle // handle to wait on.
- };
- CallInst *WaitCallsite = CallInst::Create(
- WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
- OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
- return true;
- }
- static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
- bool GlobalOnly, bool &SingleChoice) {
- if (CurrentIdent == NextIdent)
- return CurrentIdent;
- // TODO: Figure out how to actually combine multiple debug locations. For
- // now we just keep an existing one if there is a single choice.
- if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
- SingleChoice = !CurrentIdent;
- return NextIdent;
- }
- return nullptr;
- }
- /// Return an `struct ident_t*` value that represents the ones used in the
- /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
- /// return a local `struct ident_t*`. For now, if we cannot find a suitable
- /// return value we create one from scratch. We also do not yet combine
- /// information, e.g., the source locations, see combinedIdentStruct.
- Value *
- getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
- Function &F, bool GlobalOnly) {
- bool SingleChoice = true;
- Value *Ident = nullptr;
- auto CombineIdentStruct = [&](Use &U, Function &Caller) {
- CallInst *CI = getCallIfRegularCall(U, &RFI);
- if (!CI || &F != &Caller)
- return false;
- Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
- /* GlobalOnly */ true, SingleChoice);
- return false;
- };
- RFI.foreachUse(SCC, CombineIdentStruct);
- if (!Ident || !SingleChoice) {
- // The IRBuilder uses the insertion block to get to the module, this is
- // unfortunate but we work around it for now.
- if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
- OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
- &F.getEntryBlock(), F.getEntryBlock().begin()));
- // Create a fallback location if non was found.
- // TODO: Use the debug locations of the calls instead.
- uint32_t SrcLocStrSize;
- Constant *Loc =
- OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
- Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
- }
- return Ident;
- }
- /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
- /// \p ReplVal if given.
- bool deduplicateRuntimeCalls(Function &F,
- OMPInformationCache::RuntimeFunctionInfo &RFI,
- Value *ReplVal = nullptr) {
- auto *UV = RFI.getUseVector(F);
- if (!UV || UV->size() + (ReplVal != nullptr) < 2)
- return false;
- LLVM_DEBUG(
- dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
- << (ReplVal ? " with an existing value\n" : "\n") << "\n");
- assert((!ReplVal || (isa<Argument>(ReplVal) &&
- cast<Argument>(ReplVal)->getParent() == &F)) &&
- "Unexpected replacement value!");
- // TODO: Use dominance to find a good position instead.
- auto CanBeMoved = [this](CallBase &CB) {
- unsigned NumArgs = CB.arg_size();
- if (NumArgs == 0)
- return true;
- if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
- return false;
- for (unsigned U = 1; U < NumArgs; ++U)
- if (isa<Instruction>(CB.getArgOperand(U)))
- return false;
- return true;
- };
- if (!ReplVal) {
- for (Use *U : *UV)
- if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
- if (!CanBeMoved(*CI))
- continue;
- // If the function is a kernel, dedup will move
- // the runtime call right after the kernel init callsite. Otherwise,
- // it will move it to the beginning of the caller function.
- if (isKernel(F)) {
- auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
- auto *KernelInitUV = KernelInitRFI.getUseVector(F);
- if (KernelInitUV->empty())
- continue;
- assert(KernelInitUV->size() == 1 &&
- "Expected a single __kmpc_target_init in kernel\n");
- CallInst *KernelInitCI =
- getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
- assert(KernelInitCI &&
- "Expected a call to __kmpc_target_init in kernel\n");
- CI->moveAfter(KernelInitCI);
- } else
- CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
- ReplVal = CI;
- break;
- }
- if (!ReplVal)
- return false;
- }
- // If we use a call as a replacement value we need to make sure the ident is
- // valid at the new location. For now we just pick a global one, either
- // existing and used by one of the calls, or created from scratch.
- if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
- if (!CI->arg_empty() &&
- CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
- Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
- /* GlobalOnly */ true);
- CI->setArgOperand(0, Ident);
- }
- }
- bool Changed = false;
- auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
- CallInst *CI = getCallIfRegularCall(U, &RFI);
- if (!CI || CI == ReplVal || &F != &Caller)
- return false;
- assert(CI->getCaller() == &F && "Unexpected call!");
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "OpenMP runtime call "
- << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
- };
- if (CI->getDebugLoc())
- emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
- else
- emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
- CGUpdater.removeCallSite(*CI);
- CI->replaceAllUsesWith(ReplVal);
- CI->eraseFromParent();
- ++NumOpenMPRuntimeCallsDeduplicated;
- Changed = true;
- return true;
- };
- RFI.foreachUse(SCC, ReplaceAndDeleteCB);
- return Changed;
- }
- /// Collect arguments that represent the global thread id in \p GTIdArgs.
- void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
- // TODO: Below we basically perform a fixpoint iteration with a pessimistic
- // initialization. We could define an AbstractAttribute instead and
- // run the Attributor here once it can be run as an SCC pass.
- // Helper to check the argument \p ArgNo at all call sites of \p F for
- // a GTId.
- auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
- if (!F.hasLocalLinkage())
- return false;
- for (Use &U : F.uses()) {
- if (CallInst *CI = getCallIfRegularCall(U)) {
- Value *ArgOp = CI->getArgOperand(ArgNo);
- if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
- getCallIfRegularCall(
- *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
- continue;
- }
- return false;
- }
- return true;
- };
- // Helper to identify uses of a GTId as GTId arguments.
- auto AddUserArgs = [&](Value >Id) {
- for (Use &U : GTId.uses())
- if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
- if (CI->isArgOperand(&U))
- if (Function *Callee = CI->getCalledFunction())
- if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
- GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
- };
- // The argument users of __kmpc_global_thread_num calls are GTIds.
- OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
- GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
- if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
- AddUserArgs(*CI);
- return false;
- });
- // Transitively search for more arguments by looking at the users of the
- // ones we know already. During the search the GTIdArgs vector is extended
- // so we cannot cache the size nor can we use a range based for.
- for (unsigned U = 0; U < GTIdArgs.size(); ++U)
- AddUserArgs(*GTIdArgs[U]);
- }
- /// Kernel (=GPU) optimizations and utility functions
- ///
- ///{{
- /// Check if \p F is a kernel, hence entry point for target offloading.
- bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
- /// Cache to remember the unique kernel for a function.
- DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
- /// Find the unique kernel that will execute \p F, if any.
- Kernel getUniqueKernelFor(Function &F);
- /// Find the unique kernel that will execute \p I, if any.
- Kernel getUniqueKernelFor(Instruction &I) {
- return getUniqueKernelFor(*I.getFunction());
- }
- /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
- /// the cases we can avoid taking the address of a function.
- bool rewriteDeviceCodeStateMachine();
- ///
- ///}}
- /// Emit a remark generically
- ///
- /// This template function can be used to generically emit a remark. The
- /// RemarkKind should be one of the following:
- /// - OptimizationRemark to indicate a successful optimization attempt
- /// - OptimizationRemarkMissed to report a failed optimization attempt
- /// - OptimizationRemarkAnalysis to provide additional information about an
- /// optimization attempt
- ///
- /// The remark is built using a callback function provided by the caller that
- /// takes a RemarkKind as input and returns a RemarkKind.
- template <typename RemarkKind, typename RemarkCallBack>
- void emitRemark(Instruction *I, StringRef RemarkName,
- RemarkCallBack &&RemarkCB) const {
- Function *F = I->getParent()->getParent();
- auto &ORE = OREGetter(F);
- if (RemarkName.startswith("OMP"))
- ORE.emit([&]() {
- return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
- << " [" << RemarkName << "]";
- });
- else
- ORE.emit(
- [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
- }
- /// Emit a remark on a function.
- template <typename RemarkKind, typename RemarkCallBack>
- void emitRemark(Function *F, StringRef RemarkName,
- RemarkCallBack &&RemarkCB) const {
- auto &ORE = OREGetter(F);
- if (RemarkName.startswith("OMP"))
- ORE.emit([&]() {
- return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
- << " [" << RemarkName << "]";
- });
- else
- ORE.emit(
- [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
- }
- /// The underlying module.
- Module &M;
- /// The SCC we are operating on.
- SmallVectorImpl<Function *> &SCC;
- /// Callback to update the call graph, the first argument is a removed call,
- /// the second an optional replacement call.
- CallGraphUpdater &CGUpdater;
- /// Callback to get an OptimizationRemarkEmitter from a Function *
- OptimizationRemarkGetter OREGetter;
- /// OpenMP-specific information cache. Also Used for Attributor runs.
- OMPInformationCache &OMPInfoCache;
- /// Attributor instance.
- Attributor &A;
- /// Helper function to run Attributor on SCC.
- bool runAttributor(bool IsModulePass) {
- if (SCC.empty())
- return false;
- registerAAs(IsModulePass);
- ChangeStatus Changed = A.run();
- LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
- << " functions, result: " << Changed << ".\n");
- return Changed == ChangeStatus::CHANGED;
- }
- void registerFoldRuntimeCall(RuntimeFunction RF);
- /// Populate the Attributor with abstract attribute opportunities in the
- /// functions.
- void registerAAs(bool IsModulePass);
- public:
- /// Callback to register AAs for live functions, including internal functions
- /// marked live during the traversal.
- static void registerAAsForFunction(Attributor &A, const Function &F);
- };
- Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
- if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F))
- return nullptr;
- // Use a scope to keep the lifetime of the CachedKernel short.
- {
- std::optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
- if (CachedKernel)
- return *CachedKernel;
- // TODO: We should use an AA to create an (optimistic and callback
- // call-aware) call graph. For now we stick to simple patterns that
- // are less powerful, basically the worst fixpoint.
- if (isKernel(F)) {
- CachedKernel = Kernel(&F);
- return *CachedKernel;
- }
- CachedKernel = nullptr;
- if (!F.hasLocalLinkage()) {
- // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "Potentially unknown OpenMP target region caller.";
- };
- emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
- return nullptr;
- }
- }
- auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
- if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
- // Allow use in equality comparisons.
- if (Cmp->isEquality())
- return getUniqueKernelFor(*Cmp);
- return nullptr;
- }
- if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
- // Allow direct calls.
- if (CB->isCallee(&U))
- return getUniqueKernelFor(*CB);
- OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
- // Allow the use in __kmpc_parallel_51 calls.
- if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
- return getUniqueKernelFor(*CB);
- return nullptr;
- }
- // Disallow every other use.
- return nullptr;
- };
- // TODO: In the future we want to track more than just a unique kernel.
- SmallPtrSet<Kernel, 2> PotentialKernels;
- OMPInformationCache::foreachUse(F, [&](const Use &U) {
- PotentialKernels.insert(GetUniqueKernelForUse(U));
- });
- Kernel K = nullptr;
- if (PotentialKernels.size() == 1)
- K = *PotentialKernels.begin();
- // Cache the result.
- UniqueKernelMap[&F] = K;
- return K;
- }
- bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
- OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
- bool Changed = false;
- if (!KernelParallelRFI)
- return Changed;
- // If we have disabled state machine changes, exit
- if (DisableOpenMPOptStateMachineRewrite)
- return Changed;
- for (Function *F : SCC) {
- // Check if the function is a use in a __kmpc_parallel_51 call at
- // all.
- bool UnknownUse = false;
- bool KernelParallelUse = false;
- unsigned NumDirectCalls = 0;
- SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
- OMPInformationCache::foreachUse(*F, [&](Use &U) {
- if (auto *CB = dyn_cast<CallBase>(U.getUser()))
- if (CB->isCallee(&U)) {
- ++NumDirectCalls;
- return;
- }
- if (isa<ICmpInst>(U.getUser())) {
- ToBeReplacedStateMachineUses.push_back(&U);
- return;
- }
- // Find wrapper functions that represent parallel kernels.
- CallInst *CI =
- OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
- const unsigned int WrapperFunctionArgNo = 6;
- if (!KernelParallelUse && CI &&
- CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
- KernelParallelUse = true;
- ToBeReplacedStateMachineUses.push_back(&U);
- return;
- }
- UnknownUse = true;
- });
- // Do not emit a remark if we haven't seen a __kmpc_parallel_51
- // use.
- if (!KernelParallelUse)
- continue;
- // If this ever hits, we should investigate.
- // TODO: Checking the number of uses is not a necessary restriction and
- // should be lifted.
- if (UnknownUse || NumDirectCalls != 1 ||
- ToBeReplacedStateMachineUses.size() > 2) {
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "Parallel region is used in "
- << (UnknownUse ? "unknown" : "unexpected")
- << " ways. Will not attempt to rewrite the state machine.";
- };
- emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
- continue;
- }
- // Even if we have __kmpc_parallel_51 calls, we (for now) give
- // up if the function is not called from a unique kernel.
- Kernel K = getUniqueKernelFor(*F);
- if (!K) {
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "Parallel region is not called from a unique kernel. "
- "Will not attempt to rewrite the state machine.";
- };
- emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
- continue;
- }
- // We now know F is a parallel body function called only from the kernel K.
- // We also identified the state machine uses in which we replace the
- // function pointer by a new global symbol for identification purposes. This
- // ensures only direct calls to the function are left.
- Module &M = *F->getParent();
- Type *Int8Ty = Type::getInt8Ty(M.getContext());
- auto *ID = new GlobalVariable(
- M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
- UndefValue::get(Int8Ty), F->getName() + ".ID");
- for (Use *U : ToBeReplacedStateMachineUses)
- U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
- ID, U->get()->getType()));
- ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
- Changed = true;
- }
- return Changed;
- }
- /// Abstract Attribute for tracking ICV values.
- struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
- using Base = StateWrapper<BooleanState, AbstractAttribute>;
- AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- void initialize(Attributor &A) override {
- Function *F = getAnchorScope();
- if (!F || !A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
- }
- /// Returns true if value is assumed to be tracked.
- bool isAssumedTracked() const { return getAssumed(); }
- /// Returns true if value is known to be tracked.
- bool isKnownTracked() const { return getAssumed(); }
- /// Create an abstract attribute biew for the position \p IRP.
- static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
- /// Return the value with which \p I can be replaced for specific \p ICV.
- virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,
- const Instruction *I,
- Attributor &A) const {
- return std::nullopt;
- }
- /// Return an assumed unique ICV value if a single candidate is found. If
- /// there cannot be one, return a nullptr. If it is not clear yet, return
- /// std::nullopt.
- virtual std::optional<Value *>
- getUniqueReplacementValue(InternalControlVar ICV) const = 0;
- // Currently only nthreads is being tracked.
- // this array will only grow with time.
- InternalControlVar TrackableICVs[1] = {ICV_nthreads};
- /// See AbstractAttribute::getName()
- const std::string getName() const override { return "AAICVTracker"; }
- /// See AbstractAttribute::getIdAddr()
- const char *getIdAddr() const override { return &ID; }
- /// This function should return true if the type of the \p AA is AAICVTracker
- static bool classof(const AbstractAttribute *AA) {
- return (AA->getIdAddr() == &ID);
- }
- static const char ID;
- };
- struct AAICVTrackerFunction : public AAICVTracker {
- AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
- : AAICVTracker(IRP, A) {}
- // FIXME: come up with better string.
- const std::string getAsStr() const override { return "ICVTrackerFunction"; }
- // FIXME: come up with some stats.
- void trackStatistics() const override {}
- /// We don't manifest anything for this AA.
- ChangeStatus manifest(Attributor &A) override {
- return ChangeStatus::UNCHANGED;
- }
- // Map of ICV to their values at specific program point.
- EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
- InternalControlVar::ICV___last>
- ICVReplacementValuesMap;
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
- Function *F = getAnchorScope();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- for (InternalControlVar ICV : TrackableICVs) {
- auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
- auto &ValuesMap = ICVReplacementValuesMap[ICV];
- auto TrackValues = [&](Use &U, Function &) {
- CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
- if (!CI)
- return false;
- // FIXME: handle setters with more that 1 arguments.
- /// Track new value.
- if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
- HasChanged = ChangeStatus::CHANGED;
- return false;
- };
- auto CallCheck = [&](Instruction &I) {
- std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);
- if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
- HasChanged = ChangeStatus::CHANGED;
- return true;
- };
- // Track all changes of an ICV.
- SetterRFI.foreachUse(TrackValues, F);
- bool UsedAssumedInformation = false;
- A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
- UsedAssumedInformation,
- /* CheckBBLivenessOnly */ true);
- /// TODO: Figure out a way to avoid adding entry in
- /// ICVReplacementValuesMap
- Instruction *Entry = &F->getEntryBlock().front();
- if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
- ValuesMap.insert(std::make_pair(Entry, nullptr));
- }
- return HasChanged;
- }
- /// Helper to check if \p I is a call and get the value for it if it is
- /// unique.
- std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
- InternalControlVar &ICV) const {
- const auto *CB = dyn_cast<CallBase>(&I);
- if (!CB || CB->hasFnAttr("no_openmp") ||
- CB->hasFnAttr("no_openmp_routines"))
- return std::nullopt;
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
- auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
- Function *CalledFunction = CB->getCalledFunction();
- // Indirect call, assume ICV changes.
- if (CalledFunction == nullptr)
- return nullptr;
- if (CalledFunction == GetterRFI.Declaration)
- return std::nullopt;
- if (CalledFunction == SetterRFI.Declaration) {
- if (ICVReplacementValuesMap[ICV].count(&I))
- return ICVReplacementValuesMap[ICV].lookup(&I);
- return nullptr;
- }
- // Since we don't know, assume it changes the ICV.
- if (CalledFunction->isDeclaration())
- return nullptr;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
- *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
- if (ICVTrackingAA.isAssumedTracked()) {
- std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
- if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
- OMPInfoCache)))
- return URV;
- }
- // If we don't know, assume it changes.
- return nullptr;
- }
- // We don't check unique value for a function, so return std::nullopt.
- std::optional<Value *>
- getUniqueReplacementValue(InternalControlVar ICV) const override {
- return std::nullopt;
- }
- /// Return the value with which \p I can be replaced for specific \p ICV.
- std::optional<Value *> getReplacementValue(InternalControlVar ICV,
- const Instruction *I,
- Attributor &A) const override {
- const auto &ValuesMap = ICVReplacementValuesMap[ICV];
- if (ValuesMap.count(I))
- return ValuesMap.lookup(I);
- SmallVector<const Instruction *, 16> Worklist;
- SmallPtrSet<const Instruction *, 16> Visited;
- Worklist.push_back(I);
- std::optional<Value *> ReplVal;
- while (!Worklist.empty()) {
- const Instruction *CurrInst = Worklist.pop_back_val();
- if (!Visited.insert(CurrInst).second)
- continue;
- const BasicBlock *CurrBB = CurrInst->getParent();
- // Go up and look for all potential setters/calls that might change the
- // ICV.
- while ((CurrInst = CurrInst->getPrevNode())) {
- if (ValuesMap.count(CurrInst)) {
- std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
- // Unknown value, track new.
- if (!ReplVal) {
- ReplVal = NewReplVal;
- break;
- }
- // If we found a new value, we can't know the icv value anymore.
- if (NewReplVal)
- if (ReplVal != NewReplVal)
- return nullptr;
- break;
- }
- std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
- if (!NewReplVal)
- continue;
- // Unknown value, track new.
- if (!ReplVal) {
- ReplVal = NewReplVal;
- break;
- }
- // if (NewReplVal.hasValue())
- // We found a new value, we can't know the icv value anymore.
- if (ReplVal != NewReplVal)
- return nullptr;
- }
- // If we are in the same BB and we have a value, we are done.
- if (CurrBB == I->getParent() && ReplVal)
- return ReplVal;
- // Go through all predecessors and add terminators for analysis.
- for (const BasicBlock *Pred : predecessors(CurrBB))
- if (const Instruction *Terminator = Pred->getTerminator())
- Worklist.push_back(Terminator);
- }
- return ReplVal;
- }
- };
- struct AAICVTrackerFunctionReturned : AAICVTracker {
- AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
- : AAICVTracker(IRP, A) {}
- // FIXME: come up with better string.
- const std::string getAsStr() const override {
- return "ICVTrackerFunctionReturned";
- }
- // FIXME: come up with some stats.
- void trackStatistics() const override {}
- /// We don't manifest anything for this AA.
- ChangeStatus manifest(Attributor &A) override {
- return ChangeStatus::UNCHANGED;
- }
- // Map of ICV to their values at specific program point.
- EnumeratedArray<std::optional<Value *>, InternalControlVar,
- InternalControlVar::ICV___last>
- ICVReplacementValuesMap;
- /// Return the value with which \p I can be replaced for specific \p ICV.
- std::optional<Value *>
- getUniqueReplacementValue(InternalControlVar ICV) const override {
- return ICVReplacementValuesMap[ICV];
- }
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
- *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!ICVTrackingAA.isAssumedTracked())
- return indicatePessimisticFixpoint();
- for (InternalControlVar ICV : TrackableICVs) {
- std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
- std::optional<Value *> UniqueICVValue;
- auto CheckReturnInst = [&](Instruction &I) {
- std::optional<Value *> NewReplVal =
- ICVTrackingAA.getReplacementValue(ICV, &I, A);
- // If we found a second ICV value there is no unique returned value.
- if (UniqueICVValue && UniqueICVValue != NewReplVal)
- return false;
- UniqueICVValue = NewReplVal;
- return true;
- };
- bool UsedAssumedInformation = false;
- if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
- UsedAssumedInformation,
- /* CheckBBLivenessOnly */ true))
- UniqueICVValue = nullptr;
- if (UniqueICVValue == ReplVal)
- continue;
- ReplVal = UniqueICVValue;
- Changed = ChangeStatus::CHANGED;
- }
- return Changed;
- }
- };
- struct AAICVTrackerCallSite : AAICVTracker {
- AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
- : AAICVTracker(IRP, A) {}
- void initialize(Attributor &A) override {
- Function *F = getAnchorScope();
- if (!F || !A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
- // We only initialize this AA for getters, so we need to know which ICV it
- // gets.
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- for (InternalControlVar ICV : TrackableICVs) {
- auto ICVInfo = OMPInfoCache.ICVs[ICV];
- auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
- if (Getter.Declaration == getAssociatedFunction()) {
- AssociatedICV = ICVInfo.Kind;
- return;
- }
- }
- /// Unknown ICV.
- indicatePessimisticFixpoint();
- }
- ChangeStatus manifest(Attributor &A) override {
- if (!ReplVal || !*ReplVal)
- return ChangeStatus::UNCHANGED;
- A.changeAfterManifest(IRPosition::inst(*getCtxI()), **ReplVal);
- A.deleteAfterManifest(*getCtxI());
- return ChangeStatus::CHANGED;
- }
- // FIXME: come up with better string.
- const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
- // FIXME: come up with some stats.
- void trackStatistics() const override {}
- InternalControlVar AssociatedICV;
- std::optional<Value *> ReplVal;
- ChangeStatus updateImpl(Attributor &A) override {
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
- *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- // We don't have any information, so we assume it changes the ICV.
- if (!ICVTrackingAA.isAssumedTracked())
- return indicatePessimisticFixpoint();
- std::optional<Value *> NewReplVal =
- ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
- if (ReplVal == NewReplVal)
- return ChangeStatus::UNCHANGED;
- ReplVal = NewReplVal;
- return ChangeStatus::CHANGED;
- }
- // Return the value with which associated value can be replaced for specific
- // \p ICV.
- std::optional<Value *>
- getUniqueReplacementValue(InternalControlVar ICV) const override {
- return ReplVal;
- }
- };
- struct AAICVTrackerCallSiteReturned : AAICVTracker {
- AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAICVTracker(IRP, A) {}
- // FIXME: come up with better string.
- const std::string getAsStr() const override {
- return "ICVTrackerCallSiteReturned";
- }
- // FIXME: come up with some stats.
- void trackStatistics() const override {}
- /// We don't manifest anything for this AA.
- ChangeStatus manifest(Attributor &A) override {
- return ChangeStatus::UNCHANGED;
- }
- // Map of ICV to their values at specific program point.
- EnumeratedArray<std::optional<Value *>, InternalControlVar,
- InternalControlVar::ICV___last>
- ICVReplacementValuesMap;
- /// Return the value with which associated value can be replaced for specific
- /// \p ICV.
- std::optional<Value *>
- getUniqueReplacementValue(InternalControlVar ICV) const override {
- return ICVReplacementValuesMap[ICV];
- }
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
- *this, IRPosition::returned(*getAssociatedFunction()),
- DepClassTy::REQUIRED);
- // We don't have any information, so we assume it changes the ICV.
- if (!ICVTrackingAA.isAssumedTracked())
- return indicatePessimisticFixpoint();
- for (InternalControlVar ICV : TrackableICVs) {
- std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
- std::optional<Value *> NewReplVal =
- ICVTrackingAA.getUniqueReplacementValue(ICV);
- if (ReplVal == NewReplVal)
- continue;
- ReplVal = NewReplVal;
- Changed = ChangeStatus::CHANGED;
- }
- return Changed;
- }
- };
- struct AAExecutionDomainFunction : public AAExecutionDomain {
- AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
- : AAExecutionDomain(IRP, A) {}
- ~AAExecutionDomainFunction() {
- delete RPOT;
- }
- void initialize(Attributor &A) override {
- if (getAnchorScope()->isDeclaration()) {
- indicatePessimisticFixpoint();
- return;
- }
- RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope());
- }
- const std::string getAsStr() const override {
- unsigned TotalBlocks = 0, InitialThreadBlocks = 0;
- for (auto &It : BEDMap) {
- TotalBlocks++;
- InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
- }
- return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
- std::to_string(TotalBlocks) + " executed by initial thread only";
- }
- /// See AbstractAttribute::trackStatistics().
- void trackStatistics() const override {}
- ChangeStatus manifest(Attributor &A) override {
- LLVM_DEBUG({
- for (const BasicBlock &BB : *getAnchorScope()) {
- if (!isExecutedByInitialThreadOnly(BB))
- continue;
- dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
- << BB.getName() << " is executed by a single thread.\n";
- }
- });
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- if (DisableOpenMPOptBarrierElimination)
- return Changed;
- SmallPtrSet<CallBase *, 16> DeletedBarriers;
- auto HandleAlignedBarrier = [&](CallBase *CB) {
- const ExecutionDomainTy &ED = CEDMap[CB];
- if (!ED.IsReachedFromAlignedBarrierOnly ||
- ED.EncounteredNonLocalSideEffect)
- return;
- // We can remove this barrier, if it is one, or all aligned barriers
- // reaching the kernel end. In the latter case we can transitively work
- // our way back until we find a barrier that guards a side-effect if we
- // are dealing with the kernel end here.
- if (CB) {
- DeletedBarriers.insert(CB);
- A.deleteAfterManifest(*CB);
- ++NumBarriersEliminated;
- Changed = ChangeStatus::CHANGED;
- } else if (!ED.AlignedBarriers.empty()) {
- NumBarriersEliminated += ED.AlignedBarriers.size();
- Changed = ChangeStatus::CHANGED;
- SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
- ED.AlignedBarriers.end());
- SmallSetVector<CallBase *, 16> Visited;
- while (!Worklist.empty()) {
- CallBase *LastCB = Worklist.pop_back_val();
- if (!Visited.insert(LastCB))
- continue;
- if (!DeletedBarriers.count(LastCB)) {
- A.deleteAfterManifest(*LastCB);
- continue;
- }
- // The final aligned barrier (LastCB) reaching the kernel end was
- // removed already. This means we can go one step further and remove
- // the barriers encoutered last before (LastCB).
- const ExecutionDomainTy &LastED = CEDMap[LastCB];
- Worklist.append(LastED.AlignedBarriers.begin(),
- LastED.AlignedBarriers.end());
- }
- }
- // If we actually eliminated a barrier we need to eliminate the associated
- // llvm.assumes as well to avoid creating UB.
- if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
- for (auto *AssumeCB : ED.EncounteredAssumes)
- A.deleteAfterManifest(*AssumeCB);
- };
- for (auto *CB : AlignedBarriers)
- HandleAlignedBarrier(CB);
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- // Handle the "kernel end barrier" for kernels too.
- if (OMPInfoCache.Kernels.count(getAnchorScope()))
- HandleAlignedBarrier(nullptr);
- return Changed;
- }
- /// Merge barrier and assumption information from \p PredED into the successor
- /// \p ED.
- void
- mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
- const ExecutionDomainTy &PredED);
- /// Merge all information from \p PredED into the successor \p ED. If
- /// \p InitialEdgeOnly is set, only the initial edge will enter the block
- /// represented by \p ED from this predecessor.
- void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
- const ExecutionDomainTy &PredED,
- bool InitialEdgeOnly = false);
- /// Accumulate information for the entry block in \p EntryBBED.
- void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED);
- /// See AbstractAttribute::updateImpl.
- ChangeStatus updateImpl(Attributor &A) override;
- /// Query interface, see AAExecutionDomain
- ///{
- bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
- if (!isValidState())
- return false;
- return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
- }
- bool isExecutedInAlignedRegion(Attributor &A,
- const Instruction &I) const override {
- assert(I.getFunction() == getAnchorScope() &&
- "Instruction is out of scope!");
- if (!isValidState())
- return false;
- const Instruction *CurI;
- // Check forward until a call or the block end is reached.
- CurI = &I;
- do {
- auto *CB = dyn_cast<CallBase>(CurI);
- if (!CB)
- continue;
- if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB))) {
- break;
- }
- const auto &It = CEDMap.find(CB);
- if (It == CEDMap.end())
- continue;
- if (!It->getSecond().IsReachingAlignedBarrierOnly)
- return false;
- break;
- } while ((CurI = CurI->getNextNonDebugInstruction()));
- if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
- return false;
- // Check backward until a call or the block beginning is reached.
- CurI = &I;
- do {
- auto *CB = dyn_cast<CallBase>(CurI);
- if (!CB)
- continue;
- if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB))) {
- break;
- }
- const auto &It = CEDMap.find(CB);
- if (It == CEDMap.end())
- continue;
- if (!AA::isNoSyncInst(A, *CB, *this)) {
- if (It->getSecond().IsReachedFromAlignedBarrierOnly) {
- break;
- }
- return false;
- }
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee->isDeclaration())
- return false;
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
- if (!EDAA.getState().isValidState())
- return false;
- if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly)
- return false;
- break;
- } while ((CurI = CurI->getPrevNonDebugInstruction()));
- if (!CurI &&
- !llvm::all_of(
- predecessors(I.getParent()), [&](const BasicBlock *PredBB) {
- return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
- })) {
- return false;
- }
- // On neither traversal we found a anything but aligned barriers.
- return true;
- }
- ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
- assert(isValidState() &&
- "No request should be made against an invalid state!");
- return BEDMap.lookup(&BB);
- }
- ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override {
- assert(isValidState() &&
- "No request should be made against an invalid state!");
- return CEDMap.lookup(&CB);
- }
- ExecutionDomainTy getFunctionExecutionDomain() const override {
- assert(isValidState() &&
- "No request should be made against an invalid state!");
- return BEDMap.lookup(nullptr);
- }
- ///}
- // Check if the edge into the successor block contains a condition that only
- // lets the main thread execute it.
- static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
- BasicBlock &SuccessorBB) {
- if (!Edge || !Edge->isConditional())
- return false;
- if (Edge->getSuccessor(0) != &SuccessorBB)
- return false;
- auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
- if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
- return false;
- ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
- if (!C)
- return false;
- // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
- if (C->isAllOnesValue()) {
- auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
- CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
- if (!CB)
- return false;
- const int InitModeArgNo = 1;
- auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
- return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
- }
- if (C->isZero()) {
- // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
- if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
- if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
- return true;
- // Match: 0 == llvm.amdgcn.workitem.id.x()
- if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
- if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
- return true;
- }
- return false;
- };
- /// Mapping containing information per block.
- DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
- DenseMap<const CallBase *, ExecutionDomainTy> CEDMap;
- SmallSetVector<CallBase *, 16> AlignedBarriers;
- ReversePostOrderTraversal<Function *> *RPOT = nullptr;
- };
- void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
- Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
- for (auto *EA : PredED.EncounteredAssumes)
- ED.addAssumeInst(A, *EA);
- for (auto *AB : PredED.AlignedBarriers)
- ED.addAlignedBarrier(A, *AB);
- }
- void AAExecutionDomainFunction::mergeInPredecessor(
- Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
- bool InitialEdgeOnly) {
- ED.IsExecutedByInitialThreadOnly =
- InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
- ED.IsExecutedByInitialThreadOnly);
- ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly &&
- PredED.IsReachedFromAlignedBarrierOnly;
- ED.EncounteredNonLocalSideEffect =
- ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect;
- if (ED.IsReachedFromAlignedBarrierOnly)
- mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
- else
- ED.clearAssumeInstAndAlignedBarriers();
- }
- void AAExecutionDomainFunction::handleEntryBB(Attributor &A,
- ExecutionDomainTy &EntryBBED) {
- SmallVector<ExecutionDomainTy> PredExecDomains;
- auto PredForCallSite = [&](AbstractCallSite ACS) {
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
- DepClassTy::OPTIONAL);
- if (!EDAA.getState().isValidState())
- return false;
- PredExecDomains.emplace_back(
- EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
- return true;
- };
- bool AllCallSitesKnown;
- if (A.checkForAllCallSites(PredForCallSite, *this,
- /* RequiresAllCallSites */ true,
- AllCallSitesKnown)) {
- for (const auto &PredED : PredExecDomains)
- mergeInPredecessor(A, EntryBBED, PredED);
- } else {
- // We could not find all predecessors, so this is either a kernel or a
- // function with external linkage (or with some other weird uses).
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- if (OMPInfoCache.Kernels.count(getAnchorScope())) {
- EntryBBED.IsExecutedByInitialThreadOnly = false;
- EntryBBED.IsReachedFromAlignedBarrierOnly = true;
- EntryBBED.EncounteredNonLocalSideEffect = false;
- } else {
- EntryBBED.IsExecutedByInitialThreadOnly = false;
- EntryBBED.IsReachedFromAlignedBarrierOnly = false;
- EntryBBED.EncounteredNonLocalSideEffect = true;
- }
- }
- auto &FnED = BEDMap[nullptr];
- FnED.IsReachingAlignedBarrierOnly &=
- EntryBBED.IsReachedFromAlignedBarrierOnly;
- }
- ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
- bool Changed = false;
- // Helper to deal with an aligned barrier encountered during the forward
- // traversal. \p CB is the aligned barrier, \p ED is the execution domain when
- // it was encountered.
- auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) {
- if (CB)
- Changed |= AlignedBarriers.insert(CB);
- // First, update the barrier ED kept in the separate CEDMap.
- auto &CallED = CEDMap[CB];
- mergeInPredecessor(A, CallED, ED);
- // Next adjust the ED we use for the traversal.
- ED.EncounteredNonLocalSideEffect = false;
- ED.IsReachedFromAlignedBarrierOnly = true;
- // Aligned barrier collection has to come last.
- ED.clearAssumeInstAndAlignedBarriers();
- if (CB)
- ED.addAlignedBarrier(A, *CB);
- };
- auto &LivenessAA =
- A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
- // Set \p R to \V and report true if that changed \p R.
- auto SetAndRecord = [&](bool &R, bool V) {
- bool Eq = (R == V);
- R = V;
- return !Eq;
- };
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- Function *F = getAnchorScope();
- BasicBlock &EntryBB = F->getEntryBlock();
- bool IsKernel = OMPInfoCache.Kernels.count(F);
- SmallVector<Instruction *> SyncInstWorklist;
- for (auto &RIt : *RPOT) {
- BasicBlock &BB = *RIt;
- bool IsEntryBB = &BB == &EntryBB;
- // TODO: We use local reasoning since we don't have a divergence analysis
- // running as well. We could basically allow uniform branches here.
- bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
- ExecutionDomainTy ED;
- // Propagate "incoming edges" into information about this block.
- if (IsEntryBB) {
- handleEntryBB(A, ED);
- } else {
- // For live non-entry blocks we only propagate
- // information via live edges.
- if (LivenessAA.isAssumedDead(&BB))
- continue;
- for (auto *PredBB : predecessors(&BB)) {
- if (LivenessAA.isEdgeDead(PredBB, &BB))
- continue;
- bool InitialEdgeOnly = isInitialThreadOnlyEdge(
- A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
- mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
- }
- }
- // Now we traverse the block, accumulate effects in ED and attach
- // information to calls.
- for (Instruction &I : BB) {
- bool UsedAssumedInformation;
- if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation,
- /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
- /* CheckForDeadStore */ true))
- continue;
- // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the
- // former is collected the latter is ignored.
- if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
- if (auto *AI = dyn_cast_or_null<AssumeInst>(II)) {
- ED.addAssumeInst(A, *AI);
- continue;
- }
- // TODO: Should we also collect and delete lifetime markers?
- if (II->isAssumeLikeIntrinsic())
- continue;
- }
- auto *CB = dyn_cast<CallBase>(&I);
- bool IsNoSync = AA::isNoSyncInst(A, I, *this);
- bool IsAlignedBarrier =
- !IsNoSync && CB &&
- AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
- AlignedBarrierLastInBlock &= IsNoSync;
- // Next we check for calls. Aligned barriers are handled
- // explicitly, everything else is kept for the backward traversal and will
- // also affect our state.
- if (CB) {
- if (IsAlignedBarrier) {
- HandleAlignedBarrier(CB, ED);
- AlignedBarrierLastInBlock = true;
- continue;
- }
- // Check the pointer(s) of a memory intrinsic explicitly.
- if (isa<MemIntrinsic>(&I)) {
- if (!ED.EncounteredNonLocalSideEffect &&
- AA::isPotentiallyAffectedByBarrier(A, I, *this))
- ED.EncounteredNonLocalSideEffect = true;
- if (!IsNoSync) {
- ED.IsReachedFromAlignedBarrierOnly = false;
- SyncInstWorklist.push_back(&I);
- }
- continue;
- }
- // Record how we entered the call, then accumulate the effect of the
- // call in ED for potential use by the callee.
- auto &CallED = CEDMap[CB];
- mergeInPredecessor(A, CallED, ED);
- // If we have a sync-definition we can check if it starts/ends in an
- // aligned barrier. If we are unsure we assume any sync breaks
- // alignment.
- Function *Callee = CB->getCalledFunction();
- if (!IsNoSync && Callee && !Callee->isDeclaration()) {
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
- if (EDAA.getState().isValidState()) {
- const auto &CalleeED = EDAA.getFunctionExecutionDomain();
- ED.IsReachedFromAlignedBarrierOnly =
- CallED.IsReachedFromAlignedBarrierOnly =
- CalleeED.IsReachedFromAlignedBarrierOnly;
- AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
- if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
- ED.EncounteredNonLocalSideEffect |=
- CalleeED.EncounteredNonLocalSideEffect;
- else
- ED.EncounteredNonLocalSideEffect =
- CalleeED.EncounteredNonLocalSideEffect;
- if (!CalleeED.IsReachingAlignedBarrierOnly)
- SyncInstWorklist.push_back(&I);
- if (CalleeED.IsReachedFromAlignedBarrierOnly)
- mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
- continue;
- }
- }
- if (!IsNoSync)
- ED.IsReachedFromAlignedBarrierOnly =
- CallED.IsReachedFromAlignedBarrierOnly = false;
- AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
- ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory();
- if (!IsNoSync)
- SyncInstWorklist.push_back(&I);
- }
- if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
- continue;
- // If we have a callee we try to use fine-grained information to
- // determine local side-effects.
- if (CB) {
- const auto &MemAA = A.getAAFor<AAMemoryLocation>(
- *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
- auto AccessPred = [&](const Instruction *I, const Value *Ptr,
- AAMemoryLocation::AccessKind,
- AAMemoryLocation::MemoryLocationsKind) {
- return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
- };
- if (MemAA.getState().isValidState() &&
- MemAA.checkForAllAccessesToMemoryKind(
- AccessPred, AAMemoryLocation::ALL_LOCATIONS))
- continue;
- }
- if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I))
- continue;
- if (auto *LI = dyn_cast<LoadInst>(&I))
- if (LI->hasMetadata(LLVMContext::MD_invariant_load))
- continue;
- if (!ED.EncounteredNonLocalSideEffect &&
- AA::isPotentiallyAffectedByBarrier(A, I, *this))
- ED.EncounteredNonLocalSideEffect = true;
- }
- if (!isa<UnreachableInst>(BB.getTerminator()) &&
- !BB.getTerminator()->getNumSuccessors()) {
- auto &FnED = BEDMap[nullptr];
- mergeInPredecessor(A, FnED, ED);
- if (IsKernel)
- HandleAlignedBarrier(nullptr, ED);
- }
- ExecutionDomainTy &StoredED = BEDMap[&BB];
- ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly;
- // Check if we computed anything different as part of the forward
- // traversal. We do not take assumptions and aligned barriers into account
- // as they do not influence the state we iterate. Backward traversal values
- // are handled later on.
- if (ED.IsExecutedByInitialThreadOnly !=
- StoredED.IsExecutedByInitialThreadOnly ||
- ED.IsReachedFromAlignedBarrierOnly !=
- StoredED.IsReachedFromAlignedBarrierOnly ||
- ED.EncounteredNonLocalSideEffect !=
- StoredED.EncounteredNonLocalSideEffect)
- Changed = true;
- // Update the state with the new value.
- StoredED = std::move(ED);
- }
- // Propagate (non-aligned) sync instruction effects backwards until the
- // entry is hit or an aligned barrier.
- SmallSetVector<BasicBlock *, 16> Visited;
- while (!SyncInstWorklist.empty()) {
- Instruction *SyncInst = SyncInstWorklist.pop_back_val();
- Instruction *CurInst = SyncInst;
- bool HitAlignedBarrier = false;
- while ((CurInst = CurInst->getPrevNode())) {
- auto *CB = dyn_cast<CallBase>(CurInst);
- if (!CB)
- continue;
- auto &CallED = CEDMap[CB];
- if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false))
- Changed = true;
- HitAlignedBarrier = AlignedBarriers.count(CB);
- if (HitAlignedBarrier)
- break;
- }
- if (HitAlignedBarrier)
- continue;
- BasicBlock *SyncBB = SyncInst->getParent();
- for (auto *PredBB : predecessors(SyncBB)) {
- if (LivenessAA.isEdgeDead(PredBB, SyncBB))
- continue;
- if (!Visited.insert(PredBB))
- continue;
- SyncInstWorklist.push_back(PredBB->getTerminator());
- auto &PredED = BEDMap[PredBB];
- if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false))
- Changed = true;
- }
- if (SyncBB != &EntryBB)
- continue;
- auto &FnED = BEDMap[nullptr];
- if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false))
- Changed = true;
- }
- return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
- }
- /// Try to replace memory allocation calls called by a single thread with a
- /// static buffer of shared memory.
- struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
- using Base = StateWrapper<BooleanState, AbstractAttribute>;
- AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- /// Create an abstract attribute view for the position \p IRP.
- static AAHeapToShared &createForPosition(const IRPosition &IRP,
- Attributor &A);
- /// Returns true if HeapToShared conversion is assumed to be possible.
- virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
- /// Returns true if HeapToShared conversion is assumed and the CB is a
- /// callsite to a free operation to be removed.
- virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
- /// See AbstractAttribute::getName().
- const std::string getName() const override { return "AAHeapToShared"; }
- /// See AbstractAttribute::getIdAddr().
- const char *getIdAddr() const override { return &ID; }
- /// This function should return true if the type of the \p AA is
- /// AAHeapToShared.
- static bool classof(const AbstractAttribute *AA) {
- return (AA->getIdAddr() == &ID);
- }
- /// Unique ID (due to the unique address)
- static const char ID;
- };
- struct AAHeapToSharedFunction : public AAHeapToShared {
- AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
- : AAHeapToShared(IRP, A) {}
- const std::string getAsStr() const override {
- return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
- " malloc calls eligible.";
- }
- /// See AbstractAttribute::trackStatistics().
- void trackStatistics() const override {}
- /// This functions finds free calls that will be removed by the
- /// HeapToShared transformation.
- void findPotentialRemovedFreeCalls(Attributor &A) {
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
- PotentialRemovedFreeCalls.clear();
- // Update free call users of found malloc calls.
- for (CallBase *CB : MallocCalls) {
- SmallVector<CallBase *, 4> FreeCalls;
- for (auto *U : CB->users()) {
- CallBase *C = dyn_cast<CallBase>(U);
- if (C && C->getCalledFunction() == FreeRFI.Declaration)
- FreeCalls.push_back(C);
- }
- if (FreeCalls.size() != 1)
- continue;
- PotentialRemovedFreeCalls.insert(FreeCalls.front());
- }
- }
- void initialize(Attributor &A) override {
- if (DisableOpenMPOptDeglobalization) {
- indicatePessimisticFixpoint();
- return;
- }
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
- if (!RFI.Declaration)
- return;
- Attributor::SimplifictionCallbackTy SCB =
- [](const IRPosition &, const AbstractAttribute *,
- bool &) -> std::optional<Value *> { return nullptr; };
- Function *F = getAnchorScope();
- for (User *U : RFI.Declaration->users())
- if (CallBase *CB = dyn_cast<CallBase>(U)) {
- if (CB->getFunction() != F)
- continue;
- MallocCalls.insert(CB);
- A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
- SCB);
- }
- findPotentialRemovedFreeCalls(A);
- }
- bool isAssumedHeapToShared(CallBase &CB) const override {
- return isValidState() && MallocCalls.count(&CB);
- }
- bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
- return isValidState() && PotentialRemovedFreeCalls.count(&CB);
- }
- ChangeStatus manifest(Attributor &A) override {
- if (MallocCalls.empty())
- return ChangeStatus::UNCHANGED;
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
- Function *F = getAnchorScope();
- auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
- DepClassTy::OPTIONAL);
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- for (CallBase *CB : MallocCalls) {
- // Skip replacing this if HeapToStack has already claimed it.
- if (HS && HS->isAssumedHeapToStack(*CB))
- continue;
- // Find the unique free call to remove it.
- SmallVector<CallBase *, 4> FreeCalls;
- for (auto *U : CB->users()) {
- CallBase *C = dyn_cast<CallBase>(U);
- if (C && C->getCalledFunction() == FreeCall.Declaration)
- FreeCalls.push_back(C);
- }
- if (FreeCalls.size() != 1)
- continue;
- auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
- if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
- LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
- << " with shared memory."
- << " Shared memory usage is limited to "
- << SharedMemoryLimit << " bytes\n");
- continue;
- }
- LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
- << " with " << AllocSize->getZExtValue()
- << " bytes of shared memory\n");
- // Create a new shared memory buffer of the same size as the allocation
- // and replace all the uses of the original allocation with it.
- Module *M = CB->getModule();
- Type *Int8Ty = Type::getInt8Ty(M->getContext());
- Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
- auto *SharedMem = new GlobalVariable(
- *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
- UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
- GlobalValue::NotThreadLocal,
- static_cast<unsigned>(AddressSpace::Shared));
- auto *NewBuffer =
- ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Replaced globalized variable with "
- << ore::NV("SharedMemory", AllocSize->getZExtValue())
- << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
- << "of shared memory.";
- };
- A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
- MaybeAlign Alignment = CB->getRetAlign();
- assert(Alignment &&
- "HeapToShared on allocation without alignment attribute");
- SharedMem->setAlignment(MaybeAlign(Alignment));
- A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer);
- A.deleteAfterManifest(*CB);
- A.deleteAfterManifest(*FreeCalls.front());
- SharedMemoryUsed += AllocSize->getZExtValue();
- NumBytesMovedToSharedMemory = SharedMemoryUsed;
- Changed = ChangeStatus::CHANGED;
- }
- return Changed;
- }
- ChangeStatus updateImpl(Attributor &A) override {
- if (MallocCalls.empty())
- return indicatePessimisticFixpoint();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
- if (!RFI.Declaration)
- return ChangeStatus::UNCHANGED;
- Function *F = getAnchorScope();
- auto NumMallocCalls = MallocCalls.size();
- // Only consider malloc calls executed by a single thread with a constant.
- for (User *U : RFI.Declaration->users()) {
- if (CallBase *CB = dyn_cast<CallBase>(U)) {
- if (CB->getCaller() != F)
- continue;
- if (!MallocCalls.count(CB))
- continue;
- if (!isa<ConstantInt>(CB->getArgOperand(0))) {
- MallocCalls.remove(CB);
- continue;
- }
- const auto &ED = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*F), DepClassTy::REQUIRED);
- if (!ED.isExecutedByInitialThreadOnly(*CB))
- MallocCalls.remove(CB);
- }
- }
- findPotentialRemovedFreeCalls(A);
- if (NumMallocCalls != MallocCalls.size())
- return ChangeStatus::CHANGED;
- return ChangeStatus::UNCHANGED;
- }
- /// Collection of all malloc calls in a function.
- SmallSetVector<CallBase *, 4> MallocCalls;
- /// Collection of potentially removed free calls in a function.
- SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
- /// The total amount of shared memory that has been used for HeapToShared.
- unsigned SharedMemoryUsed = 0;
- };
- struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
- using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
- AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- /// Statistics are tracked as part of manifest for now.
- void trackStatistics() const override {}
- /// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
- if (!isValidState())
- return "<invalid>";
- return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
- : "generic") +
- std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
- : "") +
- std::string(" #PRs: ") +
- (ReachedKnownParallelRegions.isValidState()
- ? std::to_string(ReachedKnownParallelRegions.size())
- : "<invalid>") +
- ", #Unknown PRs: " +
- (ReachedUnknownParallelRegions.isValidState()
- ? std::to_string(ReachedUnknownParallelRegions.size())
- : "<invalid>") +
- ", #Reaching Kernels: " +
- (ReachingKernelEntries.isValidState()
- ? std::to_string(ReachingKernelEntries.size())
- : "<invalid>") +
- ", #ParLevels: " +
- (ParallelLevels.isValidState()
- ? std::to_string(ParallelLevels.size())
- : "<invalid>");
- }
- /// Create an abstract attribute biew for the position \p IRP.
- static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
- /// See AbstractAttribute::getName()
- const std::string getName() const override { return "AAKernelInfo"; }
- /// See AbstractAttribute::getIdAddr()
- const char *getIdAddr() const override { return &ID; }
- /// This function should return true if the type of the \p AA is AAKernelInfo
- static bool classof(const AbstractAttribute *AA) {
- return (AA->getIdAddr() == &ID);
- }
- static const char ID;
- };
- /// The function kernel info abstract attribute, basically, what can we say
- /// about a function with regards to the KernelInfoState.
- struct AAKernelInfoFunction : AAKernelInfo {
- AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
- : AAKernelInfo(IRP, A) {}
- SmallPtrSet<Instruction *, 4> GuardedInstructions;
- SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
- return GuardedInstructions;
- }
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // This is a high-level transform that might change the constant arguments
- // of the init and dinit calls. We need to tell the Attributor about this
- // to avoid other parts using the current constant value for simpliication.
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- Function *Fn = getAnchorScope();
- OMPInformationCache::RuntimeFunctionInfo &InitRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
- OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
- // For kernels we perform more initialization work, first we find the init
- // and deinit calls.
- auto StoreCallBase = [](Use &U,
- OMPInformationCache::RuntimeFunctionInfo &RFI,
- CallBase *&Storage) {
- CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
- assert(CB &&
- "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
- assert(!Storage &&
- "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
- Storage = CB;
- return false;
- };
- InitRFI.foreachUse(
- [&](Use &U, Function &) {
- StoreCallBase(U, InitRFI, KernelInitCB);
- return false;
- },
- Fn);
- DeinitRFI.foreachUse(
- [&](Use &U, Function &) {
- StoreCallBase(U, DeinitRFI, KernelDeinitCB);
- return false;
- },
- Fn);
- // Ignore kernels without initializers such as global constructors.
- if (!KernelInitCB || !KernelDeinitCB)
- return;
- // Add itself to the reaching kernel and set IsKernelEntry.
- ReachingKernelEntries.insert(Fn);
- IsKernelEntry = true;
- // For kernels we might need to initialize/finalize the IsSPMD state and
- // we need to register a simplification callback so that the Attributor
- // knows the constant arguments to __kmpc_target_init and
- // __kmpc_target_deinit might actually change.
- Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
- [&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> std::optional<Value *> {
- // IRP represents the "use generic state machine" argument of an
- // __kmpc_target_init call. We will answer this one with the internal
- // state. As long as we are not in an invalid state, we will create a
- // custom state machine so the value should be a `i1 false`. If we are
- // in an invalid state, we won't change the value that is in the IR.
- if (!ReachedKnownParallelRegions.isValidState())
- return nullptr;
- // If we have disabled state machine rewrites, don't make a custom one.
- if (DisableOpenMPOptStateMachineRewrite)
- return nullptr;
- if (AA)
- A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
- UsedAssumedInformation = !isAtFixpoint();
- auto *FalseVal =
- ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
- return FalseVal;
- };
- Attributor::SimplifictionCallbackTy ModeSimplifyCB =
- [&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> std::optional<Value *> {
- // IRP represents the "SPMDCompatibilityTracker" argument of an
- // __kmpc_target_init or
- // __kmpc_target_deinit call. We will answer this one with the internal
- // state.
- if (!SPMDCompatibilityTracker.isValidState())
- return nullptr;
- if (!SPMDCompatibilityTracker.isAtFixpoint()) {
- if (AA)
- A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
- UsedAssumedInformation = true;
- } else {
- UsedAssumedInformation = false;
- }
- auto *Val = ConstantInt::getSigned(
- IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()),
- SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
- : OMP_TGT_EXEC_MODE_GENERIC);
- return Val;
- };
- constexpr const int InitModeArgNo = 1;
- constexpr const int DeinitModeArgNo = 1;
- constexpr const int InitUseStateMachineArgNo = 2;
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
- StateMachineSimplifyCB);
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
- ModeSimplifyCB);
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
- ModeSimplifyCB);
- // Check if we know we are in SPMD-mode already.
- ConstantInt *ModeArg =
- dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
- if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
- SPMDCompatibilityTracker.indicateOptimisticFixpoint();
- // This is a generic region but SPMDization is disabled so stop tracking.
- else if (DisableOpenMPOptSPMDization)
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- // Register virtual uses of functions we might need to preserve.
- auto RegisterVirtualUse = [&](RuntimeFunction RFKind,
- Attributor::VirtualUseCallbackTy &CB) {
- if (!OMPInfoCache.RFIs[RFKind].Declaration)
- return;
- A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
- };
- // Add a dependence to ensure updates if the state changes.
- auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,
- const AbstractAttribute *QueryingAA) {
- if (QueryingAA) {
- A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
- }
- return true;
- };
- Attributor::VirtualUseCallbackTy CustomStateMachineUseCB =
- [&](Attributor &A, const AbstractAttribute *QueryingAA) {
- // Whenever we create a custom state machine we will insert calls to
- // __kmpc_get_hardware_num_threads_in_block,
- // __kmpc_get_warp_size,
- // __kmpc_barrier_simple_generic,
- // __kmpc_kernel_parallel, and
- // __kmpc_kernel_end_parallel.
- // Not needed if we are on track for SPMDzation.
- if (SPMDCompatibilityTracker.isValidState())
- return AddDependence(A, this, QueryingAA);
- // Not needed if we can't rewrite due to an invalid state.
- if (!ReachedKnownParallelRegions.isValidState())
- return AddDependence(A, this, QueryingAA);
- return false;
- };
- // Not needed if we are pre-runtime merge.
- if (!KernelInitCB->getCalledFunction()->isDeclaration()) {
- RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
- CustomStateMachineUseCB);
- RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
- RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
- CustomStateMachineUseCB);
- RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
- CustomStateMachineUseCB);
- RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
- CustomStateMachineUseCB);
- }
- // If we do not perform SPMDzation we do not need the virtual uses below.
- if (SPMDCompatibilityTracker.isAtFixpoint())
- return;
- Attributor::VirtualUseCallbackTy HWThreadIdUseCB =
- [&](Attributor &A, const AbstractAttribute *QueryingAA) {
- // Whenever we perform SPMDzation we will insert
- // __kmpc_get_hardware_thread_id_in_block calls.
- if (!SPMDCompatibilityTracker.isValidState())
- return AddDependence(A, this, QueryingAA);
- return false;
- };
- RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
- HWThreadIdUseCB);
- Attributor::VirtualUseCallbackTy SPMDBarrierUseCB =
- [&](Attributor &A, const AbstractAttribute *QueryingAA) {
- // Whenever we perform SPMDzation with guarding we will insert
- // __kmpc_simple_barrier_spmd calls. If SPMDzation failed, there is
- // nothing to guard, or there are no parallel regions, we don't need
- // the calls.
- if (!SPMDCompatibilityTracker.isValidState())
- return AddDependence(A, this, QueryingAA);
- if (SPMDCompatibilityTracker.empty())
- return AddDependence(A, this, QueryingAA);
- if (!mayContainParallelRegion())
- return AddDependence(A, this, QueryingAA);
- return false;
- };
- RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
- }
- /// Sanitize the string \p S such that it is a suitable global symbol name.
- static std::string sanitizeForGlobalName(std::string S) {
- std::replace_if(
- S.begin(), S.end(),
- [](const char C) {
- return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
- (C >= '0' && C <= '9') || C == '_');
- },
- '.');
- return S;
- }
- /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
- /// finished now.
- ChangeStatus manifest(Attributor &A) override {
- // If we are not looking at a kernel with __kmpc_target_init and
- // __kmpc_target_deinit call we cannot actually manifest the information.
- if (!KernelInitCB || !KernelDeinitCB)
- return ChangeStatus::UNCHANGED;
- /// Insert nested Parallelism global variable
- Function *Kernel = getAnchorScope();
- Module &M = *Kernel->getParent();
- Type *Int8Ty = Type::getInt8Ty(M.getContext());
- new GlobalVariable(M, Int8Ty, /* isConstant */ true,
- GlobalValue::WeakAnyLinkage,
- ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
- Kernel->getName() + "_nested_parallelism");
- // If we can we change the execution mode to SPMD-mode otherwise we build a
- // custom state machine.
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- if (!changeToSPMDMode(A, Changed)) {
- if (!KernelInitCB->getCalledFunction()->isDeclaration())
- return buildCustomStateMachine(A);
- }
- return Changed;
- }
- void insertInstructionGuardsHelper(Attributor &A) {
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto CreateGuardedRegion = [&](Instruction *RegionStartI,
- Instruction *RegionEndI) {
- LoopInfo *LI = nullptr;
- DominatorTree *DT = nullptr;
- MemorySSAUpdater *MSU = nullptr;
- using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
- BasicBlock *ParentBB = RegionStartI->getParent();
- Function *Fn = ParentBB->getParent();
- Module &M = *Fn->getParent();
- // Create all the blocks and logic.
- // ParentBB:
- // goto RegionCheckTidBB
- // RegionCheckTidBB:
- // Tid = __kmpc_hardware_thread_id()
- // if (Tid != 0)
- // goto RegionBarrierBB
- // RegionStartBB:
- // <execute instructions guarded>
- // goto RegionEndBB
- // RegionEndBB:
- // <store escaping values to shared mem>
- // goto RegionBarrierBB
- // RegionBarrierBB:
- // __kmpc_simple_barrier_spmd()
- // // second barrier is omitted if lacking escaping values.
- // <load escaping values from shared mem>
- // __kmpc_simple_barrier_spmd()
- // goto RegionExitBB
- // RegionExitBB:
- // <execute rest of instructions>
- BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
- DT, LI, MSU, "region.guarded.end");
- BasicBlock *RegionBarrierBB =
- SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
- MSU, "region.barrier");
- BasicBlock *RegionExitBB =
- SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
- DT, LI, MSU, "region.exit");
- BasicBlock *RegionStartBB =
- SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
- assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
- "Expected a different CFG");
- BasicBlock *RegionCheckTidBB = SplitBlock(
- ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
- // Register basic blocks with the Attributor.
- A.registerManifestAddedBasicBlock(*RegionEndBB);
- A.registerManifestAddedBasicBlock(*RegionBarrierBB);
- A.registerManifestAddedBasicBlock(*RegionExitBB);
- A.registerManifestAddedBasicBlock(*RegionStartBB);
- A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
- bool HasBroadcastValues = false;
- // Find escaping outputs from the guarded region to outside users and
- // broadcast their values to them.
- for (Instruction &I : *RegionStartBB) {
- SmallPtrSet<Instruction *, 4> OutsideUsers;
- for (User *Usr : I.users()) {
- Instruction &UsrI = *cast<Instruction>(Usr);
- if (UsrI.getParent() != RegionStartBB)
- OutsideUsers.insert(&UsrI);
- }
- if (OutsideUsers.empty())
- continue;
- HasBroadcastValues = true;
- // Emit a global variable in shared memory to store the broadcasted
- // value.
- auto *SharedMem = new GlobalVariable(
- M, I.getType(), /* IsConstant */ false,
- GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
- sanitizeForGlobalName(
- (I.getName() + ".guarded.output.alloc").str()),
- nullptr, GlobalValue::NotThreadLocal,
- static_cast<unsigned>(AddressSpace::Shared));
- // Emit a store instruction to update the value.
- new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
- LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
- I.getName() + ".guarded.output.load",
- RegionBarrierBB->getTerminator());
- // Emit a load instruction and replace uses of the output value.
- for (Instruction *UsrI : OutsideUsers)
- UsrI->replaceUsesOfWith(&I, LoadI);
- }
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- // Go to tid check BB in ParentBB.
- const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
- ParentBB->getTerminator()->eraseFromParent();
- OpenMPIRBuilder::LocationDescription Loc(
- InsertPointTy(ParentBB, ParentBB->end()), DL);
- OMPInfoCache.OMPBuilder.updateToLocation(Loc);
- uint32_t SrcLocStrSize;
- auto *SrcLocStr =
- OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- Value *Ident =
- OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
- BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
- // Add check for Tid in RegionCheckTidBB
- RegionCheckTidBB->getTerminator()->eraseFromParent();
- OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
- InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
- OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
- FunctionCallee HardwareTidFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
- CallInst *Tid =
- OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
- Tid->setDebugLoc(DL);
- OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
- Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
- OMPInfoCache.OMPBuilder.Builder
- .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
- ->setDebugLoc(DL);
- // First barrier for synchronization, ensures main thread has updated
- // values.
- FunctionCallee BarrierFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_barrier_simple_spmd);
- OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
- RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
- CallInst *Barrier =
- OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
- Barrier->setDebugLoc(DL);
- OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
- // Second barrier ensures workers have read broadcast values.
- if (HasBroadcastValues) {
- CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "",
- RegionBarrierBB->getTerminator());
- Barrier->setDebugLoc(DL);
- OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
- }
- };
- auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
- SmallPtrSet<BasicBlock *, 8> Visited;
- for (Instruction *GuardedI : SPMDCompatibilityTracker) {
- BasicBlock *BB = GuardedI->getParent();
- if (!Visited.insert(BB).second)
- continue;
- SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
- Instruction *LastEffect = nullptr;
- BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
- while (++IP != IPEnd) {
- if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
- continue;
- Instruction *I = &*IP;
- if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
- continue;
- if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
- LastEffect = nullptr;
- continue;
- }
- if (LastEffect)
- Reorders.push_back({I, LastEffect});
- LastEffect = &*IP;
- }
- for (auto &Reorder : Reorders)
- Reorder.first->moveBefore(Reorder.second);
- }
- SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
- for (Instruction *GuardedI : SPMDCompatibilityTracker) {
- BasicBlock *BB = GuardedI->getParent();
- auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
- IRPosition::function(*GuardedI->getFunction()), nullptr,
- DepClassTy::NONE);
- assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
- auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
- // Continue if instruction is already guarded.
- if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
- continue;
- Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
- for (Instruction &I : *BB) {
- // If instruction I needs to be guarded update the guarded region
- // bounds.
- if (SPMDCompatibilityTracker.contains(&I)) {
- CalleeAAFunction.getGuardedInstructions().insert(&I);
- if (GuardedRegionStart)
- GuardedRegionEnd = &I;
- else
- GuardedRegionStart = GuardedRegionEnd = &I;
- continue;
- }
- // Instruction I does not need guarding, store
- // any region found and reset bounds.
- if (GuardedRegionStart) {
- GuardedRegions.push_back(
- std::make_pair(GuardedRegionStart, GuardedRegionEnd));
- GuardedRegionStart = nullptr;
- GuardedRegionEnd = nullptr;
- }
- }
- }
- for (auto &GR : GuardedRegions)
- CreateGuardedRegion(GR.first, GR.second);
- }
- void forceSingleThreadPerWorkgroupHelper(Attributor &A) {
- // Only allow 1 thread per workgroup to continue executing the user code.
- //
- // InitCB = __kmpc_target_init(...)
- // ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
- // if (ThreadIdInBlock != 0) return;
- // UserCode:
- // // user code
- //
- auto &Ctx = getAnchorValue().getContext();
- Function *Kernel = getAssociatedFunction();
- assert(Kernel && "Expected an associated function!");
- // Create block for user code to branch to from initial block.
- BasicBlock *InitBB = KernelInitCB->getParent();
- BasicBlock *UserCodeBB = InitBB->splitBasicBlock(
- KernelInitCB->getNextNode(), "main.thread.user_code");
- BasicBlock *ReturnBB =
- BasicBlock::Create(Ctx, "exit.threads", Kernel, UserCodeBB);
- // Register blocks with attributor:
- A.registerManifestAddedBasicBlock(*InitBB);
- A.registerManifestAddedBasicBlock(*UserCodeBB);
- A.registerManifestAddedBasicBlock(*ReturnBB);
- // Debug location:
- const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
- ReturnInst::Create(Ctx, ReturnBB)->setDebugLoc(DLoc);
- InitBB->getTerminator()->eraseFromParent();
- // Prepare call to OMPRTL___kmpc_get_hardware_thread_id_in_block.
- Module &M = *Kernel->getParent();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- FunctionCallee ThreadIdInBlockFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
- // Get thread ID in block.
- CallInst *ThreadIdInBlock =
- CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);
- OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
- ThreadIdInBlock->setDebugLoc(DLoc);
- // Eliminate all threads in the block with ID not equal to 0:
- Instruction *IsMainThread =
- ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock,
- ConstantInt::get(ThreadIdInBlock->getType(), 0),
- "thread.is_main", InitBB);
- IsMainThread->setDebugLoc(DLoc);
- BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB);
- }
- bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- // We cannot change to SPMD mode if the runtime functions aren't availible.
- if (!OMPInfoCache.runtimeFnsAvailable(
- {OMPRTL___kmpc_get_hardware_thread_id_in_block,
- OMPRTL___kmpc_barrier_simple_spmd}))
- return false;
- if (!SPMDCompatibilityTracker.isAssumed()) {
- for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
- if (!NonCompatibleI)
- continue;
- // Skip diagnostics on calls to known OpenMP runtime functions for now.
- if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
- if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
- continue;
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- ORA << "Value has potential side effects preventing SPMD-mode "
- "execution";
- if (isa<CallBase>(NonCompatibleI)) {
- ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
- "the called function to override";
- }
- return ORA << ".";
- };
- A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
- Remark);
- LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
- << *NonCompatibleI << "\n");
- }
- return false;
- }
- // Get the actual kernel, could be the caller of the anchor scope if we have
- // a debug wrapper.
- Function *Kernel = getAnchorScope();
- if (Kernel->hasLocalLinkage()) {
- assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
- auto *CB = cast<CallBase>(Kernel->user_back());
- Kernel = CB->getCaller();
- }
- assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
- // Check if the kernel is already in SPMD mode, if so, return success.
- GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
- (Kernel->getName() + "_exec_mode").str());
- assert(ExecMode && "Kernel without exec mode?");
- assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
- // Set the global exec mode flag to indicate SPMD-Generic mode.
- assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
- "ExecMode is not an integer!");
- const int8_t ExecModeVal =
- cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
- if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
- return true;
- // We will now unconditionally modify the IR, indicate a change.
- Changed = ChangeStatus::CHANGED;
- // Do not use instruction guards when no parallel is present inside
- // the target region.
- if (mayContainParallelRegion())
- insertInstructionGuardsHelper(A);
- else
- forceSingleThreadPerWorkgroupHelper(A);
- // Adjust the global exec mode flag that tells the runtime what mode this
- // kernel is executed in.
- assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
- "Initially non-SPMD kernel has SPMD exec mode!");
- ExecMode->setInitializer(
- ConstantInt::get(ExecMode->getInitializer()->getType(),
- ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
- // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
- const int InitModeArgNo = 1;
- const int DeinitModeArgNo = 1;
- const int InitUseStateMachineArgNo = 2;
- auto &Ctx = getAnchorValue().getContext();
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitModeArgNo),
- *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
- OMP_TGT_EXEC_MODE_SPMD));
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
- *ConstantInt::getBool(Ctx, false));
- A.changeUseAfterManifest(
- KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
- *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
- OMP_TGT_EXEC_MODE_SPMD));
- ++NumOpenMPTargetRegionKernelsSPMD;
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Transformed generic-mode kernel to SPMD-mode.";
- };
- A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
- return true;
- };
- ChangeStatus buildCustomStateMachine(Attributor &A) {
- // If we have disabled state machine rewrites, don't make a custom one
- if (DisableOpenMPOptStateMachineRewrite)
- return ChangeStatus::UNCHANGED;
- // Don't rewrite the state machine if we are not in a valid state.
- if (!ReachedKnownParallelRegions.isValidState())
- return ChangeStatus::UNCHANGED;
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- if (!OMPInfoCache.runtimeFnsAvailable(
- {OMPRTL___kmpc_get_hardware_num_threads_in_block,
- OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
- OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
- return ChangeStatus::UNCHANGED;
- const int InitModeArgNo = 1;
- const int InitUseStateMachineArgNo = 2;
- // Check if the current configuration is non-SPMD and generic state machine.
- // If we already have SPMD mode or a custom state machine we do not need to
- // go any further. If it is anything but a constant something is weird and
- // we give up.
- ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
- KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
- ConstantInt *Mode =
- dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
- // If we are stuck with generic mode, try to create a custom device (=GPU)
- // state machine which is specialized for the parallel regions that are
- // reachable by the kernel.
- if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
- (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
- return ChangeStatus::UNCHANGED;
- // If not SPMD mode, indicate we use a custom state machine now.
- auto &Ctx = getAnchorValue().getContext();
- auto *FalseVal = ConstantInt::getBool(Ctx, false);
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
- // If we don't actually need a state machine we are done here. This can
- // happen if there simply are no parallel regions. In the resulting kernel
- // all worker threads will simply exit right away, leaving the main thread
- // to do the work alone.
- if (!mayContainParallelRegion()) {
- ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Removing unused state machine from generic-mode kernel.";
- };
- A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
- return ChangeStatus::CHANGED;
- }
- // Keep track in the statistics of our new shiny custom state machine.
- if (ReachedUnknownParallelRegions.empty()) {
- ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Rewriting generic-mode kernel with a customized state "
- "machine.";
- };
- A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
- } else {
- ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
- auto Remark = [&](OptimizationRemarkAnalysis OR) {
- return OR << "Generic-mode kernel is executed with a customized state "
- "machine that requires a fallback.";
- };
- A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
- // Tell the user why we ended up with a fallback.
- for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
- if (!UnknownParallelRegionCB)
- continue;
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- return ORA << "Call may contain unknown parallel regions. Use "
- << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
- "override.";
- };
- A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
- "OMP133", Remark);
- }
- }
- // Create all the blocks:
- //
- // InitCB = __kmpc_target_init(...)
- // BlockHwSize =
- // __kmpc_get_hardware_num_threads_in_block();
- // WarpSize = __kmpc_get_warp_size();
- // BlockSize = BlockHwSize - WarpSize;
- // IsWorkerCheckBB: bool IsWorker = InitCB != -1;
- // if (IsWorker) {
- // if (InitCB >= BlockSize) return;
- // SMBeginBB: __kmpc_barrier_simple_generic(...);
- // void *WorkFn;
- // bool Active = __kmpc_kernel_parallel(&WorkFn);
- // if (!WorkFn) return;
- // SMIsActiveCheckBB: if (Active) {
- // SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
- // ParFn0(...);
- // SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
- // ParFn1(...);
- // ...
- // SMIfCascadeCurrentBB: else
- // ((WorkFnTy*)WorkFn)(...);
- // SMEndParallelBB: __kmpc_kernel_end_parallel(...);
- // }
- // SMDoneBB: __kmpc_barrier_simple_generic(...);
- // goto SMBeginBB;
- // }
- // UserCodeEntryBB: // user code
- // __kmpc_target_deinit(...)
- //
- Function *Kernel = getAssociatedFunction();
- assert(Kernel && "Expected an associated function!");
- BasicBlock *InitBB = KernelInitCB->getParent();
- BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
- KernelInitCB->getNextNode(), "thread.user_code.check");
- BasicBlock *IsWorkerCheckBB =
- BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineBeginBB = BasicBlock::Create(
- Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
- Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
- Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineIfCascadeCurrentBB =
- BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
- Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineEndParallelBB =
- BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
- Kernel, UserCodeEntryBB);
- BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
- Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
- A.registerManifestAddedBasicBlock(*InitBB);
- A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
- A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
- A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
- A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
- A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
- A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
- A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
- A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
- const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
- ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
- InitBB->getTerminator()->eraseFromParent();
- Instruction *IsWorker =
- ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
- ConstantInt::get(KernelInitCB->getType(), -1),
- "thread.is_worker", InitBB);
- IsWorker->setDebugLoc(DLoc);
- BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
- Module &M = *Kernel->getParent();
- FunctionCallee BlockHwSizeFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
- FunctionCallee WarpSizeFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_get_warp_size);
- CallInst *BlockHwSize =
- CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB);
- OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
- BlockHwSize->setDebugLoc(DLoc);
- CallInst *WarpSize =
- CallInst::Create(WarpSizeFn, "warp.size", IsWorkerCheckBB);
- OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
- WarpSize->setDebugLoc(DLoc);
- Instruction *BlockSize = BinaryOperator::CreateSub(
- BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB);
- BlockSize->setDebugLoc(DLoc);
- Instruction *IsMainOrWorker = ICmpInst::Create(
- ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize,
- "thread.is_main_or_worker", IsWorkerCheckBB);
- IsMainOrWorker->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB,
- IsMainOrWorker, IsWorkerCheckBB);
- // Create local storage for the work function pointer.
- const DataLayout &DL = M.getDataLayout();
- Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
- Instruction *WorkFnAI =
- new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
- "worker.work_fn.addr", &Kernel->getEntryBlock().front());
- WorkFnAI->setDebugLoc(DLoc);
- OMPInfoCache.OMPBuilder.updateToLocation(
- OpenMPIRBuilder::LocationDescription(
- IRBuilder<>::InsertPoint(StateMachineBeginBB,
- StateMachineBeginBB->end()),
- DLoc));
- Value *Ident = KernelInitCB->getArgOperand(0);
- Value *GTid = KernelInitCB;
- FunctionCallee BarrierFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_barrier_simple_generic);
- CallInst *Barrier =
- CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);
- OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
- Barrier->setDebugLoc(DLoc);
- if (WorkFnAI->getType()->getPointerAddressSpace() !=
- (unsigned int)AddressSpace::Generic) {
- WorkFnAI = new AddrSpaceCastInst(
- WorkFnAI,
- PointerType::getWithSamePointeeType(
- cast<PointerType>(WorkFnAI->getType()),
- (unsigned int)AddressSpace::Generic),
- WorkFnAI->getName() + ".generic", StateMachineBeginBB);
- WorkFnAI->setDebugLoc(DLoc);
- }
- FunctionCallee KernelParallelFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_kernel_parallel);
- CallInst *IsActiveWorker = CallInst::Create(
- KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
- OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
- IsActiveWorker->setDebugLoc(DLoc);
- Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
- StateMachineBeginBB);
- WorkFn->setDebugLoc(DLoc);
- FunctionType *ParallelRegionFnTy = FunctionType::get(
- Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
- false);
- Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
- WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
- StateMachineBeginBB);
- Instruction *IsDone =
- ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
- Constant::getNullValue(VoidPtrTy), "worker.is_done",
- StateMachineBeginBB);
- IsDone->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
- IsDone, StateMachineBeginBB)
- ->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineIfCascadeCurrentBB,
- StateMachineDoneBarrierBB, IsActiveWorker,
- StateMachineIsActiveCheckBB)
- ->setDebugLoc(DLoc);
- Value *ZeroArg =
- Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
- // Now that we have most of the CFG skeleton it is time for the if-cascade
- // that checks the function pointer we got from the runtime against the
- // parallel regions we expect, if there are any.
- for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
- auto *ParallelRegion = ReachedKnownParallelRegions[I];
- BasicBlock *PRExecuteBB = BasicBlock::Create(
- Ctx, "worker_state_machine.parallel_region.execute", Kernel,
- StateMachineEndParallelBB);
- CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
- ->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
- ->setDebugLoc(DLoc);
- BasicBlock *PRNextBB =
- BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
- Kernel, StateMachineEndParallelBB);
- // Check if we need to compare the pointer at all or if we can just
- // call the parallel region function.
- Value *IsPR;
- if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
- Instruction *CmpI = ICmpInst::Create(
- ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
- "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
- CmpI->setDebugLoc(DLoc);
- IsPR = CmpI;
- } else {
- IsPR = ConstantInt::getTrue(Ctx);
- }
- BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
- StateMachineIfCascadeCurrentBB)
- ->setDebugLoc(DLoc);
- StateMachineIfCascadeCurrentBB = PRNextBB;
- }
- // At the end of the if-cascade we place the indirect function pointer call
- // in case we might need it, that is if there can be parallel regions we
- // have not handled in the if-cascade above.
- if (!ReachedUnknownParallelRegions.empty()) {
- StateMachineIfCascadeCurrentBB->setName(
- "worker_state_machine.parallel_region.fallback.execute");
- CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
- StateMachineIfCascadeCurrentBB)
- ->setDebugLoc(DLoc);
- }
- BranchInst::Create(StateMachineEndParallelBB,
- StateMachineIfCascadeCurrentBB)
- ->setDebugLoc(DLoc);
- FunctionCallee EndParallelFn =
- OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
- M, OMPRTL___kmpc_kernel_end_parallel);
- CallInst *EndParallel =
- CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
- OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
- EndParallel->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
- ->setDebugLoc(DLoc);
- CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
- ->setDebugLoc(DLoc);
- BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
- ->setDebugLoc(DLoc);
- return ChangeStatus::CHANGED;
- }
- /// Fixpoint iteration update function. Will be called every time a dependence
- /// changed its state (and in the beginning).
- ChangeStatus updateImpl(Attributor &A) override {
- KernelInfoState StateBefore = getState();
- // Callback to check a read/write instruction.
- auto CheckRWInst = [&](Instruction &I) {
- // We handle calls later.
- if (isa<CallBase>(I))
- return true;
- // We only care about write effects.
- if (!I.mayWriteToMemory())
- return true;
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
- const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
- *this, IRPosition::value(*SI->getPointerOperand()),
- DepClassTy::OPTIONAL);
- auto &HS = A.getAAFor<AAHeapToStack>(
- *this, IRPosition::function(*I.getFunction()),
- DepClassTy::OPTIONAL);
- if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) {
- if (AA::isAssumedThreadLocalObject(A, Obj, *this))
- return true;
- // Check for AAHeapToStack moved objects which must not be
- // guarded.
- auto *CB = dyn_cast<CallBase>(&Obj);
- return CB && HS.isAssumedHeapToStack(*CB);
- }))
- return true;
- }
- // Insert instruction that needs guarding.
- SPMDCompatibilityTracker.insert(&I);
- return true;
- };
- bool UsedAssumedInformationInCheckRWInst = false;
- if (!SPMDCompatibilityTracker.isAtFixpoint())
- if (!A.checkForAllReadWriteInstructions(
- CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- bool UsedAssumedInformationFromReachingKernels = false;
- if (!IsKernelEntry) {
- updateParallelLevels(A);
- bool AllReachingKernelsKnown = true;
- updateReachingKernelEntries(A, AllReachingKernelsKnown);
- UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
- if (!SPMDCompatibilityTracker.empty()) {
- if (!ParallelLevels.isValidState())
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- else if (!ReachingKernelEntries.isValidState())
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- else {
- // Check if all reaching kernels agree on the mode as we can otherwise
- // not guard instructions. We might not be sure about the mode so we
- // we cannot fix the internal spmd-zation state either.
- int SPMD = 0, Generic = 0;
- for (auto *Kernel : ReachingKernelEntries) {
- auto &CBAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
- if (CBAA.SPMDCompatibilityTracker.isValidState() &&
- CBAA.SPMDCompatibilityTracker.isAssumed())
- ++SPMD;
- else
- ++Generic;
- if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
- UsedAssumedInformationFromReachingKernels = true;
- }
- if (SPMD != 0 && Generic != 0)
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- }
- }
- }
- // Callback to check a call instruction.
- bool AllParallelRegionStatesWereFixed = true;
- bool AllSPMDStatesWereFixed = true;
- auto CheckCallInst = [&](Instruction &I) {
- auto &CB = cast<CallBase>(I);
- auto &CBAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
- getState() ^= CBAA.getState();
- AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
- AllParallelRegionStatesWereFixed &=
- CBAA.ReachedKnownParallelRegions.isAtFixpoint();
- AllParallelRegionStatesWereFixed &=
- CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
- return true;
- };
- bool UsedAssumedInformationInCheckCallInst = false;
- if (!A.checkForAllCallLikeInstructions(
- CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
- LLVM_DEBUG(dbgs() << TAG
- << "Failed to visit all call-like instructions!\n";);
- return indicatePessimisticFixpoint();
- }
- // If we haven't used any assumed information for the reached parallel
- // region states we can fix it.
- if (!UsedAssumedInformationInCheckCallInst &&
- AllParallelRegionStatesWereFixed) {
- ReachedKnownParallelRegions.indicateOptimisticFixpoint();
- ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
- }
- // If we haven't used any assumed information for the SPMD state we can fix
- // it.
- if (!UsedAssumedInformationInCheckRWInst &&
- !UsedAssumedInformationInCheckCallInst &&
- !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
- SPMDCompatibilityTracker.indicateOptimisticFixpoint();
- return StateBefore == getState() ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
- private:
- /// Update info regarding reaching kernels.
- void updateReachingKernelEntries(Attributor &A,
- bool &AllReachingKernelsKnown) {
- auto PredCallSite = [&](AbstractCallSite ACS) {
- Function *Caller = ACS.getInstruction()->getFunction();
- assert(Caller && "Caller is nullptr");
- auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
- IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
- if (CAA.ReachingKernelEntries.isValidState()) {
- ReachingKernelEntries ^= CAA.ReachingKernelEntries;
- return true;
- }
- // We lost track of the caller of the associated function, any kernel
- // could reach now.
- ReachingKernelEntries.indicatePessimisticFixpoint();
- return true;
- };
- if (!A.checkForAllCallSites(PredCallSite, *this,
- true /* RequireAllCallSites */,
- AllReachingKernelsKnown))
- ReachingKernelEntries.indicatePessimisticFixpoint();
- }
- /// Update info regarding parallel levels.
- void updateParallelLevels(Attributor &A) {
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
- auto PredCallSite = [&](AbstractCallSite ACS) {
- Function *Caller = ACS.getInstruction()->getFunction();
- assert(Caller && "Caller is nullptr");
- auto &CAA =
- A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
- if (CAA.ParallelLevels.isValidState()) {
- // Any function that is called by `__kmpc_parallel_51` will not be
- // folded as the parallel level in the function is updated. In order to
- // get it right, all the analysis would depend on the implentation. That
- // said, if in the future any change to the implementation, the analysis
- // could be wrong. As a consequence, we are just conservative here.
- if (Caller == Parallel51RFI.Declaration) {
- ParallelLevels.indicatePessimisticFixpoint();
- return true;
- }
- ParallelLevels ^= CAA.ParallelLevels;
- return true;
- }
- // We lost track of the caller of the associated function, any kernel
- // could reach now.
- ParallelLevels.indicatePessimisticFixpoint();
- return true;
- };
- bool AllCallSitesKnown = true;
- if (!A.checkForAllCallSites(PredCallSite, *this,
- true /* RequireAllCallSites */,
- AllCallSitesKnown))
- ParallelLevels.indicatePessimisticFixpoint();
- }
- };
- /// The call site kernel info abstract attribute, basically, what can we say
- /// about a call site with regards to the KernelInfoState. For now this simply
- /// forwards the information from the callee.
- struct AAKernelInfoCallSite : AAKernelInfo {
- AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
- : AAKernelInfo(IRP, A) {}
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAKernelInfo::initialize(A);
- CallBase &CB = cast<CallBase>(getAssociatedValue());
- Function *Callee = getAssociatedFunction();
- auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
- *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
- // Check for SPMD-mode assumptions.
- if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
- SPMDCompatibilityTracker.indicateOptimisticFixpoint();
- indicateOptimisticFixpoint();
- }
- // First weed out calls we do not care about, that is readonly/readnone
- // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
- // parallel region or anything else we are looking for.
- if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
- indicateOptimisticFixpoint();
- return;
- }
- // Next we check if we know the callee. If it is a known OpenMP function
- // we will handle them explicitly in the switch below. If it is not, we
- // will use an AAKernelInfo object on the callee to gather information and
- // merge that into the current state. The latter happens in the updateImpl.
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
- if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
- // Unknown caller or declarations are not analyzable, we give up.
- if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
- // Unknown callees might contain parallel regions, except if they have
- // an appropriate assumption attached.
- if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
- AssumptionAA.hasAssumption("omp_no_parallelism")))
- ReachedUnknownParallelRegions.insert(&CB);
- // If SPMDCompatibilityTracker is not fixed, we need to give up on the
- // idea we can run something unknown in SPMD-mode.
- if (!SPMDCompatibilityTracker.isAtFixpoint()) {
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.insert(&CB);
- }
- // We have updated the state for this unknown call properly, there won't
- // be any change so we indicate a fixpoint.
- indicateOptimisticFixpoint();
- }
- // If the callee is known and can be used in IPO, we will update the state
- // based on the callee state in updateImpl.
- return;
- }
- const unsigned int WrapperFunctionArgNo = 6;
- RuntimeFunction RF = It->getSecond();
- switch (RF) {
- // All the functions we know are compatible with SPMD mode.
- case OMPRTL___kmpc_is_spmd_exec_mode:
- case OMPRTL___kmpc_distribute_static_fini:
- case OMPRTL___kmpc_for_static_fini:
- case OMPRTL___kmpc_global_thread_num:
- case OMPRTL___kmpc_get_hardware_num_threads_in_block:
- case OMPRTL___kmpc_get_hardware_num_blocks:
- case OMPRTL___kmpc_single:
- case OMPRTL___kmpc_end_single:
- case OMPRTL___kmpc_master:
- case OMPRTL___kmpc_end_master:
- case OMPRTL___kmpc_barrier:
- case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
- case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
- case OMPRTL___kmpc_nvptx_end_reduce_nowait:
- break;
- case OMPRTL___kmpc_distribute_static_init_4:
- case OMPRTL___kmpc_distribute_static_init_4u:
- case OMPRTL___kmpc_distribute_static_init_8:
- case OMPRTL___kmpc_distribute_static_init_8u:
- case OMPRTL___kmpc_for_static_init_4:
- case OMPRTL___kmpc_for_static_init_4u:
- case OMPRTL___kmpc_for_static_init_8:
- case OMPRTL___kmpc_for_static_init_8u: {
- // Check the schedule and allow static schedule in SPMD mode.
- unsigned ScheduleArgOpNo = 2;
- auto *ScheduleTypeCI =
- dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
- unsigned ScheduleTypeVal =
- ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
- switch (OMPScheduleType(ScheduleTypeVal)) {
- case OMPScheduleType::UnorderedStatic:
- case OMPScheduleType::UnorderedStaticChunked:
- case OMPScheduleType::OrderedDistribute:
- case OMPScheduleType::OrderedDistributeChunked:
- break;
- default:
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.insert(&CB);
- break;
- };
- } break;
- case OMPRTL___kmpc_target_init:
- KernelInitCB = &CB;
- break;
- case OMPRTL___kmpc_target_deinit:
- KernelDeinitCB = &CB;
- break;
- case OMPRTL___kmpc_parallel_51:
- if (auto *ParallelRegion = dyn_cast<Function>(
- CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
- ReachedKnownParallelRegions.insert(ParallelRegion);
- /// Check nested parallelism
- auto &FnAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
- NestedParallelism |= !FnAA.getState().isValidState() ||
- !FnAA.ReachedKnownParallelRegions.empty() ||
- !FnAA.ReachedUnknownParallelRegions.empty();
- break;
- }
- // The condition above should usually get the parallel region function
- // pointer and record it. In the off chance it doesn't we assume the
- // worst.
- ReachedUnknownParallelRegions.insert(&CB);
- break;
- case OMPRTL___kmpc_omp_task:
- // We do not look into tasks right now, just give up.
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.insert(&CB);
- ReachedUnknownParallelRegions.insert(&CB);
- break;
- case OMPRTL___kmpc_alloc_shared:
- case OMPRTL___kmpc_free_shared:
- // Return without setting a fixpoint, to be resolved in updateImpl.
- return;
- default:
- // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
- // generally. However, they do not hide parallel regions.
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.insert(&CB);
- break;
- }
- // All other OpenMP runtime calls will not reach parallel regions so they
- // can be safely ignored for now. Since it is a known OpenMP runtime call we
- // have now modeled all effects and there is no need for any update.
- indicateOptimisticFixpoint();
- }
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
- // If F is not a runtime function, propagate the AAKernelInfo of the callee.
- if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
- if (getState() == FnAA.getState())
- return ChangeStatus::UNCHANGED;
- getState() = FnAA.getState();
- return ChangeStatus::CHANGED;
- }
- // F is a runtime function that allocates or frees memory, check
- // AAHeapToStack and AAHeapToShared.
- KernelInfoState StateBefore = getState();
- assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||
- It->getSecond() == OMPRTL___kmpc_free_shared) &&
- "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
- CallBase &CB = cast<CallBase>(getAssociatedValue());
- auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
- *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
- auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
- *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
- RuntimeFunction RF = It->getSecond();
- switch (RF) {
- // If neither HeapToStack nor HeapToShared assume the call is removed,
- // assume SPMD incompatibility.
- case OMPRTL___kmpc_alloc_shared:
- if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
- !HeapToSharedAA.isAssumedHeapToShared(CB))
- SPMDCompatibilityTracker.insert(&CB);
- break;
- case OMPRTL___kmpc_free_shared:
- if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
- !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
- SPMDCompatibilityTracker.insert(&CB);
- break;
- default:
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- SPMDCompatibilityTracker.insert(&CB);
- }
- return StateBefore == getState() ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
- };
- struct AAFoldRuntimeCall
- : public StateWrapper<BooleanState, AbstractAttribute> {
- using Base = StateWrapper<BooleanState, AbstractAttribute>;
- AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- /// Statistics are tracked as part of manifest for now.
- void trackStatistics() const override {}
- /// Create an abstract attribute biew for the position \p IRP.
- static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
- Attributor &A);
- /// See AbstractAttribute::getName()
- const std::string getName() const override { return "AAFoldRuntimeCall"; }
- /// See AbstractAttribute::getIdAddr()
- const char *getIdAddr() const override { return &ID; }
- /// This function should return true if the type of the \p AA is
- /// AAFoldRuntimeCall
- static bool classof(const AbstractAttribute *AA) {
- return (AA->getIdAddr() == &ID);
- }
- static const char ID;
- };
- struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
- AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAFoldRuntimeCall(IRP, A) {}
- /// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
- if (!isValidState())
- return "<invalid>";
- std::string Str("simplified value: ");
- if (!SimplifiedValue)
- return Str + std::string("none");
- if (!*SimplifiedValue)
- return Str + std::string("nullptr");
- if (ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
- return Str + std::to_string(CI->getSExtValue());
- return Str + std::string("unknown");
- }
- void initialize(Attributor &A) override {
- if (DisableOpenMPOptFolding)
- indicatePessimisticFixpoint();
- Function *Callee = getAssociatedFunction();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
- assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
- "Expected a known OpenMP runtime function");
- RFKind = It->getSecond();
- CallBase &CB = cast<CallBase>(getAssociatedValue());
- A.registerSimplificationCallback(
- IRPosition::callsite_returned(CB),
- [&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> std::optional<Value *> {
- assert((isValidState() ||
- (SimplifiedValue && *SimplifiedValue == nullptr)) &&
- "Unexpected invalid state!");
- if (!isAtFixpoint()) {
- UsedAssumedInformation = true;
- if (AA)
- A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
- }
- return SimplifiedValue;
- });
- }
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- switch (RFKind) {
- case OMPRTL___kmpc_is_spmd_exec_mode:
- Changed |= foldIsSPMDExecMode(A);
- break;
- case OMPRTL___kmpc_parallel_level:
- Changed |= foldParallelLevel(A);
- break;
- case OMPRTL___kmpc_get_hardware_num_threads_in_block:
- Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
- break;
- case OMPRTL___kmpc_get_hardware_num_blocks:
- Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
- break;
- default:
- llvm_unreachable("Unhandled OpenMP runtime function!");
- }
- return Changed;
- }
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- if (SimplifiedValue && *SimplifiedValue) {
- Instruction &I = *getCtxI();
- A.changeAfterManifest(IRPosition::inst(I), **SimplifiedValue);
- A.deleteAfterManifest(I);
- CallBase *CB = dyn_cast<CallBase>(&I);
- auto Remark = [&](OptimizationRemark OR) {
- if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
- return OR << "Replacing OpenMP runtime call "
- << CB->getCalledFunction()->getName() << " with "
- << ore::NV("FoldedValue", C->getZExtValue()) << ".";
- return OR << "Replacing OpenMP runtime call "
- << CB->getCalledFunction()->getName() << ".";
- };
- if (CB && EnableVerboseRemarks)
- A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
- LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
- << **SimplifiedValue << "\n");
- Changed = ChangeStatus::CHANGED;
- }
- return Changed;
- }
- ChangeStatus indicatePessimisticFixpoint() override {
- SimplifiedValue = nullptr;
- return AAFoldRuntimeCall::indicatePessimisticFixpoint();
- }
- private:
- /// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
- ChangeStatus foldIsSPMDExecMode(Attributor &A) {
- std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
- unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
- unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
- return indicatePessimisticFixpoint();
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
- DepClassTy::REQUIRED);
- if (!AA.isValidState()) {
- SimplifiedValue = nullptr;
- return indicatePessimisticFixpoint();
- }
- if (AA.SPMDCompatibilityTracker.isAssumed()) {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
- ++KnownSPMDCount;
- else
- ++AssumedSPMDCount;
- } else {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
- ++KnownNonSPMDCount;
- else
- ++AssumedNonSPMDCount;
- }
- }
- if ((AssumedSPMDCount + KnownSPMDCount) &&
- (AssumedNonSPMDCount + KnownNonSPMDCount))
- return indicatePessimisticFixpoint();
- auto &Ctx = getAnchorValue().getContext();
- if (KnownSPMDCount || AssumedSPMDCount) {
- assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
- "Expected only SPMD kernels!");
- // All reaching kernels are in SPMD mode. Update all function calls to
- // __kmpc_is_spmd_exec_mode to 1.
- SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
- } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
- assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
- "Expected only non-SPMD kernels!");
- // All reaching kernels are in non-SPMD mode. Update all function
- // calls to __kmpc_is_spmd_exec_mode to 0.
- SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
- } else {
- // We have empty reaching kernels, therefore we cannot tell if the
- // associated call site can be folded. At this moment, SimplifiedValue
- // must be none.
- assert(!SimplifiedValue && "SimplifiedValue should be none");
- }
- return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
- /// Fold __kmpc_parallel_level into a constant if possible.
- ChangeStatus foldParallelLevel(Attributor &A) {
- std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ParallelLevels.isValidState())
- return indicatePessimisticFixpoint();
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
- return indicatePessimisticFixpoint();
- if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
- assert(!SimplifiedValue &&
- "SimplifiedValue should keep none at this point");
- return ChangeStatus::UNCHANGED;
- }
- unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
- unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
- DepClassTy::REQUIRED);
- if (!AA.SPMDCompatibilityTracker.isValidState())
- return indicatePessimisticFixpoint();
- if (AA.SPMDCompatibilityTracker.isAssumed()) {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
- ++KnownSPMDCount;
- else
- ++AssumedSPMDCount;
- } else {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
- ++KnownNonSPMDCount;
- else
- ++AssumedNonSPMDCount;
- }
- }
- if ((AssumedSPMDCount + KnownSPMDCount) &&
- (AssumedNonSPMDCount + KnownNonSPMDCount))
- return indicatePessimisticFixpoint();
- auto &Ctx = getAnchorValue().getContext();
- // If the caller can only be reached by SPMD kernel entries, the parallel
- // level is 1. Similarly, if the caller can only be reached by non-SPMD
- // kernel entries, it is 0.
- if (AssumedSPMDCount || KnownSPMDCount) {
- assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
- "Expected only SPMD kernels!");
- SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
- } else {
- assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
- "Expected only non-SPMD kernels!");
- SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
- }
- return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
- ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
- // Specialize only if all the calls agree with the attribute constant value
- int32_t CurrentAttrValue = -1;
- std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
- return indicatePessimisticFixpoint();
- // Iterate over the kernels that reach this function
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
- if (NextAttrVal == -1 ||
- (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
- return indicatePessimisticFixpoint();
- CurrentAttrValue = NextAttrVal;
- }
- if (CurrentAttrValue != -1) {
- auto &Ctx = getAnchorValue().getContext();
- SimplifiedValue =
- ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
- }
- return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
- /// An optional value the associated value is assumed to fold to. That is, we
- /// assume the associated value (which is a call) can be replaced by this
- /// simplified value.
- std::optional<Value *> SimplifiedValue;
- /// The runtime function kind of the callee of the associated call site.
- RuntimeFunction RFKind;
- };
- } // namespace
- /// Register folding callsite
- void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
- auto &RFI = OMPInfoCache.RFIs[RF];
- RFI.foreachUse(SCC, [&](Use &U, Function &F) {
- CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
- if (!CI)
- return false;
- A.getOrCreateAAFor<AAFoldRuntimeCall>(
- IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
- DepClassTy::NONE, /* ForceUpdate */ false,
- /* UpdateAfterInit */ false);
- return false;
- });
- }
- void OpenMPOpt::registerAAs(bool IsModulePass) {
- if (SCC.empty())
- return;
- if (IsModulePass) {
- // Ensure we create the AAKernelInfo AAs first and without triggering an
- // update. This will make sure we register all value simplification
- // callbacks before any other AA has the chance to create an AAValueSimplify
- // or similar.
- auto CreateKernelInfoCB = [&](Use &, Function &Kernel) {
- A.getOrCreateAAFor<AAKernelInfo>(
- IRPosition::function(Kernel), /* QueryingAA */ nullptr,
- DepClassTy::NONE, /* ForceUpdate */ false,
- /* UpdateAfterInit */ false);
- return false;
- };
- OMPInformationCache::RuntimeFunctionInfo &InitRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
- InitRFI.foreachUse(SCC, CreateKernelInfoCB);
- registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
- registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
- registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
- registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
- }
- // Create CallSite AA for all Getters.
- if (DeduceICVValues) {
- for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
- auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
- auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
- auto CreateAA = [&](Use &U, Function &Caller) {
- CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
- if (!CI)
- return false;
- auto &CB = cast<CallBase>(*CI);
- IRPosition CBPos = IRPosition::callsite_function(CB);
- A.getOrCreateAAFor<AAICVTracker>(CBPos);
- return false;
- };
- GetterRFI.foreachUse(SCC, CreateAA);
- }
- }
- // Create an ExecutionDomain AA for every function and a HeapToStack AA for
- // every function if there is a device kernel.
- if (!isOpenMPDevice(M))
- return;
- for (auto *F : SCC) {
- if (F->isDeclaration())
- continue;
- // We look at internal functions only on-demand but if any use is not a
- // direct call or outside the current set of analyzed functions, we have
- // to do it eagerly.
- if (F->hasLocalLinkage()) {
- if (llvm::all_of(F->uses(), [this](const Use &U) {
- const auto *CB = dyn_cast<CallBase>(U.getUser());
- return CB && CB->isCallee(&U) &&
- A.isRunOn(const_cast<Function *>(CB->getCaller()));
- }))
- continue;
- }
- registerAAsForFunction(A, *F);
- }
- }
- void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
- if (!DisableOpenMPOptDeglobalization)
- A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
- A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
- if (!DisableOpenMPOptDeglobalization)
- A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F));
- for (auto &I : instructions(F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
- bool UsedAssumedInformation = false;
- A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
- UsedAssumedInformation, AA::Interprocedural);
- continue;
- }
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
- A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
- continue;
- }
- if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
- if (II->getIntrinsicID() == Intrinsic::assume) {
- A.getOrCreateAAFor<AAPotentialValues>(
- IRPosition::value(*II->getArgOperand(0)));
- continue;
- }
- }
- }
- }
- const char AAICVTracker::ID = 0;
- const char AAKernelInfo::ID = 0;
- const char AAExecutionDomain::ID = 0;
- const char AAHeapToShared::ID = 0;
- const char AAFoldRuntimeCall::ID = 0;
- AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAICVTracker *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- llvm_unreachable("ICVTracker can only be created for function position!");
- case IRPosition::IRP_RETURNED:
- AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
- break;
- case IRPosition::IRP_CALL_SITE_RETURNED:
- AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
- break;
- case IRPosition::IRP_CALL_SITE:
- AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
- break;
- case IRPosition::IRP_FUNCTION:
- AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
- break;
- }
- return *AA;
- }
- AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAExecutionDomainFunction *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- case IRPosition::IRP_RETURNED:
- case IRPosition::IRP_CALL_SITE_RETURNED:
- case IRPosition::IRP_CALL_SITE:
- llvm_unreachable(
- "AAExecutionDomain can only be created for function position!");
- case IRPosition::IRP_FUNCTION:
- AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
- break;
- }
- return *AA;
- }
- AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAHeapToSharedFunction *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- case IRPosition::IRP_RETURNED:
- case IRPosition::IRP_CALL_SITE_RETURNED:
- case IRPosition::IRP_CALL_SITE:
- llvm_unreachable(
- "AAHeapToShared can only be created for function position!");
- case IRPosition::IRP_FUNCTION:
- AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
- break;
- }
- return *AA;
- }
- AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAKernelInfo *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_RETURNED:
- case IRPosition::IRP_CALL_SITE_RETURNED:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- llvm_unreachable("KernelInfo can only be created for function position!");
- case IRPosition::IRP_CALL_SITE:
- AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
- break;
- case IRPosition::IRP_FUNCTION:
- AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
- break;
- }
- return *AA;
- }
- AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAFoldRuntimeCall *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_RETURNED:
- case IRPosition::IRP_FUNCTION:
- case IRPosition::IRP_CALL_SITE:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- llvm_unreachable("KernelInfo can only be created for call site position!");
- case IRPosition::IRP_CALL_SITE_RETURNED:
- AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
- break;
- }
- return *AA;
- }
- PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!containsOpenMP(M))
- return PreservedAnalyses::all();
- if (DisableOpenMPOptimizations)
- return PreservedAnalyses::all();
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- KernelSet Kernels = getDeviceKernels(M);
- if (PrintModuleBeforeOptimizations)
- LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);
- auto IsCalled = [&](Function &F) {
- if (Kernels.contains(&F))
- return true;
- for (const User *U : F.users())
- if (!isa<BlockAddress>(U))
- return true;
- return false;
- };
- auto EmitRemark = [&](Function &F) {
- auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- ORE.emit([&]() {
- OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
- return ORA << "Could not internalize function. "
- << "Some optimizations may not be possible. [OMP140]";
- });
- };
- // Create internal copies of each function if this is a kernel Module. This
- // allows iterprocedural passes to see every call edge.
- DenseMap<Function *, Function *> InternalizedMap;
- if (isOpenMPDevice(M)) {
- SmallPtrSet<Function *, 16> InternalizeFns;
- for (Function &F : M)
- if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
- !DisableInternalization) {
- if (Attributor::isInternalizable(F)) {
- InternalizeFns.insert(&F);
- } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
- EmitRemark(F);
- }
- }
- Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
- }
- // Look at every function in the Module unless it was internalized.
- SetVector<Function *> Functions;
- SmallVector<Function *, 16> SCC;
- for (Function &F : M)
- if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) {
- SCC.push_back(&F);
- Functions.insert(&F);
- }
- if (SCC.empty())
- return PreservedAnalyses::all();
- AnalysisGetter AG(FAM);
- auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
- };
- BumpPtrAllocator Allocator;
- CallGraphUpdater CGUpdater;
- bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
- LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
- OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels,
- PostLink);
- unsigned MaxFixpointIterations =
- (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
- AttributorConfig AC(CGUpdater);
- AC.DefaultInitializeLiveInternals = false;
- AC.IsModulePass = true;
- AC.RewriteSignatures = false;
- AC.MaxFixpointIterations = MaxFixpointIterations;
- AC.OREGetter = OREGetter;
- AC.PassName = DEBUG_TYPE;
- AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
- Attributor A(Functions, InfoCache, AC);
- OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- bool Changed = OMPOpt.run(true);
- // Optionally inline device functions for potentially better performance.
- if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
- for (Function &F : M)
- if (!F.isDeclaration() && !Kernels.contains(&F) &&
- !F.hasFnAttribute(Attribute::NoInline))
- F.addFnAttr(Attribute::AlwaysInline);
- if (PrintModuleAfterOptimizations)
- LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
- if (Changed)
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
- }
- PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM,
- LazyCallGraph &CG,
- CGSCCUpdateResult &UR) {
- if (!containsOpenMP(*C.begin()->getFunction().getParent()))
- return PreservedAnalyses::all();
- if (DisableOpenMPOptimizations)
- return PreservedAnalyses::all();
- SmallVector<Function *, 16> SCC;
- // If there are kernels in the module, we have to run on all SCC's.
- for (LazyCallGraph::Node &N : C) {
- Function *Fn = &N.getFunction();
- SCC.push_back(Fn);
- }
- if (SCC.empty())
- return PreservedAnalyses::all();
- Module &M = *C.begin()->getFunction().getParent();
- if (PrintModuleBeforeOptimizations)
- LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);
- KernelSet Kernels = getDeviceKernels(M);
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
- AnalysisGetter AG(FAM);
- auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
- };
- BumpPtrAllocator Allocator;
- CallGraphUpdater CGUpdater;
- CGUpdater.initialize(CG, C, AM, UR);
- bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
- LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
- SetVector<Function *> Functions(SCC.begin(), SCC.end());
- OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
- /*CGSCC*/ &Functions, Kernels, PostLink);
- unsigned MaxFixpointIterations =
- (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
- AttributorConfig AC(CGUpdater);
- AC.DefaultInitializeLiveInternals = false;
- AC.IsModulePass = false;
- AC.RewriteSignatures = false;
- AC.MaxFixpointIterations = MaxFixpointIterations;
- AC.OREGetter = OREGetter;
- AC.PassName = DEBUG_TYPE;
- AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
- Attributor A(Functions, InfoCache, AC);
- OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- bool Changed = OMPOpt.run(false);
- if (PrintModuleAfterOptimizations)
- LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
- if (Changed)
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
- }
- KernelSet llvm::omp::getDeviceKernels(Module &M) {
- // TODO: Create a more cross-platform way of determining device kernels.
- NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
- KernelSet Kernels;
- if (!MD)
- return Kernels;
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() < 2)
- continue;
- MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
- if (!KindID || KindID->getString() != "kernel")
- continue;
- Function *KernelFn =
- mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
- if (!KernelFn)
- continue;
- ++NumOpenMPTargetRegionKernels;
- Kernels.insert(KernelFn);
- }
- return Kernels;
- }
- bool llvm::omp::containsOpenMP(Module &M) {
- Metadata *MD = M.getModuleFlag("openmp");
- if (!MD)
- return false;
- return true;
- }
- bool llvm::omp::isOpenMPDevice(Module &M) {
- Metadata *MD = M.getModuleFlag("openmp-device");
- if (!MD)
- return false;
- return true;
- }
|