AArch64FrameLowering.cpp 144 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730
  1. //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains the AArch64 implementation of TargetFrameLowering class.
  10. //
  11. // On AArch64, stack frames are structured as follows:
  12. //
  13. // The stack grows downward.
  14. //
  15. // All of the individual frame areas on the frame below are optional, i.e. it's
  16. // possible to create a function so that the particular area isn't present
  17. // in the frame.
  18. //
  19. // At function entry, the "frame" looks as follows:
  20. //
  21. // | | Higher address
  22. // |-----------------------------------|
  23. // | |
  24. // | arguments passed on the stack |
  25. // | |
  26. // |-----------------------------------| <- sp
  27. // | | Lower address
  28. //
  29. //
  30. // After the prologue has run, the frame has the following general structure.
  31. // Note that this doesn't depict the case where a red-zone is used. Also,
  32. // technically the last frame area (VLAs) doesn't get created until in the
  33. // main function body, after the prologue is run. However, it's depicted here
  34. // for completeness.
  35. //
  36. // | | Higher address
  37. // |-----------------------------------|
  38. // | |
  39. // | arguments passed on the stack |
  40. // | |
  41. // |-----------------------------------|
  42. // | |
  43. // | (Win64 only) varargs from reg |
  44. // | |
  45. // |-----------------------------------|
  46. // | |
  47. // | callee-saved gpr registers | <--.
  48. // | | | On Darwin platforms these
  49. // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
  50. // | prev_lr | | (frame record first)
  51. // | prev_fp | <--'
  52. // | async context if needed |
  53. // | (a.k.a. "frame record") |
  54. // |-----------------------------------| <- fp(=x29)
  55. // | |
  56. // | callee-saved fp/simd/SVE regs |
  57. // | |
  58. // |-----------------------------------|
  59. // | |
  60. // | SVE stack objects |
  61. // | |
  62. // |-----------------------------------|
  63. // |.empty.space.to.make.part.below....|
  64. // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
  65. // |.the.standard.16-byte.alignment....| compile time; if present)
  66. // |-----------------------------------|
  67. // | |
  68. // | local variables of fixed size |
  69. // | including spill slots |
  70. // |-----------------------------------| <- bp(not defined by ABI,
  71. // |.variable-sized.local.variables....| LLVM chooses X19)
  72. // |.(VLAs)............................| (size of this area is unknown at
  73. // |...................................| compile time)
  74. // |-----------------------------------| <- sp
  75. // | | Lower address
  76. //
  77. //
  78. // To access the data in a frame, at-compile time, a constant offset must be
  79. // computable from one of the pointers (fp, bp, sp) to access it. The size
  80. // of the areas with a dotted background cannot be computed at compile-time
  81. // if they are present, making it required to have all three of fp, bp and
  82. // sp to be set up to be able to access all contents in the frame areas,
  83. // assuming all of the frame areas are non-empty.
  84. //
  85. // For most functions, some of the frame areas are empty. For those functions,
  86. // it may not be necessary to set up fp or bp:
  87. // * A base pointer is definitely needed when there are both VLAs and local
  88. // variables with more-than-default alignment requirements.
  89. // * A frame pointer is definitely needed when there are local variables with
  90. // more-than-default alignment requirements.
  91. //
  92. // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
  93. // callee-saved area, since the unwind encoding does not allow for encoding
  94. // this dynamically and existing tools depend on this layout. For other
  95. // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
  96. // area to allow SVE stack objects (allocated directly below the callee-saves,
  97. // if available) to be accessed directly from the framepointer.
  98. // The SVE spill/fill instructions have VL-scaled addressing modes such
  99. // as:
  100. // ldr z8, [fp, #-7 mul vl]
  101. // For SVE the size of the vector length (VL) is not known at compile-time, so
  102. // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
  103. // layout, we don't need to add an unscaled offset to the framepointer before
  104. // accessing the SVE object in the frame.
  105. //
  106. // In some cases when a base pointer is not strictly needed, it is generated
  107. // anyway when offsets from the frame pointer to access local variables become
  108. // so large that the offset can't be encoded in the immediate fields of loads
  109. // or stores.
  110. //
  111. // Outgoing function arguments must be at the bottom of the stack frame when
  112. // calling another function. If we do not have variable-sized stack objects, we
  113. // can allocate a "reserved call frame" area at the bottom of the local
  114. // variable area, large enough for all outgoing calls. If we do have VLAs, then
  115. // the stack pointer must be decremented and incremented around each call to
  116. // make space for the arguments below the VLAs.
  117. //
  118. // FIXME: also explain the redzone concept.
  119. //
  120. //===----------------------------------------------------------------------===//
  121. #include "AArch64FrameLowering.h"
  122. #include "AArch64InstrInfo.h"
  123. #include "AArch64MachineFunctionInfo.h"
  124. #include "AArch64RegisterInfo.h"
  125. #include "AArch64Subtarget.h"
  126. #include "AArch64TargetMachine.h"
  127. #include "MCTargetDesc/AArch64AddressingModes.h"
  128. #include "llvm/ADT/ScopeExit.h"
  129. #include "llvm/ADT/SmallVector.h"
  130. #include "llvm/ADT/Statistic.h"
  131. #include "llvm/CodeGen/LivePhysRegs.h"
  132. #include "llvm/CodeGen/MachineBasicBlock.h"
  133. #include "llvm/CodeGen/MachineFrameInfo.h"
  134. #include "llvm/CodeGen/MachineFunction.h"
  135. #include "llvm/CodeGen/MachineInstr.h"
  136. #include "llvm/CodeGen/MachineInstrBuilder.h"
  137. #include "llvm/CodeGen/MachineMemOperand.h"
  138. #include "llvm/CodeGen/MachineModuleInfo.h"
  139. #include "llvm/CodeGen/MachineOperand.h"
  140. #include "llvm/CodeGen/MachineRegisterInfo.h"
  141. #include "llvm/CodeGen/RegisterScavenging.h"
  142. #include "llvm/CodeGen/TargetInstrInfo.h"
  143. #include "llvm/CodeGen/TargetRegisterInfo.h"
  144. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  145. #include "llvm/CodeGen/WinEHFuncInfo.h"
  146. #include "llvm/IR/Attributes.h"
  147. #include "llvm/IR/CallingConv.h"
  148. #include "llvm/IR/DataLayout.h"
  149. #include "llvm/IR/DebugLoc.h"
  150. #include "llvm/IR/Function.h"
  151. #include "llvm/MC/MCAsmInfo.h"
  152. #include "llvm/MC/MCDwarf.h"
  153. #include "llvm/Support/CommandLine.h"
  154. #include "llvm/Support/Debug.h"
  155. #include "llvm/Support/ErrorHandling.h"
  156. #include "llvm/Support/LEB128.h"
  157. #include "llvm/Support/MathExtras.h"
  158. #include "llvm/Support/raw_ostream.h"
  159. #include "llvm/Target/TargetMachine.h"
  160. #include "llvm/Target/TargetOptions.h"
  161. #include <cassert>
  162. #include <cstdint>
  163. #include <iterator>
  164. #include <vector>
  165. using namespace llvm;
  166. #define DEBUG_TYPE "frame-info"
  167. static cl::opt<bool> EnableRedZone("aarch64-redzone",
  168. cl::desc("enable use of redzone on AArch64"),
  169. cl::init(false), cl::Hidden);
  170. static cl::opt<bool>
  171. ReverseCSRRestoreSeq("reverse-csr-restore-seq",
  172. cl::desc("reverse the CSR restore sequence"),
  173. cl::init(false), cl::Hidden);
  174. static cl::opt<bool> StackTaggingMergeSetTag(
  175. "stack-tagging-merge-settag",
  176. cl::desc("merge settag instruction in function epilog"), cl::init(true),
  177. cl::Hidden);
  178. static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
  179. cl::desc("sort stack allocations"),
  180. cl::init(true), cl::Hidden);
  181. cl::opt<bool> EnableHomogeneousPrologEpilog(
  182. "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
  183. cl::desc("Emit homogeneous prologue and epilogue for the size "
  184. "optimization (default = off)"));
  185. STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
  186. /// Returns how much of the incoming argument stack area (in bytes) we should
  187. /// clean up in an epilogue. For the C calling convention this will be 0, for
  188. /// guaranteed tail call conventions it can be positive (a normal return or a
  189. /// tail call to a function that uses less stack space for arguments) or
  190. /// negative (for a tail call to a function that needs more stack space than us
  191. /// for arguments).
  192. static int64_t getArgumentStackToRestore(MachineFunction &MF,
  193. MachineBasicBlock &MBB) {
  194. MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
  195. bool IsTailCallReturn = false;
  196. if (MBB.end() != MBBI) {
  197. unsigned RetOpcode = MBBI->getOpcode();
  198. IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
  199. RetOpcode == AArch64::TCRETURNri ||
  200. RetOpcode == AArch64::TCRETURNriBTI;
  201. }
  202. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  203. int64_t ArgumentPopSize = 0;
  204. if (IsTailCallReturn) {
  205. MachineOperand &StackAdjust = MBBI->getOperand(1);
  206. // For a tail-call in a callee-pops-arguments environment, some or all of
  207. // the stack may actually be in use for the call's arguments, this is
  208. // calculated during LowerCall and consumed here...
  209. ArgumentPopSize = StackAdjust.getImm();
  210. } else {
  211. // ... otherwise the amount to pop is *all* of the argument space,
  212. // conveniently stored in the MachineFunctionInfo by
  213. // LowerFormalArguments. This will, of course, be zero for the C calling
  214. // convention.
  215. ArgumentPopSize = AFI->getArgumentStackToRestore();
  216. }
  217. return ArgumentPopSize;
  218. }
  219. static bool produceCompactUnwindFrame(MachineFunction &MF);
  220. static bool needsWinCFI(const MachineFunction &MF);
  221. static StackOffset getSVEStackSize(const MachineFunction &MF);
  222. /// Returns true if a homogeneous prolog or epilog code can be emitted
  223. /// for the size optimization. If possible, a frame helper call is injected.
  224. /// When Exit block is given, this check is for epilog.
  225. bool AArch64FrameLowering::homogeneousPrologEpilog(
  226. MachineFunction &MF, MachineBasicBlock *Exit) const {
  227. if (!MF.getFunction().hasMinSize())
  228. return false;
  229. if (!EnableHomogeneousPrologEpilog)
  230. return false;
  231. if (ReverseCSRRestoreSeq)
  232. return false;
  233. if (EnableRedZone)
  234. return false;
  235. // TODO: Window is supported yet.
  236. if (needsWinCFI(MF))
  237. return false;
  238. // TODO: SVE is not supported yet.
  239. if (getSVEStackSize(MF))
  240. return false;
  241. // Bail on stack adjustment needed on return for simplicity.
  242. const MachineFrameInfo &MFI = MF.getFrameInfo();
  243. const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
  244. if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
  245. return false;
  246. if (Exit && getArgumentStackToRestore(MF, *Exit))
  247. return false;
  248. return true;
  249. }
  250. /// Returns true if CSRs should be paired.
  251. bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
  252. return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
  253. }
  254. /// This is the biggest offset to the stack pointer we can encode in aarch64
  255. /// instructions (without using a separate calculation and a temp register).
  256. /// Note that the exception here are vector stores/loads which cannot encode any
  257. /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
  258. static const unsigned DefaultSafeSPDisplacement = 255;
  259. /// Look at each instruction that references stack frames and return the stack
  260. /// size limit beyond which some of these instructions will require a scratch
  261. /// register during their expansion later.
  262. static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
  263. // FIXME: For now, just conservatively guestimate based on unscaled indexing
  264. // range. We'll end up allocating an unnecessary spill slot a lot, but
  265. // realistically that's not a big deal at this stage of the game.
  266. for (MachineBasicBlock &MBB : MF) {
  267. for (MachineInstr &MI : MBB) {
  268. if (MI.isDebugInstr() || MI.isPseudo() ||
  269. MI.getOpcode() == AArch64::ADDXri ||
  270. MI.getOpcode() == AArch64::ADDSXri)
  271. continue;
  272. for (const MachineOperand &MO : MI.operands()) {
  273. if (!MO.isFI())
  274. continue;
  275. StackOffset Offset;
  276. if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
  277. AArch64FrameOffsetCannotUpdate)
  278. return 0;
  279. }
  280. }
  281. }
  282. return DefaultSafeSPDisplacement;
  283. }
  284. TargetStackID::Value
  285. AArch64FrameLowering::getStackIDForScalableVectors() const {
  286. return TargetStackID::ScalableVector;
  287. }
  288. /// Returns the size of the fixed object area (allocated next to sp on entry)
  289. /// On Win64 this may include a var args area and an UnwindHelp object for EH.
  290. static unsigned getFixedObjectSize(const MachineFunction &MF,
  291. const AArch64FunctionInfo *AFI, bool IsWin64,
  292. bool IsFunclet) {
  293. if (!IsWin64 || IsFunclet) {
  294. return AFI->getTailCallReservedStack();
  295. } else {
  296. if (AFI->getTailCallReservedStack() != 0)
  297. report_fatal_error("cannot generate ABI-changing tail call for Win64");
  298. // Var args are stored here in the primary function.
  299. const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
  300. // To support EH funclets we allocate an UnwindHelp object
  301. const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
  302. return alignTo(VarArgsArea + UnwindHelpObject, 16);
  303. }
  304. }
  305. /// Returns the size of the entire SVE stackframe (calleesaves + spills).
  306. static StackOffset getSVEStackSize(const MachineFunction &MF) {
  307. const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  308. return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
  309. }
  310. bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
  311. if (!EnableRedZone)
  312. return false;
  313. // Don't use the red zone if the function explicitly asks us not to.
  314. // This is typically used for kernel code.
  315. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  316. const unsigned RedZoneSize =
  317. Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
  318. if (!RedZoneSize)
  319. return false;
  320. const MachineFrameInfo &MFI = MF.getFrameInfo();
  321. const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  322. uint64_t NumBytes = AFI->getLocalStackSize();
  323. return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
  324. getSVEStackSize(MF));
  325. }
  326. /// hasFP - Return true if the specified function should have a dedicated frame
  327. /// pointer register.
  328. bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
  329. const MachineFrameInfo &MFI = MF.getFrameInfo();
  330. const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
  331. // Win64 EH requires a frame pointer if funclets are present, as the locals
  332. // are accessed off the frame pointer in both the parent function and the
  333. // funclets.
  334. if (MF.hasEHFunclets())
  335. return true;
  336. // Retain behavior of always omitting the FP for leaf functions when possible.
  337. if (MF.getTarget().Options.DisableFramePointerElim(MF))
  338. return true;
  339. if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
  340. MFI.hasStackMap() || MFI.hasPatchPoint() ||
  341. RegInfo->hasStackRealignment(MF))
  342. return true;
  343. // With large callframes around we may need to use FP to access the scavenging
  344. // emergency spillslot.
  345. //
  346. // Unfortunately some calls to hasFP() like machine verifier ->
  347. // getReservedReg() -> hasFP in the middle of global isel are too early
  348. // to know the max call frame size. Hopefully conservatively returning "true"
  349. // in those cases is fine.
  350. // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
  351. if (!MFI.isMaxCallFrameSizeComputed() ||
  352. MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
  353. return true;
  354. return false;
  355. }
  356. /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
  357. /// not required, we reserve argument space for call sites in the function
  358. /// immediately on entry to the current function. This eliminates the need for
  359. /// add/sub sp brackets around call sites. Returns true if the call frame is
  360. /// included as part of the stack frame.
  361. bool
  362. AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
  363. return !MF.getFrameInfo().hasVarSizedObjects();
  364. }
  365. MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
  366. MachineFunction &MF, MachineBasicBlock &MBB,
  367. MachineBasicBlock::iterator I) const {
  368. const AArch64InstrInfo *TII =
  369. static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
  370. DebugLoc DL = I->getDebugLoc();
  371. unsigned Opc = I->getOpcode();
  372. bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
  373. uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
  374. if (!hasReservedCallFrame(MF)) {
  375. int64_t Amount = I->getOperand(0).getImm();
  376. Amount = alignTo(Amount, getStackAlign());
  377. if (!IsDestroy)
  378. Amount = -Amount;
  379. // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
  380. // doesn't have to pop anything), then the first operand will be zero too so
  381. // this adjustment is a no-op.
  382. if (CalleePopAmount == 0) {
  383. // FIXME: in-function stack adjustment for calls is limited to 24-bits
  384. // because there's no guaranteed temporary register available.
  385. //
  386. // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
  387. // 1) For offset <= 12-bit, we use LSL #0
  388. // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
  389. // LSL #0, and the other uses LSL #12.
  390. //
  391. // Most call frames will be allocated at the start of a function so
  392. // this is OK, but it is a limitation that needs dealing with.
  393. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
  394. emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
  395. StackOffset::getFixed(Amount), TII);
  396. }
  397. } else if (CalleePopAmount != 0) {
  398. // If the calling convention demands that the callee pops arguments from the
  399. // stack, we want to add it back if we have a reserved call frame.
  400. assert(CalleePopAmount < 0xffffff && "call frame too large");
  401. emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
  402. StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
  403. }
  404. return MBB.erase(I);
  405. }
  406. // Convenience function to create a DWARF expression for
  407. // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
  408. static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
  409. int NumBytes, int NumVGScaledBytes, unsigned VG,
  410. llvm::raw_string_ostream &Comment) {
  411. uint8_t buffer[16];
  412. if (NumBytes) {
  413. Expr.push_back(dwarf::DW_OP_consts);
  414. Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
  415. Expr.push_back((uint8_t)dwarf::DW_OP_plus);
  416. Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
  417. }
  418. if (NumVGScaledBytes) {
  419. Expr.push_back((uint8_t)dwarf::DW_OP_consts);
  420. Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
  421. Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
  422. Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
  423. Expr.push_back(0);
  424. Expr.push_back((uint8_t)dwarf::DW_OP_mul);
  425. Expr.push_back((uint8_t)dwarf::DW_OP_plus);
  426. Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
  427. << std::abs(NumVGScaledBytes) << " * VG";
  428. }
  429. }
  430. // Creates an MCCFIInstruction:
  431. // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
  432. MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
  433. const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
  434. int64_t NumBytes, NumVGScaledBytes;
  435. AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
  436. NumVGScaledBytes);
  437. std::string CommentBuffer = "sp";
  438. llvm::raw_string_ostream Comment(CommentBuffer);
  439. // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
  440. SmallString<64> Expr;
  441. Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
  442. Expr.push_back(0);
  443. appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
  444. TRI.getDwarfRegNum(AArch64::VG, true), Comment);
  445. // Wrap this into DW_CFA_def_cfa.
  446. SmallString<64> DefCfaExpr;
  447. DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
  448. uint8_t buffer[16];
  449. DefCfaExpr.append(buffer,
  450. buffer + encodeULEB128(Expr.size(), buffer));
  451. DefCfaExpr.append(Expr.str());
  452. return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
  453. Comment.str());
  454. }
  455. MCCFIInstruction AArch64FrameLowering::createCfaOffset(
  456. const TargetRegisterInfo &TRI, unsigned Reg,
  457. const StackOffset &OffsetFromDefCFA) const {
  458. int64_t NumBytes, NumVGScaledBytes;
  459. AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
  460. OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
  461. unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
  462. // Non-scalable offsets can use DW_CFA_offset directly.
  463. if (!NumVGScaledBytes)
  464. return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
  465. std::string CommentBuffer;
  466. llvm::raw_string_ostream Comment(CommentBuffer);
  467. Comment << printReg(Reg, &TRI) << " @ cfa";
  468. // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
  469. SmallString<64> OffsetExpr;
  470. appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
  471. TRI.getDwarfRegNum(AArch64::VG, true), Comment);
  472. // Wrap this into DW_CFA_expression
  473. SmallString<64> CfaExpr;
  474. CfaExpr.push_back(dwarf::DW_CFA_expression);
  475. uint8_t buffer[16];
  476. CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
  477. CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
  478. CfaExpr.append(OffsetExpr.str());
  479. return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
  480. }
  481. void AArch64FrameLowering::emitCalleeSavedFrameMoves(
  482. MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
  483. MachineFunction &MF = *MBB.getParent();
  484. MachineFrameInfo &MFI = MF.getFrameInfo();
  485. const TargetSubtargetInfo &STI = MF.getSubtarget();
  486. const TargetRegisterInfo *TRI = STI.getRegisterInfo();
  487. const TargetInstrInfo *TII = STI.getInstrInfo();
  488. DebugLoc DL = MBB.findDebugLoc(MBBI);
  489. // Add callee saved registers to move list.
  490. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
  491. if (CSI.empty())
  492. return;
  493. for (const auto &Info : CSI) {
  494. Register Reg = Info.getReg();
  495. // Not all unwinders may know about SVE registers, so assume the lowest
  496. // common demoninator.
  497. unsigned NewReg;
  498. if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
  499. Reg = NewReg;
  500. else
  501. continue;
  502. StackOffset Offset;
  503. if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
  504. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  505. Offset =
  506. StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
  507. StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
  508. } else {
  509. Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
  510. getOffsetOfLocalArea());
  511. }
  512. unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
  513. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
  514. .addCFIIndex(CFIIndex)
  515. .setMIFlags(MachineInstr::FrameSetup);
  516. }
  517. }
  518. // Find a scratch register that we can use at the start of the prologue to
  519. // re-align the stack pointer. We avoid using callee-save registers since they
  520. // may appear to be free when this is called from canUseAsPrologue (during
  521. // shrink wrapping), but then no longer be free when this is called from
  522. // emitPrologue.
  523. //
  524. // FIXME: This is a bit conservative, since in the above case we could use one
  525. // of the callee-save registers as a scratch temp to re-align the stack pointer,
  526. // but we would then have to make sure that we were in fact saving at least one
  527. // callee-save register in the prologue, which is additional complexity that
  528. // doesn't seem worth the benefit.
  529. static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
  530. MachineFunction *MF = MBB->getParent();
  531. // If MBB is an entry block, use X9 as the scratch register
  532. if (&MF->front() == MBB)
  533. return AArch64::X9;
  534. const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
  535. const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
  536. LivePhysRegs LiveRegs(TRI);
  537. LiveRegs.addLiveIns(*MBB);
  538. // Mark callee saved registers as used so we will not choose them.
  539. const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
  540. for (unsigned i = 0; CSRegs[i]; ++i)
  541. LiveRegs.addReg(CSRegs[i]);
  542. // Prefer X9 since it was historically used for the prologue scratch reg.
  543. const MachineRegisterInfo &MRI = MF->getRegInfo();
  544. if (LiveRegs.available(MRI, AArch64::X9))
  545. return AArch64::X9;
  546. for (unsigned Reg : AArch64::GPR64RegClass) {
  547. if (LiveRegs.available(MRI, Reg))
  548. return Reg;
  549. }
  550. return AArch64::NoRegister;
  551. }
  552. bool AArch64FrameLowering::canUseAsPrologue(
  553. const MachineBasicBlock &MBB) const {
  554. const MachineFunction *MF = MBB.getParent();
  555. MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
  556. const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
  557. const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  558. // Don't need a scratch register if we're not going to re-align the stack.
  559. if (!RegInfo->hasStackRealignment(*MF))
  560. return true;
  561. // Otherwise, we can use any block as long as it has a scratch register
  562. // available.
  563. return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
  564. }
  565. static bool windowsRequiresStackProbe(MachineFunction &MF,
  566. uint64_t StackSizeInBytes) {
  567. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  568. if (!Subtarget.isTargetWindows())
  569. return false;
  570. const Function &F = MF.getFunction();
  571. // TODO: When implementing stack protectors, take that into account
  572. // for the probe threshold.
  573. unsigned StackProbeSize = 4096;
  574. if (F.hasFnAttribute("stack-probe-size"))
  575. F.getFnAttribute("stack-probe-size")
  576. .getValueAsString()
  577. .getAsInteger(0, StackProbeSize);
  578. return (StackSizeInBytes >= StackProbeSize) &&
  579. !F.hasFnAttribute("no-stack-arg-probe");
  580. }
  581. static bool needsWinCFI(const MachineFunction &MF) {
  582. const Function &F = MF.getFunction();
  583. return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
  584. F.needsUnwindTableEntry();
  585. }
  586. bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
  587. MachineFunction &MF, uint64_t StackBumpBytes) const {
  588. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  589. const MachineFrameInfo &MFI = MF.getFrameInfo();
  590. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  591. const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  592. if (homogeneousPrologEpilog(MF))
  593. return false;
  594. if (AFI->getLocalStackSize() == 0)
  595. return false;
  596. // For WinCFI, if optimizing for size, prefer to not combine the stack bump
  597. // (to force a stp with predecrement) to match the packed unwind format,
  598. // provided that there actually are any callee saved registers to merge the
  599. // decrement with.
  600. // This is potentially marginally slower, but allows using the packed
  601. // unwind format for functions that both have a local area and callee saved
  602. // registers. Using the packed unwind format notably reduces the size of
  603. // the unwind info.
  604. if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
  605. MF.getFunction().hasOptSize())
  606. return false;
  607. // 512 is the maximum immediate for stp/ldp that will be used for
  608. // callee-save save/restores
  609. if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
  610. return false;
  611. if (MFI.hasVarSizedObjects())
  612. return false;
  613. if (RegInfo->hasStackRealignment(MF))
  614. return false;
  615. // This isn't strictly necessary, but it simplifies things a bit since the
  616. // current RedZone handling code assumes the SP is adjusted by the
  617. // callee-save save/restore code.
  618. if (canUseRedZone(MF))
  619. return false;
  620. // When there is an SVE area on the stack, always allocate the
  621. // callee-saves and spills/locals separately.
  622. if (getSVEStackSize(MF))
  623. return false;
  624. return true;
  625. }
  626. bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
  627. MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
  628. if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
  629. return false;
  630. if (MBB.empty())
  631. return true;
  632. // Disable combined SP bump if the last instruction is an MTE tag store. It
  633. // is almost always better to merge SP adjustment into those instructions.
  634. MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
  635. MachineBasicBlock::iterator Begin = MBB.begin();
  636. while (LastI != Begin) {
  637. --LastI;
  638. if (LastI->isTransient())
  639. continue;
  640. if (!LastI->getFlag(MachineInstr::FrameDestroy))
  641. break;
  642. }
  643. switch (LastI->getOpcode()) {
  644. case AArch64::STGloop:
  645. case AArch64::STZGloop:
  646. case AArch64::STGOffset:
  647. case AArch64::STZGOffset:
  648. case AArch64::ST2GOffset:
  649. case AArch64::STZ2GOffset:
  650. return false;
  651. default:
  652. return true;
  653. }
  654. llvm_unreachable("unreachable");
  655. }
  656. // Given a load or a store instruction, generate an appropriate unwinding SEH
  657. // code on Windows.
  658. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
  659. const TargetInstrInfo &TII,
  660. MachineInstr::MIFlag Flag) {
  661. unsigned Opc = MBBI->getOpcode();
  662. MachineBasicBlock *MBB = MBBI->getParent();
  663. MachineFunction &MF = *MBB->getParent();
  664. DebugLoc DL = MBBI->getDebugLoc();
  665. unsigned ImmIdx = MBBI->getNumOperands() - 1;
  666. int Imm = MBBI->getOperand(ImmIdx).getImm();
  667. MachineInstrBuilder MIB;
  668. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  669. const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  670. switch (Opc) {
  671. default:
  672. llvm_unreachable("No SEH Opcode for this instruction");
  673. case AArch64::LDPDpost:
  674. Imm = -Imm;
  675. LLVM_FALLTHROUGH;
  676. case AArch64::STPDpre: {
  677. unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
  678. unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
  679. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
  680. .addImm(Reg0)
  681. .addImm(Reg1)
  682. .addImm(Imm * 8)
  683. .setMIFlag(Flag);
  684. break;
  685. }
  686. case AArch64::LDPXpost:
  687. Imm = -Imm;
  688. LLVM_FALLTHROUGH;
  689. case AArch64::STPXpre: {
  690. Register Reg0 = MBBI->getOperand(1).getReg();
  691. Register Reg1 = MBBI->getOperand(2).getReg();
  692. if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
  693. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
  694. .addImm(Imm * 8)
  695. .setMIFlag(Flag);
  696. else
  697. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
  698. .addImm(RegInfo->getSEHRegNum(Reg0))
  699. .addImm(RegInfo->getSEHRegNum(Reg1))
  700. .addImm(Imm * 8)
  701. .setMIFlag(Flag);
  702. break;
  703. }
  704. case AArch64::LDRDpost:
  705. Imm = -Imm;
  706. LLVM_FALLTHROUGH;
  707. case AArch64::STRDpre: {
  708. unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
  709. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
  710. .addImm(Reg)
  711. .addImm(Imm)
  712. .setMIFlag(Flag);
  713. break;
  714. }
  715. case AArch64::LDRXpost:
  716. Imm = -Imm;
  717. LLVM_FALLTHROUGH;
  718. case AArch64::STRXpre: {
  719. unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
  720. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
  721. .addImm(Reg)
  722. .addImm(Imm)
  723. .setMIFlag(Flag);
  724. break;
  725. }
  726. case AArch64::STPDi:
  727. case AArch64::LDPDi: {
  728. unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
  729. unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
  730. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
  731. .addImm(Reg0)
  732. .addImm(Reg1)
  733. .addImm(Imm * 8)
  734. .setMIFlag(Flag);
  735. break;
  736. }
  737. case AArch64::STPXi:
  738. case AArch64::LDPXi: {
  739. Register Reg0 = MBBI->getOperand(0).getReg();
  740. Register Reg1 = MBBI->getOperand(1).getReg();
  741. if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
  742. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
  743. .addImm(Imm * 8)
  744. .setMIFlag(Flag);
  745. else
  746. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
  747. .addImm(RegInfo->getSEHRegNum(Reg0))
  748. .addImm(RegInfo->getSEHRegNum(Reg1))
  749. .addImm(Imm * 8)
  750. .setMIFlag(Flag);
  751. break;
  752. }
  753. case AArch64::STRXui:
  754. case AArch64::LDRXui: {
  755. int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
  756. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
  757. .addImm(Reg)
  758. .addImm(Imm * 8)
  759. .setMIFlag(Flag);
  760. break;
  761. }
  762. case AArch64::STRDui:
  763. case AArch64::LDRDui: {
  764. unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
  765. MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
  766. .addImm(Reg)
  767. .addImm(Imm * 8)
  768. .setMIFlag(Flag);
  769. break;
  770. }
  771. }
  772. auto I = MBB->insertAfter(MBBI, MIB);
  773. return I;
  774. }
  775. // Fix up the SEH opcode associated with the save/restore instruction.
  776. static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
  777. unsigned LocalStackSize) {
  778. MachineOperand *ImmOpnd = nullptr;
  779. unsigned ImmIdx = MBBI->getNumOperands() - 1;
  780. switch (MBBI->getOpcode()) {
  781. default:
  782. llvm_unreachable("Fix the offset in the SEH instruction");
  783. case AArch64::SEH_SaveFPLR:
  784. case AArch64::SEH_SaveRegP:
  785. case AArch64::SEH_SaveReg:
  786. case AArch64::SEH_SaveFRegP:
  787. case AArch64::SEH_SaveFReg:
  788. ImmOpnd = &MBBI->getOperand(ImmIdx);
  789. break;
  790. }
  791. if (ImmOpnd)
  792. ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
  793. }
  794. // Convert callee-save register save/restore instruction to do stack pointer
  795. // decrement/increment to allocate/deallocate the callee-save stack area by
  796. // converting store/load to use pre/post increment version.
  797. static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
  798. MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
  799. const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
  800. bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
  801. // Ignore instructions that do not operate on SP, i.e. shadow call stack
  802. // instructions and associated CFI instruction.
  803. while (MBBI->getOpcode() == AArch64::STRXpost ||
  804. MBBI->getOpcode() == AArch64::LDRXpre ||
  805. MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
  806. if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
  807. assert(MBBI->getOperand(0).getReg() != AArch64::SP);
  808. ++MBBI;
  809. }
  810. unsigned NewOpc;
  811. switch (MBBI->getOpcode()) {
  812. default:
  813. llvm_unreachable("Unexpected callee-save save/restore opcode!");
  814. case AArch64::STPXi:
  815. NewOpc = AArch64::STPXpre;
  816. break;
  817. case AArch64::STPDi:
  818. NewOpc = AArch64::STPDpre;
  819. break;
  820. case AArch64::STPQi:
  821. NewOpc = AArch64::STPQpre;
  822. break;
  823. case AArch64::STRXui:
  824. NewOpc = AArch64::STRXpre;
  825. break;
  826. case AArch64::STRDui:
  827. NewOpc = AArch64::STRDpre;
  828. break;
  829. case AArch64::STRQui:
  830. NewOpc = AArch64::STRQpre;
  831. break;
  832. case AArch64::LDPXi:
  833. NewOpc = AArch64::LDPXpost;
  834. break;
  835. case AArch64::LDPDi:
  836. NewOpc = AArch64::LDPDpost;
  837. break;
  838. case AArch64::LDPQi:
  839. NewOpc = AArch64::LDPQpost;
  840. break;
  841. case AArch64::LDRXui:
  842. NewOpc = AArch64::LDRXpost;
  843. break;
  844. case AArch64::LDRDui:
  845. NewOpc = AArch64::LDRDpost;
  846. break;
  847. case AArch64::LDRQui:
  848. NewOpc = AArch64::LDRQpost;
  849. break;
  850. }
  851. // Get rid of the SEH code associated with the old instruction.
  852. if (NeedsWinCFI) {
  853. auto SEH = std::next(MBBI);
  854. if (AArch64InstrInfo::isSEHInstruction(*SEH))
  855. SEH->eraseFromParent();
  856. }
  857. TypeSize Scale = TypeSize::Fixed(1);
  858. unsigned Width;
  859. int64_t MinOffset, MaxOffset;
  860. bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
  861. NewOpc, Scale, Width, MinOffset, MaxOffset);
  862. (void)Success;
  863. assert(Success && "unknown load/store opcode");
  864. // If the first store isn't right where we want SP then we can't fold the
  865. // update in so create a normal arithmetic instruction instead.
  866. if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
  867. CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
  868. emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
  869. StackOffset::getFixed(CSStackSizeInc), TII,
  870. InProlog ? MachineInstr::FrameSetup
  871. : MachineInstr::FrameDestroy);
  872. return std::prev(MBBI);
  873. }
  874. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
  875. MIB.addReg(AArch64::SP, RegState::Define);
  876. // Copy all operands other than the immediate offset.
  877. unsigned OpndIdx = 0;
  878. for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
  879. ++OpndIdx)
  880. MIB.add(MBBI->getOperand(OpndIdx));
  881. assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
  882. "Unexpected immediate offset in first/last callee-save save/restore "
  883. "instruction!");
  884. assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
  885. "Unexpected base register in callee-save save/restore instruction!");
  886. assert(CSStackSizeInc % Scale == 0);
  887. MIB.addImm(CSStackSizeInc / (int)Scale);
  888. MIB.setMIFlags(MBBI->getFlags());
  889. MIB.setMemRefs(MBBI->memoperands());
  890. // Generate a new SEH code that corresponds to the new instruction.
  891. if (NeedsWinCFI) {
  892. *HasWinCFI = true;
  893. InsertSEH(*MIB, *TII,
  894. InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
  895. }
  896. return std::prev(MBB.erase(MBBI));
  897. }
  898. // Fixup callee-save register save/restore instructions to take into account
  899. // combined SP bump by adding the local stack size to the stack offsets.
  900. static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
  901. uint64_t LocalStackSize,
  902. bool NeedsWinCFI,
  903. bool *HasWinCFI) {
  904. if (AArch64InstrInfo::isSEHInstruction(MI))
  905. return;
  906. unsigned Opc = MI.getOpcode();
  907. // Ignore instructions that do not operate on SP, i.e. shadow call stack
  908. // instructions and associated CFI instruction.
  909. if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
  910. Opc == AArch64::CFI_INSTRUCTION) {
  911. if (Opc != AArch64::CFI_INSTRUCTION)
  912. assert(MI.getOperand(0).getReg() != AArch64::SP);
  913. return;
  914. }
  915. unsigned Scale;
  916. switch (Opc) {
  917. case AArch64::STPXi:
  918. case AArch64::STRXui:
  919. case AArch64::STPDi:
  920. case AArch64::STRDui:
  921. case AArch64::LDPXi:
  922. case AArch64::LDRXui:
  923. case AArch64::LDPDi:
  924. case AArch64::LDRDui:
  925. Scale = 8;
  926. break;
  927. case AArch64::STPQi:
  928. case AArch64::STRQui:
  929. case AArch64::LDPQi:
  930. case AArch64::LDRQui:
  931. Scale = 16;
  932. break;
  933. default:
  934. llvm_unreachable("Unexpected callee-save save/restore opcode!");
  935. }
  936. unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
  937. assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
  938. "Unexpected base register in callee-save save/restore instruction!");
  939. // Last operand is immediate offset that needs fixing.
  940. MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
  941. // All generated opcodes have scaled offsets.
  942. assert(LocalStackSize % Scale == 0);
  943. OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
  944. if (NeedsWinCFI) {
  945. *HasWinCFI = true;
  946. auto MBBI = std::next(MachineBasicBlock::iterator(MI));
  947. assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
  948. assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
  949. "Expecting a SEH instruction");
  950. fixupSEHOpcode(MBBI, LocalStackSize);
  951. }
  952. }
  953. static void adaptForLdStOpt(MachineBasicBlock &MBB,
  954. MachineBasicBlock::iterator FirstSPPopI,
  955. MachineBasicBlock::iterator LastPopI) {
  956. // Sometimes (when we restore in the same order as we save), we can end up
  957. // with code like this:
  958. //
  959. // ldp x26, x25, [sp]
  960. // ldp x24, x23, [sp, #16]
  961. // ldp x22, x21, [sp, #32]
  962. // ldp x20, x19, [sp, #48]
  963. // add sp, sp, #64
  964. //
  965. // In this case, it is always better to put the first ldp at the end, so
  966. // that the load-store optimizer can run and merge the ldp and the add into
  967. // a post-index ldp.
  968. // If we managed to grab the first pop instruction, move it to the end.
  969. if (ReverseCSRRestoreSeq)
  970. MBB.splice(FirstSPPopI, &MBB, LastPopI);
  971. // We should end up with something like this now:
  972. //
  973. // ldp x24, x23, [sp, #16]
  974. // ldp x22, x21, [sp, #32]
  975. // ldp x20, x19, [sp, #48]
  976. // ldp x26, x25, [sp]
  977. // add sp, sp, #64
  978. //
  979. // and the load-store optimizer can merge the last two instructions into:
  980. //
  981. // ldp x26, x25, [sp], #64
  982. //
  983. }
  984. static bool isTargetWindows(const MachineFunction &MF) {
  985. return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
  986. }
  987. // Convenience function to determine whether I is an SVE callee save.
  988. static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
  989. switch (I->getOpcode()) {
  990. default:
  991. return false;
  992. case AArch64::STR_ZXI:
  993. case AArch64::STR_PXI:
  994. case AArch64::LDR_ZXI:
  995. case AArch64::LDR_PXI:
  996. return I->getFlag(MachineInstr::FrameSetup) ||
  997. I->getFlag(MachineInstr::FrameDestroy);
  998. }
  999. }
  1000. void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
  1001. MachineBasicBlock &MBB) const {
  1002. MachineBasicBlock::iterator MBBI = MBB.begin();
  1003. const MachineFrameInfo &MFI = MF.getFrameInfo();
  1004. const Function &F = MF.getFunction();
  1005. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1006. const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
  1007. const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  1008. MachineModuleInfo &MMI = MF.getMMI();
  1009. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  1010. bool needsFrameMoves =
  1011. MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
  1012. bool HasFP = hasFP(MF);
  1013. bool NeedsWinCFI = needsWinCFI(MF);
  1014. bool HasWinCFI = false;
  1015. auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
  1016. bool IsFunclet = MBB.isEHFuncletEntry();
  1017. // At this point, we're going to decide whether or not the function uses a
  1018. // redzone. In most cases, the function doesn't have a redzone so let's
  1019. // assume that's false and set it to true in the case that there's a redzone.
  1020. AFI->setHasRedZone(false);
  1021. // Debug location must be unknown since the first debug location is used
  1022. // to determine the end of the prologue.
  1023. DebugLoc DL;
  1024. const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
  1025. if (MFnI.shouldSignReturnAddress()) {
  1026. unsigned PACI;
  1027. if (MFnI.shouldSignWithBKey()) {
  1028. BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
  1029. .setMIFlag(MachineInstr::FrameSetup);
  1030. PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
  1031. } else {
  1032. PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
  1033. }
  1034. auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
  1035. if (Subtarget.hasPAuth())
  1036. MI.addReg(AArch64::LR, RegState::Define)
  1037. .addReg(AArch64::LR)
  1038. .addReg(AArch64::SP, RegState::InternalRead);
  1039. MI.setMIFlag(MachineInstr::FrameSetup);
  1040. unsigned CFIIndex =
  1041. MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
  1042. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
  1043. .addCFIIndex(CFIIndex)
  1044. .setMIFlags(MachineInstr::FrameSetup);
  1045. }
  1046. // We signal the presence of a Swift extended frame to external tools by
  1047. // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
  1048. // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
  1049. // bits so that is still true.
  1050. if (HasFP && AFI->hasSwiftAsyncContext()) {
  1051. switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
  1052. case SwiftAsyncFramePointerMode::DeploymentBased:
  1053. if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
  1054. // The special symbol below is absolute and has a *value* that can be
  1055. // combined with the frame pointer to signal an extended frame.
  1056. BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
  1057. .addExternalSymbol("swift_async_extendedFramePointerFlags",
  1058. AArch64II::MO_GOT);
  1059. BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
  1060. .addUse(AArch64::FP)
  1061. .addUse(AArch64::X16)
  1062. .addImm(Subtarget.isTargetILP32() ? 32 : 0);
  1063. break;
  1064. }
  1065. LLVM_FALLTHROUGH;
  1066. case SwiftAsyncFramePointerMode::Always:
  1067. // ORR x29, x29, #0x1000_0000_0000_0000
  1068. BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
  1069. .addUse(AArch64::FP)
  1070. .addImm(0x1100)
  1071. .setMIFlag(MachineInstr::FrameSetup);
  1072. break;
  1073. case SwiftAsyncFramePointerMode::Never:
  1074. break;
  1075. }
  1076. }
  1077. // All calls are tail calls in GHC calling conv, and functions have no
  1078. // prologue/epilogue.
  1079. if (MF.getFunction().getCallingConv() == CallingConv::GHC)
  1080. return;
  1081. // Set tagged base pointer to the requested stack slot.
  1082. // Ideally it should match SP value after prologue.
  1083. Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
  1084. if (TBPI)
  1085. AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
  1086. else
  1087. AFI->setTaggedBasePointerOffset(MFI.getStackSize());
  1088. const StackOffset &SVEStackSize = getSVEStackSize(MF);
  1089. // getStackSize() includes all the locals in its size calculation. We don't
  1090. // include these locals when computing the stack size of a funclet, as they
  1091. // are allocated in the parent's stack frame and accessed via the frame
  1092. // pointer from the funclet. We only save the callee saved registers in the
  1093. // funclet, which are really the callee saved registers of the parent
  1094. // function, including the funclet.
  1095. int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
  1096. : MFI.getStackSize();
  1097. if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
  1098. assert(!HasFP && "unexpected function without stack frame but with FP");
  1099. assert(!SVEStackSize &&
  1100. "unexpected function without stack frame but with SVE objects");
  1101. // All of the stack allocation is for locals.
  1102. AFI->setLocalStackSize(NumBytes);
  1103. if (!NumBytes)
  1104. return;
  1105. // REDZONE: If the stack size is less than 128 bytes, we don't need
  1106. // to actually allocate.
  1107. if (canUseRedZone(MF)) {
  1108. AFI->setHasRedZone(true);
  1109. ++NumRedZoneFunctions;
  1110. } else {
  1111. emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
  1112. StackOffset::getFixed(-NumBytes), TII,
  1113. MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
  1114. if (needsFrameMoves) {
  1115. // Label used to tie together the PROLOG_LABEL and the MachineMoves.
  1116. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
  1117. // Encode the stack size of the leaf function.
  1118. unsigned CFIIndex = MF.addFrameInst(
  1119. MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
  1120. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
  1121. .addCFIIndex(CFIIndex)
  1122. .setMIFlags(MachineInstr::FrameSetup);
  1123. }
  1124. }
  1125. if (NeedsWinCFI) {
  1126. HasWinCFI = true;
  1127. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
  1128. .setMIFlag(MachineInstr::FrameSetup);
  1129. }
  1130. return;
  1131. }
  1132. bool IsWin64 =
  1133. Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
  1134. unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
  1135. auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
  1136. // All of the remaining stack allocations are for locals.
  1137. AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
  1138. bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
  1139. bool HomPrologEpilog = homogeneousPrologEpilog(MF);
  1140. if (CombineSPBump) {
  1141. assert(!SVEStackSize && "Cannot combine SP bump with SVE");
  1142. emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
  1143. StackOffset::getFixed(-NumBytes), TII,
  1144. MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
  1145. NumBytes = 0;
  1146. } else if (HomPrologEpilog) {
  1147. // Stack has been already adjusted.
  1148. NumBytes -= PrologueSaveSize;
  1149. } else if (PrologueSaveSize != 0) {
  1150. MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
  1151. MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
  1152. NumBytes -= PrologueSaveSize;
  1153. }
  1154. assert(NumBytes >= 0 && "Negative stack allocation size!?");
  1155. // Move past the saves of the callee-saved registers, fixing up the offsets
  1156. // and pre-inc if we decided to combine the callee-save and local stack
  1157. // pointer bump above.
  1158. MachineBasicBlock::iterator End = MBB.end();
  1159. while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
  1160. !IsSVECalleeSave(MBBI)) {
  1161. if (CombineSPBump)
  1162. fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
  1163. NeedsWinCFI, &HasWinCFI);
  1164. ++MBBI;
  1165. }
  1166. // For funclets the FP belongs to the containing function.
  1167. if (!IsFunclet && HasFP) {
  1168. // Only set up FP if we actually need to.
  1169. int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
  1170. if (CombineSPBump)
  1171. FPOffset += AFI->getLocalStackSize();
  1172. if (AFI->hasSwiftAsyncContext()) {
  1173. // Before we update the live FP we have to ensure there's a valid (or
  1174. // null) asynchronous context in its slot just before FP in the frame
  1175. // record, so store it now.
  1176. const auto &Attrs = MF.getFunction().getAttributes();
  1177. bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
  1178. if (HaveInitialContext)
  1179. MBB.addLiveIn(AArch64::X22);
  1180. BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
  1181. .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
  1182. .addUse(AArch64::SP)
  1183. .addImm(FPOffset - 8)
  1184. .setMIFlags(MachineInstr::FrameSetup);
  1185. }
  1186. if (HomPrologEpilog) {
  1187. auto Prolog = MBBI;
  1188. --Prolog;
  1189. assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
  1190. Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
  1191. } else {
  1192. // Issue sub fp, sp, FPOffset or
  1193. // mov fp,sp when FPOffset is zero.
  1194. // Note: All stores of callee-saved registers are marked as "FrameSetup".
  1195. // This code marks the instruction(s) that set the FP also.
  1196. emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
  1197. StackOffset::getFixed(FPOffset), TII,
  1198. MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
  1199. }
  1200. }
  1201. if (windowsRequiresStackProbe(MF, NumBytes)) {
  1202. uint64_t NumWords = NumBytes >> 4;
  1203. if (NeedsWinCFI) {
  1204. HasWinCFI = true;
  1205. // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
  1206. // exceed this amount. We need to move at most 2^24 - 1 into x15.
  1207. // This is at most two instructions, MOVZ follwed by MOVK.
  1208. // TODO: Fix to use multiple stack alloc unwind codes for stacks
  1209. // exceeding 256MB in size.
  1210. if (NumBytes >= (1 << 28))
  1211. report_fatal_error("Stack size cannot exceed 256MB for stack "
  1212. "unwinding purposes");
  1213. uint32_t LowNumWords = NumWords & 0xFFFF;
  1214. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
  1215. .addImm(LowNumWords)
  1216. .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
  1217. .setMIFlag(MachineInstr::FrameSetup);
  1218. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1219. .setMIFlag(MachineInstr::FrameSetup);
  1220. if ((NumWords & 0xFFFF0000) != 0) {
  1221. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
  1222. .addReg(AArch64::X15)
  1223. .addImm((NumWords & 0xFFFF0000) >> 16) // High half
  1224. .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
  1225. .setMIFlag(MachineInstr::FrameSetup);
  1226. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1227. .setMIFlag(MachineInstr::FrameSetup);
  1228. }
  1229. } else {
  1230. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
  1231. .addImm(NumWords)
  1232. .setMIFlags(MachineInstr::FrameSetup);
  1233. }
  1234. switch (MF.getTarget().getCodeModel()) {
  1235. case CodeModel::Tiny:
  1236. case CodeModel::Small:
  1237. case CodeModel::Medium:
  1238. case CodeModel::Kernel:
  1239. BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
  1240. .addExternalSymbol("__chkstk")
  1241. .addReg(AArch64::X15, RegState::Implicit)
  1242. .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
  1243. .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
  1244. .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
  1245. .setMIFlags(MachineInstr::FrameSetup);
  1246. if (NeedsWinCFI) {
  1247. HasWinCFI = true;
  1248. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1249. .setMIFlag(MachineInstr::FrameSetup);
  1250. }
  1251. break;
  1252. case CodeModel::Large:
  1253. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
  1254. .addReg(AArch64::X16, RegState::Define)
  1255. .addExternalSymbol("__chkstk")
  1256. .addExternalSymbol("__chkstk")
  1257. .setMIFlags(MachineInstr::FrameSetup);
  1258. if (NeedsWinCFI) {
  1259. HasWinCFI = true;
  1260. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1261. .setMIFlag(MachineInstr::FrameSetup);
  1262. }
  1263. BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
  1264. .addReg(AArch64::X16, RegState::Kill)
  1265. .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
  1266. .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
  1267. .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
  1268. .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
  1269. .setMIFlags(MachineInstr::FrameSetup);
  1270. if (NeedsWinCFI) {
  1271. HasWinCFI = true;
  1272. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1273. .setMIFlag(MachineInstr::FrameSetup);
  1274. }
  1275. break;
  1276. }
  1277. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
  1278. .addReg(AArch64::SP, RegState::Kill)
  1279. .addReg(AArch64::X15, RegState::Kill)
  1280. .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
  1281. .setMIFlags(MachineInstr::FrameSetup);
  1282. if (NeedsWinCFI) {
  1283. HasWinCFI = true;
  1284. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
  1285. .addImm(NumBytes)
  1286. .setMIFlag(MachineInstr::FrameSetup);
  1287. }
  1288. NumBytes = 0;
  1289. }
  1290. StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
  1291. MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
  1292. // Process the SVE callee-saves to determine what space needs to be
  1293. // allocated.
  1294. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
  1295. // Find callee save instructions in frame.
  1296. CalleeSavesBegin = MBBI;
  1297. assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
  1298. while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
  1299. ++MBBI;
  1300. CalleeSavesEnd = MBBI;
  1301. AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
  1302. AllocateAfter = SVEStackSize - AllocateBefore;
  1303. }
  1304. // Allocate space for the callee saves (if any).
  1305. emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
  1306. -AllocateBefore, TII,
  1307. MachineInstr::FrameSetup);
  1308. // Finally allocate remaining SVE stack space.
  1309. emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
  1310. -AllocateAfter, TII,
  1311. MachineInstr::FrameSetup);
  1312. // Allocate space for the rest of the frame.
  1313. if (NumBytes) {
  1314. // Alignment is required for the parent frame, not the funclet
  1315. const bool NeedsRealignment =
  1316. !IsFunclet && RegInfo->hasStackRealignment(MF);
  1317. unsigned scratchSPReg = AArch64::SP;
  1318. if (NeedsRealignment) {
  1319. scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
  1320. assert(scratchSPReg != AArch64::NoRegister);
  1321. }
  1322. // If we're a leaf function, try using the red zone.
  1323. if (!canUseRedZone(MF))
  1324. // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
  1325. // the correct value here, as NumBytes also includes padding bytes,
  1326. // which shouldn't be counted here.
  1327. emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
  1328. StackOffset::getFixed(-NumBytes), TII,
  1329. MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
  1330. if (NeedsRealignment) {
  1331. const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
  1332. assert(NrBitsToZero > 1);
  1333. assert(scratchSPReg != AArch64::SP);
  1334. // SUB X9, SP, NumBytes
  1335. // -- X9 is temporary register, so shouldn't contain any live data here,
  1336. // -- free to use. This is already produced by emitFrameOffset above.
  1337. // AND SP, X9, 0b11111...0000
  1338. // The logical immediates have a non-trivial encoding. The following
  1339. // formula computes the encoded immediate with all ones but
  1340. // NrBitsToZero zero bits as least significant bits.
  1341. uint32_t andMaskEncoded = (1 << 12) // = N
  1342. | ((64 - NrBitsToZero) << 6) // immr
  1343. | ((64 - NrBitsToZero - 1) << 0); // imms
  1344. BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
  1345. .addReg(scratchSPReg, RegState::Kill)
  1346. .addImm(andMaskEncoded);
  1347. AFI->setStackRealigned(true);
  1348. if (NeedsWinCFI) {
  1349. HasWinCFI = true;
  1350. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
  1351. .addImm(NumBytes & andMaskEncoded)
  1352. .setMIFlag(MachineInstr::FrameSetup);
  1353. }
  1354. }
  1355. }
  1356. // If we need a base pointer, set it up here. It's whatever the value of the
  1357. // stack pointer is at this point. Any variable size objects will be allocated
  1358. // after this, so we can still use the base pointer to reference locals.
  1359. //
  1360. // FIXME: Clarify FrameSetup flags here.
  1361. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
  1362. // needed.
  1363. // For funclets the BP belongs to the containing function.
  1364. if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
  1365. TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
  1366. false);
  1367. if (NeedsWinCFI) {
  1368. HasWinCFI = true;
  1369. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
  1370. .setMIFlag(MachineInstr::FrameSetup);
  1371. }
  1372. }
  1373. // The very last FrameSetup instruction indicates the end of prologue. Emit a
  1374. // SEH opcode indicating the prologue end.
  1375. if (NeedsWinCFI && HasWinCFI) {
  1376. BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
  1377. .setMIFlag(MachineInstr::FrameSetup);
  1378. }
  1379. // SEH funclets are passed the frame pointer in X1. If the parent
  1380. // function uses the base register, then the base register is used
  1381. // directly, and is not retrieved from X1.
  1382. if (IsFunclet && F.hasPersonalityFn()) {
  1383. EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
  1384. if (isAsynchronousEHPersonality(Per)) {
  1385. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
  1386. .addReg(AArch64::X1)
  1387. .setMIFlag(MachineInstr::FrameSetup);
  1388. MBB.addLiveIn(AArch64::X1);
  1389. }
  1390. }
  1391. if (needsFrameMoves) {
  1392. // An example of the prologue:
  1393. //
  1394. // .globl __foo
  1395. // .align 2
  1396. // __foo:
  1397. // Ltmp0:
  1398. // .cfi_startproc
  1399. // .cfi_personality 155, ___gxx_personality_v0
  1400. // Leh_func_begin:
  1401. // .cfi_lsda 16, Lexception33
  1402. //
  1403. // stp xa,bx, [sp, -#offset]!
  1404. // ...
  1405. // stp x28, x27, [sp, #offset-32]
  1406. // stp fp, lr, [sp, #offset-16]
  1407. // add fp, sp, #offset - 16
  1408. // sub sp, sp, #1360
  1409. //
  1410. // The Stack:
  1411. // +-------------------------------------------+
  1412. // 10000 | ........ | ........ | ........ | ........ |
  1413. // 10004 | ........ | ........ | ........ | ........ |
  1414. // +-------------------------------------------+
  1415. // 10008 | ........ | ........ | ........ | ........ |
  1416. // 1000c | ........ | ........ | ........ | ........ |
  1417. // +===========================================+
  1418. // 10010 | X28 Register |
  1419. // 10014 | X28 Register |
  1420. // +-------------------------------------------+
  1421. // 10018 | X27 Register |
  1422. // 1001c | X27 Register |
  1423. // +===========================================+
  1424. // 10020 | Frame Pointer |
  1425. // 10024 | Frame Pointer |
  1426. // +-------------------------------------------+
  1427. // 10028 | Link Register |
  1428. // 1002c | Link Register |
  1429. // +===========================================+
  1430. // 10030 | ........ | ........ | ........ | ........ |
  1431. // 10034 | ........ | ........ | ........ | ........ |
  1432. // +-------------------------------------------+
  1433. // 10038 | ........ | ........ | ........ | ........ |
  1434. // 1003c | ........ | ........ | ........ | ........ |
  1435. // +-------------------------------------------+
  1436. //
  1437. // [sp] = 10030 :: >>initial value<<
  1438. // sp = 10020 :: stp fp, lr, [sp, #-16]!
  1439. // fp = sp == 10020 :: mov fp, sp
  1440. // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
  1441. // sp == 10010 :: >>final value<<
  1442. //
  1443. // The frame pointer (w29) points to address 10020. If we use an offset of
  1444. // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
  1445. // for w27, and -32 for w28:
  1446. //
  1447. // Ltmp1:
  1448. // .cfi_def_cfa w29, 16
  1449. // Ltmp2:
  1450. // .cfi_offset w30, -8
  1451. // Ltmp3:
  1452. // .cfi_offset w29, -16
  1453. // Ltmp4:
  1454. // .cfi_offset w27, -24
  1455. // Ltmp5:
  1456. // .cfi_offset w28, -32
  1457. if (HasFP) {
  1458. const int OffsetToFirstCalleeSaveFromFP =
  1459. AFI->getCalleeSaveBaseToFrameRecordOffset() -
  1460. AFI->getCalleeSavedStackSize();
  1461. Register FramePtr = RegInfo->getFrameRegister(MF);
  1462. // Define the current CFA rule to use the provided FP.
  1463. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
  1464. unsigned CFIIndex = MF.addFrameInst(
  1465. MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
  1466. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
  1467. .addCFIIndex(CFIIndex)
  1468. .setMIFlags(MachineInstr::FrameSetup);
  1469. } else {
  1470. unsigned CFIIndex;
  1471. if (SVEStackSize) {
  1472. const TargetSubtargetInfo &STI = MF.getSubtarget();
  1473. const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
  1474. StackOffset TotalSize =
  1475. SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
  1476. CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
  1477. } else {
  1478. // Encode the stack size of the leaf function.
  1479. CFIIndex = MF.addFrameInst(
  1480. MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
  1481. }
  1482. BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
  1483. .addCFIIndex(CFIIndex)
  1484. .setMIFlags(MachineInstr::FrameSetup);
  1485. }
  1486. // Now emit the moves for whatever callee saved regs we have (including FP,
  1487. // LR if those are saved).
  1488. emitCalleeSavedFrameMoves(MBB, MBBI);
  1489. }
  1490. }
  1491. static void InsertReturnAddressAuth(MachineFunction &MF,
  1492. MachineBasicBlock &MBB) {
  1493. const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
  1494. if (!MFI.shouldSignReturnAddress())
  1495. return;
  1496. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1497. const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  1498. MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
  1499. DebugLoc DL;
  1500. if (MBBI != MBB.end())
  1501. DL = MBBI->getDebugLoc();
  1502. // The AUTIASP instruction assembles to a hint instruction before v8.3a so
  1503. // this instruction can safely used for any v8a architecture.
  1504. // From v8.3a onwards there are optimised authenticate LR and return
  1505. // instructions, namely RETA{A,B}, that can be used instead.
  1506. if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
  1507. MBBI->getOpcode() == AArch64::RET_ReallyLR) {
  1508. BuildMI(MBB, MBBI, DL,
  1509. TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
  1510. .copyImplicitOps(*MBBI);
  1511. MBB.erase(MBBI);
  1512. } else {
  1513. BuildMI(
  1514. MBB, MBBI, DL,
  1515. TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
  1516. .setMIFlag(MachineInstr::FrameDestroy);
  1517. }
  1518. }
  1519. static bool isFuncletReturnInstr(const MachineInstr &MI) {
  1520. switch (MI.getOpcode()) {
  1521. default:
  1522. return false;
  1523. case AArch64::CATCHRET:
  1524. case AArch64::CLEANUPRET:
  1525. return true;
  1526. }
  1527. }
  1528. void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
  1529. MachineBasicBlock &MBB) const {
  1530. MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
  1531. MachineFrameInfo &MFI = MF.getFrameInfo();
  1532. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1533. const TargetInstrInfo *TII = Subtarget.getInstrInfo();
  1534. DebugLoc DL;
  1535. bool NeedsWinCFI = needsWinCFI(MF);
  1536. bool HasWinCFI = false;
  1537. bool IsFunclet = false;
  1538. auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
  1539. if (MBB.end() != MBBI) {
  1540. DL = MBBI->getDebugLoc();
  1541. IsFunclet = isFuncletReturnInstr(*MBBI);
  1542. }
  1543. int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
  1544. : MFI.getStackSize();
  1545. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  1546. // All calls are tail calls in GHC calling conv, and functions have no
  1547. // prologue/epilogue.
  1548. if (MF.getFunction().getCallingConv() == CallingConv::GHC)
  1549. return;
  1550. // How much of the stack used by incoming arguments this function is expected
  1551. // to restore in this particular epilogue.
  1552. int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
  1553. // The stack frame should be like below,
  1554. //
  1555. // ---------------------- ---
  1556. // | | |
  1557. // | BytesInStackArgArea| CalleeArgStackSize
  1558. // | (NumReusableBytes) | (of tail call)
  1559. // | | ---
  1560. // | | |
  1561. // ---------------------| --- |
  1562. // | | | |
  1563. // | CalleeSavedReg | | |
  1564. // | (CalleeSavedStackSize)| | |
  1565. // | | | |
  1566. // ---------------------| | NumBytes
  1567. // | | StackSize (StackAdjustUp)
  1568. // | LocalStackSize | | |
  1569. // | (covering callee | | |
  1570. // | args) | | |
  1571. // | | | |
  1572. // ---------------------- --- ---
  1573. //
  1574. // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
  1575. // = StackSize + ArgumentPopSize
  1576. //
  1577. // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
  1578. // it as the 2nd argument of AArch64ISD::TC_RETURN.
  1579. auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
  1580. bool IsWin64 =
  1581. Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
  1582. unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
  1583. int64_t AfterCSRPopSize = ArgumentStackToRestore;
  1584. auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
  1585. // We cannot rely on the local stack size set in emitPrologue if the function
  1586. // has funclets, as funclets have different local stack size requirements, and
  1587. // the current value set in emitPrologue may be that of the containing
  1588. // function.
  1589. if (MF.hasEHFunclets())
  1590. AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
  1591. if (homogeneousPrologEpilog(MF, &MBB)) {
  1592. assert(!NeedsWinCFI);
  1593. auto LastPopI = MBB.getFirstTerminator();
  1594. if (LastPopI != MBB.begin()) {
  1595. auto HomogeneousEpilog = std::prev(LastPopI);
  1596. if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
  1597. LastPopI = HomogeneousEpilog;
  1598. }
  1599. // Adjust local stack
  1600. emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
  1601. StackOffset::getFixed(AFI->getLocalStackSize()), TII,
  1602. MachineInstr::FrameDestroy, false, NeedsWinCFI);
  1603. // SP has been already adjusted while restoring callee save regs.
  1604. // We've bailed-out the case with adjusting SP for arguments.
  1605. assert(AfterCSRPopSize == 0);
  1606. return;
  1607. }
  1608. bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
  1609. // Assume we can't combine the last pop with the sp restore.
  1610. if (!CombineSPBump && PrologueSaveSize != 0) {
  1611. MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
  1612. while (AArch64InstrInfo::isSEHInstruction(*Pop))
  1613. Pop = std::prev(Pop);
  1614. // Converting the last ldp to a post-index ldp is valid only if the last
  1615. // ldp's offset is 0.
  1616. const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
  1617. // If the offset is 0 and the AfterCSR pop is not actually trying to
  1618. // allocate more stack for arguments (in space that an untimely interrupt
  1619. // may clobber), convert it to a post-index ldp.
  1620. if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
  1621. convertCalleeSaveRestoreToSPPrePostIncDec(
  1622. MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
  1623. else {
  1624. // If not, make sure to emit an add after the last ldp.
  1625. // We're doing this by transfering the size to be restored from the
  1626. // adjustment *before* the CSR pops to the adjustment *after* the CSR
  1627. // pops.
  1628. AfterCSRPopSize += PrologueSaveSize;
  1629. }
  1630. }
  1631. // Move past the restores of the callee-saved registers.
  1632. // If we plan on combining the sp bump of the local stack size and the callee
  1633. // save stack size, we might need to adjust the CSR save and restore offsets.
  1634. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
  1635. MachineBasicBlock::iterator Begin = MBB.begin();
  1636. while (LastPopI != Begin) {
  1637. --LastPopI;
  1638. if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
  1639. IsSVECalleeSave(LastPopI)) {
  1640. ++LastPopI;
  1641. break;
  1642. } else if (CombineSPBump)
  1643. fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
  1644. NeedsWinCFI, &HasWinCFI);
  1645. }
  1646. if (MF.hasWinCFI()) {
  1647. // If the prologue didn't contain any SEH opcodes and didn't set the
  1648. // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
  1649. // EpilogStart - to avoid generating CFI for functions that don't need it.
  1650. // (And as we didn't generate any prologue at all, it would be asymmetrical
  1651. // to the epilogue.) By the end of the function, we assert that
  1652. // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
  1653. HasWinCFI = true;
  1654. BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
  1655. .setMIFlag(MachineInstr::FrameDestroy);
  1656. }
  1657. if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
  1658. // We need to reset FP to its untagged state on return. Bit 60 is currently
  1659. // used to show the presence of an extended frame.
  1660. // BIC x29, x29, #0x1000_0000_0000_0000
  1661. BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
  1662. AArch64::FP)
  1663. .addUse(AArch64::FP)
  1664. .addImm(0x10fe)
  1665. .setMIFlag(MachineInstr::FrameDestroy);
  1666. }
  1667. const StackOffset &SVEStackSize = getSVEStackSize(MF);
  1668. // If there is a single SP update, insert it before the ret and we're done.
  1669. if (CombineSPBump) {
  1670. assert(!SVEStackSize && "Cannot combine SP bump with SVE");
  1671. emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
  1672. StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
  1673. TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
  1674. &HasWinCFI);
  1675. if (HasWinCFI)
  1676. BuildMI(MBB, MBB.getFirstTerminator(), DL,
  1677. TII->get(AArch64::SEH_EpilogEnd))
  1678. .setMIFlag(MachineInstr::FrameDestroy);
  1679. return;
  1680. }
  1681. NumBytes -= PrologueSaveSize;
  1682. assert(NumBytes >= 0 && "Negative stack allocation size!?");
  1683. // Process the SVE callee-saves to determine what space needs to be
  1684. // deallocated.
  1685. StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
  1686. MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
  1687. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
  1688. RestoreBegin = std::prev(RestoreEnd);
  1689. while (RestoreBegin != MBB.begin() &&
  1690. IsSVECalleeSave(std::prev(RestoreBegin)))
  1691. --RestoreBegin;
  1692. assert(IsSVECalleeSave(RestoreBegin) &&
  1693. IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
  1694. StackOffset CalleeSavedSizeAsOffset =
  1695. StackOffset::getScalable(CalleeSavedSize);
  1696. DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
  1697. DeallocateAfter = CalleeSavedSizeAsOffset;
  1698. }
  1699. // Deallocate the SVE area.
  1700. if (SVEStackSize) {
  1701. if (AFI->isStackRealigned()) {
  1702. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
  1703. // Set SP to start of SVE callee-save area from which they can
  1704. // be reloaded. The code below will deallocate the stack space
  1705. // space by moving FP -> SP.
  1706. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
  1707. StackOffset::getScalable(-CalleeSavedSize), TII,
  1708. MachineInstr::FrameDestroy);
  1709. } else {
  1710. if (AFI->getSVECalleeSavedStackSize()) {
  1711. // Deallocate the non-SVE locals first before we can deallocate (and
  1712. // restore callee saves) from the SVE area.
  1713. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
  1714. StackOffset::getFixed(NumBytes), TII,
  1715. MachineInstr::FrameDestroy);
  1716. NumBytes = 0;
  1717. }
  1718. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
  1719. DeallocateBefore, TII, MachineInstr::FrameDestroy);
  1720. emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
  1721. DeallocateAfter, TII, MachineInstr::FrameDestroy);
  1722. }
  1723. }
  1724. if (!hasFP(MF)) {
  1725. bool RedZone = canUseRedZone(MF);
  1726. // If this was a redzone leaf function, we don't need to restore the
  1727. // stack pointer (but we may need to pop stack args for fastcc).
  1728. if (RedZone && AfterCSRPopSize == 0)
  1729. return;
  1730. bool NoCalleeSaveRestore = PrologueSaveSize == 0;
  1731. int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
  1732. if (NoCalleeSaveRestore)
  1733. StackRestoreBytes += AfterCSRPopSize;
  1734. // If we were able to combine the local stack pop with the argument pop,
  1735. // then we're done.
  1736. bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
  1737. // If we're done after this, make sure to help the load store optimizer.
  1738. if (Done)
  1739. adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
  1740. emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
  1741. StackOffset::getFixed(StackRestoreBytes), TII,
  1742. MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
  1743. if (Done) {
  1744. if (HasWinCFI) {
  1745. BuildMI(MBB, MBB.getFirstTerminator(), DL,
  1746. TII->get(AArch64::SEH_EpilogEnd))
  1747. .setMIFlag(MachineInstr::FrameDestroy);
  1748. }
  1749. return;
  1750. }
  1751. NumBytes = 0;
  1752. }
  1753. // Restore the original stack pointer.
  1754. // FIXME: Rather than doing the math here, we should instead just use
  1755. // non-post-indexed loads for the restores if we aren't actually going to
  1756. // be able to save any instructions.
  1757. if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
  1758. emitFrameOffset(
  1759. MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
  1760. StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
  1761. TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
  1762. } else if (NumBytes)
  1763. emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
  1764. StackOffset::getFixed(NumBytes), TII,
  1765. MachineInstr::FrameDestroy, false, NeedsWinCFI);
  1766. // This must be placed after the callee-save restore code because that code
  1767. // assumes the SP is at the same location as it was after the callee-save save
  1768. // code in the prologue.
  1769. if (AfterCSRPopSize) {
  1770. assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
  1771. "interrupt may have clobbered");
  1772. // Find an insertion point for the first ldp so that it goes before the
  1773. // shadow call stack epilog instruction. This ensures that the restore of
  1774. // lr from x18 is placed after the restore from sp.
  1775. auto FirstSPPopI = MBB.getFirstTerminator();
  1776. while (FirstSPPopI != Begin) {
  1777. auto Prev = std::prev(FirstSPPopI);
  1778. if (Prev->getOpcode() != AArch64::LDRXpre ||
  1779. Prev->getOperand(0).getReg() == AArch64::SP)
  1780. break;
  1781. FirstSPPopI = Prev;
  1782. }
  1783. adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
  1784. emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
  1785. StackOffset::getFixed(AfterCSRPopSize), TII,
  1786. MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
  1787. }
  1788. if (HasWinCFI)
  1789. BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
  1790. .setMIFlag(MachineInstr::FrameDestroy);
  1791. }
  1792. /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
  1793. /// debug info. It's the same as what we use for resolving the code-gen
  1794. /// references for now. FIXME: This can go wrong when references are
  1795. /// SP-relative and simple call frames aren't used.
  1796. StackOffset
  1797. AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
  1798. Register &FrameReg) const {
  1799. return resolveFrameIndexReference(
  1800. MF, FI, FrameReg,
  1801. /*PreferFP=*/
  1802. MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
  1803. /*ForSimm=*/false);
  1804. }
  1805. StackOffset
  1806. AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
  1807. int FI) const {
  1808. return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
  1809. }
  1810. static StackOffset getFPOffset(const MachineFunction &MF,
  1811. int64_t ObjectOffset) {
  1812. const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
  1813. const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1814. bool IsWin64 =
  1815. Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
  1816. unsigned FixedObject =
  1817. getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
  1818. int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
  1819. int64_t FPAdjust =
  1820. CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
  1821. return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
  1822. }
  1823. static StackOffset getStackOffset(const MachineFunction &MF,
  1824. int64_t ObjectOffset) {
  1825. const auto &MFI = MF.getFrameInfo();
  1826. return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
  1827. }
  1828. // TODO: This function currently does not work for scalable vectors.
  1829. int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
  1830. int FI) const {
  1831. const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
  1832. MF.getSubtarget().getRegisterInfo());
  1833. int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
  1834. return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
  1835. ? getFPOffset(MF, ObjectOffset).getFixed()
  1836. : getStackOffset(MF, ObjectOffset).getFixed();
  1837. }
  1838. StackOffset AArch64FrameLowering::resolveFrameIndexReference(
  1839. const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
  1840. bool ForSimm) const {
  1841. const auto &MFI = MF.getFrameInfo();
  1842. int64_t ObjectOffset = MFI.getObjectOffset(FI);
  1843. bool isFixed = MFI.isFixedObjectIndex(FI);
  1844. bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
  1845. return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
  1846. PreferFP, ForSimm);
  1847. }
  1848. StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
  1849. const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
  1850. Register &FrameReg, bool PreferFP, bool ForSimm) const {
  1851. const auto &MFI = MF.getFrameInfo();
  1852. const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
  1853. MF.getSubtarget().getRegisterInfo());
  1854. const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
  1855. const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1856. int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
  1857. int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
  1858. bool isCSR =
  1859. !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
  1860. const StackOffset &SVEStackSize = getSVEStackSize(MF);
  1861. // Use frame pointer to reference fixed objects. Use it for locals if
  1862. // there are VLAs or a dynamically realigned SP (and thus the SP isn't
  1863. // reliable as a base). Make sure useFPForScavengingIndex() does the
  1864. // right thing for the emergency spill slot.
  1865. bool UseFP = false;
  1866. if (AFI->hasStackFrame() && !isSVE) {
  1867. // We shouldn't prefer using the FP when there is an SVE area
  1868. // in between the FP and the non-SVE locals/spills.
  1869. PreferFP &= !SVEStackSize;
  1870. // Note: Keeping the following as multiple 'if' statements rather than
  1871. // merging to a single expression for readability.
  1872. //
  1873. // Argument access should always use the FP.
  1874. if (isFixed) {
  1875. UseFP = hasFP(MF);
  1876. } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
  1877. // References to the CSR area must use FP if we're re-aligning the stack
  1878. // since the dynamically-sized alignment padding is between the SP/BP and
  1879. // the CSR area.
  1880. assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
  1881. UseFP = true;
  1882. } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
  1883. // If the FPOffset is negative and we're producing a signed immediate, we
  1884. // have to keep in mind that the available offset range for negative
  1885. // offsets is smaller than for positive ones. If an offset is available
  1886. // via the FP and the SP, use whichever is closest.
  1887. bool FPOffsetFits = !ForSimm || FPOffset >= -256;
  1888. PreferFP |= Offset > -FPOffset;
  1889. if (MFI.hasVarSizedObjects()) {
  1890. // If we have variable sized objects, we can use either FP or BP, as the
  1891. // SP offset is unknown. We can use the base pointer if we have one and
  1892. // FP is not preferred. If not, we're stuck with using FP.
  1893. bool CanUseBP = RegInfo->hasBasePointer(MF);
  1894. if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
  1895. UseFP = PreferFP;
  1896. else if (!CanUseBP) // Can't use BP. Forced to use FP.
  1897. UseFP = true;
  1898. // else we can use BP and FP, but the offset from FP won't fit.
  1899. // That will make us scavenge registers which we can probably avoid by
  1900. // using BP. If it won't fit for BP either, we'll scavenge anyway.
  1901. } else if (FPOffset >= 0) {
  1902. // Use SP or FP, whichever gives us the best chance of the offset
  1903. // being in range for direct access. If the FPOffset is positive,
  1904. // that'll always be best, as the SP will be even further away.
  1905. UseFP = true;
  1906. } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
  1907. // Funclets access the locals contained in the parent's stack frame
  1908. // via the frame pointer, so we have to use the FP in the parent
  1909. // function.
  1910. (void) Subtarget;
  1911. assert(
  1912. Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
  1913. "Funclets should only be present on Win64");
  1914. UseFP = true;
  1915. } else {
  1916. // We have the choice between FP and (SP or BP).
  1917. if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
  1918. UseFP = true;
  1919. }
  1920. }
  1921. }
  1922. assert(
  1923. ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
  1924. "In the presence of dynamic stack pointer realignment, "
  1925. "non-argument/CSR objects cannot be accessed through the frame pointer");
  1926. if (isSVE) {
  1927. StackOffset FPOffset =
  1928. StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
  1929. StackOffset SPOffset =
  1930. SVEStackSize +
  1931. StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
  1932. ObjectOffset);
  1933. // Always use the FP for SVE spills if available and beneficial.
  1934. if (hasFP(MF) && (SPOffset.getFixed() ||
  1935. FPOffset.getScalable() < SPOffset.getScalable() ||
  1936. RegInfo->hasStackRealignment(MF))) {
  1937. FrameReg = RegInfo->getFrameRegister(MF);
  1938. return FPOffset;
  1939. }
  1940. FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
  1941. : (unsigned)AArch64::SP;
  1942. return SPOffset;
  1943. }
  1944. StackOffset ScalableOffset = {};
  1945. if (UseFP && !(isFixed || isCSR))
  1946. ScalableOffset = -SVEStackSize;
  1947. if (!UseFP && (isFixed || isCSR))
  1948. ScalableOffset = SVEStackSize;
  1949. if (UseFP) {
  1950. FrameReg = RegInfo->getFrameRegister(MF);
  1951. return StackOffset::getFixed(FPOffset) + ScalableOffset;
  1952. }
  1953. // Use the base pointer if we have one.
  1954. if (RegInfo->hasBasePointer(MF))
  1955. FrameReg = RegInfo->getBaseRegister();
  1956. else {
  1957. assert(!MFI.hasVarSizedObjects() &&
  1958. "Can't use SP when we have var sized objects.");
  1959. FrameReg = AArch64::SP;
  1960. // If we're using the red zone for this function, the SP won't actually
  1961. // be adjusted, so the offsets will be negative. They're also all
  1962. // within range of the signed 9-bit immediate instructions.
  1963. if (canUseRedZone(MF))
  1964. Offset -= AFI->getLocalStackSize();
  1965. }
  1966. return StackOffset::getFixed(Offset) + ScalableOffset;
  1967. }
  1968. static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
  1969. // Do not set a kill flag on values that are also marked as live-in. This
  1970. // happens with the @llvm-returnaddress intrinsic and with arguments passed in
  1971. // callee saved registers.
  1972. // Omitting the kill flags is conservatively correct even if the live-in
  1973. // is not used after all.
  1974. bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
  1975. return getKillRegState(!IsLiveIn);
  1976. }
  1977. static bool produceCompactUnwindFrame(MachineFunction &MF) {
  1978. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1979. AttributeList Attrs = MF.getFunction().getAttributes();
  1980. return Subtarget.isTargetMachO() &&
  1981. !(Subtarget.getTargetLowering()->supportSwiftError() &&
  1982. Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
  1983. MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
  1984. }
  1985. static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
  1986. bool NeedsWinCFI, bool IsFirst) {
  1987. // If we are generating register pairs for a Windows function that requires
  1988. // EH support, then pair consecutive registers only. There are no unwind
  1989. // opcodes for saves/restores of non-consectuve register pairs.
  1990. // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
  1991. // save_lrpair.
  1992. // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
  1993. if (Reg2 == AArch64::FP)
  1994. return true;
  1995. if (!NeedsWinCFI)
  1996. return false;
  1997. if (Reg2 == Reg1 + 1)
  1998. return false;
  1999. // If pairing a GPR with LR, the pair can be described by the save_lrpair
  2000. // opcode. If this is the first register pair, it would end up with a
  2001. // predecrement, but there's no save_lrpair_x opcode, so we can only do this
  2002. // if LR is paired with something else than the first register.
  2003. // The save_lrpair opcode requires the first register to be an odd one.
  2004. if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
  2005. (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
  2006. return false;
  2007. return true;
  2008. }
  2009. /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
  2010. /// WindowsCFI requires that only consecutive registers can be paired.
  2011. /// LR and FP need to be allocated together when the frame needs to save
  2012. /// the frame-record. This means any other register pairing with LR is invalid.
  2013. static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
  2014. bool UsesWinAAPCS, bool NeedsWinCFI,
  2015. bool NeedsFrameRecord, bool IsFirst) {
  2016. if (UsesWinAAPCS)
  2017. return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
  2018. // If we need to store the frame record, don't pair any register
  2019. // with LR other than FP.
  2020. if (NeedsFrameRecord)
  2021. return Reg2 == AArch64::LR;
  2022. return false;
  2023. }
  2024. namespace {
  2025. struct RegPairInfo {
  2026. unsigned Reg1 = AArch64::NoRegister;
  2027. unsigned Reg2 = AArch64::NoRegister;
  2028. int FrameIdx;
  2029. int Offset;
  2030. enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
  2031. RegPairInfo() = default;
  2032. bool isPaired() const { return Reg2 != AArch64::NoRegister; }
  2033. unsigned getScale() const {
  2034. switch (Type) {
  2035. case PPR:
  2036. return 2;
  2037. case GPR:
  2038. case FPR64:
  2039. return 8;
  2040. case ZPR:
  2041. case FPR128:
  2042. return 16;
  2043. }
  2044. llvm_unreachable("Unsupported type");
  2045. }
  2046. bool isScalable() const { return Type == PPR || Type == ZPR; }
  2047. };
  2048. } // end anonymous namespace
  2049. static void computeCalleeSaveRegisterPairs(
  2050. MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
  2051. const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
  2052. bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
  2053. if (CSI.empty())
  2054. return;
  2055. bool IsWindows = isTargetWindows(MF);
  2056. bool NeedsWinCFI = needsWinCFI(MF);
  2057. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  2058. MachineFrameInfo &MFI = MF.getFrameInfo();
  2059. CallingConv::ID CC = MF.getFunction().getCallingConv();
  2060. unsigned Count = CSI.size();
  2061. (void)CC;
  2062. // MachO's compact unwind format relies on all registers being stored in
  2063. // pairs.
  2064. assert((!produceCompactUnwindFrame(MF) ||
  2065. CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
  2066. (Count & 1) == 0) &&
  2067. "Odd number of callee-saved regs to spill!");
  2068. int ByteOffset = AFI->getCalleeSavedStackSize();
  2069. int StackFillDir = -1;
  2070. int RegInc = 1;
  2071. unsigned FirstReg = 0;
  2072. if (NeedsWinCFI) {
  2073. // For WinCFI, fill the stack from the bottom up.
  2074. ByteOffset = 0;
  2075. StackFillDir = 1;
  2076. // As the CSI array is reversed to match PrologEpilogInserter, iterate
  2077. // backwards, to pair up registers starting from lower numbered registers.
  2078. RegInc = -1;
  2079. FirstReg = Count - 1;
  2080. }
  2081. int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
  2082. bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
  2083. // When iterating backwards, the loop condition relies on unsigned wraparound.
  2084. for (unsigned i = FirstReg; i < Count; i += RegInc) {
  2085. RegPairInfo RPI;
  2086. RPI.Reg1 = CSI[i].getReg();
  2087. if (AArch64::GPR64RegClass.contains(RPI.Reg1))
  2088. RPI.Type = RegPairInfo::GPR;
  2089. else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
  2090. RPI.Type = RegPairInfo::FPR64;
  2091. else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
  2092. RPI.Type = RegPairInfo::FPR128;
  2093. else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
  2094. RPI.Type = RegPairInfo::ZPR;
  2095. else if (AArch64::PPRRegClass.contains(RPI.Reg1))
  2096. RPI.Type = RegPairInfo::PPR;
  2097. else
  2098. llvm_unreachable("Unsupported register class.");
  2099. // Add the next reg to the pair if it is in the same register class.
  2100. if (unsigned(i + RegInc) < Count) {
  2101. Register NextReg = CSI[i + RegInc].getReg();
  2102. bool IsFirst = i == FirstReg;
  2103. switch (RPI.Type) {
  2104. case RegPairInfo::GPR:
  2105. if (AArch64::GPR64RegClass.contains(NextReg) &&
  2106. !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
  2107. NeedsWinCFI, NeedsFrameRecord, IsFirst))
  2108. RPI.Reg2 = NextReg;
  2109. break;
  2110. case RegPairInfo::FPR64:
  2111. if (AArch64::FPR64RegClass.contains(NextReg) &&
  2112. !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
  2113. IsFirst))
  2114. RPI.Reg2 = NextReg;
  2115. break;
  2116. case RegPairInfo::FPR128:
  2117. if (AArch64::FPR128RegClass.contains(NextReg))
  2118. RPI.Reg2 = NextReg;
  2119. break;
  2120. case RegPairInfo::PPR:
  2121. case RegPairInfo::ZPR:
  2122. break;
  2123. }
  2124. }
  2125. // If either of the registers to be saved is the lr register, it means that
  2126. // we also need to save lr in the shadow call stack.
  2127. if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
  2128. MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
  2129. if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
  2130. report_fatal_error("Must reserve x18 to use shadow call stack");
  2131. NeedShadowCallStackProlog = true;
  2132. }
  2133. // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
  2134. // list to come in sorted by frame index so that we can issue the store
  2135. // pair instructions directly. Assert if we see anything otherwise.
  2136. //
  2137. // The order of the registers in the list is controlled by
  2138. // getCalleeSavedRegs(), so they will always be in-order, as well.
  2139. assert((!RPI.isPaired() ||
  2140. (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
  2141. "Out of order callee saved regs!");
  2142. assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
  2143. RPI.Reg1 == AArch64::LR) &&
  2144. "FrameRecord must be allocated together with LR");
  2145. // Windows AAPCS has FP and LR reversed.
  2146. assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
  2147. RPI.Reg2 == AArch64::LR) &&
  2148. "FrameRecord must be allocated together with LR");
  2149. // MachO's compact unwind format relies on all registers being stored in
  2150. // adjacent register pairs.
  2151. assert((!produceCompactUnwindFrame(MF) ||
  2152. CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
  2153. (RPI.isPaired() &&
  2154. ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
  2155. RPI.Reg1 + 1 == RPI.Reg2))) &&
  2156. "Callee-save registers not saved as adjacent register pair!");
  2157. RPI.FrameIdx = CSI[i].getFrameIdx();
  2158. if (NeedsWinCFI &&
  2159. RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
  2160. RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
  2161. int Scale = RPI.getScale();
  2162. int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
  2163. assert(OffsetPre % Scale == 0);
  2164. if (RPI.isScalable())
  2165. ScalableByteOffset += StackFillDir * Scale;
  2166. else
  2167. ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
  2168. // Swift's async context is directly before FP, so allocate an extra
  2169. // 8 bytes for it.
  2170. if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
  2171. RPI.Reg2 == AArch64::FP)
  2172. ByteOffset += StackFillDir * 8;
  2173. assert(!(RPI.isScalable() && RPI.isPaired()) &&
  2174. "Paired spill/fill instructions don't exist for SVE vectors");
  2175. // Round up size of non-pair to pair size if we need to pad the
  2176. // callee-save area to ensure 16-byte alignment.
  2177. if (NeedGapToAlignStack && !NeedsWinCFI &&
  2178. !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
  2179. !RPI.isPaired() && ByteOffset % 16 != 0) {
  2180. ByteOffset += 8 * StackFillDir;
  2181. assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
  2182. // A stack frame with a gap looks like this, bottom up:
  2183. // d9, d8. x21, gap, x20, x19.
  2184. // Set extra alignment on the x21 object to create the gap above it.
  2185. MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
  2186. NeedGapToAlignStack = false;
  2187. }
  2188. int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
  2189. assert(OffsetPost % Scale == 0);
  2190. // If filling top down (default), we want the offset after incrementing it.
  2191. // If fillibg bootom up (WinCFI) we need the original offset.
  2192. int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
  2193. // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
  2194. // Swift context can directly precede FP.
  2195. if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
  2196. RPI.Reg2 == AArch64::FP)
  2197. Offset += 8;
  2198. RPI.Offset = Offset / Scale;
  2199. assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
  2200. (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
  2201. "Offset out of bounds for LDP/STP immediate");
  2202. // Save the offset to frame record so that the FP register can point to the
  2203. // innermost frame record (spilled FP and LR registers).
  2204. if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
  2205. RPI.Reg2 == AArch64::FP) ||
  2206. (IsWindows && RPI.Reg1 == AArch64::FP &&
  2207. RPI.Reg2 == AArch64::LR)))
  2208. AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
  2209. RegPairs.push_back(RPI);
  2210. if (RPI.isPaired())
  2211. i += RegInc;
  2212. }
  2213. if (NeedsWinCFI) {
  2214. // If we need an alignment gap in the stack, align the topmost stack
  2215. // object. A stack frame with a gap looks like this, bottom up:
  2216. // x19, d8. d9, gap.
  2217. // Set extra alignment on the topmost stack object (the first element in
  2218. // CSI, which goes top down), to create the gap above it.
  2219. if (AFI->hasCalleeSaveStackFreeSpace())
  2220. MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
  2221. // We iterated bottom up over the registers; flip RegPairs back to top
  2222. // down order.
  2223. std::reverse(RegPairs.begin(), RegPairs.end());
  2224. }
  2225. }
  2226. bool AArch64FrameLowering::spillCalleeSavedRegisters(
  2227. MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
  2228. ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
  2229. MachineFunction &MF = *MBB.getParent();
  2230. const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
  2231. bool NeedsWinCFI = needsWinCFI(MF);
  2232. DebugLoc DL;
  2233. SmallVector<RegPairInfo, 8> RegPairs;
  2234. bool NeedShadowCallStackProlog = false;
  2235. computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
  2236. NeedShadowCallStackProlog, hasFP(MF));
  2237. const MachineRegisterInfo &MRI = MF.getRegInfo();
  2238. if (NeedShadowCallStackProlog) {
  2239. // Shadow call stack prolog: str x30, [x18], #8
  2240. BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
  2241. .addReg(AArch64::X18, RegState::Define)
  2242. .addReg(AArch64::LR)
  2243. .addReg(AArch64::X18)
  2244. .addImm(8)
  2245. .setMIFlag(MachineInstr::FrameSetup);
  2246. if (NeedsWinCFI)
  2247. BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
  2248. .setMIFlag(MachineInstr::FrameSetup);
  2249. // Emit a CFI instruction that causes 8 to be subtracted from the value of
  2250. // x18 when unwinding past this frame.
  2251. static const char CFIInst[] = {
  2252. dwarf::DW_CFA_val_expression,
  2253. 18, // register
  2254. 2, // length
  2255. static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
  2256. static_cast<char>(-8) & 0x7f, // addend (sleb128)
  2257. };
  2258. unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
  2259. nullptr, StringRef(CFIInst, sizeof(CFIInst))));
  2260. BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
  2261. .addCFIIndex(CFIIndex)
  2262. .setMIFlag(MachineInstr::FrameSetup);
  2263. // This instruction also makes x18 live-in to the entry block.
  2264. MBB.addLiveIn(AArch64::X18);
  2265. }
  2266. if (homogeneousPrologEpilog(MF)) {
  2267. auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
  2268. .setMIFlag(MachineInstr::FrameSetup);
  2269. for (auto &RPI : RegPairs) {
  2270. MIB.addReg(RPI.Reg1);
  2271. MIB.addReg(RPI.Reg2);
  2272. // Update register live in.
  2273. if (!MRI.isReserved(RPI.Reg1))
  2274. MBB.addLiveIn(RPI.Reg1);
  2275. if (!MRI.isReserved(RPI.Reg2))
  2276. MBB.addLiveIn(RPI.Reg2);
  2277. }
  2278. return true;
  2279. }
  2280. for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
  2281. unsigned Reg1 = RPI.Reg1;
  2282. unsigned Reg2 = RPI.Reg2;
  2283. unsigned StrOpc;
  2284. // Issue sequence of spills for cs regs. The first spill may be converted
  2285. // to a pre-decrement store later by emitPrologue if the callee-save stack
  2286. // area allocation can't be combined with the local stack area allocation.
  2287. // For example:
  2288. // stp x22, x21, [sp, #0] // addImm(+0)
  2289. // stp x20, x19, [sp, #16] // addImm(+2)
  2290. // stp fp, lr, [sp, #32] // addImm(+4)
  2291. // Rationale: This sequence saves uop updates compared to a sequence of
  2292. // pre-increment spills like stp xi,xj,[sp,#-16]!
  2293. // Note: Similar rationale and sequence for restores in epilog.
  2294. unsigned Size;
  2295. Align Alignment;
  2296. switch (RPI.Type) {
  2297. case RegPairInfo::GPR:
  2298. StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
  2299. Size = 8;
  2300. Alignment = Align(8);
  2301. break;
  2302. case RegPairInfo::FPR64:
  2303. StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
  2304. Size = 8;
  2305. Alignment = Align(8);
  2306. break;
  2307. case RegPairInfo::FPR128:
  2308. StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
  2309. Size = 16;
  2310. Alignment = Align(16);
  2311. break;
  2312. case RegPairInfo::ZPR:
  2313. StrOpc = AArch64::STR_ZXI;
  2314. Size = 16;
  2315. Alignment = Align(16);
  2316. break;
  2317. case RegPairInfo::PPR:
  2318. StrOpc = AArch64::STR_PXI;
  2319. Size = 2;
  2320. Alignment = Align(2);
  2321. break;
  2322. }
  2323. LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
  2324. if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
  2325. dbgs() << ") -> fi#(" << RPI.FrameIdx;
  2326. if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
  2327. dbgs() << ")\n");
  2328. assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
  2329. "Windows unwdinding requires a consecutive (FP,LR) pair");
  2330. // Windows unwind codes require consecutive registers if registers are
  2331. // paired. Make the switch here, so that the code below will save (x,x+1)
  2332. // and not (x+1,x).
  2333. unsigned FrameIdxReg1 = RPI.FrameIdx;
  2334. unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
  2335. if (NeedsWinCFI && RPI.isPaired()) {
  2336. std::swap(Reg1, Reg2);
  2337. std::swap(FrameIdxReg1, FrameIdxReg2);
  2338. }
  2339. MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
  2340. if (!MRI.isReserved(Reg1))
  2341. MBB.addLiveIn(Reg1);
  2342. if (RPI.isPaired()) {
  2343. if (!MRI.isReserved(Reg2))
  2344. MBB.addLiveIn(Reg2);
  2345. MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
  2346. MIB.addMemOperand(MF.getMachineMemOperand(
  2347. MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
  2348. MachineMemOperand::MOStore, Size, Alignment));
  2349. }
  2350. MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
  2351. .addReg(AArch64::SP)
  2352. .addImm(RPI.Offset) // [sp, #offset*scale],
  2353. // where factor*scale is implicit
  2354. .setMIFlag(MachineInstr::FrameSetup);
  2355. MIB.addMemOperand(MF.getMachineMemOperand(
  2356. MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
  2357. MachineMemOperand::MOStore, Size, Alignment));
  2358. if (NeedsWinCFI)
  2359. InsertSEH(MIB, TII, MachineInstr::FrameSetup);
  2360. // Update the StackIDs of the SVE stack slots.
  2361. MachineFrameInfo &MFI = MF.getFrameInfo();
  2362. if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
  2363. MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
  2364. }
  2365. return true;
  2366. }
  2367. bool AArch64FrameLowering::restoreCalleeSavedRegisters(
  2368. MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
  2369. MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
  2370. MachineFunction &MF = *MBB.getParent();
  2371. const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
  2372. DebugLoc DL;
  2373. SmallVector<RegPairInfo, 8> RegPairs;
  2374. bool NeedsWinCFI = needsWinCFI(MF);
  2375. if (MI != MBB.end())
  2376. DL = MI->getDebugLoc();
  2377. bool NeedShadowCallStackProlog = false;
  2378. computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
  2379. NeedShadowCallStackProlog, hasFP(MF));
  2380. auto EmitMI = [&](const RegPairInfo &RPI) {
  2381. unsigned Reg1 = RPI.Reg1;
  2382. unsigned Reg2 = RPI.Reg2;
  2383. // Issue sequence of restores for cs regs. The last restore may be converted
  2384. // to a post-increment load later by emitEpilogue if the callee-save stack
  2385. // area allocation can't be combined with the local stack area allocation.
  2386. // For example:
  2387. // ldp fp, lr, [sp, #32] // addImm(+4)
  2388. // ldp x20, x19, [sp, #16] // addImm(+2)
  2389. // ldp x22, x21, [sp, #0] // addImm(+0)
  2390. // Note: see comment in spillCalleeSavedRegisters()
  2391. unsigned LdrOpc;
  2392. unsigned Size;
  2393. Align Alignment;
  2394. switch (RPI.Type) {
  2395. case RegPairInfo::GPR:
  2396. LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
  2397. Size = 8;
  2398. Alignment = Align(8);
  2399. break;
  2400. case RegPairInfo::FPR64:
  2401. LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
  2402. Size = 8;
  2403. Alignment = Align(8);
  2404. break;
  2405. case RegPairInfo::FPR128:
  2406. LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
  2407. Size = 16;
  2408. Alignment = Align(16);
  2409. break;
  2410. case RegPairInfo::ZPR:
  2411. LdrOpc = AArch64::LDR_ZXI;
  2412. Size = 16;
  2413. Alignment = Align(16);
  2414. break;
  2415. case RegPairInfo::PPR:
  2416. LdrOpc = AArch64::LDR_PXI;
  2417. Size = 2;
  2418. Alignment = Align(2);
  2419. break;
  2420. }
  2421. LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
  2422. if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
  2423. dbgs() << ") -> fi#(" << RPI.FrameIdx;
  2424. if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
  2425. dbgs() << ")\n");
  2426. // Windows unwind codes require consecutive registers if registers are
  2427. // paired. Make the switch here, so that the code below will save (x,x+1)
  2428. // and not (x+1,x).
  2429. unsigned FrameIdxReg1 = RPI.FrameIdx;
  2430. unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
  2431. if (NeedsWinCFI && RPI.isPaired()) {
  2432. std::swap(Reg1, Reg2);
  2433. std::swap(FrameIdxReg1, FrameIdxReg2);
  2434. }
  2435. MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
  2436. if (RPI.isPaired()) {
  2437. MIB.addReg(Reg2, getDefRegState(true));
  2438. MIB.addMemOperand(MF.getMachineMemOperand(
  2439. MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
  2440. MachineMemOperand::MOLoad, Size, Alignment));
  2441. }
  2442. MIB.addReg(Reg1, getDefRegState(true))
  2443. .addReg(AArch64::SP)
  2444. .addImm(RPI.Offset) // [sp, #offset*scale]
  2445. // where factor*scale is implicit
  2446. .setMIFlag(MachineInstr::FrameDestroy);
  2447. MIB.addMemOperand(MF.getMachineMemOperand(
  2448. MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
  2449. MachineMemOperand::MOLoad, Size, Alignment));
  2450. if (NeedsWinCFI)
  2451. InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
  2452. };
  2453. // SVE objects are always restored in reverse order.
  2454. for (const RegPairInfo &RPI : reverse(RegPairs))
  2455. if (RPI.isScalable())
  2456. EmitMI(RPI);
  2457. if (ReverseCSRRestoreSeq) {
  2458. for (const RegPairInfo &RPI : reverse(RegPairs))
  2459. if (!RPI.isScalable())
  2460. EmitMI(RPI);
  2461. } else if (homogeneousPrologEpilog(MF, &MBB)) {
  2462. auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
  2463. .setMIFlag(MachineInstr::FrameDestroy);
  2464. for (auto &RPI : RegPairs) {
  2465. MIB.addReg(RPI.Reg1, RegState::Define);
  2466. MIB.addReg(RPI.Reg2, RegState::Define);
  2467. }
  2468. return true;
  2469. } else
  2470. for (const RegPairInfo &RPI : RegPairs)
  2471. if (!RPI.isScalable())
  2472. EmitMI(RPI);
  2473. if (NeedShadowCallStackProlog) {
  2474. // Shadow call stack epilog: ldr x30, [x18, #-8]!
  2475. BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
  2476. .addReg(AArch64::X18, RegState::Define)
  2477. .addReg(AArch64::LR, RegState::Define)
  2478. .addReg(AArch64::X18)
  2479. .addImm(-8)
  2480. .setMIFlag(MachineInstr::FrameDestroy);
  2481. }
  2482. return true;
  2483. }
  2484. void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
  2485. BitVector &SavedRegs,
  2486. RegScavenger *RS) const {
  2487. // All calls are tail calls in GHC calling conv, and functions have no
  2488. // prologue/epilogue.
  2489. if (MF.getFunction().getCallingConv() == CallingConv::GHC)
  2490. return;
  2491. TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
  2492. const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
  2493. MF.getSubtarget().getRegisterInfo());
  2494. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  2495. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  2496. unsigned UnspilledCSGPR = AArch64::NoRegister;
  2497. unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
  2498. MachineFrameInfo &MFI = MF.getFrameInfo();
  2499. const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
  2500. unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
  2501. ? RegInfo->getBaseRegister()
  2502. : (unsigned)AArch64::NoRegister;
  2503. unsigned ExtraCSSpill = 0;
  2504. // Figure out which callee-saved registers to save/restore.
  2505. for (unsigned i = 0; CSRegs[i]; ++i) {
  2506. const unsigned Reg = CSRegs[i];
  2507. // Add the base pointer register to SavedRegs if it is callee-save.
  2508. if (Reg == BasePointerReg)
  2509. SavedRegs.set(Reg);
  2510. bool RegUsed = SavedRegs.test(Reg);
  2511. unsigned PairedReg = AArch64::NoRegister;
  2512. if (AArch64::GPR64RegClass.contains(Reg) ||
  2513. AArch64::FPR64RegClass.contains(Reg) ||
  2514. AArch64::FPR128RegClass.contains(Reg))
  2515. PairedReg = CSRegs[i ^ 1];
  2516. if (!RegUsed) {
  2517. if (AArch64::GPR64RegClass.contains(Reg) &&
  2518. !RegInfo->isReservedReg(MF, Reg)) {
  2519. UnspilledCSGPR = Reg;
  2520. UnspilledCSGPRPaired = PairedReg;
  2521. }
  2522. continue;
  2523. }
  2524. // MachO's compact unwind format relies on all registers being stored in
  2525. // pairs.
  2526. // FIXME: the usual format is actually better if unwinding isn't needed.
  2527. if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
  2528. !SavedRegs.test(PairedReg)) {
  2529. SavedRegs.set(PairedReg);
  2530. if (AArch64::GPR64RegClass.contains(PairedReg) &&
  2531. !RegInfo->isReservedReg(MF, PairedReg))
  2532. ExtraCSSpill = PairedReg;
  2533. }
  2534. }
  2535. if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
  2536. !Subtarget.isTargetWindows()) {
  2537. // For Windows calling convention on a non-windows OS, where X18 is treated
  2538. // as reserved, back up X18 when entering non-windows code (marked with the
  2539. // Windows calling convention) and restore when returning regardless of
  2540. // whether the individual function uses it - it might call other functions
  2541. // that clobber it.
  2542. SavedRegs.set(AArch64::X18);
  2543. }
  2544. // Calculates the callee saved stack size.
  2545. unsigned CSStackSize = 0;
  2546. unsigned SVECSStackSize = 0;
  2547. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
  2548. const MachineRegisterInfo &MRI = MF.getRegInfo();
  2549. for (unsigned Reg : SavedRegs.set_bits()) {
  2550. auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
  2551. if (AArch64::PPRRegClass.contains(Reg) ||
  2552. AArch64::ZPRRegClass.contains(Reg))
  2553. SVECSStackSize += RegSize;
  2554. else
  2555. CSStackSize += RegSize;
  2556. }
  2557. // Save number of saved regs, so we can easily update CSStackSize later.
  2558. unsigned NumSavedRegs = SavedRegs.count();
  2559. // The frame record needs to be created by saving the appropriate registers
  2560. uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
  2561. if (hasFP(MF) ||
  2562. windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
  2563. SavedRegs.set(AArch64::FP);
  2564. SavedRegs.set(AArch64::LR);
  2565. }
  2566. LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
  2567. for (unsigned Reg
  2568. : SavedRegs.set_bits()) dbgs()
  2569. << ' ' << printReg(Reg, RegInfo);
  2570. dbgs() << "\n";);
  2571. // If any callee-saved registers are used, the frame cannot be eliminated.
  2572. int64_t SVEStackSize =
  2573. alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
  2574. bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
  2575. // The CSR spill slots have not been allocated yet, so estimateStackSize
  2576. // won't include them.
  2577. unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
  2578. // Conservatively always assume BigStack when there are SVE spills.
  2579. bool BigStack = SVEStackSize ||
  2580. (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
  2581. if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
  2582. AFI->setHasStackFrame(true);
  2583. // Estimate if we might need to scavenge a register at some point in order
  2584. // to materialize a stack offset. If so, either spill one additional
  2585. // callee-saved register or reserve a special spill slot to facilitate
  2586. // register scavenging. If we already spilled an extra callee-saved register
  2587. // above to keep the number of spills even, we don't need to do anything else
  2588. // here.
  2589. if (BigStack) {
  2590. if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
  2591. LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
  2592. << " to get a scratch register.\n");
  2593. SavedRegs.set(UnspilledCSGPR);
  2594. // MachO's compact unwind format relies on all registers being stored in
  2595. // pairs, so if we need to spill one extra for BigStack, then we need to
  2596. // store the pair.
  2597. if (producePairRegisters(MF))
  2598. SavedRegs.set(UnspilledCSGPRPaired);
  2599. ExtraCSSpill = UnspilledCSGPR;
  2600. }
  2601. // If we didn't find an extra callee-saved register to spill, create
  2602. // an emergency spill slot.
  2603. if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
  2604. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
  2605. const TargetRegisterClass &RC = AArch64::GPR64RegClass;
  2606. unsigned Size = TRI->getSpillSize(RC);
  2607. Align Alignment = TRI->getSpillAlign(RC);
  2608. int FI = MFI.CreateStackObject(Size, Alignment, false);
  2609. RS->addScavengingFrameIndex(FI);
  2610. LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
  2611. << " as the emergency spill slot.\n");
  2612. }
  2613. }
  2614. // Adding the size of additional 64bit GPR saves.
  2615. CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
  2616. // A Swift asynchronous context extends the frame record with a pointer
  2617. // directly before FP.
  2618. if (hasFP(MF) && AFI->hasSwiftAsyncContext())
  2619. CSStackSize += 8;
  2620. uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
  2621. LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
  2622. << EstimatedStackSize + AlignedCSStackSize
  2623. << " bytes.\n");
  2624. assert((!MFI.isCalleeSavedInfoValid() ||
  2625. AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
  2626. "Should not invalidate callee saved info");
  2627. // Round up to register pair alignment to avoid additional SP adjustment
  2628. // instructions.
  2629. AFI->setCalleeSavedStackSize(AlignedCSStackSize);
  2630. AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
  2631. AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
  2632. }
  2633. bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
  2634. MachineFunction &MF, const TargetRegisterInfo *RegInfo,
  2635. std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
  2636. unsigned &MaxCSFrameIndex) const {
  2637. bool NeedsWinCFI = needsWinCFI(MF);
  2638. // To match the canonical windows frame layout, reverse the list of
  2639. // callee saved registers to get them laid out by PrologEpilogInserter
  2640. // in the right order. (PrologEpilogInserter allocates stack objects top
  2641. // down. Windows canonical prologs store higher numbered registers at
  2642. // the top, thus have the CSI array start from the highest registers.)
  2643. if (NeedsWinCFI)
  2644. std::reverse(CSI.begin(), CSI.end());
  2645. if (CSI.empty())
  2646. return true; // Early exit if no callee saved registers are modified!
  2647. // Now that we know which registers need to be saved and restored, allocate
  2648. // stack slots for them.
  2649. MachineFrameInfo &MFI = MF.getFrameInfo();
  2650. auto *AFI = MF.getInfo<AArch64FunctionInfo>();
  2651. for (auto &CS : CSI) {
  2652. Register Reg = CS.getReg();
  2653. const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
  2654. unsigned Size = RegInfo->getSpillSize(*RC);
  2655. Align Alignment(RegInfo->getSpillAlign(*RC));
  2656. int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
  2657. CS.setFrameIdx(FrameIdx);
  2658. if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
  2659. if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
  2660. // Grab 8 bytes below FP for the extended asynchronous frame info.
  2661. if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
  2662. FrameIdx = MFI.CreateStackObject(8, Alignment, true);
  2663. AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
  2664. if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
  2665. if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
  2666. }
  2667. }
  2668. return true;
  2669. }
  2670. bool AArch64FrameLowering::enableStackSlotScavenging(
  2671. const MachineFunction &MF) const {
  2672. const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  2673. return AFI->hasCalleeSaveStackFreeSpace();
  2674. }
  2675. /// returns true if there are any SVE callee saves.
  2676. static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
  2677. int &Min, int &Max) {
  2678. Min = std::numeric_limits<int>::max();
  2679. Max = std::numeric_limits<int>::min();
  2680. if (!MFI.isCalleeSavedInfoValid())
  2681. return false;
  2682. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
  2683. for (auto &CS : CSI) {
  2684. if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
  2685. AArch64::PPRRegClass.contains(CS.getReg())) {
  2686. assert((Max == std::numeric_limits<int>::min() ||
  2687. Max + 1 == CS.getFrameIdx()) &&
  2688. "SVE CalleeSaves are not consecutive");
  2689. Min = std::min(Min, CS.getFrameIdx());
  2690. Max = std::max(Max, CS.getFrameIdx());
  2691. }
  2692. }
  2693. return Min != std::numeric_limits<int>::max();
  2694. }
  2695. // Process all the SVE stack objects and determine offsets for each
  2696. // object. If AssignOffsets is true, the offsets get assigned.
  2697. // Fills in the first and last callee-saved frame indices into
  2698. // Min/MaxCSFrameIndex, respectively.
  2699. // Returns the size of the stack.
  2700. static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
  2701. int &MinCSFrameIndex,
  2702. int &MaxCSFrameIndex,
  2703. bool AssignOffsets) {
  2704. #ifndef NDEBUG
  2705. // First process all fixed stack objects.
  2706. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
  2707. assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
  2708. "SVE vectors should never be passed on the stack by value, only by "
  2709. "reference.");
  2710. #endif
  2711. auto Assign = [&MFI](int FI, int64_t Offset) {
  2712. LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
  2713. MFI.setObjectOffset(FI, Offset);
  2714. };
  2715. int64_t Offset = 0;
  2716. // Then process all callee saved slots.
  2717. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
  2718. // Assign offsets to the callee save slots.
  2719. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
  2720. Offset += MFI.getObjectSize(I);
  2721. Offset = alignTo(Offset, MFI.getObjectAlign(I));
  2722. if (AssignOffsets)
  2723. Assign(I, -Offset);
  2724. }
  2725. }
  2726. // Ensure that the Callee-save area is aligned to 16bytes.
  2727. Offset = alignTo(Offset, Align(16U));
  2728. // Create a buffer of SVE objects to allocate and sort it.
  2729. SmallVector<int, 8> ObjectsToAllocate;
  2730. // If we have a stack protector, and we've previously decided that we have SVE
  2731. // objects on the stack and thus need it to go in the SVE stack area, then it
  2732. // needs to go first.
  2733. int StackProtectorFI = -1;
  2734. if (MFI.hasStackProtectorIndex()) {
  2735. StackProtectorFI = MFI.getStackProtectorIndex();
  2736. if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
  2737. ObjectsToAllocate.push_back(StackProtectorFI);
  2738. }
  2739. for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
  2740. unsigned StackID = MFI.getStackID(I);
  2741. if (StackID != TargetStackID::ScalableVector)
  2742. continue;
  2743. if (I == StackProtectorFI)
  2744. continue;
  2745. if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
  2746. continue;
  2747. if (MFI.isDeadObjectIndex(I))
  2748. continue;
  2749. ObjectsToAllocate.push_back(I);
  2750. }
  2751. // Allocate all SVE locals and spills
  2752. for (unsigned FI : ObjectsToAllocate) {
  2753. Align Alignment = MFI.getObjectAlign(FI);
  2754. // FIXME: Given that the length of SVE vectors is not necessarily a power of
  2755. // two, we'd need to align every object dynamically at runtime if the
  2756. // alignment is larger than 16. This is not yet supported.
  2757. if (Alignment > Align(16))
  2758. report_fatal_error(
  2759. "Alignment of scalable vectors > 16 bytes is not yet supported");
  2760. Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
  2761. if (AssignOffsets)
  2762. Assign(FI, -Offset);
  2763. }
  2764. return Offset;
  2765. }
  2766. int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
  2767. MachineFrameInfo &MFI) const {
  2768. int MinCSFrameIndex, MaxCSFrameIndex;
  2769. return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
  2770. }
  2771. int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
  2772. MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
  2773. return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
  2774. true);
  2775. }
  2776. void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
  2777. MachineFunction &MF, RegScavenger *RS) const {
  2778. MachineFrameInfo &MFI = MF.getFrameInfo();
  2779. assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
  2780. "Upwards growing stack unsupported");
  2781. int MinCSFrameIndex, MaxCSFrameIndex;
  2782. int64_t SVEStackSize =
  2783. assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
  2784. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
  2785. AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
  2786. AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
  2787. // If this function isn't doing Win64-style C++ EH, we don't need to do
  2788. // anything.
  2789. if (!MF.hasEHFunclets())
  2790. return;
  2791. const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
  2792. WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
  2793. MachineBasicBlock &MBB = MF.front();
  2794. auto MBBI = MBB.begin();
  2795. while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
  2796. ++MBBI;
  2797. // Create an UnwindHelp object.
  2798. // The UnwindHelp object is allocated at the start of the fixed object area
  2799. int64_t FixedObject =
  2800. getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
  2801. int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
  2802. /*SPOffset*/ -FixedObject,
  2803. /*IsImmutable=*/false);
  2804. EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
  2805. // We need to store -2 into the UnwindHelp object at the start of the
  2806. // function.
  2807. DebugLoc DL;
  2808. RS->enterBasicBlockEnd(MBB);
  2809. RS->backward(std::prev(MBBI));
  2810. Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
  2811. assert(DstReg && "There must be a free register after frame setup");
  2812. BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
  2813. BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
  2814. .addReg(DstReg, getKillRegState(true))
  2815. .addFrameIndex(UnwindHelpFI)
  2816. .addImm(0);
  2817. }
  2818. namespace {
  2819. struct TagStoreInstr {
  2820. MachineInstr *MI;
  2821. int64_t Offset, Size;
  2822. explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
  2823. : MI(MI), Offset(Offset), Size(Size) {}
  2824. };
  2825. class TagStoreEdit {
  2826. MachineFunction *MF;
  2827. MachineBasicBlock *MBB;
  2828. MachineRegisterInfo *MRI;
  2829. // Tag store instructions that are being replaced.
  2830. SmallVector<TagStoreInstr, 8> TagStores;
  2831. // Combined memref arguments of the above instructions.
  2832. SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
  2833. // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
  2834. // FrameRegOffset + Size) with the address tag of SP.
  2835. Register FrameReg;
  2836. StackOffset FrameRegOffset;
  2837. int64_t Size;
  2838. // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
  2839. Optional<int64_t> FrameRegUpdate;
  2840. // MIFlags for any FrameReg updating instructions.
  2841. unsigned FrameRegUpdateFlags;
  2842. // Use zeroing instruction variants.
  2843. bool ZeroData;
  2844. DebugLoc DL;
  2845. void emitUnrolled(MachineBasicBlock::iterator InsertI);
  2846. void emitLoop(MachineBasicBlock::iterator InsertI);
  2847. public:
  2848. TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
  2849. : MBB(MBB), ZeroData(ZeroData) {
  2850. MF = MBB->getParent();
  2851. MRI = &MF->getRegInfo();
  2852. }
  2853. // Add an instruction to be replaced. Instructions must be added in the
  2854. // ascending order of Offset, and have to be adjacent.
  2855. void addInstruction(TagStoreInstr I) {
  2856. assert((TagStores.empty() ||
  2857. TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
  2858. "Non-adjacent tag store instructions.");
  2859. TagStores.push_back(I);
  2860. }
  2861. void clear() { TagStores.clear(); }
  2862. // Emit equivalent code at the given location, and erase the current set of
  2863. // instructions. May skip if the replacement is not profitable. May invalidate
  2864. // the input iterator and replace it with a valid one.
  2865. void emitCode(MachineBasicBlock::iterator &InsertI,
  2866. const AArch64FrameLowering *TFI, bool IsLast);
  2867. };
  2868. void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
  2869. const AArch64InstrInfo *TII =
  2870. MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
  2871. const int64_t kMinOffset = -256 * 16;
  2872. const int64_t kMaxOffset = 255 * 16;
  2873. Register BaseReg = FrameReg;
  2874. int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
  2875. if (BaseRegOffsetBytes < kMinOffset ||
  2876. BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
  2877. Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
  2878. emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
  2879. StackOffset::getFixed(BaseRegOffsetBytes), TII);
  2880. BaseReg = ScratchReg;
  2881. BaseRegOffsetBytes = 0;
  2882. }
  2883. MachineInstr *LastI = nullptr;
  2884. while (Size) {
  2885. int64_t InstrSize = (Size > 16) ? 32 : 16;
  2886. unsigned Opcode =
  2887. InstrSize == 16
  2888. ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
  2889. : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
  2890. MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
  2891. .addReg(AArch64::SP)
  2892. .addReg(BaseReg)
  2893. .addImm(BaseRegOffsetBytes / 16)
  2894. .setMemRefs(CombinedMemRefs);
  2895. // A store to [BaseReg, #0] should go last for an opportunity to fold the
  2896. // final SP adjustment in the epilogue.
  2897. if (BaseRegOffsetBytes == 0)
  2898. LastI = I;
  2899. BaseRegOffsetBytes += InstrSize;
  2900. Size -= InstrSize;
  2901. }
  2902. if (LastI)
  2903. MBB->splice(InsertI, MBB, LastI);
  2904. }
  2905. void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
  2906. const AArch64InstrInfo *TII =
  2907. MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
  2908. Register BaseReg = FrameRegUpdate
  2909. ? FrameReg
  2910. : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
  2911. Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
  2912. emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
  2913. int64_t LoopSize = Size;
  2914. // If the loop size is not a multiple of 32, split off one 16-byte store at
  2915. // the end to fold BaseReg update into.
  2916. if (FrameRegUpdate && *FrameRegUpdate)
  2917. LoopSize -= LoopSize % 32;
  2918. MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
  2919. TII->get(ZeroData ? AArch64::STZGloop_wback
  2920. : AArch64::STGloop_wback))
  2921. .addDef(SizeReg)
  2922. .addDef(BaseReg)
  2923. .addImm(LoopSize)
  2924. .addReg(BaseReg)
  2925. .setMemRefs(CombinedMemRefs);
  2926. if (FrameRegUpdate)
  2927. LoopI->setFlags(FrameRegUpdateFlags);
  2928. int64_t ExtraBaseRegUpdate =
  2929. FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
  2930. if (LoopSize < Size) {
  2931. assert(FrameRegUpdate);
  2932. assert(Size - LoopSize == 16);
  2933. // Tag 16 more bytes at BaseReg and update BaseReg.
  2934. BuildMI(*MBB, InsertI, DL,
  2935. TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
  2936. .addDef(BaseReg)
  2937. .addReg(BaseReg)
  2938. .addReg(BaseReg)
  2939. .addImm(1 + ExtraBaseRegUpdate / 16)
  2940. .setMemRefs(CombinedMemRefs)
  2941. .setMIFlags(FrameRegUpdateFlags);
  2942. } else if (ExtraBaseRegUpdate) {
  2943. // Update BaseReg.
  2944. BuildMI(
  2945. *MBB, InsertI, DL,
  2946. TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
  2947. .addDef(BaseReg)
  2948. .addReg(BaseReg)
  2949. .addImm(std::abs(ExtraBaseRegUpdate))
  2950. .addImm(0)
  2951. .setMIFlags(FrameRegUpdateFlags);
  2952. }
  2953. }
  2954. // Check if *II is a register update that can be merged into STGloop that ends
  2955. // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
  2956. // end of the loop.
  2957. bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
  2958. int64_t Size, int64_t *TotalOffset) {
  2959. MachineInstr &MI = *II;
  2960. if ((MI.getOpcode() == AArch64::ADDXri ||
  2961. MI.getOpcode() == AArch64::SUBXri) &&
  2962. MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
  2963. unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
  2964. int64_t Offset = MI.getOperand(2).getImm() << Shift;
  2965. if (MI.getOpcode() == AArch64::SUBXri)
  2966. Offset = -Offset;
  2967. int64_t AbsPostOffset = std::abs(Offset - Size);
  2968. const int64_t kMaxOffset =
  2969. 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
  2970. if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
  2971. *TotalOffset = Offset;
  2972. return true;
  2973. }
  2974. }
  2975. return false;
  2976. }
  2977. void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
  2978. SmallVectorImpl<MachineMemOperand *> &MemRefs) {
  2979. MemRefs.clear();
  2980. for (auto &TS : TSE) {
  2981. MachineInstr *MI = TS.MI;
  2982. // An instruction without memory operands may access anything. Be
  2983. // conservative and return an empty list.
  2984. if (MI->memoperands_empty()) {
  2985. MemRefs.clear();
  2986. return;
  2987. }
  2988. MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
  2989. }
  2990. }
  2991. void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
  2992. const AArch64FrameLowering *TFI, bool IsLast) {
  2993. if (TagStores.empty())
  2994. return;
  2995. TagStoreInstr &FirstTagStore = TagStores[0];
  2996. TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
  2997. Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
  2998. DL = TagStores[0].MI->getDebugLoc();
  2999. Register Reg;
  3000. FrameRegOffset = TFI->resolveFrameOffsetReference(
  3001. *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
  3002. /*PreferFP=*/false, /*ForSimm=*/true);
  3003. FrameReg = Reg;
  3004. FrameRegUpdate = None;
  3005. mergeMemRefs(TagStores, CombinedMemRefs);
  3006. LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
  3007. for (const auto &Instr
  3008. : TagStores) { dbgs() << " " << *Instr.MI; });
  3009. // Size threshold where a loop becomes shorter than a linear sequence of
  3010. // tagging instructions.
  3011. const int kSetTagLoopThreshold = 176;
  3012. if (Size < kSetTagLoopThreshold) {
  3013. if (TagStores.size() < 2)
  3014. return;
  3015. emitUnrolled(InsertI);
  3016. } else {
  3017. MachineInstr *UpdateInstr = nullptr;
  3018. int64_t TotalOffset;
  3019. if (IsLast) {
  3020. // See if we can merge base register update into the STGloop.
  3021. // This is done in AArch64LoadStoreOptimizer for "normal" stores,
  3022. // but STGloop is way too unusual for that, and also it only
  3023. // realistically happens in function epilogue. Also, STGloop is expanded
  3024. // before that pass.
  3025. if (InsertI != MBB->end() &&
  3026. canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
  3027. &TotalOffset)) {
  3028. UpdateInstr = &*InsertI++;
  3029. LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
  3030. << *UpdateInstr);
  3031. }
  3032. }
  3033. if (!UpdateInstr && TagStores.size() < 2)
  3034. return;
  3035. if (UpdateInstr) {
  3036. FrameRegUpdate = TotalOffset;
  3037. FrameRegUpdateFlags = UpdateInstr->getFlags();
  3038. }
  3039. emitLoop(InsertI);
  3040. if (UpdateInstr)
  3041. UpdateInstr->eraseFromParent();
  3042. }
  3043. for (auto &TS : TagStores)
  3044. TS.MI->eraseFromParent();
  3045. }
  3046. bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
  3047. int64_t &Size, bool &ZeroData) {
  3048. MachineFunction &MF = *MI.getParent()->getParent();
  3049. const MachineFrameInfo &MFI = MF.getFrameInfo();
  3050. unsigned Opcode = MI.getOpcode();
  3051. ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
  3052. Opcode == AArch64::STZ2GOffset);
  3053. if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
  3054. if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
  3055. return false;
  3056. if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
  3057. return false;
  3058. Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
  3059. Size = MI.getOperand(2).getImm();
  3060. return true;
  3061. }
  3062. if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
  3063. Size = 16;
  3064. else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
  3065. Size = 32;
  3066. else
  3067. return false;
  3068. if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
  3069. return false;
  3070. Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
  3071. 16 * MI.getOperand(2).getImm();
  3072. return true;
  3073. }
  3074. // Detect a run of memory tagging instructions for adjacent stack frame slots,
  3075. // and replace them with a shorter instruction sequence:
  3076. // * replace STG + STG with ST2G
  3077. // * replace STGloop + STGloop with STGloop
  3078. // This code needs to run when stack slot offsets are already known, but before
  3079. // FrameIndex operands in STG instructions are eliminated.
  3080. MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
  3081. const AArch64FrameLowering *TFI,
  3082. RegScavenger *RS) {
  3083. bool FirstZeroData;
  3084. int64_t Size, Offset;
  3085. MachineInstr &MI = *II;
  3086. MachineBasicBlock *MBB = MI.getParent();
  3087. MachineBasicBlock::iterator NextI = ++II;
  3088. if (&MI == &MBB->instr_back())
  3089. return II;
  3090. if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
  3091. return II;
  3092. SmallVector<TagStoreInstr, 4> Instrs;
  3093. Instrs.emplace_back(&MI, Offset, Size);
  3094. constexpr int kScanLimit = 10;
  3095. int Count = 0;
  3096. for (MachineBasicBlock::iterator E = MBB->end();
  3097. NextI != E && Count < kScanLimit; ++NextI) {
  3098. MachineInstr &MI = *NextI;
  3099. bool ZeroData;
  3100. int64_t Size, Offset;
  3101. // Collect instructions that update memory tags with a FrameIndex operand
  3102. // and (when applicable) constant size, and whose output registers are dead
  3103. // (the latter is almost always the case in practice). Since these
  3104. // instructions effectively have no inputs or outputs, we are free to skip
  3105. // any non-aliasing instructions in between without tracking used registers.
  3106. if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
  3107. if (ZeroData != FirstZeroData)
  3108. break;
  3109. Instrs.emplace_back(&MI, Offset, Size);
  3110. continue;
  3111. }
  3112. // Only count non-transient, non-tagging instructions toward the scan
  3113. // limit.
  3114. if (!MI.isTransient())
  3115. ++Count;
  3116. // Just in case, stop before the epilogue code starts.
  3117. if (MI.getFlag(MachineInstr::FrameSetup) ||
  3118. MI.getFlag(MachineInstr::FrameDestroy))
  3119. break;
  3120. // Reject anything that may alias the collected instructions.
  3121. if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
  3122. break;
  3123. }
  3124. // New code will be inserted after the last tagging instruction we've found.
  3125. MachineBasicBlock::iterator InsertI = Instrs.back().MI;
  3126. InsertI++;
  3127. llvm::stable_sort(Instrs,
  3128. [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
  3129. return Left.Offset < Right.Offset;
  3130. });
  3131. // Make sure that we don't have any overlapping stores.
  3132. int64_t CurOffset = Instrs[0].Offset;
  3133. for (auto &Instr : Instrs) {
  3134. if (CurOffset > Instr.Offset)
  3135. return NextI;
  3136. CurOffset = Instr.Offset + Instr.Size;
  3137. }
  3138. // Find contiguous runs of tagged memory and emit shorter instruction
  3139. // sequencies for them when possible.
  3140. TagStoreEdit TSE(MBB, FirstZeroData);
  3141. Optional<int64_t> EndOffset;
  3142. for (auto &Instr : Instrs) {
  3143. if (EndOffset && *EndOffset != Instr.Offset) {
  3144. // Found a gap.
  3145. TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
  3146. TSE.clear();
  3147. }
  3148. TSE.addInstruction(Instr);
  3149. EndOffset = Instr.Offset + Instr.Size;
  3150. }
  3151. TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
  3152. return InsertI;
  3153. }
  3154. } // namespace
  3155. void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
  3156. MachineFunction &MF, RegScavenger *RS = nullptr) const {
  3157. if (StackTaggingMergeSetTag)
  3158. for (auto &BB : MF)
  3159. for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
  3160. II = tryMergeAdjacentSTG(II, this, RS);
  3161. }
  3162. /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
  3163. /// before the update. This is easily retrieved as it is exactly the offset
  3164. /// that is set in processFunctionBeforeFrameFinalized.
  3165. StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
  3166. const MachineFunction &MF, int FI, Register &FrameReg,
  3167. bool IgnoreSPUpdates) const {
  3168. const MachineFrameInfo &MFI = MF.getFrameInfo();
  3169. if (IgnoreSPUpdates) {
  3170. LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
  3171. << MFI.getObjectOffset(FI) << "\n");
  3172. FrameReg = AArch64::SP;
  3173. return StackOffset::getFixed(MFI.getObjectOffset(FI));
  3174. }
  3175. // Go to common code if we cannot provide sp + offset.
  3176. if (MFI.hasVarSizedObjects() ||
  3177. MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
  3178. MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
  3179. return getFrameIndexReference(MF, FI, FrameReg);
  3180. FrameReg = AArch64::SP;
  3181. return getStackOffset(MF, MFI.getObjectOffset(FI));
  3182. }
  3183. /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
  3184. /// the parent's frame pointer
  3185. unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
  3186. const MachineFunction &MF) const {
  3187. return 0;
  3188. }
  3189. /// Funclets only need to account for space for the callee saved registers,
  3190. /// as the locals are accounted for in the parent's stack frame.
  3191. unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
  3192. const MachineFunction &MF) const {
  3193. // This is the size of the pushed CSRs.
  3194. unsigned CSSize =
  3195. MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
  3196. // This is the amount of stack a funclet needs to allocate.
  3197. return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
  3198. getStackAlign());
  3199. }
  3200. namespace {
  3201. struct FrameObject {
  3202. bool IsValid = false;
  3203. // Index of the object in MFI.
  3204. int ObjectIndex = 0;
  3205. // Group ID this object belongs to.
  3206. int GroupIndex = -1;
  3207. // This object should be placed first (closest to SP).
  3208. bool ObjectFirst = false;
  3209. // This object's group (which always contains the object with
  3210. // ObjectFirst==true) should be placed first.
  3211. bool GroupFirst = false;
  3212. };
  3213. class GroupBuilder {
  3214. SmallVector<int, 8> CurrentMembers;
  3215. int NextGroupIndex = 0;
  3216. std::vector<FrameObject> &Objects;
  3217. public:
  3218. GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
  3219. void AddMember(int Index) { CurrentMembers.push_back(Index); }
  3220. void EndCurrentGroup() {
  3221. if (CurrentMembers.size() > 1) {
  3222. // Create a new group with the current member list. This might remove them
  3223. // from their pre-existing groups. That's OK, dealing with overlapping
  3224. // groups is too hard and unlikely to make a difference.
  3225. LLVM_DEBUG(dbgs() << "group:");
  3226. for (int Index : CurrentMembers) {
  3227. Objects[Index].GroupIndex = NextGroupIndex;
  3228. LLVM_DEBUG(dbgs() << " " << Index);
  3229. }
  3230. LLVM_DEBUG(dbgs() << "\n");
  3231. NextGroupIndex++;
  3232. }
  3233. CurrentMembers.clear();
  3234. }
  3235. };
  3236. bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
  3237. // Objects at a lower index are closer to FP; objects at a higher index are
  3238. // closer to SP.
  3239. //
  3240. // For consistency in our comparison, all invalid objects are placed
  3241. // at the end. This also allows us to stop walking when we hit the
  3242. // first invalid item after it's all sorted.
  3243. //
  3244. // The "first" object goes first (closest to SP), followed by the members of
  3245. // the "first" group.
  3246. //
  3247. // The rest are sorted by the group index to keep the groups together.
  3248. // Higher numbered groups are more likely to be around longer (i.e. untagged
  3249. // in the function epilogue and not at some earlier point). Place them closer
  3250. // to SP.
  3251. //
  3252. // If all else equal, sort by the object index to keep the objects in the
  3253. // original order.
  3254. return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
  3255. A.ObjectIndex) <
  3256. std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
  3257. B.ObjectIndex);
  3258. }
  3259. } // namespace
  3260. void AArch64FrameLowering::orderFrameObjects(
  3261. const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
  3262. if (!OrderFrameObjects || ObjectsToAllocate.empty())
  3263. return;
  3264. const MachineFrameInfo &MFI = MF.getFrameInfo();
  3265. std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
  3266. for (auto &Obj : ObjectsToAllocate) {
  3267. FrameObjects[Obj].IsValid = true;
  3268. FrameObjects[Obj].ObjectIndex = Obj;
  3269. }
  3270. // Identify stack slots that are tagged at the same time.
  3271. GroupBuilder GB(FrameObjects);
  3272. for (auto &MBB : MF) {
  3273. for (auto &MI : MBB) {
  3274. if (MI.isDebugInstr())
  3275. continue;
  3276. int OpIndex;
  3277. switch (MI.getOpcode()) {
  3278. case AArch64::STGloop:
  3279. case AArch64::STZGloop:
  3280. OpIndex = 3;
  3281. break;
  3282. case AArch64::STGOffset:
  3283. case AArch64::STZGOffset:
  3284. case AArch64::ST2GOffset:
  3285. case AArch64::STZ2GOffset:
  3286. OpIndex = 1;
  3287. break;
  3288. default:
  3289. OpIndex = -1;
  3290. }
  3291. int TaggedFI = -1;
  3292. if (OpIndex >= 0) {
  3293. const MachineOperand &MO = MI.getOperand(OpIndex);
  3294. if (MO.isFI()) {
  3295. int FI = MO.getIndex();
  3296. if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
  3297. FrameObjects[FI].IsValid)
  3298. TaggedFI = FI;
  3299. }
  3300. }
  3301. // If this is a stack tagging instruction for a slot that is not part of a
  3302. // group yet, either start a new group or add it to the current one.
  3303. if (TaggedFI >= 0)
  3304. GB.AddMember(TaggedFI);
  3305. else
  3306. GB.EndCurrentGroup();
  3307. }
  3308. // Groups should never span multiple basic blocks.
  3309. GB.EndCurrentGroup();
  3310. }
  3311. // If the function's tagged base pointer is pinned to a stack slot, we want to
  3312. // put that slot first when possible. This will likely place it at SP + 0,
  3313. // and save one instruction when generating the base pointer because IRG does
  3314. // not allow an immediate offset.
  3315. const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
  3316. Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
  3317. if (TBPI) {
  3318. FrameObjects[*TBPI].ObjectFirst = true;
  3319. FrameObjects[*TBPI].GroupFirst = true;
  3320. int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
  3321. if (FirstGroupIndex >= 0)
  3322. for (FrameObject &Object : FrameObjects)
  3323. if (Object.GroupIndex == FirstGroupIndex)
  3324. Object.GroupFirst = true;
  3325. }
  3326. llvm::stable_sort(FrameObjects, FrameObjectCompare);
  3327. int i = 0;
  3328. for (auto &Obj : FrameObjects) {
  3329. // All invalid items are sorted at the end, so it's safe to stop.
  3330. if (!Obj.IsValid)
  3331. break;
  3332. ObjectsToAllocate[i++] = Obj.ObjectIndex;
  3333. }
  3334. LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
  3335. : FrameObjects) {
  3336. if (!Obj.IsValid)
  3337. break;
  3338. dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
  3339. if (Obj.ObjectFirst)
  3340. dbgs() << ", first";
  3341. if (Obj.GroupFirst)
  3342. dbgs() << ", group-first";
  3343. dbgs() << "\n";
  3344. });
  3345. }