AArch64LegalizerInfo.cpp 59 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594
  1. //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// This file implements the targeting of the Machinelegalizer class for
  10. /// AArch64.
  11. /// \todo This should be generated by TableGen.
  12. //===----------------------------------------------------------------------===//
  13. #include "AArch64LegalizerInfo.h"
  14. #include "AArch64RegisterBankInfo.h"
  15. #include "AArch64Subtarget.h"
  16. #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  17. #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
  18. #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
  19. #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  20. #include "llvm/CodeGen/GlobalISel/Utils.h"
  21. #include "llvm/CodeGen/MachineInstr.h"
  22. #include "llvm/CodeGen/MachineRegisterInfo.h"
  23. #include "llvm/CodeGen/TargetOpcodes.h"
  24. #include "llvm/CodeGen/ValueTypes.h"
  25. #include "llvm/IR/DerivedTypes.h"
  26. #include "llvm/IR/Intrinsics.h"
  27. #include "llvm/IR/IntrinsicsAArch64.h"
  28. #include "llvm/IR/Type.h"
  29. #include "llvm/Support/MathExtras.h"
  30. #include <initializer_list>
  31. #define DEBUG_TYPE "aarch64-legalinfo"
  32. using namespace llvm;
  33. using namespace LegalizeActions;
  34. using namespace LegalizeMutations;
  35. using namespace LegalityPredicates;
  36. using namespace MIPatternMatch;
  37. AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
  38. : ST(&ST) {
  39. using namespace TargetOpcode;
  40. const LLT p0 = LLT::pointer(0, 64);
  41. const LLT s8 = LLT::scalar(8);
  42. const LLT s16 = LLT::scalar(16);
  43. const LLT s32 = LLT::scalar(32);
  44. const LLT s64 = LLT::scalar(64);
  45. const LLT s128 = LLT::scalar(128);
  46. const LLT v16s8 = LLT::fixed_vector(16, 8);
  47. const LLT v8s8 = LLT::fixed_vector(8, 8);
  48. const LLT v4s8 = LLT::fixed_vector(4, 8);
  49. const LLT v8s16 = LLT::fixed_vector(8, 16);
  50. const LLT v4s16 = LLT::fixed_vector(4, 16);
  51. const LLT v2s16 = LLT::fixed_vector(2, 16);
  52. const LLT v2s32 = LLT::fixed_vector(2, 32);
  53. const LLT v4s32 = LLT::fixed_vector(4, 32);
  54. const LLT v2s64 = LLT::fixed_vector(2, 64);
  55. const LLT v2p0 = LLT::fixed_vector(2, p0);
  56. std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
  57. v16s8, v8s16, v4s32,
  58. v2s64, v2p0,
  59. /* End 128bit types */
  60. /* Begin 64bit types */
  61. v8s8, v4s16, v2s32};
  62. const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
  63. // FIXME: support subtargets which have neon/fp-armv8 disabled.
  64. if (!ST.hasNEON() || !ST.hasFPARMv8()) {
  65. getLegacyLegalizerInfo().computeTables();
  66. return;
  67. }
  68. // Some instructions only support s16 if the subtarget has full 16-bit FP
  69. // support.
  70. const bool HasFP16 = ST.hasFullFP16();
  71. const LLT &MinFPScalar = HasFP16 ? s16 : s32;
  72. const bool HasCSSC = ST.hasCSSC();
  73. getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
  74. .legalFor({p0, s8, s16, s32, s64})
  75. .legalFor(PackedVectorAllTypeList)
  76. .widenScalarToNextPow2(0)
  77. .clampScalar(0, s8, s64)
  78. .fewerElementsIf(
  79. [=](const LegalityQuery &Query) {
  80. return Query.Types[0].isVector() &&
  81. (Query.Types[0].getElementType() != s64 ||
  82. Query.Types[0].getNumElements() != 2);
  83. },
  84. [=](const LegalityQuery &Query) {
  85. LLT EltTy = Query.Types[0].getElementType();
  86. if (EltTy == s64)
  87. return std::make_pair(0, LLT::fixed_vector(2, 64));
  88. return std::make_pair(0, EltTy);
  89. });
  90. getActionDefinitionsBuilder(G_PHI)
  91. .legalFor({p0, s16, s32, s64})
  92. .legalFor(PackedVectorAllTypeList)
  93. .widenScalarToNextPow2(0)
  94. .clampScalar(0, s16, s64)
  95. // Maximum: sN * k = 128
  96. .clampMaxNumElements(0, s8, 16)
  97. .clampMaxNumElements(0, s16, 8)
  98. .clampMaxNumElements(0, s32, 4)
  99. .clampMaxNumElements(0, s64, 2)
  100. .clampMaxNumElements(0, p0, 2);
  101. getActionDefinitionsBuilder(G_BSWAP)
  102. .legalFor({s32, s64, v4s32, v2s32, v2s64})
  103. .widenScalarToNextPow2(0)
  104. .clampScalar(0, s32, s64);
  105. getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
  106. .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
  107. .scalarizeIf(
  108. [=](const LegalityQuery &Query) {
  109. return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
  110. },
  111. 0)
  112. .legalFor({v2s64})
  113. .widenScalarToNextPow2(0)
  114. .clampScalar(0, s32, s64)
  115. .clampNumElements(0, v2s32, v4s32)
  116. .clampNumElements(0, v2s64, v2s64)
  117. .moreElementsToNextPow2(0);
  118. getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
  119. .customIf([=](const LegalityQuery &Query) {
  120. const auto &SrcTy = Query.Types[0];
  121. const auto &AmtTy = Query.Types[1];
  122. return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
  123. AmtTy.getSizeInBits() == 32;
  124. })
  125. .legalFor({
  126. {s32, s32},
  127. {s32, s64},
  128. {s64, s64},
  129. {v8s8, v8s8},
  130. {v16s8, v16s8},
  131. {v4s16, v4s16},
  132. {v8s16, v8s16},
  133. {v2s32, v2s32},
  134. {v4s32, v4s32},
  135. {v2s64, v2s64},
  136. })
  137. .widenScalarToNextPow2(0)
  138. .clampScalar(1, s32, s64)
  139. .clampScalar(0, s32, s64)
  140. .clampNumElements(0, v2s32, v4s32)
  141. .clampNumElements(0, v2s64, v2s64)
  142. .moreElementsToNextPow2(0)
  143. .minScalarSameAs(1, 0);
  144. getActionDefinitionsBuilder(G_PTR_ADD)
  145. .legalFor({{p0, s64}, {v2p0, v2s64}})
  146. .clampScalar(1, s64, s64);
  147. getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
  148. getActionDefinitionsBuilder({G_SDIV, G_UDIV})
  149. .legalFor({s32, s64})
  150. .libcallFor({s128})
  151. .clampScalar(0, s32, s64)
  152. .widenScalarToNextPow2(0)
  153. .scalarize(0);
  154. getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
  155. .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
  156. .widenScalarOrEltToNextPow2(0)
  157. .clampScalarOrElt(0, s32, s64)
  158. .clampNumElements(0, v2s32, v4s32)
  159. .clampNumElements(0, v2s64, v2s64)
  160. .moreElementsToNextPow2(0);
  161. getActionDefinitionsBuilder({G_SMULO, G_UMULO})
  162. .widenScalarToNextPow2(0, /*Min = */ 32)
  163. .clampScalar(0, s32, s64)
  164. .lower();
  165. getActionDefinitionsBuilder({G_SMULH, G_UMULH})
  166. .legalFor({s64, v8s16, v16s8, v4s32})
  167. .lower();
  168. auto &MinMaxActions = getActionDefinitionsBuilder(
  169. {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
  170. if (HasCSSC)
  171. MinMaxActions
  172. .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
  173. // Making clamping conditional on CSSC extension as without legal types we
  174. // lower to CMP which can fold one of the two sxtb's we'd otherwise need
  175. // if we detect a type smaller than 32-bit.
  176. .minScalar(0, s32);
  177. else
  178. MinMaxActions
  179. .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
  180. MinMaxActions
  181. .clampNumElements(0, v8s8, v16s8)
  182. .clampNumElements(0, v4s16, v8s16)
  183. .clampNumElements(0, v2s32, v4s32)
  184. // FIXME: This sholdn't be needed as v2s64 types are going to
  185. // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
  186. .clampNumElements(0, v2s64, v2s64)
  187. .lower();
  188. getActionDefinitionsBuilder(
  189. {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
  190. .legalFor({{s32, s32}, {s64, s32}})
  191. .clampScalar(0, s32, s64)
  192. .clampScalar(1, s32, s64)
  193. .widenScalarToNextPow2(0);
  194. getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
  195. .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32})
  196. .clampScalar(0, MinFPScalar, s64)
  197. .clampNumElements(0, v2s32, v4s32)
  198. .clampNumElements(0, v2s64, v2s64);
  199. getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
  200. getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
  201. G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
  202. G_FNEARBYINT, G_INTRINSIC_LRINT})
  203. // If we don't have full FP16 support, then scalarize the elements of
  204. // vectors containing fp16 types.
  205. .fewerElementsIf(
  206. [=, &ST](const LegalityQuery &Query) {
  207. const auto &Ty = Query.Types[0];
  208. return Ty.isVector() && Ty.getElementType() == s16 &&
  209. !ST.hasFullFP16();
  210. },
  211. [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
  212. // If we don't have full FP16 support, then widen s16 to s32 if we
  213. // encounter it.
  214. .widenScalarIf(
  215. [=, &ST](const LegalityQuery &Query) {
  216. return Query.Types[0] == s16 && !ST.hasFullFP16();
  217. },
  218. [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
  219. .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
  220. getActionDefinitionsBuilder(
  221. {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
  222. // We need a call for these, so we always need to scalarize.
  223. .scalarize(0)
  224. // Regardless of FP16 support, widen 16-bit elements to 32-bits.
  225. .minScalar(0, s32)
  226. .libcallFor({s32, s64, v2s32, v4s32, v2s64});
  227. getActionDefinitionsBuilder(G_INSERT)
  228. .legalIf(all(typeInSet(0, {s32, s64, p0}),
  229. typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
  230. .widenScalarToNextPow2(0)
  231. .clampScalar(0, s32, s64)
  232. .widenScalarToNextPow2(1)
  233. .minScalar(1, s8)
  234. .maxScalarIf(typeInSet(0, {s32}), 1, s16)
  235. .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
  236. getActionDefinitionsBuilder(G_EXTRACT)
  237. .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
  238. typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
  239. .widenScalarToNextPow2(1)
  240. .clampScalar(1, s32, s128)
  241. .widenScalarToNextPow2(0)
  242. .minScalar(0, s16)
  243. .maxScalarIf(typeInSet(1, {s32}), 0, s16)
  244. .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
  245. .maxScalarIf(typeInSet(1, {s128}), 0, s64);
  246. for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
  247. auto &Actions = getActionDefinitionsBuilder(Op);
  248. if (Op == G_SEXTLOAD)
  249. Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
  250. // Atomics have zero extending behavior.
  251. Actions
  252. .legalForTypesWithMemDesc({{s32, p0, s8, 8},
  253. {s32, p0, s16, 8},
  254. {s32, p0, s32, 8},
  255. {s64, p0, s8, 2},
  256. {s64, p0, s16, 2},
  257. {s64, p0, s32, 4},
  258. {s64, p0, s64, 8},
  259. {p0, p0, s64, 8},
  260. {v2s32, p0, s64, 8}})
  261. .widenScalarToNextPow2(0)
  262. .clampScalar(0, s32, s64)
  263. // TODO: We could support sum-of-pow2's but the lowering code doesn't know
  264. // how to do that yet.
  265. .unsupportedIfMemSizeNotPow2()
  266. // Lower anything left over into G_*EXT and G_LOAD
  267. .lower();
  268. }
  269. auto IsPtrVecPred = [=](const LegalityQuery &Query) {
  270. const LLT &ValTy = Query.Types[0];
  271. if (!ValTy.isVector())
  272. return false;
  273. const LLT EltTy = ValTy.getElementType();
  274. return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
  275. };
  276. getActionDefinitionsBuilder(G_LOAD)
  277. .customIf([=](const LegalityQuery &Query) {
  278. return Query.Types[0] == s128 &&
  279. Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
  280. })
  281. .legalForTypesWithMemDesc({{s8, p0, s8, 8},
  282. {s16, p0, s16, 8},
  283. {s32, p0, s32, 8},
  284. {s64, p0, s64, 8},
  285. {p0, p0, s64, 8},
  286. {s128, p0, s128, 8},
  287. {v8s8, p0, s64, 8},
  288. {v16s8, p0, s128, 8},
  289. {v4s16, p0, s64, 8},
  290. {v8s16, p0, s128, 8},
  291. {v2s32, p0, s64, 8},
  292. {v4s32, p0, s128, 8},
  293. {v2s64, p0, s128, 8}})
  294. // These extends are also legal
  295. .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
  296. .widenScalarToNextPow2(0, /* MinSize = */8)
  297. .lowerIfMemSizeNotByteSizePow2()
  298. .clampScalar(0, s8, s64)
  299. .narrowScalarIf([=](const LegalityQuery &Query) {
  300. // Clamp extending load results to 32-bits.
  301. return Query.Types[0].isScalar() &&
  302. Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
  303. Query.Types[0].getSizeInBits() > 32;
  304. },
  305. changeTo(0, s32))
  306. .clampMaxNumElements(0, s8, 16)
  307. .clampMaxNumElements(0, s16, 8)
  308. .clampMaxNumElements(0, s32, 4)
  309. .clampMaxNumElements(0, s64, 2)
  310. .clampMaxNumElements(0, p0, 2)
  311. .customIf(IsPtrVecPred)
  312. .scalarizeIf(typeIs(0, v2s16), 0);
  313. getActionDefinitionsBuilder(G_STORE)
  314. .customIf([=](const LegalityQuery &Query) {
  315. return Query.Types[0] == s128 &&
  316. Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
  317. })
  318. .legalForTypesWithMemDesc({{s8, p0, s8, 8},
  319. {s16, p0, s8, 8}, // truncstorei8 from s16
  320. {s32, p0, s8, 8}, // truncstorei8 from s32
  321. {s64, p0, s8, 8}, // truncstorei8 from s64
  322. {s16, p0, s16, 8},
  323. {s32, p0, s16, 8}, // truncstorei16 from s32
  324. {s64, p0, s16, 8}, // truncstorei16 from s64
  325. {s32, p0, s8, 8},
  326. {s32, p0, s16, 8},
  327. {s32, p0, s32, 8},
  328. {s64, p0, s64, 8},
  329. {s64, p0, s32, 8}, // truncstorei32 from s64
  330. {p0, p0, s64, 8},
  331. {s128, p0, s128, 8},
  332. {v16s8, p0, s128, 8},
  333. {v8s8, p0, s64, 8},
  334. {v4s16, p0, s64, 8},
  335. {v8s16, p0, s128, 8},
  336. {v2s32, p0, s64, 8},
  337. {v4s32, p0, s128, 8},
  338. {v2s64, p0, s128, 8}})
  339. .clampScalar(0, s8, s64)
  340. .lowerIf([=](const LegalityQuery &Query) {
  341. return Query.Types[0].isScalar() &&
  342. Query.Types[0] != Query.MMODescrs[0].MemoryTy;
  343. })
  344. // Maximum: sN * k = 128
  345. .clampMaxNumElements(0, s8, 16)
  346. .clampMaxNumElements(0, s16, 8)
  347. .clampMaxNumElements(0, s32, 4)
  348. .clampMaxNumElements(0, s64, 2)
  349. .clampMaxNumElements(0, p0, 2)
  350. .lowerIfMemSizeNotPow2()
  351. .customIf(IsPtrVecPred)
  352. .scalarizeIf(typeIs(0, v2s16), 0);
  353. // Constants
  354. getActionDefinitionsBuilder(G_CONSTANT)
  355. .legalFor({p0, s8, s16, s32, s64})
  356. .widenScalarToNextPow2(0)
  357. .clampScalar(0, s8, s64);
  358. getActionDefinitionsBuilder(G_FCONSTANT)
  359. .legalIf([=](const LegalityQuery &Query) {
  360. const auto &Ty = Query.Types[0];
  361. if (HasFP16 && Ty == s16)
  362. return true;
  363. return Ty == s32 || Ty == s64 || Ty == s128;
  364. })
  365. .clampScalar(0, MinFPScalar, s128);
  366. getActionDefinitionsBuilder({G_ICMP, G_FCMP})
  367. .legalFor({{s32, s32},
  368. {s32, s64},
  369. {s32, p0},
  370. {v4s32, v4s32},
  371. {v2s32, v2s32},
  372. {v2s64, v2s64},
  373. {v2s64, v2p0},
  374. {v4s16, v4s16},
  375. {v8s16, v8s16},
  376. {v8s8, v8s8},
  377. {v16s8, v16s8}})
  378. .widenScalarOrEltToNextPow2(1)
  379. .clampScalar(1, s32, s64)
  380. .clampScalar(0, s32, s32)
  381. .minScalarEltSameAsIf(
  382. [=](const LegalityQuery &Query) {
  383. const LLT &Ty = Query.Types[0];
  384. const LLT &SrcTy = Query.Types[1];
  385. return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
  386. Ty.getElementType() != SrcTy.getElementType();
  387. },
  388. 0, 1)
  389. .minScalarOrEltIf(
  390. [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
  391. 1, s32)
  392. .minScalarOrEltIf(
  393. [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
  394. s64)
  395. .clampNumElements(0, v2s32, v4s32);
  396. // Extensions
  397. auto ExtLegalFunc = [=](const LegalityQuery &Query) {
  398. unsigned DstSize = Query.Types[0].getSizeInBits();
  399. if (DstSize == 128 && !Query.Types[0].isVector())
  400. return false; // Extending to a scalar s128 needs narrowing.
  401. // Make sure that we have something that will fit in a register, and
  402. // make sure it's a power of 2.
  403. if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
  404. return false;
  405. const LLT &SrcTy = Query.Types[1];
  406. // Make sure we fit in a register otherwise. Don't bother checking that
  407. // the source type is below 128 bits. We shouldn't be allowing anything
  408. // through which is wider than the destination in the first place.
  409. unsigned SrcSize = SrcTy.getSizeInBits();
  410. if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
  411. return false;
  412. return true;
  413. };
  414. getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
  415. .legalIf(ExtLegalFunc)
  416. .clampScalar(0, s64, s64); // Just for s128, others are handled above.
  417. getActionDefinitionsBuilder(G_TRUNC)
  418. .minScalarOrEltIf(
  419. [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
  420. 0, s8)
  421. .customIf([=](const LegalityQuery &Query) {
  422. LLT DstTy = Query.Types[0];
  423. LLT SrcTy = Query.Types[1];
  424. return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
  425. })
  426. .alwaysLegal();
  427. getActionDefinitionsBuilder(G_SEXT_INREG)
  428. .legalFor({s32, s64})
  429. .legalFor(PackedVectorAllTypeList)
  430. .lower();
  431. // FP conversions
  432. getActionDefinitionsBuilder(G_FPTRUNC)
  433. .legalFor(
  434. {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
  435. .clampMaxNumElements(0, s32, 2);
  436. getActionDefinitionsBuilder(G_FPEXT)
  437. .legalFor(
  438. {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
  439. .clampMaxNumElements(0, s64, 2);
  440. // Conversions
  441. getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
  442. .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
  443. .widenScalarToNextPow2(0)
  444. .clampScalar(0, s32, s64)
  445. .widenScalarToNextPow2(1)
  446. .clampScalar(1, s32, s64);
  447. getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
  448. .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
  449. .clampScalar(1, s32, s64)
  450. .minScalarSameAs(1, 0)
  451. .clampScalar(0, s32, s64)
  452. .widenScalarToNextPow2(0);
  453. // Control-flow
  454. getActionDefinitionsBuilder(G_BRCOND)
  455. .legalFor({s32})
  456. .clampScalar(0, s32, s32);
  457. getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
  458. getActionDefinitionsBuilder(G_SELECT)
  459. .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
  460. .widenScalarToNextPow2(0)
  461. .clampScalar(0, s32, s64)
  462. .clampScalar(1, s32, s32)
  463. .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
  464. .lowerIf(isVector(0));
  465. // Pointer-handling
  466. getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
  467. if (TM.getCodeModel() == CodeModel::Small)
  468. getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
  469. else
  470. getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
  471. getActionDefinitionsBuilder(G_PTRTOINT)
  472. .legalFor({{s64, p0}, {v2s64, v2p0}})
  473. .widenScalarToNextPow2(0, 64)
  474. .clampScalar(0, s64, s64);
  475. getActionDefinitionsBuilder(G_INTTOPTR)
  476. .unsupportedIf([&](const LegalityQuery &Query) {
  477. return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
  478. })
  479. .legalFor({{p0, s64}, {v2p0, v2s64}});
  480. // Casts for 32 and 64-bit width type are just copies.
  481. // Same for 128-bit width type, except they are on the FPR bank.
  482. getActionDefinitionsBuilder(G_BITCAST)
  483. // FIXME: This is wrong since G_BITCAST is not allowed to change the
  484. // number of bits but it's what the previous code described and fixing
  485. // it breaks tests.
  486. .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
  487. v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
  488. v2p0});
  489. getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
  490. // va_list must be a pointer, but most sized types are pretty easy to handle
  491. // as the destination.
  492. getActionDefinitionsBuilder(G_VAARG)
  493. .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
  494. .clampScalar(0, s8, s64)
  495. .widenScalarToNextPow2(0, /*Min*/ 8);
  496. getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
  497. .lowerIf(
  498. all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
  499. getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
  500. .customIf([](const LegalityQuery &Query) {
  501. return Query.Types[0].getSizeInBits() == 128;
  502. })
  503. .clampScalar(0, s32, s64)
  504. .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
  505. getActionDefinitionsBuilder(
  506. {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
  507. G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
  508. G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
  509. .clampScalar(0, s32, s64)
  510. .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
  511. getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
  512. // Merge/Unmerge
  513. for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
  514. unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
  515. unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
  516. getActionDefinitionsBuilder(Op)
  517. .widenScalarToNextPow2(LitTyIdx, 8)
  518. .widenScalarToNextPow2(BigTyIdx, 32)
  519. .clampScalar(LitTyIdx, s8, s64)
  520. .clampScalar(BigTyIdx, s32, s128)
  521. .legalIf([=](const LegalityQuery &Q) {
  522. switch (Q.Types[BigTyIdx].getSizeInBits()) {
  523. case 32:
  524. case 64:
  525. case 128:
  526. break;
  527. default:
  528. return false;
  529. }
  530. switch (Q.Types[LitTyIdx].getSizeInBits()) {
  531. case 8:
  532. case 16:
  533. case 32:
  534. case 64:
  535. return true;
  536. default:
  537. return false;
  538. }
  539. });
  540. }
  541. getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
  542. .unsupportedIf([=](const LegalityQuery &Query) {
  543. const LLT &EltTy = Query.Types[1].getElementType();
  544. return Query.Types[0] != EltTy;
  545. })
  546. .minScalar(2, s64)
  547. .legalIf([=](const LegalityQuery &Query) {
  548. const LLT &VecTy = Query.Types[1];
  549. return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
  550. VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
  551. VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 ||
  552. VecTy == v2p0;
  553. })
  554. .minScalarOrEltIf(
  555. [=](const LegalityQuery &Query) {
  556. // We want to promote to <M x s1> to <M x s64> if that wouldn't
  557. // cause the total vec size to be > 128b.
  558. return Query.Types[1].getNumElements() <= 2;
  559. },
  560. 0, s64)
  561. .minScalarOrEltIf(
  562. [=](const LegalityQuery &Query) {
  563. return Query.Types[1].getNumElements() <= 4;
  564. },
  565. 0, s32)
  566. .minScalarOrEltIf(
  567. [=](const LegalityQuery &Query) {
  568. return Query.Types[1].getNumElements() <= 8;
  569. },
  570. 0, s16)
  571. .minScalarOrEltIf(
  572. [=](const LegalityQuery &Query) {
  573. return Query.Types[1].getNumElements() <= 16;
  574. },
  575. 0, s8)
  576. .minScalarOrElt(0, s8) // Worst case, we need at least s8.
  577. .clampMaxNumElements(1, s64, 2)
  578. .clampMaxNumElements(1, s32, 4)
  579. .clampMaxNumElements(1, s16, 8)
  580. .clampMaxNumElements(1, p0, 2);
  581. getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
  582. .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
  583. getActionDefinitionsBuilder(G_BUILD_VECTOR)
  584. .legalFor({{v8s8, s8},
  585. {v16s8, s8},
  586. {v2s16, s16},
  587. {v4s16, s16},
  588. {v8s16, s16},
  589. {v2s32, s32},
  590. {v4s32, s32},
  591. {v2p0, p0},
  592. {v2s64, s64}})
  593. .clampNumElements(0, v4s32, v4s32)
  594. .clampNumElements(0, v2s64, v2s64)
  595. .minScalarOrElt(0, s8)
  596. .minScalarSameAs(1, 0);
  597. getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
  598. getActionDefinitionsBuilder(G_CTLZ)
  599. .legalForCartesianProduct(
  600. {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
  601. .scalarize(1);
  602. getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
  603. // TODO: Custom lowering for v2s32, v4s32, v2s64.
  604. getActionDefinitionsBuilder(G_BITREVERSE)
  605. .legalFor({s32, s64, v8s8, v16s8})
  606. .widenScalarToNextPow2(0, /*Min = */ 32)
  607. .clampScalar(0, s32, s64);
  608. getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
  609. getActionDefinitionsBuilder(G_CTTZ)
  610. .lowerIf(isVector(0))
  611. .clampScalar(0, s32, s64)
  612. .scalarSameSizeAs(1, 0)
  613. .legalIf([=](const LegalityQuery &Query) {
  614. return (HasCSSC && typeInSet(0, {s32, s64})(Query));
  615. })
  616. .customIf([=](const LegalityQuery &Query) {
  617. return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
  618. });
  619. getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
  620. .legalIf([=](const LegalityQuery &Query) {
  621. const LLT &DstTy = Query.Types[0];
  622. const LLT &SrcTy = Query.Types[1];
  623. // For now just support the TBL2 variant which needs the source vectors
  624. // to be the same size as the dest.
  625. if (DstTy != SrcTy)
  626. return false;
  627. return llvm::is_contained({v2s32, v4s32, v2s64, v2p0, v16s8, v8s16},
  628. DstTy);
  629. })
  630. // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
  631. // just want those lowered into G_BUILD_VECTOR
  632. .lowerIf([=](const LegalityQuery &Query) {
  633. return !Query.Types[1].isVector();
  634. })
  635. .moreElementsIf(
  636. [](const LegalityQuery &Query) {
  637. return Query.Types[0].isVector() && Query.Types[1].isVector() &&
  638. Query.Types[0].getNumElements() >
  639. Query.Types[1].getNumElements();
  640. },
  641. changeTo(1, 0))
  642. .moreElementsToNextPow2(0)
  643. .clampNumElements(0, v4s32, v4s32)
  644. .clampNumElements(0, v2s64, v2s64);
  645. getActionDefinitionsBuilder(G_CONCAT_VECTORS)
  646. .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
  647. getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
  648. getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
  649. return Query.Types[0] == p0 && Query.Types[1] == s64;
  650. });
  651. getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
  652. if (ST.hasMOPS()) {
  653. // G_BZERO is not supported. Currently it is only emitted by
  654. // PreLegalizerCombiner for G_MEMSET with zero constant.
  655. getActionDefinitionsBuilder(G_BZERO).unsupported();
  656. getActionDefinitionsBuilder(G_MEMSET)
  657. .legalForCartesianProduct({p0}, {s64}, {s64})
  658. .customForCartesianProduct({p0}, {s8}, {s64})
  659. .immIdx(0); // Inform verifier imm idx 0 is handled.
  660. getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
  661. .legalForCartesianProduct({p0}, {p0}, {s64})
  662. .immIdx(0); // Inform verifier imm idx 0 is handled.
  663. // G_MEMCPY_INLINE does not have a tailcall immediate
  664. getActionDefinitionsBuilder(G_MEMCPY_INLINE)
  665. .legalForCartesianProduct({p0}, {p0}, {s64});
  666. } else {
  667. getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
  668. .libcall();
  669. }
  670. // FIXME: Legal vector types are only legal with NEON.
  671. auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
  672. if (HasCSSC)
  673. ABSActions
  674. .legalFor({s32, s64});
  675. ABSActions
  676. .legalFor(PackedVectorAllTypeList)
  677. .lowerIf(isScalar(0));
  678. getActionDefinitionsBuilder(G_VECREDUCE_FADD)
  679. // We only have FADDP to do reduction-like operations. Lower the rest.
  680. .legalFor({{s32, v2s32}, {s64, v2s64}})
  681. .clampMaxNumElements(1, s64, 2)
  682. .clampMaxNumElements(1, s32, 2)
  683. .lower();
  684. getActionDefinitionsBuilder(G_VECREDUCE_ADD)
  685. .legalFor(
  686. {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
  687. .clampMaxNumElements(1, s64, 2)
  688. .clampMaxNumElements(1, s32, 4)
  689. .lower();
  690. getActionDefinitionsBuilder(
  691. {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
  692. // Try to break down into smaller vectors as long as they're at least 64
  693. // bits. This lets us use vector operations for some parts of the
  694. // reduction.
  695. .fewerElementsIf(
  696. [=](const LegalityQuery &Q) {
  697. LLT SrcTy = Q.Types[1];
  698. if (SrcTy.isScalar())
  699. return false;
  700. if (!isPowerOf2_32(SrcTy.getNumElements()))
  701. return false;
  702. // We can usually perform 64b vector operations.
  703. return SrcTy.getSizeInBits() > 64;
  704. },
  705. [=](const LegalityQuery &Q) {
  706. LLT SrcTy = Q.Types[1];
  707. return std::make_pair(1, SrcTy.divide(2));
  708. })
  709. .scalarize(1)
  710. .lower();
  711. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
  712. .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
  713. getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
  714. getActionDefinitionsBuilder(G_ROTR)
  715. .legalFor({{s32, s64}, {s64, s64}})
  716. .customIf([=](const LegalityQuery &Q) {
  717. return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
  718. })
  719. .lower();
  720. getActionDefinitionsBuilder(G_ROTL).lower();
  721. getActionDefinitionsBuilder({G_SBFX, G_UBFX})
  722. .customFor({{s32, s32}, {s64, s64}});
  723. auto always = [=](const LegalityQuery &Q) { return true; };
  724. auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
  725. if (HasCSSC)
  726. CTPOPActions
  727. .legalFor({{s32, s32},
  728. {s64, s64},
  729. {v8s8, v8s8},
  730. {v16s8, v16s8}})
  731. .customFor({{s128, s128},
  732. {v2s64, v2s64},
  733. {v2s32, v2s32},
  734. {v4s32, v4s32},
  735. {v4s16, v4s16},
  736. {v8s16, v8s16}});
  737. else
  738. CTPOPActions
  739. .legalFor({{v8s8, v8s8},
  740. {v16s8, v16s8}})
  741. .customFor({{s32, s32},
  742. {s64, s64},
  743. {s128, s128},
  744. {v2s64, v2s64},
  745. {v2s32, v2s32},
  746. {v4s32, v4s32},
  747. {v4s16, v4s16},
  748. {v8s16, v8s16}});
  749. CTPOPActions
  750. .clampScalar(0, s32, s128)
  751. .widenScalarToNextPow2(0)
  752. .minScalarEltSameAsIf(always, 1, 0)
  753. .maxScalarEltSameAsIf(always, 1, 0);
  754. // TODO: Vector types.
  755. getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
  756. // TODO: Vector types.
  757. getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM})
  758. .legalFor({MinFPScalar, s32, s64})
  759. .libcallFor({s128})
  760. .minScalar(0, MinFPScalar);
  761. getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM})
  762. .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
  763. .legalIf([=](const LegalityQuery &Query) {
  764. const auto &Ty = Query.Types[0];
  765. return (Ty == v8s16 || Ty == v4s16) && HasFP16;
  766. })
  767. .minScalar(0, MinFPScalar)
  768. .clampNumElements(0, v4s16, v8s16)
  769. .clampNumElements(0, v2s32, v4s32)
  770. .clampNumElements(0, v2s64, v2s64);
  771. // TODO: Libcall support for s128.
  772. // TODO: s16 should be legal with full FP16 support.
  773. getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
  774. .legalFor({{s64, s32}, {s64, s64}});
  775. // TODO: Custom legalization for vector types.
  776. // TODO: Custom legalization for mismatched types.
  777. // TODO: s16 support.
  778. getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}});
  779. getActionDefinitionsBuilder(G_FMAD).lower();
  780. getLegacyLegalizerInfo().computeTables();
  781. verify(*ST.getInstrInfo());
  782. }
  783. bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
  784. MachineInstr &MI) const {
  785. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  786. MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
  787. GISelChangeObserver &Observer = Helper.Observer;
  788. switch (MI.getOpcode()) {
  789. default:
  790. // No idea what to do.
  791. return false;
  792. case TargetOpcode::G_VAARG:
  793. return legalizeVaArg(MI, MRI, MIRBuilder);
  794. case TargetOpcode::G_LOAD:
  795. case TargetOpcode::G_STORE:
  796. return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
  797. case TargetOpcode::G_SHL:
  798. case TargetOpcode::G_ASHR:
  799. case TargetOpcode::G_LSHR:
  800. return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
  801. case TargetOpcode::G_GLOBAL_VALUE:
  802. return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
  803. case TargetOpcode::G_TRUNC:
  804. return legalizeVectorTrunc(MI, Helper);
  805. case TargetOpcode::G_SBFX:
  806. case TargetOpcode::G_UBFX:
  807. return legalizeBitfieldExtract(MI, MRI, Helper);
  808. case TargetOpcode::G_ROTR:
  809. return legalizeRotate(MI, MRI, Helper);
  810. case TargetOpcode::G_CTPOP:
  811. return legalizeCTPOP(MI, MRI, Helper);
  812. case TargetOpcode::G_ATOMIC_CMPXCHG:
  813. return legalizeAtomicCmpxchg128(MI, MRI, Helper);
  814. case TargetOpcode::G_CTTZ:
  815. return legalizeCTTZ(MI, Helper);
  816. case TargetOpcode::G_BZERO:
  817. case TargetOpcode::G_MEMCPY:
  818. case TargetOpcode::G_MEMMOVE:
  819. case TargetOpcode::G_MEMSET:
  820. return legalizeMemOps(MI, Helper);
  821. case TargetOpcode::G_FCOPYSIGN:
  822. return legalizeFCopySign(MI, Helper);
  823. }
  824. llvm_unreachable("expected switch to return");
  825. }
  826. bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
  827. MachineRegisterInfo &MRI,
  828. LegalizerHelper &Helper) const {
  829. // To allow for imported patterns to match, we ensure that the rotate amount
  830. // is 64b with an extension.
  831. Register AmtReg = MI.getOperand(2).getReg();
  832. LLT AmtTy = MRI.getType(AmtReg);
  833. (void)AmtTy;
  834. assert(AmtTy.isScalar() && "Expected a scalar rotate");
  835. assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
  836. auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
  837. Helper.Observer.changingInstr(MI);
  838. MI.getOperand(2).setReg(NewAmt.getReg(0));
  839. Helper.Observer.changedInstr(MI);
  840. return true;
  841. }
  842. static void extractParts(Register Reg, MachineRegisterInfo &MRI,
  843. MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
  844. SmallVectorImpl<Register> &VRegs) {
  845. for (int I = 0; I < NumParts; ++I)
  846. VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
  847. MIRBuilder.buildUnmerge(VRegs, Reg);
  848. }
  849. bool AArch64LegalizerInfo::legalizeVectorTrunc(
  850. MachineInstr &MI, LegalizerHelper &Helper) const {
  851. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  852. MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
  853. // Similar to how operand splitting is done in SelectiondDAG, we can handle
  854. // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
  855. // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
  856. // %lo16(<4 x s16>) = G_TRUNC %inlo
  857. // %hi16(<4 x s16>) = G_TRUNC %inhi
  858. // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
  859. // %res(<8 x s8>) = G_TRUNC %in16
  860. Register DstReg = MI.getOperand(0).getReg();
  861. Register SrcReg = MI.getOperand(1).getReg();
  862. LLT DstTy = MRI.getType(DstReg);
  863. LLT SrcTy = MRI.getType(SrcReg);
  864. assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
  865. isPowerOf2_32(SrcTy.getSizeInBits()));
  866. // Split input type.
  867. LLT SplitSrcTy =
  868. SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
  869. // First, split the source into two smaller vectors.
  870. SmallVector<Register, 2> SplitSrcs;
  871. extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
  872. // Truncate the splits into intermediate narrower elements.
  873. LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
  874. for (unsigned I = 0; I < SplitSrcs.size(); ++I)
  875. SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
  876. auto Concat = MIRBuilder.buildConcatVectors(
  877. DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
  878. Helper.Observer.changingInstr(MI);
  879. MI.getOperand(1).setReg(Concat.getReg(0));
  880. Helper.Observer.changedInstr(MI);
  881. return true;
  882. }
  883. bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
  884. MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
  885. GISelChangeObserver &Observer) const {
  886. assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
  887. // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
  888. // G_ADD_LOW instructions.
  889. // By splitting this here, we can optimize accesses in the small code model by
  890. // folding in the G_ADD_LOW into the load/store offset.
  891. auto &GlobalOp = MI.getOperand(1);
  892. const auto* GV = GlobalOp.getGlobal();
  893. if (GV->isThreadLocal())
  894. return true; // Don't want to modify TLS vars.
  895. auto &TM = ST->getTargetLowering()->getTargetMachine();
  896. unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
  897. if (OpFlags & AArch64II::MO_GOT)
  898. return true;
  899. auto Offset = GlobalOp.getOffset();
  900. Register DstReg = MI.getOperand(0).getReg();
  901. auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
  902. .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
  903. // Set the regclass on the dest reg too.
  904. MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
  905. // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
  906. // by creating a MOVK that sets bits 48-63 of the register to (global address
  907. // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
  908. // prevent an incorrect tag being generated during relocation when the the
  909. // global appears before the code section. Without the offset, a global at
  910. // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
  911. // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
  912. // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
  913. // instead of `0xf`.
  914. // This assumes that we're in the small code model so we can assume a binary
  915. // size of <= 4GB, which makes the untagged PC relative offset positive. The
  916. // binary must also be loaded into address range [0, 2^48). Both of these
  917. // properties need to be ensured at runtime when using tagged addresses.
  918. if (OpFlags & AArch64II::MO_TAGGED) {
  919. assert(!Offset &&
  920. "Should not have folded in an offset for a tagged global!");
  921. ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
  922. .addGlobalAddress(GV, 0x100000000,
  923. AArch64II::MO_PREL | AArch64II::MO_G3)
  924. .addImm(48);
  925. MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
  926. }
  927. MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
  928. .addGlobalAddress(GV, Offset,
  929. OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
  930. MI.eraseFromParent();
  931. return true;
  932. }
  933. bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
  934. MachineInstr &MI) const {
  935. switch (MI.getIntrinsicID()) {
  936. case Intrinsic::vacopy: {
  937. unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
  938. unsigned VaListSize =
  939. (ST->isTargetDarwin() || ST->isTargetWindows())
  940. ? PtrSize
  941. : ST->isTargetILP32() ? 20 : 32;
  942. MachineFunction &MF = *MI.getMF();
  943. auto Val = MF.getRegInfo().createGenericVirtualRegister(
  944. LLT::scalar(VaListSize * 8));
  945. MachineIRBuilder MIB(MI);
  946. MIB.buildLoad(Val, MI.getOperand(2),
  947. *MF.getMachineMemOperand(MachinePointerInfo(),
  948. MachineMemOperand::MOLoad,
  949. VaListSize, Align(PtrSize)));
  950. MIB.buildStore(Val, MI.getOperand(1),
  951. *MF.getMachineMemOperand(MachinePointerInfo(),
  952. MachineMemOperand::MOStore,
  953. VaListSize, Align(PtrSize)));
  954. MI.eraseFromParent();
  955. return true;
  956. }
  957. case Intrinsic::get_dynamic_area_offset: {
  958. MachineIRBuilder &MIB = Helper.MIRBuilder;
  959. MIB.buildConstant(MI.getOperand(0).getReg(), 0);
  960. MI.eraseFromParent();
  961. return true;
  962. }
  963. case Intrinsic::aarch64_mops_memset_tag: {
  964. assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
  965. // Zext the value to 64 bit
  966. MachineIRBuilder MIB(MI);
  967. auto &Value = MI.getOperand(3);
  968. Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
  969. Value.setReg(ZExtValueReg);
  970. return true;
  971. }
  972. case Intrinsic::prefetch: {
  973. MachineIRBuilder MIB(MI);
  974. auto &AddrVal = MI.getOperand(1);
  975. int64_t IsWrite = MI.getOperand(2).getImm();
  976. int64_t Locality = MI.getOperand(3).getImm();
  977. int64_t IsData = MI.getOperand(4).getImm();
  978. bool IsStream = Locality == 0;
  979. if (Locality != 0) {
  980. assert(Locality <= 3 && "Prefetch locality out-of-range");
  981. // The locality degree is the opposite of the cache speed.
  982. // Put the number the other way around.
  983. // The encoding starts at 0 for level 1
  984. Locality = 3 - Locality;
  985. }
  986. unsigned PrfOp =
  987. (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
  988. MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
  989. MI.eraseFromParent();
  990. return true;
  991. }
  992. case Intrinsic::aarch64_prefetch: {
  993. MachineIRBuilder MIB(MI);
  994. auto &AddrVal = MI.getOperand(1);
  995. int64_t IsWrite = MI.getOperand(2).getImm();
  996. int64_t Target = MI.getOperand(3).getImm();
  997. int64_t IsStream = MI.getOperand(4).getImm();
  998. int64_t IsData = MI.getOperand(5).getImm();
  999. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
  1000. (!IsData << 3) | // IsDataCache bit
  1001. (Target << 1) | // Cache level bits
  1002. (unsigned)IsStream; // Stream bit
  1003. MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
  1004. MI.eraseFromParent();
  1005. return true;
  1006. }
  1007. }
  1008. return true;
  1009. }
  1010. bool AArch64LegalizerInfo::legalizeShlAshrLshr(
  1011. MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
  1012. GISelChangeObserver &Observer) const {
  1013. assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
  1014. MI.getOpcode() == TargetOpcode::G_LSHR ||
  1015. MI.getOpcode() == TargetOpcode::G_SHL);
  1016. // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
  1017. // imported patterns can select it later. Either way, it will be legal.
  1018. Register AmtReg = MI.getOperand(2).getReg();
  1019. auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
  1020. if (!VRegAndVal)
  1021. return true;
  1022. // Check the shift amount is in range for an immediate form.
  1023. int64_t Amount = VRegAndVal->Value.getSExtValue();
  1024. if (Amount > 31)
  1025. return true; // This will have to remain a register variant.
  1026. auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
  1027. Observer.changingInstr(MI);
  1028. MI.getOperand(2).setReg(ExtCst.getReg(0));
  1029. Observer.changedInstr(MI);
  1030. return true;
  1031. }
  1032. static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
  1033. MachineRegisterInfo &MRI) {
  1034. Base = Root;
  1035. Offset = 0;
  1036. Register NewBase;
  1037. int64_t NewOffset;
  1038. if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
  1039. isShiftedInt<7, 3>(NewOffset)) {
  1040. Base = NewBase;
  1041. Offset = NewOffset;
  1042. }
  1043. }
  1044. // FIXME: This should be removed and replaced with the generic bitcast legalize
  1045. // action.
  1046. bool AArch64LegalizerInfo::legalizeLoadStore(
  1047. MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
  1048. GISelChangeObserver &Observer) const {
  1049. assert(MI.getOpcode() == TargetOpcode::G_STORE ||
  1050. MI.getOpcode() == TargetOpcode::G_LOAD);
  1051. // Here we just try to handle vector loads/stores where our value type might
  1052. // have pointer elements, which the SelectionDAG importer can't handle. To
  1053. // allow the existing patterns for s64 to fire for p0, we just try to bitcast
  1054. // the value to use s64 types.
  1055. // Custom legalization requires the instruction, if not deleted, must be fully
  1056. // legalized. In order to allow further legalization of the inst, we create
  1057. // a new instruction and erase the existing one.
  1058. Register ValReg = MI.getOperand(0).getReg();
  1059. const LLT ValTy = MRI.getType(ValReg);
  1060. if (ValTy == LLT::scalar(128)) {
  1061. assert((*MI.memoperands_begin())->getSuccessOrdering() ==
  1062. AtomicOrdering::Monotonic ||
  1063. (*MI.memoperands_begin())->getSuccessOrdering() ==
  1064. AtomicOrdering::Unordered);
  1065. assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
  1066. LLT s64 = LLT::scalar(64);
  1067. MachineInstrBuilder NewI;
  1068. if (MI.getOpcode() == TargetOpcode::G_LOAD) {
  1069. NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
  1070. MIRBuilder.buildMergeLikeInstr(
  1071. ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
  1072. } else {
  1073. auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
  1074. NewI = MIRBuilder.buildInstr(
  1075. AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
  1076. }
  1077. Register Base;
  1078. int Offset;
  1079. matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
  1080. NewI.addUse(Base);
  1081. NewI.addImm(Offset / 8);
  1082. NewI.cloneMemRefs(MI);
  1083. constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
  1084. *MRI.getTargetRegisterInfo(),
  1085. *ST->getRegBankInfo());
  1086. MI.eraseFromParent();
  1087. return true;
  1088. }
  1089. if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
  1090. ValTy.getElementType().getAddressSpace() != 0) {
  1091. LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
  1092. return false;
  1093. }
  1094. unsigned PtrSize = ValTy.getElementType().getSizeInBits();
  1095. const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
  1096. auto &MMO = **MI.memoperands_begin();
  1097. MMO.setType(NewTy);
  1098. if (MI.getOpcode() == TargetOpcode::G_STORE) {
  1099. auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
  1100. MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
  1101. } else {
  1102. auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
  1103. MIRBuilder.buildBitcast(ValReg, NewLoad);
  1104. }
  1105. MI.eraseFromParent();
  1106. return true;
  1107. }
  1108. bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
  1109. MachineRegisterInfo &MRI,
  1110. MachineIRBuilder &MIRBuilder) const {
  1111. MachineFunction &MF = MIRBuilder.getMF();
  1112. Align Alignment(MI.getOperand(2).getImm());
  1113. Register Dst = MI.getOperand(0).getReg();
  1114. Register ListPtr = MI.getOperand(1).getReg();
  1115. LLT PtrTy = MRI.getType(ListPtr);
  1116. LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
  1117. const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
  1118. const Align PtrAlign = Align(PtrSize);
  1119. auto List = MIRBuilder.buildLoad(
  1120. PtrTy, ListPtr,
  1121. *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
  1122. PtrTy, PtrAlign));
  1123. MachineInstrBuilder DstPtr;
  1124. if (Alignment > PtrAlign) {
  1125. // Realign the list to the actual required alignment.
  1126. auto AlignMinus1 =
  1127. MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
  1128. auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
  1129. DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
  1130. } else
  1131. DstPtr = List;
  1132. LLT ValTy = MRI.getType(Dst);
  1133. uint64_t ValSize = ValTy.getSizeInBits() / 8;
  1134. MIRBuilder.buildLoad(
  1135. Dst, DstPtr,
  1136. *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
  1137. ValTy, std::max(Alignment, PtrAlign)));
  1138. auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
  1139. auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
  1140. MIRBuilder.buildStore(NewList, ListPtr,
  1141. *MF.getMachineMemOperand(MachinePointerInfo(),
  1142. MachineMemOperand::MOStore,
  1143. PtrTy, PtrAlign));
  1144. MI.eraseFromParent();
  1145. return true;
  1146. }
  1147. bool AArch64LegalizerInfo::legalizeBitfieldExtract(
  1148. MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
  1149. // Only legal if we can select immediate forms.
  1150. // TODO: Lower this otherwise.
  1151. return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
  1152. getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
  1153. }
  1154. bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
  1155. MachineRegisterInfo &MRI,
  1156. LegalizerHelper &Helper) const {
  1157. // When there is no integer popcount instruction (FEAT_CSSC isn't available),
  1158. // it can be more efficiently lowered to the following sequence that uses
  1159. // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
  1160. // registers are cheap.
  1161. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
  1162. // CNT V0.8B, V0.8B // 8xbyte pop-counts
  1163. // ADDV B0, V0.8B // sum 8xbyte pop-counts
  1164. // UMOV X0, V0.B[0] // copy byte result back to integer reg
  1165. //
  1166. // For 128 bit vector popcounts, we lower to the following sequence:
  1167. // cnt.16b v0, v0 // v8s16, v4s32, v2s64
  1168. // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
  1169. // uaddlp.4s v0, v0 // v4s32, v2s64
  1170. // uaddlp.2d v0, v0 // v2s64
  1171. //
  1172. // For 64 bit vector popcounts, we lower to the following sequence:
  1173. // cnt.8b v0, v0 // v4s16, v2s32
  1174. // uaddlp.4h v0, v0 // v4s16, v2s32
  1175. // uaddlp.2s v0, v0 // v2s32
  1176. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  1177. Register Dst = MI.getOperand(0).getReg();
  1178. Register Val = MI.getOperand(1).getReg();
  1179. LLT Ty = MRI.getType(Val);
  1180. unsigned Size = Ty.getSizeInBits();
  1181. assert(Ty == MRI.getType(Dst) &&
  1182. "Expected src and dst to have the same type!");
  1183. if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
  1184. LLT s64 = LLT::scalar(64);
  1185. auto Split = MIRBuilder.buildUnmerge(s64, Val);
  1186. auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
  1187. auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
  1188. auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
  1189. MIRBuilder.buildZExt(Dst, Add);
  1190. MI.eraseFromParent();
  1191. return true;
  1192. }
  1193. if (!ST->hasNEON() ||
  1194. MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
  1195. // Use generic lowering when custom lowering is not possible.
  1196. return Ty.isScalar() && (Size == 32 || Size == 64) &&
  1197. Helper.lowerBitCount(MI) ==
  1198. LegalizerHelper::LegalizeResult::Legalized;
  1199. }
  1200. // Pre-conditioning: widen Val up to the nearest vector type.
  1201. // s32,s64,v4s16,v2s32 -> v8i8
  1202. // v8s16,v4s32,v2s64 -> v16i8
  1203. LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
  1204. if (Ty.isScalar()) {
  1205. assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
  1206. if (Size == 32) {
  1207. Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
  1208. }
  1209. }
  1210. Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
  1211. // Count bits in each byte-sized lane.
  1212. auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
  1213. // Sum across lanes.
  1214. Register HSum = CTPOP.getReg(0);
  1215. unsigned Opc;
  1216. SmallVector<LLT> HAddTys;
  1217. if (Ty.isScalar()) {
  1218. Opc = Intrinsic::aarch64_neon_uaddlv;
  1219. HAddTys.push_back(LLT::scalar(32));
  1220. } else if (Ty == LLT::fixed_vector(8, 16)) {
  1221. Opc = Intrinsic::aarch64_neon_uaddlp;
  1222. HAddTys.push_back(LLT::fixed_vector(8, 16));
  1223. } else if (Ty == LLT::fixed_vector(4, 32)) {
  1224. Opc = Intrinsic::aarch64_neon_uaddlp;
  1225. HAddTys.push_back(LLT::fixed_vector(8, 16));
  1226. HAddTys.push_back(LLT::fixed_vector(4, 32));
  1227. } else if (Ty == LLT::fixed_vector(2, 64)) {
  1228. Opc = Intrinsic::aarch64_neon_uaddlp;
  1229. HAddTys.push_back(LLT::fixed_vector(8, 16));
  1230. HAddTys.push_back(LLT::fixed_vector(4, 32));
  1231. HAddTys.push_back(LLT::fixed_vector(2, 64));
  1232. } else if (Ty == LLT::fixed_vector(4, 16)) {
  1233. Opc = Intrinsic::aarch64_neon_uaddlp;
  1234. HAddTys.push_back(LLT::fixed_vector(4, 16));
  1235. } else if (Ty == LLT::fixed_vector(2, 32)) {
  1236. Opc = Intrinsic::aarch64_neon_uaddlp;
  1237. HAddTys.push_back(LLT::fixed_vector(4, 16));
  1238. HAddTys.push_back(LLT::fixed_vector(2, 32));
  1239. } else
  1240. llvm_unreachable("unexpected vector shape");
  1241. MachineInstrBuilder UADD;
  1242. for (LLT HTy : HAddTys) {
  1243. UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
  1244. .addUse(HSum);
  1245. HSum = UADD.getReg(0);
  1246. }
  1247. // Post-conditioning.
  1248. if (Ty.isScalar() && (Size == 64 || Size == 128))
  1249. MIRBuilder.buildZExt(Dst, UADD);
  1250. else
  1251. UADD->getOperand(0).setReg(Dst);
  1252. MI.eraseFromParent();
  1253. return true;
  1254. }
  1255. bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
  1256. MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
  1257. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  1258. LLT s64 = LLT::scalar(64);
  1259. auto Addr = MI.getOperand(1).getReg();
  1260. auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
  1261. auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
  1262. auto DstLo = MRI.createGenericVirtualRegister(s64);
  1263. auto DstHi = MRI.createGenericVirtualRegister(s64);
  1264. MachineInstrBuilder CAS;
  1265. if (ST->hasLSE()) {
  1266. // We have 128-bit CASP instructions taking XSeqPair registers, which are
  1267. // s128. We need the merge/unmerge to bracket the expansion and pair up with
  1268. // the rest of the MIR so we must reassemble the extracted registers into a
  1269. // 128-bit known-regclass one with code like this:
  1270. //
  1271. // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
  1272. // %out = CASP %in1, ...
  1273. // %OldLo = G_EXTRACT %out, 0
  1274. // %OldHi = G_EXTRACT %out, 64
  1275. auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
  1276. unsigned Opcode;
  1277. switch (Ordering) {
  1278. case AtomicOrdering::Acquire:
  1279. Opcode = AArch64::CASPAX;
  1280. break;
  1281. case AtomicOrdering::Release:
  1282. Opcode = AArch64::CASPLX;
  1283. break;
  1284. case AtomicOrdering::AcquireRelease:
  1285. case AtomicOrdering::SequentiallyConsistent:
  1286. Opcode = AArch64::CASPALX;
  1287. break;
  1288. default:
  1289. Opcode = AArch64::CASPX;
  1290. break;
  1291. }
  1292. LLT s128 = LLT::scalar(128);
  1293. auto CASDst = MRI.createGenericVirtualRegister(s128);
  1294. auto CASDesired = MRI.createGenericVirtualRegister(s128);
  1295. auto CASNew = MRI.createGenericVirtualRegister(s128);
  1296. MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
  1297. .addUse(DesiredI->getOperand(0).getReg())
  1298. .addImm(AArch64::sube64)
  1299. .addUse(DesiredI->getOperand(1).getReg())
  1300. .addImm(AArch64::subo64);
  1301. MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
  1302. .addUse(NewI->getOperand(0).getReg())
  1303. .addImm(AArch64::sube64)
  1304. .addUse(NewI->getOperand(1).getReg())
  1305. .addImm(AArch64::subo64);
  1306. CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
  1307. MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
  1308. MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
  1309. } else {
  1310. // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
  1311. // can take arbitrary registers so it just has the normal GPR64 operands the
  1312. // rest of AArch64 is expecting.
  1313. auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
  1314. unsigned Opcode;
  1315. switch (Ordering) {
  1316. case AtomicOrdering::Acquire:
  1317. Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
  1318. break;
  1319. case AtomicOrdering::Release:
  1320. Opcode = AArch64::CMP_SWAP_128_RELEASE;
  1321. break;
  1322. case AtomicOrdering::AcquireRelease:
  1323. case AtomicOrdering::SequentiallyConsistent:
  1324. Opcode = AArch64::CMP_SWAP_128;
  1325. break;
  1326. default:
  1327. Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
  1328. break;
  1329. }
  1330. auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
  1331. CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
  1332. {Addr, DesiredI->getOperand(0),
  1333. DesiredI->getOperand(1), NewI->getOperand(0),
  1334. NewI->getOperand(1)});
  1335. }
  1336. CAS.cloneMemRefs(MI);
  1337. constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
  1338. *MRI.getTargetRegisterInfo(),
  1339. *ST->getRegBankInfo());
  1340. MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
  1341. MI.eraseFromParent();
  1342. return true;
  1343. }
  1344. bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
  1345. LegalizerHelper &Helper) const {
  1346. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  1347. MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
  1348. LLT Ty = MRI.getType(MI.getOperand(1).getReg());
  1349. auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
  1350. MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
  1351. MI.eraseFromParent();
  1352. return true;
  1353. }
  1354. bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
  1355. LegalizerHelper &Helper) const {
  1356. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  1357. // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
  1358. if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
  1359. // Zext the value operand to 64 bit
  1360. auto &Value = MI.getOperand(1);
  1361. Register ZExtValueReg =
  1362. MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
  1363. Value.setReg(ZExtValueReg);
  1364. return true;
  1365. }
  1366. return false;
  1367. }
  1368. bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
  1369. LegalizerHelper &Helper) const {
  1370. MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
  1371. MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
  1372. Register Dst = MI.getOperand(0).getReg();
  1373. LLT DstTy = MRI.getType(Dst);
  1374. assert(DstTy.isScalar() && "Only expected scalars right now!");
  1375. const unsigned DstSize = DstTy.getSizeInBits();
  1376. assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!");
  1377. assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy &&
  1378. "Expected homogeneous types!");
  1379. // We want to materialize a mask with the high bit set.
  1380. uint64_t EltMask;
  1381. LLT VecTy;
  1382. // TODO: s16 support.
  1383. switch (DstSize) {
  1384. default:
  1385. llvm_unreachable("Unexpected type for G_FCOPYSIGN!");
  1386. case 64: {
  1387. // AdvSIMD immediate moves cannot materialize out mask in a single
  1388. // instruction for 64-bit elements. Instead, materialize zero and then
  1389. // negate it.
  1390. EltMask = 0;
  1391. VecTy = LLT::fixed_vector(2, DstTy);
  1392. break;
  1393. }
  1394. case 32:
  1395. EltMask = 0x80000000ULL;
  1396. VecTy = LLT::fixed_vector(4, DstTy);
  1397. break;
  1398. }
  1399. // Widen In1 and In2 to 128 bits. We want these to eventually become
  1400. // INSERT_SUBREGs.
  1401. auto Undef = MIRBuilder.buildUndef(VecTy);
  1402. auto Zero = MIRBuilder.buildConstant(DstTy, 0);
  1403. auto Ins1 = MIRBuilder.buildInsertVectorElement(
  1404. VecTy, Undef, MI.getOperand(1).getReg(), Zero);
  1405. auto Ins2 = MIRBuilder.buildInsertVectorElement(
  1406. VecTy, Undef, MI.getOperand(2).getReg(), Zero);
  1407. // Construct the mask.
  1408. auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
  1409. if (DstSize == 64)
  1410. Mask = MIRBuilder.buildFNeg(VecTy, Mask);
  1411. auto Sel = MIRBuilder.buildInstr(AArch64::G_BIT, {VecTy}, {Ins1, Ins2, Mask});
  1412. // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We
  1413. // want this to eventually become an EXTRACT_SUBREG.
  1414. SmallVector<Register, 2> DstRegs(1, Dst);
  1415. for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I)
  1416. DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy));
  1417. MIRBuilder.buildUnmerge(DstRegs, Sel);
  1418. MI.eraseFromParent();
  1419. return true;
  1420. }