AArch64Subtarget.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the AArch64 specific subclass of TargetSubtarget.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "AArch64Subtarget.h"
  13. #include "AArch64.h"
  14. #include "AArch64InstrInfo.h"
  15. #include "AArch64PBQPRegAlloc.h"
  16. #include "AArch64TargetMachine.h"
  17. #include "GISel/AArch64CallLowering.h"
  18. #include "GISel/AArch64LegalizerInfo.h"
  19. #include "GISel/AArch64RegisterBankInfo.h"
  20. #include "MCTargetDesc/AArch64AddressingModes.h"
  21. #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
  22. #include "llvm/CodeGen/MachineFrameInfo.h"
  23. #include "llvm/CodeGen/MachineScheduler.h"
  24. #include "llvm/IR/GlobalValue.h"
  25. #include "llvm/Support/AArch64TargetParser.h"
  26. #include "llvm/Support/TargetParser.h"
  27. using namespace llvm;
  28. #define DEBUG_TYPE "aarch64-subtarget"
  29. #define GET_SUBTARGETINFO_CTOR
  30. #define GET_SUBTARGETINFO_TARGET_DESC
  31. #include "AArch64GenSubtargetInfo.inc"
  32. static cl::opt<bool>
  33. EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
  34. "converter pass"), cl::init(true), cl::Hidden);
  35. // If OS supports TBI, use this flag to enable it.
  36. static cl::opt<bool>
  37. UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
  38. "an address is ignored"), cl::init(false), cl::Hidden);
  39. static cl::opt<bool>
  40. UseNonLazyBind("aarch64-enable-nonlazybind",
  41. cl::desc("Call nonlazybind functions via direct GOT load"),
  42. cl::init(false), cl::Hidden);
  43. static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
  44. cl::desc("Enable the use of AA during codegen."));
  45. static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
  46. "aarch64-insert-extract-base-cost",
  47. cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
  48. // Reserve a list of X# registers, so they are unavailable for register
  49. // allocator, but can still be used as ABI requests, such as passing arguments
  50. // to function call.
  51. static cl::list<std::string>
  52. ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
  53. "registers, so they can't be used by register allocator. "
  54. "Should only be used for testing register allocator."),
  55. cl::CommaSeparated, cl::Hidden);
  56. static cl::opt<bool>
  57. ForceStreamingCompatibleSVE("force-streaming-compatible-sve",
  58. cl::init(false), cl::Hidden);
  59. unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
  60. if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
  61. return OverrideVectorInsertExtractBaseCost;
  62. return VectorInsertExtractBaseCost;
  63. }
  64. AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
  65. StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
  66. // Determine default and user-specified characteristics
  67. if (CPUString.empty())
  68. CPUString = "generic";
  69. if (TuneCPUString.empty())
  70. TuneCPUString = CPUString;
  71. ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
  72. initializeProperties();
  73. return *this;
  74. }
  75. void AArch64Subtarget::initializeProperties() {
  76. // Initialize CPU specific properties. We should add a tablegen feature for
  77. // this in the future so we can specify it together with the subtarget
  78. // features.
  79. switch (ARMProcFamily) {
  80. case Others:
  81. break;
  82. case Carmel:
  83. CacheLineSize = 64;
  84. break;
  85. case CortexA35:
  86. case CortexA53:
  87. case CortexA55:
  88. PrefFunctionLogAlignment = 4;
  89. PrefLoopLogAlignment = 4;
  90. MaxBytesForLoopAlignment = 8;
  91. break;
  92. case CortexA57:
  93. MaxInterleaveFactor = 4;
  94. PrefFunctionLogAlignment = 4;
  95. PrefLoopLogAlignment = 4;
  96. MaxBytesForLoopAlignment = 8;
  97. break;
  98. case CortexA65:
  99. PrefFunctionLogAlignment = 3;
  100. break;
  101. case CortexA72:
  102. case CortexA73:
  103. case CortexA75:
  104. PrefFunctionLogAlignment = 4;
  105. PrefLoopLogAlignment = 4;
  106. MaxBytesForLoopAlignment = 8;
  107. break;
  108. case CortexA76:
  109. case CortexA77:
  110. case CortexA78:
  111. case CortexA78C:
  112. case CortexR82:
  113. case CortexX1:
  114. case CortexX1C:
  115. PrefFunctionLogAlignment = 4;
  116. PrefLoopLogAlignment = 5;
  117. MaxBytesForLoopAlignment = 16;
  118. break;
  119. case CortexA510:
  120. PrefFunctionLogAlignment = 4;
  121. VScaleForTuning = 1;
  122. PrefLoopLogAlignment = 4;
  123. MaxBytesForLoopAlignment = 8;
  124. break;
  125. case CortexA710:
  126. case CortexA715:
  127. case CortexX2:
  128. case CortexX3:
  129. PrefFunctionLogAlignment = 4;
  130. VScaleForTuning = 1;
  131. PrefLoopLogAlignment = 5;
  132. MaxBytesForLoopAlignment = 16;
  133. break;
  134. case A64FX:
  135. CacheLineSize = 256;
  136. PrefFunctionLogAlignment = 3;
  137. PrefLoopLogAlignment = 2;
  138. MaxInterleaveFactor = 4;
  139. PrefetchDistance = 128;
  140. MinPrefetchStride = 1024;
  141. MaxPrefetchIterationsAhead = 4;
  142. VScaleForTuning = 4;
  143. break;
  144. case AppleA7:
  145. case AppleA10:
  146. case AppleA11:
  147. case AppleA12:
  148. case AppleA13:
  149. case AppleA14:
  150. case AppleA15:
  151. case AppleA16:
  152. CacheLineSize = 64;
  153. PrefetchDistance = 280;
  154. MinPrefetchStride = 2048;
  155. MaxPrefetchIterationsAhead = 3;
  156. switch (ARMProcFamily) {
  157. case AppleA14:
  158. case AppleA15:
  159. case AppleA16:
  160. MaxInterleaveFactor = 4;
  161. break;
  162. default:
  163. break;
  164. }
  165. break;
  166. case ExynosM3:
  167. MaxInterleaveFactor = 4;
  168. MaxJumpTableSize = 20;
  169. PrefFunctionLogAlignment = 5;
  170. PrefLoopLogAlignment = 4;
  171. break;
  172. case Falkor:
  173. MaxInterleaveFactor = 4;
  174. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  175. MinVectorRegisterBitWidth = 128;
  176. CacheLineSize = 128;
  177. PrefetchDistance = 820;
  178. MinPrefetchStride = 2048;
  179. MaxPrefetchIterationsAhead = 8;
  180. break;
  181. case Kryo:
  182. MaxInterleaveFactor = 4;
  183. VectorInsertExtractBaseCost = 2;
  184. CacheLineSize = 128;
  185. PrefetchDistance = 740;
  186. MinPrefetchStride = 1024;
  187. MaxPrefetchIterationsAhead = 11;
  188. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  189. MinVectorRegisterBitWidth = 128;
  190. break;
  191. case NeoverseE1:
  192. PrefFunctionLogAlignment = 3;
  193. break;
  194. case NeoverseN1:
  195. PrefFunctionLogAlignment = 4;
  196. PrefLoopLogAlignment = 5;
  197. MaxBytesForLoopAlignment = 16;
  198. break;
  199. case NeoverseN2:
  200. case NeoverseV2:
  201. PrefFunctionLogAlignment = 4;
  202. PrefLoopLogAlignment = 5;
  203. MaxBytesForLoopAlignment = 16;
  204. VScaleForTuning = 1;
  205. break;
  206. case NeoverseV1:
  207. PrefFunctionLogAlignment = 4;
  208. PrefLoopLogAlignment = 5;
  209. MaxBytesForLoopAlignment = 16;
  210. VScaleForTuning = 2;
  211. break;
  212. case Neoverse512TVB:
  213. PrefFunctionLogAlignment = 4;
  214. VScaleForTuning = 1;
  215. MaxInterleaveFactor = 4;
  216. break;
  217. case Saphira:
  218. MaxInterleaveFactor = 4;
  219. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  220. MinVectorRegisterBitWidth = 128;
  221. break;
  222. case ThunderX2T99:
  223. CacheLineSize = 64;
  224. PrefFunctionLogAlignment = 3;
  225. PrefLoopLogAlignment = 2;
  226. MaxInterleaveFactor = 4;
  227. PrefetchDistance = 128;
  228. MinPrefetchStride = 1024;
  229. MaxPrefetchIterationsAhead = 4;
  230. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  231. MinVectorRegisterBitWidth = 128;
  232. break;
  233. case ThunderX:
  234. case ThunderXT88:
  235. case ThunderXT81:
  236. case ThunderXT83:
  237. CacheLineSize = 128;
  238. PrefFunctionLogAlignment = 3;
  239. PrefLoopLogAlignment = 2;
  240. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  241. MinVectorRegisterBitWidth = 128;
  242. break;
  243. case TSV110:
  244. CacheLineSize = 64;
  245. PrefFunctionLogAlignment = 4;
  246. PrefLoopLogAlignment = 2;
  247. break;
  248. case ThunderX3T110:
  249. CacheLineSize = 64;
  250. PrefFunctionLogAlignment = 4;
  251. PrefLoopLogAlignment = 2;
  252. MaxInterleaveFactor = 4;
  253. PrefetchDistance = 128;
  254. MinPrefetchStride = 1024;
  255. MaxPrefetchIterationsAhead = 4;
  256. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  257. MinVectorRegisterBitWidth = 128;
  258. break;
  259. case Ampere1:
  260. case Ampere1A:
  261. CacheLineSize = 64;
  262. PrefFunctionLogAlignment = 6;
  263. PrefLoopLogAlignment = 6;
  264. MaxInterleaveFactor = 4;
  265. break;
  266. }
  267. }
  268. AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
  269. StringRef TuneCPU, StringRef FS,
  270. const TargetMachine &TM, bool LittleEndian,
  271. unsigned MinSVEVectorSizeInBitsOverride,
  272. unsigned MaxSVEVectorSizeInBitsOverride,
  273. bool StreamingSVEModeDisabled)
  274. : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
  275. ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
  276. ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
  277. CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
  278. IsLittle(LittleEndian),
  279. StreamingSVEModeDisabled(StreamingSVEModeDisabled),
  280. MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
  281. MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
  282. InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
  283. TLInfo(TM, *this) {
  284. if (AArch64::isX18ReservedByDefault(TT))
  285. ReserveXRegister.set(18);
  286. CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
  287. InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
  288. Legalizer.reset(new AArch64LegalizerInfo(*this));
  289. auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
  290. // FIXME: At this point, we can't rely on Subtarget having RBI.
  291. // It's awkward to mix passing RBI and the Subtarget; should we pass
  292. // TII/TRI as well?
  293. InstSelector.reset(createAArch64InstructionSelector(
  294. *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
  295. RegBankInfo.reset(RBI);
  296. auto TRI = getRegisterInfo();
  297. StringSet<> ReservedRegNames;
  298. ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end());
  299. for (unsigned i = 0; i < 29; ++i) {
  300. if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
  301. ReserveXRegisterForRA.set(i);
  302. }
  303. // X30 is named LR, so we can't use TRI->getName to check X30.
  304. if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
  305. ReserveXRegisterForRA.set(30);
  306. // X29 is named FP, so we can't use TRI->getName to check X29.
  307. if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
  308. ReserveXRegisterForRA.set(29);
  309. }
  310. const CallLowering *AArch64Subtarget::getCallLowering() const {
  311. return CallLoweringInfo.get();
  312. }
  313. const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
  314. return InlineAsmLoweringInfo.get();
  315. }
  316. InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
  317. return InstSelector.get();
  318. }
  319. const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
  320. return Legalizer.get();
  321. }
  322. const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
  323. return RegBankInfo.get();
  324. }
  325. /// Find the target operand flags that describe how a global value should be
  326. /// referenced for the current subtarget.
  327. unsigned
  328. AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
  329. const TargetMachine &TM) const {
  330. // MachO large model always goes via a GOT, simply to get a single 8-byte
  331. // absolute relocation on all global addresses.
  332. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
  333. return AArch64II::MO_GOT;
  334. if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
  335. if (GV->hasDLLImportStorageClass()) {
  336. if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy())
  337. return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX;
  338. return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
  339. }
  340. if (getTargetTriple().isOSWindows())
  341. return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
  342. return AArch64II::MO_GOT;
  343. }
  344. // The small code model's direct accesses use ADRP, which cannot
  345. // necessarily produce the value 0 (if the code is above 4GB).
  346. // Same for the tiny code model, where we have a pc relative LDR.
  347. if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
  348. GV->hasExternalWeakLinkage())
  349. return AArch64II::MO_GOT;
  350. // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
  351. // that their nominal addresses are tagged and outside of the code model. In
  352. // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
  353. // tag if necessary based on MO_TAGGED.
  354. if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
  355. return AArch64II::MO_NC | AArch64II::MO_TAGGED;
  356. return AArch64II::MO_NO_FLAG;
  357. }
  358. unsigned AArch64Subtarget::classifyGlobalFunctionReference(
  359. const GlobalValue *GV, const TargetMachine &TM) const {
  360. // MachO large model always goes via a GOT, because we don't have the
  361. // relocations available to do anything else..
  362. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
  363. !GV->hasInternalLinkage())
  364. return AArch64II::MO_GOT;
  365. // NonLazyBind goes via GOT unless we know it's available locally.
  366. auto *F = dyn_cast<Function>(GV);
  367. if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
  368. !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
  369. return AArch64II::MO_GOT;
  370. if (getTargetTriple().isOSWindows()) {
  371. if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() &&
  372. GV->hasDLLImportStorageClass()) {
  373. // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT,
  374. // not MO_DLLIMPORTAUX.
  375. return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
  376. }
  377. // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
  378. return ClassifyGlobalReference(GV, TM);
  379. }
  380. return AArch64II::MO_NO_FLAG;
  381. }
  382. void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
  383. unsigned NumRegionInstrs) const {
  384. // LNT run (at least on Cyclone) showed reasonably significant gains for
  385. // bi-directional scheduling. 253.perlbmk.
  386. Policy.OnlyTopDown = false;
  387. Policy.OnlyBottomUp = false;
  388. // Enabling or Disabling the latency heuristic is a close call: It seems to
  389. // help nearly no benchmark on out-of-order architectures, on the other hand
  390. // it regresses register pressure on a few benchmarking.
  391. Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
  392. }
  393. bool AArch64Subtarget::enableEarlyIfConversion() const {
  394. return EnableEarlyIfConvert;
  395. }
  396. bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
  397. if (!UseAddressTopByteIgnored)
  398. return false;
  399. if (TargetTriple.isDriverKit())
  400. return true;
  401. if (TargetTriple.isiOS()) {
  402. return TargetTriple.getiOSVersion() >= VersionTuple(8);
  403. }
  404. return false;
  405. }
  406. std::unique_ptr<PBQPRAConstraint>
  407. AArch64Subtarget::getCustomPBQPConstraints() const {
  408. return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
  409. }
  410. void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
  411. // We usually compute max call frame size after ISel. Do the computation now
  412. // if the .mir file didn't specify it. Note that this will probably give you
  413. // bogus values after PEI has eliminated the callframe setup/destroy pseudo
  414. // instructions, specify explicitly if you need it to be correct.
  415. MachineFrameInfo &MFI = MF.getFrameInfo();
  416. if (!MFI.isMaxCallFrameSizeComputed())
  417. MFI.computeMaxCallFrameSize(MF);
  418. }
  419. bool AArch64Subtarget::useAA() const { return UseAA; }
  420. bool AArch64Subtarget::forceStreamingCompatibleSVE() const {
  421. if (ForceStreamingCompatibleSVE) {
  422. assert(hasSVEorSME() && "Expected SVE to be available");
  423. return hasSVEorSME();
  424. }
  425. return false;
  426. }