AArch64Subtarget.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the AArch64 specific subclass of TargetSubtarget.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "AArch64Subtarget.h"
  13. #include "AArch64.h"
  14. #include "AArch64InstrInfo.h"
  15. #include "AArch64PBQPRegAlloc.h"
  16. #include "AArch64TargetMachine.h"
  17. #include "GISel/AArch64CallLowering.h"
  18. #include "GISel/AArch64LegalizerInfo.h"
  19. #include "GISel/AArch64RegisterBankInfo.h"
  20. #include "MCTargetDesc/AArch64AddressingModes.h"
  21. #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
  22. #include "llvm/CodeGen/MachineScheduler.h"
  23. #include "llvm/IR/GlobalValue.h"
  24. #include "llvm/Support/AArch64TargetParser.h"
  25. #include "llvm/Support/TargetParser.h"
  26. using namespace llvm;
  27. #define DEBUG_TYPE "aarch64-subtarget"
  28. #define GET_SUBTARGETINFO_CTOR
  29. #define GET_SUBTARGETINFO_TARGET_DESC
  30. #include "AArch64GenSubtargetInfo.inc"
  31. static cl::opt<bool>
  32. EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
  33. "converter pass"), cl::init(true), cl::Hidden);
  34. // If OS supports TBI, use this flag to enable it.
  35. static cl::opt<bool>
  36. UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
  37. "an address is ignored"), cl::init(false), cl::Hidden);
  38. static cl::opt<bool>
  39. UseNonLazyBind("aarch64-enable-nonlazybind",
  40. cl::desc("Call nonlazybind functions via direct GOT load"),
  41. cl::init(false), cl::Hidden);
  42. static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
  43. cl::desc("Enable the use of AA during codegen."));
  44. AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
  45. StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
  46. // Determine default and user-specified characteristics
  47. if (CPUString.empty())
  48. CPUString = "generic";
  49. if (TuneCPUString.empty())
  50. TuneCPUString = CPUString;
  51. ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
  52. initializeProperties();
  53. return *this;
  54. }
  55. void AArch64Subtarget::initializeProperties() {
  56. // Initialize CPU specific properties. We should add a tablegen feature for
  57. // this in the future so we can specify it together with the subtarget
  58. // features.
  59. switch (ARMProcFamily) {
  60. case Others:
  61. break;
  62. case Carmel:
  63. CacheLineSize = 64;
  64. break;
  65. case CortexA35:
  66. break;
  67. case CortexA53:
  68. case CortexA55:
  69. PrefFunctionLogAlignment = 4;
  70. break;
  71. case CortexA57:
  72. MaxInterleaveFactor = 4;
  73. PrefFunctionLogAlignment = 4;
  74. break;
  75. case CortexA65:
  76. PrefFunctionLogAlignment = 3;
  77. break;
  78. case CortexA72:
  79. case CortexA73:
  80. case CortexA75:
  81. case CortexA76:
  82. case CortexA77:
  83. case CortexA78:
  84. case CortexA78C:
  85. case CortexR82:
  86. case CortexX1:
  87. case CortexX1C:
  88. PrefFunctionLogAlignment = 4;
  89. break;
  90. case CortexA510:
  91. case CortexA710:
  92. case CortexX2:
  93. PrefFunctionLogAlignment = 4;
  94. VScaleForTuning = 1;
  95. break;
  96. case A64FX:
  97. CacheLineSize = 256;
  98. PrefFunctionLogAlignment = 3;
  99. PrefLoopLogAlignment = 2;
  100. MaxInterleaveFactor = 4;
  101. PrefetchDistance = 128;
  102. MinPrefetchStride = 1024;
  103. MaxPrefetchIterationsAhead = 4;
  104. VScaleForTuning = 4;
  105. break;
  106. case AppleA7:
  107. case AppleA10:
  108. case AppleA11:
  109. case AppleA12:
  110. case AppleA13:
  111. case AppleA14:
  112. CacheLineSize = 64;
  113. PrefetchDistance = 280;
  114. MinPrefetchStride = 2048;
  115. MaxPrefetchIterationsAhead = 3;
  116. break;
  117. case ExynosM3:
  118. MaxInterleaveFactor = 4;
  119. MaxJumpTableSize = 20;
  120. PrefFunctionLogAlignment = 5;
  121. PrefLoopLogAlignment = 4;
  122. break;
  123. case Falkor:
  124. MaxInterleaveFactor = 4;
  125. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  126. MinVectorRegisterBitWidth = 128;
  127. CacheLineSize = 128;
  128. PrefetchDistance = 820;
  129. MinPrefetchStride = 2048;
  130. MaxPrefetchIterationsAhead = 8;
  131. break;
  132. case Kryo:
  133. MaxInterleaveFactor = 4;
  134. VectorInsertExtractBaseCost = 2;
  135. CacheLineSize = 128;
  136. PrefetchDistance = 740;
  137. MinPrefetchStride = 1024;
  138. MaxPrefetchIterationsAhead = 11;
  139. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  140. MinVectorRegisterBitWidth = 128;
  141. break;
  142. case NeoverseE1:
  143. PrefFunctionLogAlignment = 3;
  144. break;
  145. case NeoverseN1:
  146. PrefFunctionLogAlignment = 4;
  147. PrefLoopLogAlignment = 5;
  148. MaxBytesForLoopAlignment = 16;
  149. break;
  150. case NeoverseN2:
  151. PrefFunctionLogAlignment = 4;
  152. PrefLoopLogAlignment = 5;
  153. MaxBytesForLoopAlignment = 16;
  154. VScaleForTuning = 1;
  155. break;
  156. case NeoverseV1:
  157. PrefFunctionLogAlignment = 4;
  158. PrefLoopLogAlignment = 5;
  159. MaxBytesForLoopAlignment = 16;
  160. VScaleForTuning = 2;
  161. break;
  162. case Neoverse512TVB:
  163. PrefFunctionLogAlignment = 4;
  164. VScaleForTuning = 1;
  165. MaxInterleaveFactor = 4;
  166. break;
  167. case Saphira:
  168. MaxInterleaveFactor = 4;
  169. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  170. MinVectorRegisterBitWidth = 128;
  171. break;
  172. case ThunderX2T99:
  173. CacheLineSize = 64;
  174. PrefFunctionLogAlignment = 3;
  175. PrefLoopLogAlignment = 2;
  176. MaxInterleaveFactor = 4;
  177. PrefetchDistance = 128;
  178. MinPrefetchStride = 1024;
  179. MaxPrefetchIterationsAhead = 4;
  180. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  181. MinVectorRegisterBitWidth = 128;
  182. break;
  183. case ThunderX:
  184. case ThunderXT88:
  185. case ThunderXT81:
  186. case ThunderXT83:
  187. CacheLineSize = 128;
  188. PrefFunctionLogAlignment = 3;
  189. PrefLoopLogAlignment = 2;
  190. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  191. MinVectorRegisterBitWidth = 128;
  192. break;
  193. case TSV110:
  194. CacheLineSize = 64;
  195. PrefFunctionLogAlignment = 4;
  196. PrefLoopLogAlignment = 2;
  197. break;
  198. case ThunderX3T110:
  199. CacheLineSize = 64;
  200. PrefFunctionLogAlignment = 4;
  201. PrefLoopLogAlignment = 2;
  202. MaxInterleaveFactor = 4;
  203. PrefetchDistance = 128;
  204. MinPrefetchStride = 1024;
  205. MaxPrefetchIterationsAhead = 4;
  206. // FIXME: remove this to enable 64-bit SLP if performance looks good.
  207. MinVectorRegisterBitWidth = 128;
  208. break;
  209. case Ampere1:
  210. CacheLineSize = 64;
  211. PrefFunctionLogAlignment = 6;
  212. PrefLoopLogAlignment = 6;
  213. MaxInterleaveFactor = 4;
  214. break;
  215. }
  216. }
  217. AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
  218. const std::string &TuneCPU,
  219. const std::string &FS,
  220. const TargetMachine &TM, bool LittleEndian,
  221. unsigned MinSVEVectorSizeInBitsOverride,
  222. unsigned MaxSVEVectorSizeInBitsOverride)
  223. : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
  224. ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
  225. CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
  226. IsLittle(LittleEndian),
  227. MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
  228. MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
  229. InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
  230. TLInfo(TM, *this) {
  231. if (AArch64::isX18ReservedByDefault(TT))
  232. ReserveXRegister.set(18);
  233. CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
  234. InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
  235. Legalizer.reset(new AArch64LegalizerInfo(*this));
  236. auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
  237. // FIXME: At this point, we can't rely on Subtarget having RBI.
  238. // It's awkward to mix passing RBI and the Subtarget; should we pass
  239. // TII/TRI as well?
  240. InstSelector.reset(createAArch64InstructionSelector(
  241. *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
  242. RegBankInfo.reset(RBI);
  243. }
  244. const CallLowering *AArch64Subtarget::getCallLowering() const {
  245. return CallLoweringInfo.get();
  246. }
  247. const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
  248. return InlineAsmLoweringInfo.get();
  249. }
  250. InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
  251. return InstSelector.get();
  252. }
  253. const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
  254. return Legalizer.get();
  255. }
  256. const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
  257. return RegBankInfo.get();
  258. }
  259. /// Find the target operand flags that describe how a global value should be
  260. /// referenced for the current subtarget.
  261. unsigned
  262. AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
  263. const TargetMachine &TM) const {
  264. // MachO large model always goes via a GOT, simply to get a single 8-byte
  265. // absolute relocation on all global addresses.
  266. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
  267. return AArch64II::MO_GOT;
  268. if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
  269. if (GV->hasDLLImportStorageClass())
  270. return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
  271. if (getTargetTriple().isOSWindows())
  272. return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
  273. return AArch64II::MO_GOT;
  274. }
  275. // The small code model's direct accesses use ADRP, which cannot
  276. // necessarily produce the value 0 (if the code is above 4GB).
  277. // Same for the tiny code model, where we have a pc relative LDR.
  278. if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
  279. GV->hasExternalWeakLinkage())
  280. return AArch64II::MO_GOT;
  281. // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
  282. // that their nominal addresses are tagged and outside of the code model. In
  283. // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
  284. // tag if necessary based on MO_TAGGED.
  285. if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
  286. return AArch64II::MO_NC | AArch64II::MO_TAGGED;
  287. return AArch64II::MO_NO_FLAG;
  288. }
  289. unsigned AArch64Subtarget::classifyGlobalFunctionReference(
  290. const GlobalValue *GV, const TargetMachine &TM) const {
  291. // MachO large model always goes via a GOT, because we don't have the
  292. // relocations available to do anything else..
  293. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
  294. !GV->hasInternalLinkage())
  295. return AArch64II::MO_GOT;
  296. // NonLazyBind goes via GOT unless we know it's available locally.
  297. auto *F = dyn_cast<Function>(GV);
  298. if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
  299. !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
  300. return AArch64II::MO_GOT;
  301. // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
  302. if (getTargetTriple().isOSWindows())
  303. return ClassifyGlobalReference(GV, TM);
  304. return AArch64II::MO_NO_FLAG;
  305. }
  306. void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
  307. unsigned NumRegionInstrs) const {
  308. // LNT run (at least on Cyclone) showed reasonably significant gains for
  309. // bi-directional scheduling. 253.perlbmk.
  310. Policy.OnlyTopDown = false;
  311. Policy.OnlyBottomUp = false;
  312. // Enabling or Disabling the latency heuristic is a close call: It seems to
  313. // help nearly no benchmark on out-of-order architectures, on the other hand
  314. // it regresses register pressure on a few benchmarking.
  315. Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
  316. }
  317. bool AArch64Subtarget::enableEarlyIfConversion() const {
  318. return EnableEarlyIfConvert;
  319. }
  320. bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
  321. if (!UseAddressTopByteIgnored)
  322. return false;
  323. if (TargetTriple.isiOS()) {
  324. return TargetTriple.getiOSVersion() >= VersionTuple(8);
  325. }
  326. return false;
  327. }
  328. std::unique_ptr<PBQPRAConstraint>
  329. AArch64Subtarget::getCustomPBQPConstraints() const {
  330. return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
  331. }
  332. void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
  333. // We usually compute max call frame size after ISel. Do the computation now
  334. // if the .mir file didn't specify it. Note that this will probably give you
  335. // bogus values after PEI has eliminated the callframe setup/destroy pseudo
  336. // instructions, specify explicitly if you need it to be correct.
  337. MachineFrameInfo &MFI = MF.getFrameInfo();
  338. if (!MFI.isMaxCallFrameSizeComputed())
  339. MFI.computeMaxCallFrameSize(MF);
  340. }
  341. bool AArch64Subtarget::useAA() const { return UseAA; }