ARMSelectionDAGInfo.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the ARMSelectionDAGInfo class.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "ARMTargetMachine.h"
  13. #include "ARMTargetTransformInfo.h"
  14. #include "llvm/CodeGen/SelectionDAG.h"
  15. #include "llvm/IR/DerivedTypes.h"
  16. #include "llvm/Support/CommandLine.h"
  17. using namespace llvm;
  18. #define DEBUG_TYPE "arm-selectiondag-info"
  19. cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
  20. "arm-memtransfer-tploop", cl::Hidden,
  21. cl::desc("Control conversion of memcpy to "
  22. "Tail predicated loops (WLSTP)"),
  23. cl::init(TPLoop::ForceDisabled),
  24. cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
  25. "Don't convert memcpy to TP loop."),
  26. clEnumValN(TPLoop::ForceEnabled, "force-enabled",
  27. "Always convert memcpy to TP loop."),
  28. clEnumValN(TPLoop::Allow, "allow",
  29. "Allow (may be subject to certain conditions) "
  30. "conversion of memcpy to TP loop.")));
  31. // Emit, if possible, a specialized version of the given Libcall. Typically this
  32. // means selecting the appropriately aligned version, but we also convert memset
  33. // of 0 into memclr.
  34. SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
  35. SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  36. SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
  37. const ARMSubtarget &Subtarget =
  38. DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  39. const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
  40. // Only use a specialized AEABI function if the default version of this
  41. // Libcall is an AEABI function.
  42. if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
  43. return SDValue();
  44. // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
  45. // able to translate memset to memclr and use the value to index the function
  46. // name array.
  47. enum {
  48. AEABI_MEMCPY = 0,
  49. AEABI_MEMMOVE,
  50. AEABI_MEMSET,
  51. AEABI_MEMCLR
  52. } AEABILibcall;
  53. switch (LC) {
  54. case RTLIB::MEMCPY:
  55. AEABILibcall = AEABI_MEMCPY;
  56. break;
  57. case RTLIB::MEMMOVE:
  58. AEABILibcall = AEABI_MEMMOVE;
  59. break;
  60. case RTLIB::MEMSET:
  61. AEABILibcall = AEABI_MEMSET;
  62. if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
  63. if (ConstantSrc->getZExtValue() == 0)
  64. AEABILibcall = AEABI_MEMCLR;
  65. break;
  66. default:
  67. return SDValue();
  68. }
  69. // Choose the most-aligned libcall variant that we can
  70. enum {
  71. ALIGN1 = 0,
  72. ALIGN4,
  73. ALIGN8
  74. } AlignVariant;
  75. if ((Align & 7) == 0)
  76. AlignVariant = ALIGN8;
  77. else if ((Align & 3) == 0)
  78. AlignVariant = ALIGN4;
  79. else
  80. AlignVariant = ALIGN1;
  81. TargetLowering::ArgListTy Args;
  82. TargetLowering::ArgListEntry Entry;
  83. Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
  84. Entry.Node = Dst;
  85. Args.push_back(Entry);
  86. if (AEABILibcall == AEABI_MEMCLR) {
  87. Entry.Node = Size;
  88. Args.push_back(Entry);
  89. } else if (AEABILibcall == AEABI_MEMSET) {
  90. // Adjust parameters for memset, EABI uses format (ptr, size, value),
  91. // GNU library uses (ptr, value, size)
  92. // See RTABI section 4.3.4
  93. Entry.Node = Size;
  94. Args.push_back(Entry);
  95. // Extend or truncate the argument to be an i32 value for the call.
  96. if (Src.getValueType().bitsGT(MVT::i32))
  97. Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
  98. else if (Src.getValueType().bitsLT(MVT::i32))
  99. Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
  100. Entry.Node = Src;
  101. Entry.Ty = Type::getInt32Ty(*DAG.getContext());
  102. Entry.IsSExt = false;
  103. Args.push_back(Entry);
  104. } else {
  105. Entry.Node = Src;
  106. Args.push_back(Entry);
  107. Entry.Node = Size;
  108. Args.push_back(Entry);
  109. }
  110. char const *FunctionNames[4][3] = {
  111. { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
  112. { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
  113. { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
  114. { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
  115. };
  116. TargetLowering::CallLoweringInfo CLI(DAG);
  117. CLI.setDebugLoc(dl)
  118. .setChain(Chain)
  119. .setLibCallee(
  120. TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
  121. DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
  122. TLI->getPointerTy(DAG.getDataLayout())),
  123. std::move(Args))
  124. .setDiscardResult();
  125. std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
  126. return CallResult.second;
  127. }
  128. static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
  129. const SelectionDAG &DAG,
  130. ConstantSDNode *ConstantSize,
  131. Align Alignment, bool IsMemcpy) {
  132. auto &F = DAG.getMachineFunction().getFunction();
  133. if (!EnableMemtransferTPLoop)
  134. return false;
  135. if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
  136. return true;
  137. // Do not generate inline TP loop if optimizations is disabled,
  138. // or if optimization for size (-Os or -Oz) is on.
  139. if (F.hasOptNone() || F.hasOptSize())
  140. return false;
  141. // If cli option is unset, for memset always generate inline TP.
  142. // For memcpy, check some conditions
  143. if (!IsMemcpy)
  144. return true;
  145. if (!ConstantSize && Alignment >= Align(4))
  146. return true;
  147. if (ConstantSize &&
  148. ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
  149. ConstantSize->getZExtValue() <
  150. Subtarget.getMaxMemcpyTPInlineSizeThreshold())
  151. return true;
  152. return false;
  153. }
  154. SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
  155. SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  156. SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
  157. MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
  158. const ARMSubtarget &Subtarget =
  159. DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  160. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  161. if (Subtarget.hasMVEIntegerOps() &&
  162. shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
  163. return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
  164. DAG.getZExtOrTrunc(Size, dl, MVT::i32));
  165. // Do repeated 4-byte loads and stores. To be improved.
  166. // This requires 4-byte alignment.
  167. if (Alignment < Align(4))
  168. return SDValue();
  169. // This requires the copy size to be a constant, preferably
  170. // within a subtarget-specific limit.
  171. if (!ConstantSize)
  172. return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
  173. Alignment.value(), RTLIB::MEMCPY);
  174. uint64_t SizeVal = ConstantSize->getZExtValue();
  175. if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
  176. return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
  177. Alignment.value(), RTLIB::MEMCPY);
  178. unsigned BytesLeft = SizeVal & 3;
  179. unsigned NumMemOps = SizeVal >> 2;
  180. unsigned EmittedNumMemOps = 0;
  181. EVT VT = MVT::i32;
  182. unsigned VTSize = 4;
  183. unsigned i = 0;
  184. // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
  185. const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
  186. SDValue TFOps[6];
  187. SDValue Loads[6];
  188. uint64_t SrcOff = 0, DstOff = 0;
  189. // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
  190. // VLDM/VSTM and make this code emit it when appropriate. This would reduce
  191. // pressure on the general purpose registers. However this seems harder to map
  192. // onto the register allocator's view of the world.
  193. // The number of MEMCPY pseudo-instructions to emit. We use up to
  194. // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
  195. // later on. This is a lower bound on the number of MEMCPY operations we must
  196. // emit.
  197. unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
  198. // Code size optimisation: do not inline memcpy if expansion results in
  199. // more instructions than the libary call.
  200. if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
  201. return SDValue();
  202. }
  203. SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
  204. for (unsigned I = 0; I != NumMEMCPYs; ++I) {
  205. // Evenly distribute registers among MEMCPY operations to reduce register
  206. // pressure.
  207. unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
  208. unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
  209. Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
  210. DAG.getConstant(NumRegs, dl, MVT::i32));
  211. Src = Dst.getValue(1);
  212. Chain = Dst.getValue(2);
  213. DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
  214. SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
  215. EmittedNumMemOps = NextEmittedNumMemOps;
  216. }
  217. if (BytesLeft == 0)
  218. return Chain;
  219. // Issue loads / stores for the trailing (1 - 3) bytes.
  220. auto getRemainingValueType = [](unsigned BytesLeft) {
  221. return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
  222. };
  223. auto getRemainingSize = [](unsigned BytesLeft) {
  224. return (BytesLeft >= 2) ? 2 : 1;
  225. };
  226. unsigned BytesLeftSave = BytesLeft;
  227. i = 0;
  228. while (BytesLeft) {
  229. VT = getRemainingValueType(BytesLeft);
  230. VTSize = getRemainingSize(BytesLeft);
  231. Loads[i] = DAG.getLoad(VT, dl, Chain,
  232. DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
  233. DAG.getConstant(SrcOff, dl, MVT::i32)),
  234. SrcPtrInfo.getWithOffset(SrcOff));
  235. TFOps[i] = Loads[i].getValue(1);
  236. ++i;
  237. SrcOff += VTSize;
  238. BytesLeft -= VTSize;
  239. }
  240. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
  241. makeArrayRef(TFOps, i));
  242. i = 0;
  243. BytesLeft = BytesLeftSave;
  244. while (BytesLeft) {
  245. VT = getRemainingValueType(BytesLeft);
  246. VTSize = getRemainingSize(BytesLeft);
  247. TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
  248. DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
  249. DAG.getConstant(DstOff, dl, MVT::i32)),
  250. DstPtrInfo.getWithOffset(DstOff));
  251. ++i;
  252. DstOff += VTSize;
  253. BytesLeft -= VTSize;
  254. }
  255. return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
  256. makeArrayRef(TFOps, i));
  257. }
  258. SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
  259. SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  260. SDValue Size, Align Alignment, bool isVolatile,
  261. MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
  262. return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
  263. Alignment.value(), RTLIB::MEMMOVE);
  264. }
  265. SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
  266. SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  267. SDValue Size, Align Alignment, bool isVolatile,
  268. MachinePointerInfo DstPtrInfo) const {
  269. const ARMSubtarget &Subtarget =
  270. DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  271. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  272. // Generate TP loop for llvm.memset
  273. if (Subtarget.hasMVEIntegerOps() &&
  274. shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
  275. false)) {
  276. Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
  277. DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
  278. return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
  279. DAG.getZExtOrTrunc(Size, dl, MVT::i32));
  280. }
  281. return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
  282. Alignment.value(), RTLIB::MEMSET);
  283. }