X86InstCombineIntrinsic.cpp 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016
  1. //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// This file implements a TargetTransformInfo analysis pass specific to the
  10. /// X86 target machine. It uses the target's detailed information to provide
  11. /// more precise answers to certain TTI queries, while letting the target
  12. /// independent and default TTI implementations handle the rest.
  13. ///
  14. //===----------------------------------------------------------------------===//
  15. #include "X86TargetTransformInfo.h"
  16. #include "llvm/IR/IntrinsicInst.h"
  17. #include "llvm/IR/IntrinsicsX86.h"
  18. #include "llvm/Support/KnownBits.h"
  19. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  20. using namespace llvm;
  21. #define DEBUG_TYPE "x86tti"
  22. /// Return a constant boolean vector that has true elements in all positions
  23. /// where the input constant data vector has an element with the sign bit set.
  24. static Constant *getNegativeIsTrueBoolVec(Constant *V) {
  25. VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
  26. V = ConstantExpr::getBitCast(V, IntTy);
  27. V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
  28. V);
  29. return V;
  30. }
  31. /// Convert the x86 XMM integer vector mask to a vector of bools based on
  32. /// each element's most significant bit (the sign bit).
  33. static Value *getBoolVecFromMask(Value *Mask) {
  34. // Fold Constant Mask.
  35. if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
  36. return getNegativeIsTrueBoolVec(ConstantMask);
  37. // Mask was extended from a boolean vector.
  38. Value *ExtMask;
  39. if (PatternMatch::match(
  40. Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
  41. ExtMask->getType()->isIntOrIntVectorTy(1))
  42. return ExtMask;
  43. return nullptr;
  44. }
  45. // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  46. // XMM register mask efficiently, we could transform all x86 masked intrinsics
  47. // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  48. static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
  49. Value *Ptr = II.getOperand(0);
  50. Value *Mask = II.getOperand(1);
  51. Constant *ZeroVec = Constant::getNullValue(II.getType());
  52. // Zero Mask - masked load instruction creates a zero vector.
  53. if (isa<ConstantAggregateZero>(Mask))
  54. return IC.replaceInstUsesWith(II, ZeroVec);
  55. // The mask is constant or extended from a bool vector. Convert this x86
  56. // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  57. if (Value *BoolMask = getBoolVecFromMask(Mask)) {
  58. // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
  59. // the LLVM intrinsic definition for the pointer argument.
  60. unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  61. PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
  62. Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  63. // The pass-through vector for an x86 masked load is a zero vector.
  64. CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
  65. II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
  66. return IC.replaceInstUsesWith(II, NewMaskedLoad);
  67. }
  68. return nullptr;
  69. }
  70. // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  71. // XMM register mask efficiently, we could transform all x86 masked intrinsics
  72. // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  73. static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
  74. Value *Ptr = II.getOperand(0);
  75. Value *Mask = II.getOperand(1);
  76. Value *Vec = II.getOperand(2);
  77. // Zero Mask - this masked store instruction does nothing.
  78. if (isa<ConstantAggregateZero>(Mask)) {
  79. IC.eraseInstFromFunction(II);
  80. return true;
  81. }
  82. // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
  83. // anything else at this level.
  84. if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
  85. return false;
  86. // The mask is constant or extended from a bool vector. Convert this x86
  87. // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  88. if (Value *BoolMask = getBoolVecFromMask(Mask)) {
  89. unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  90. PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
  91. Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  92. IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
  93. // 'Replace uses' doesn't work for stores. Erase the original masked store.
  94. IC.eraseInstFromFunction(II);
  95. return true;
  96. }
  97. return false;
  98. }
  99. static Value *simplifyX86immShift(const IntrinsicInst &II,
  100. InstCombiner::BuilderTy &Builder) {
  101. bool LogicalShift = false;
  102. bool ShiftLeft = false;
  103. bool IsImm = false;
  104. switch (II.getIntrinsicID()) {
  105. default:
  106. llvm_unreachable("Unexpected intrinsic!");
  107. case Intrinsic::x86_sse2_psrai_d:
  108. case Intrinsic::x86_sse2_psrai_w:
  109. case Intrinsic::x86_avx2_psrai_d:
  110. case Intrinsic::x86_avx2_psrai_w:
  111. case Intrinsic::x86_avx512_psrai_q_128:
  112. case Intrinsic::x86_avx512_psrai_q_256:
  113. case Intrinsic::x86_avx512_psrai_d_512:
  114. case Intrinsic::x86_avx512_psrai_q_512:
  115. case Intrinsic::x86_avx512_psrai_w_512:
  116. IsImm = true;
  117. LLVM_FALLTHROUGH;
  118. case Intrinsic::x86_sse2_psra_d:
  119. case Intrinsic::x86_sse2_psra_w:
  120. case Intrinsic::x86_avx2_psra_d:
  121. case Intrinsic::x86_avx2_psra_w:
  122. case Intrinsic::x86_avx512_psra_q_128:
  123. case Intrinsic::x86_avx512_psra_q_256:
  124. case Intrinsic::x86_avx512_psra_d_512:
  125. case Intrinsic::x86_avx512_psra_q_512:
  126. case Intrinsic::x86_avx512_psra_w_512:
  127. LogicalShift = false;
  128. ShiftLeft = false;
  129. break;
  130. case Intrinsic::x86_sse2_psrli_d:
  131. case Intrinsic::x86_sse2_psrli_q:
  132. case Intrinsic::x86_sse2_psrli_w:
  133. case Intrinsic::x86_avx2_psrli_d:
  134. case Intrinsic::x86_avx2_psrli_q:
  135. case Intrinsic::x86_avx2_psrli_w:
  136. case Intrinsic::x86_avx512_psrli_d_512:
  137. case Intrinsic::x86_avx512_psrli_q_512:
  138. case Intrinsic::x86_avx512_psrli_w_512:
  139. IsImm = true;
  140. LLVM_FALLTHROUGH;
  141. case Intrinsic::x86_sse2_psrl_d:
  142. case Intrinsic::x86_sse2_psrl_q:
  143. case Intrinsic::x86_sse2_psrl_w:
  144. case Intrinsic::x86_avx2_psrl_d:
  145. case Intrinsic::x86_avx2_psrl_q:
  146. case Intrinsic::x86_avx2_psrl_w:
  147. case Intrinsic::x86_avx512_psrl_d_512:
  148. case Intrinsic::x86_avx512_psrl_q_512:
  149. case Intrinsic::x86_avx512_psrl_w_512:
  150. LogicalShift = true;
  151. ShiftLeft = false;
  152. break;
  153. case Intrinsic::x86_sse2_pslli_d:
  154. case Intrinsic::x86_sse2_pslli_q:
  155. case Intrinsic::x86_sse2_pslli_w:
  156. case Intrinsic::x86_avx2_pslli_d:
  157. case Intrinsic::x86_avx2_pslli_q:
  158. case Intrinsic::x86_avx2_pslli_w:
  159. case Intrinsic::x86_avx512_pslli_d_512:
  160. case Intrinsic::x86_avx512_pslli_q_512:
  161. case Intrinsic::x86_avx512_pslli_w_512:
  162. IsImm = true;
  163. LLVM_FALLTHROUGH;
  164. case Intrinsic::x86_sse2_psll_d:
  165. case Intrinsic::x86_sse2_psll_q:
  166. case Intrinsic::x86_sse2_psll_w:
  167. case Intrinsic::x86_avx2_psll_d:
  168. case Intrinsic::x86_avx2_psll_q:
  169. case Intrinsic::x86_avx2_psll_w:
  170. case Intrinsic::x86_avx512_psll_d_512:
  171. case Intrinsic::x86_avx512_psll_q_512:
  172. case Intrinsic::x86_avx512_psll_w_512:
  173. LogicalShift = true;
  174. ShiftLeft = true;
  175. break;
  176. }
  177. assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
  178. Value *Vec = II.getArgOperand(0);
  179. Value *Amt = II.getArgOperand(1);
  180. auto *VT = cast<FixedVectorType>(Vec->getType());
  181. Type *SVT = VT->getElementType();
  182. Type *AmtVT = Amt->getType();
  183. unsigned VWidth = VT->getNumElements();
  184. unsigned BitWidth = SVT->getPrimitiveSizeInBits();
  185. // If the shift amount is guaranteed to be in-range we can replace it with a
  186. // generic shift. If its guaranteed to be out of range, logical shifts combine
  187. // to zero and arithmetic shifts are clamped to (BitWidth - 1).
  188. if (IsImm) {
  189. assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
  190. KnownBits KnownAmtBits =
  191. llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
  192. if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
  193. Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
  194. Amt = Builder.CreateVectorSplat(VWidth, Amt);
  195. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  196. : Builder.CreateLShr(Vec, Amt))
  197. : Builder.CreateAShr(Vec, Amt));
  198. }
  199. if (KnownAmtBits.getMinValue().uge(BitWidth)) {
  200. if (LogicalShift)
  201. return ConstantAggregateZero::get(VT);
  202. Amt = ConstantInt::get(SVT, BitWidth - 1);
  203. return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
  204. }
  205. } else {
  206. // Ensure the first element has an in-range value and the rest of the
  207. // elements in the bottom 64 bits are zero.
  208. assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
  209. cast<VectorType>(AmtVT)->getElementType() == SVT &&
  210. "Unexpected shift-by-scalar type");
  211. unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
  212. APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
  213. APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
  214. KnownBits KnownLowerBits = llvm::computeKnownBits(
  215. Amt, DemandedLower, II.getModule()->getDataLayout());
  216. KnownBits KnownUpperBits = llvm::computeKnownBits(
  217. Amt, DemandedUpper, II.getModule()->getDataLayout());
  218. if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
  219. (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
  220. SmallVector<int, 16> ZeroSplat(VWidth, 0);
  221. Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
  222. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  223. : Builder.CreateLShr(Vec, Amt))
  224. : Builder.CreateAShr(Vec, Amt));
  225. }
  226. }
  227. // Simplify if count is constant vector.
  228. auto *CDV = dyn_cast<ConstantDataVector>(Amt);
  229. if (!CDV)
  230. return nullptr;
  231. // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
  232. // operand to compute the shift amount.
  233. assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
  234. cast<VectorType>(AmtVT)->getElementType() == SVT &&
  235. "Unexpected shift-by-scalar type");
  236. // Concatenate the sub-elements to create the 64-bit value.
  237. APInt Count(64, 0);
  238. for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
  239. unsigned SubEltIdx = (NumSubElts - 1) - i;
  240. auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
  241. Count <<= BitWidth;
  242. Count |= SubElt->getValue().zextOrTrunc(64);
  243. }
  244. // If shift-by-zero then just return the original value.
  245. if (Count.isZero())
  246. return Vec;
  247. // Handle cases when Shift >= BitWidth.
  248. if (Count.uge(BitWidth)) {
  249. // If LogicalShift - just return zero.
  250. if (LogicalShift)
  251. return ConstantAggregateZero::get(VT);
  252. // If ArithmeticShift - clamp Shift to (BitWidth - 1).
  253. Count = APInt(64, BitWidth - 1);
  254. }
  255. // Get a constant vector of the same type as the first operand.
  256. auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
  257. auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
  258. if (ShiftLeft)
  259. return Builder.CreateShl(Vec, ShiftVec);
  260. if (LogicalShift)
  261. return Builder.CreateLShr(Vec, ShiftVec);
  262. return Builder.CreateAShr(Vec, ShiftVec);
  263. }
  264. // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
  265. // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
  266. // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
  267. static Value *simplifyX86varShift(const IntrinsicInst &II,
  268. InstCombiner::BuilderTy &Builder) {
  269. bool LogicalShift = false;
  270. bool ShiftLeft = false;
  271. switch (II.getIntrinsicID()) {
  272. default:
  273. llvm_unreachable("Unexpected intrinsic!");
  274. case Intrinsic::x86_avx2_psrav_d:
  275. case Intrinsic::x86_avx2_psrav_d_256:
  276. case Intrinsic::x86_avx512_psrav_q_128:
  277. case Intrinsic::x86_avx512_psrav_q_256:
  278. case Intrinsic::x86_avx512_psrav_d_512:
  279. case Intrinsic::x86_avx512_psrav_q_512:
  280. case Intrinsic::x86_avx512_psrav_w_128:
  281. case Intrinsic::x86_avx512_psrav_w_256:
  282. case Intrinsic::x86_avx512_psrav_w_512:
  283. LogicalShift = false;
  284. ShiftLeft = false;
  285. break;
  286. case Intrinsic::x86_avx2_psrlv_d:
  287. case Intrinsic::x86_avx2_psrlv_d_256:
  288. case Intrinsic::x86_avx2_psrlv_q:
  289. case Intrinsic::x86_avx2_psrlv_q_256:
  290. case Intrinsic::x86_avx512_psrlv_d_512:
  291. case Intrinsic::x86_avx512_psrlv_q_512:
  292. case Intrinsic::x86_avx512_psrlv_w_128:
  293. case Intrinsic::x86_avx512_psrlv_w_256:
  294. case Intrinsic::x86_avx512_psrlv_w_512:
  295. LogicalShift = true;
  296. ShiftLeft = false;
  297. break;
  298. case Intrinsic::x86_avx2_psllv_d:
  299. case Intrinsic::x86_avx2_psllv_d_256:
  300. case Intrinsic::x86_avx2_psllv_q:
  301. case Intrinsic::x86_avx2_psllv_q_256:
  302. case Intrinsic::x86_avx512_psllv_d_512:
  303. case Intrinsic::x86_avx512_psllv_q_512:
  304. case Intrinsic::x86_avx512_psllv_w_128:
  305. case Intrinsic::x86_avx512_psllv_w_256:
  306. case Intrinsic::x86_avx512_psllv_w_512:
  307. LogicalShift = true;
  308. ShiftLeft = true;
  309. break;
  310. }
  311. assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
  312. Value *Vec = II.getArgOperand(0);
  313. Value *Amt = II.getArgOperand(1);
  314. auto *VT = cast<FixedVectorType>(II.getType());
  315. Type *SVT = VT->getElementType();
  316. int NumElts = VT->getNumElements();
  317. int BitWidth = SVT->getIntegerBitWidth();
  318. // If the shift amount is guaranteed to be in-range we can replace it with a
  319. // generic shift.
  320. APInt UpperBits =
  321. APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
  322. if (llvm::MaskedValueIsZero(Amt, UpperBits,
  323. II.getModule()->getDataLayout())) {
  324. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  325. : Builder.CreateLShr(Vec, Amt))
  326. : Builder.CreateAShr(Vec, Amt));
  327. }
  328. // Simplify if all shift amounts are constant/undef.
  329. auto *CShift = dyn_cast<Constant>(Amt);
  330. if (!CShift)
  331. return nullptr;
  332. // Collect each element's shift amount.
  333. // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
  334. bool AnyOutOfRange = false;
  335. SmallVector<int, 8> ShiftAmts;
  336. for (int I = 0; I < NumElts; ++I) {
  337. auto *CElt = CShift->getAggregateElement(I);
  338. if (isa_and_nonnull<UndefValue>(CElt)) {
  339. ShiftAmts.push_back(-1);
  340. continue;
  341. }
  342. auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
  343. if (!COp)
  344. return nullptr;
  345. // Handle out of range shifts.
  346. // If LogicalShift - set to BitWidth (special case).
  347. // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
  348. APInt ShiftVal = COp->getValue();
  349. if (ShiftVal.uge(BitWidth)) {
  350. AnyOutOfRange = LogicalShift;
  351. ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
  352. continue;
  353. }
  354. ShiftAmts.push_back((int)ShiftVal.getZExtValue());
  355. }
  356. // If all elements out of range or UNDEF, return vector of zeros/undefs.
  357. // ArithmeticShift should only hit this if they are all UNDEF.
  358. auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
  359. if (llvm::all_of(ShiftAmts, OutOfRange)) {
  360. SmallVector<Constant *, 8> ConstantVec;
  361. for (int Idx : ShiftAmts) {
  362. if (Idx < 0) {
  363. ConstantVec.push_back(UndefValue::get(SVT));
  364. } else {
  365. assert(LogicalShift && "Logical shift expected");
  366. ConstantVec.push_back(ConstantInt::getNullValue(SVT));
  367. }
  368. }
  369. return ConstantVector::get(ConstantVec);
  370. }
  371. // We can't handle only some out of range values with generic logical shifts.
  372. if (AnyOutOfRange)
  373. return nullptr;
  374. // Build the shift amount constant vector.
  375. SmallVector<Constant *, 8> ShiftVecAmts;
  376. for (int Idx : ShiftAmts) {
  377. if (Idx < 0)
  378. ShiftVecAmts.push_back(UndefValue::get(SVT));
  379. else
  380. ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
  381. }
  382. auto ShiftVec = ConstantVector::get(ShiftVecAmts);
  383. if (ShiftLeft)
  384. return Builder.CreateShl(Vec, ShiftVec);
  385. if (LogicalShift)
  386. return Builder.CreateLShr(Vec, ShiftVec);
  387. return Builder.CreateAShr(Vec, ShiftVec);
  388. }
  389. static Value *simplifyX86pack(IntrinsicInst &II,
  390. InstCombiner::BuilderTy &Builder, bool IsSigned) {
  391. Value *Arg0 = II.getArgOperand(0);
  392. Value *Arg1 = II.getArgOperand(1);
  393. Type *ResTy = II.getType();
  394. // Fast all undef handling.
  395. if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
  396. return UndefValue::get(ResTy);
  397. auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
  398. unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
  399. unsigned NumSrcElts = ArgTy->getNumElements();
  400. assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
  401. "Unexpected packing types");
  402. unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
  403. unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
  404. unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
  405. assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
  406. "Unexpected packing types");
  407. // Constant folding.
  408. if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
  409. return nullptr;
  410. // Clamp Values - signed/unsigned both use signed clamp values, but they
  411. // differ on the min/max values.
  412. APInt MinValue, MaxValue;
  413. if (IsSigned) {
  414. // PACKSS: Truncate signed value with signed saturation.
  415. // Source values less than dst minint are saturated to minint.
  416. // Source values greater than dst maxint are saturated to maxint.
  417. MinValue =
  418. APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
  419. MaxValue =
  420. APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
  421. } else {
  422. // PACKUS: Truncate signed value with unsigned saturation.
  423. // Source values less than zero are saturated to zero.
  424. // Source values greater than dst maxuint are saturated to maxuint.
  425. MinValue = APInt::getZero(SrcScalarSizeInBits);
  426. MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
  427. }
  428. auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
  429. auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
  430. Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
  431. Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
  432. Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
  433. Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
  434. // Shuffle clamped args together at the lane level.
  435. SmallVector<int, 32> PackMask;
  436. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  437. for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
  438. PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
  439. for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
  440. PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
  441. }
  442. auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
  443. // Truncate to dst size.
  444. return Builder.CreateTrunc(Shuffle, ResTy);
  445. }
  446. static Value *simplifyX86movmsk(const IntrinsicInst &II,
  447. InstCombiner::BuilderTy &Builder) {
  448. Value *Arg = II.getArgOperand(0);
  449. Type *ResTy = II.getType();
  450. // movmsk(undef) -> zero as we must ensure the upper bits are zero.
  451. if (isa<UndefValue>(Arg))
  452. return Constant::getNullValue(ResTy);
  453. auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
  454. // We can't easily peek through x86_mmx types.
  455. if (!ArgTy)
  456. return nullptr;
  457. // Expand MOVMSK to compare/bitcast/zext:
  458. // e.g. PMOVMSKB(v16i8 x):
  459. // %cmp = icmp slt <16 x i8> %x, zeroinitializer
  460. // %int = bitcast <16 x i1> %cmp to i16
  461. // %res = zext i16 %int to i32
  462. unsigned NumElts = ArgTy->getNumElements();
  463. Type *IntegerVecTy = VectorType::getInteger(ArgTy);
  464. Type *IntegerTy = Builder.getIntNTy(NumElts);
  465. Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
  466. Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
  467. Res = Builder.CreateBitCast(Res, IntegerTy);
  468. Res = Builder.CreateZExtOrTrunc(Res, ResTy);
  469. return Res;
  470. }
  471. static Value *simplifyX86addcarry(const IntrinsicInst &II,
  472. InstCombiner::BuilderTy &Builder) {
  473. Value *CarryIn = II.getArgOperand(0);
  474. Value *Op1 = II.getArgOperand(1);
  475. Value *Op2 = II.getArgOperand(2);
  476. Type *RetTy = II.getType();
  477. Type *OpTy = Op1->getType();
  478. assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
  479. RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
  480. "Unexpected types for x86 addcarry");
  481. // If carry-in is zero, this is just an unsigned add with overflow.
  482. if (match(CarryIn, PatternMatch::m_ZeroInt())) {
  483. Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
  484. {Op1, Op2});
  485. // The types have to be adjusted to match the x86 call types.
  486. Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
  487. Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
  488. Builder.getInt8Ty());
  489. Value *Res = UndefValue::get(RetTy);
  490. Res = Builder.CreateInsertValue(Res, UAddOV, 0);
  491. return Builder.CreateInsertValue(Res, UAddResult, 1);
  492. }
  493. return nullptr;
  494. }
  495. static Value *simplifyX86insertps(const IntrinsicInst &II,
  496. InstCombiner::BuilderTy &Builder) {
  497. auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
  498. if (!CInt)
  499. return nullptr;
  500. auto *VecTy = cast<FixedVectorType>(II.getType());
  501. assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
  502. // The immediate permute control byte looks like this:
  503. // [3:0] - zero mask for each 32-bit lane
  504. // [5:4] - select one 32-bit destination lane
  505. // [7:6] - select one 32-bit source lane
  506. uint8_t Imm = CInt->getZExtValue();
  507. uint8_t ZMask = Imm & 0xf;
  508. uint8_t DestLane = (Imm >> 4) & 0x3;
  509. uint8_t SourceLane = (Imm >> 6) & 0x3;
  510. ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
  511. // If all zero mask bits are set, this was just a weird way to
  512. // generate a zero vector.
  513. if (ZMask == 0xf)
  514. return ZeroVector;
  515. // Initialize by passing all of the first source bits through.
  516. int ShuffleMask[4] = {0, 1, 2, 3};
  517. // We may replace the second operand with the zero vector.
  518. Value *V1 = II.getArgOperand(1);
  519. if (ZMask) {
  520. // If the zero mask is being used with a single input or the zero mask
  521. // overrides the destination lane, this is a shuffle with the zero vector.
  522. if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
  523. (ZMask & (1 << DestLane))) {
  524. V1 = ZeroVector;
  525. // We may still move 32-bits of the first source vector from one lane
  526. // to another.
  527. ShuffleMask[DestLane] = SourceLane;
  528. // The zero mask may override the previous insert operation.
  529. for (unsigned i = 0; i < 4; ++i)
  530. if ((ZMask >> i) & 0x1)
  531. ShuffleMask[i] = i + 4;
  532. } else {
  533. // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
  534. return nullptr;
  535. }
  536. } else {
  537. // Replace the selected destination lane with the selected source lane.
  538. ShuffleMask[DestLane] = SourceLane + 4;
  539. }
  540. return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
  541. }
  542. /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
  543. /// or conversion to a shuffle vector.
  544. static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
  545. ConstantInt *CILength, ConstantInt *CIIndex,
  546. InstCombiner::BuilderTy &Builder) {
  547. auto LowConstantHighUndef = [&](uint64_t Val) {
  548. Type *IntTy64 = Type::getInt64Ty(II.getContext());
  549. Constant *Args[] = {ConstantInt::get(IntTy64, Val),
  550. UndefValue::get(IntTy64)};
  551. return ConstantVector::get(Args);
  552. };
  553. // See if we're dealing with constant values.
  554. auto *C0 = dyn_cast<Constant>(Op0);
  555. auto *CI0 =
  556. C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
  557. : nullptr;
  558. // Attempt to constant fold.
  559. if (CILength && CIIndex) {
  560. // From AMD documentation: "The bit index and field length are each six
  561. // bits in length other bits of the field are ignored."
  562. APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
  563. APInt APLength = CILength->getValue().zextOrTrunc(6);
  564. unsigned Index = APIndex.getZExtValue();
  565. // From AMD documentation: "a value of zero in the field length is
  566. // defined as length of 64".
  567. unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
  568. // From AMD documentation: "If the sum of the bit index + length field
  569. // is greater than 64, the results are undefined".
  570. unsigned End = Index + Length;
  571. // Note that both field index and field length are 8-bit quantities.
  572. // Since variables 'Index' and 'Length' are unsigned values
  573. // obtained from zero-extending field index and field length
  574. // respectively, their sum should never wrap around.
  575. if (End > 64)
  576. return UndefValue::get(II.getType());
  577. // If we are inserting whole bytes, we can convert this to a shuffle.
  578. // Lowering can recognize EXTRQI shuffle masks.
  579. if ((Length % 8) == 0 && (Index % 8) == 0) {
  580. // Convert bit indices to byte indices.
  581. Length /= 8;
  582. Index /= 8;
  583. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  584. auto *ShufTy = FixedVectorType::get(IntTy8, 16);
  585. SmallVector<int, 16> ShuffleMask;
  586. for (int i = 0; i != (int)Length; ++i)
  587. ShuffleMask.push_back(i + Index);
  588. for (int i = Length; i != 8; ++i)
  589. ShuffleMask.push_back(i + 16);
  590. for (int i = 8; i != 16; ++i)
  591. ShuffleMask.push_back(-1);
  592. Value *SV = Builder.CreateShuffleVector(
  593. Builder.CreateBitCast(Op0, ShufTy),
  594. ConstantAggregateZero::get(ShufTy), ShuffleMask);
  595. return Builder.CreateBitCast(SV, II.getType());
  596. }
  597. // Constant Fold - shift Index'th bit to lowest position and mask off
  598. // Length bits.
  599. if (CI0) {
  600. APInt Elt = CI0->getValue();
  601. Elt.lshrInPlace(Index);
  602. Elt = Elt.zextOrTrunc(Length);
  603. return LowConstantHighUndef(Elt.getZExtValue());
  604. }
  605. // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
  606. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
  607. Value *Args[] = {Op0, CILength, CIIndex};
  608. Module *M = II.getModule();
  609. Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
  610. return Builder.CreateCall(F, Args);
  611. }
  612. }
  613. // Constant Fold - extraction from zero is always {zero, undef}.
  614. if (CI0 && CI0->isZero())
  615. return LowConstantHighUndef(0);
  616. return nullptr;
  617. }
  618. /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
  619. /// folding or conversion to a shuffle vector.
  620. static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
  621. APInt APLength, APInt APIndex,
  622. InstCombiner::BuilderTy &Builder) {
  623. // From AMD documentation: "The bit index and field length are each six bits
  624. // in length other bits of the field are ignored."
  625. APIndex = APIndex.zextOrTrunc(6);
  626. APLength = APLength.zextOrTrunc(6);
  627. // Attempt to constant fold.
  628. unsigned Index = APIndex.getZExtValue();
  629. // From AMD documentation: "a value of zero in the field length is
  630. // defined as length of 64".
  631. unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
  632. // From AMD documentation: "If the sum of the bit index + length field
  633. // is greater than 64, the results are undefined".
  634. unsigned End = Index + Length;
  635. // Note that both field index and field length are 8-bit quantities.
  636. // Since variables 'Index' and 'Length' are unsigned values
  637. // obtained from zero-extending field index and field length
  638. // respectively, their sum should never wrap around.
  639. if (End > 64)
  640. return UndefValue::get(II.getType());
  641. // If we are inserting whole bytes, we can convert this to a shuffle.
  642. // Lowering can recognize INSERTQI shuffle masks.
  643. if ((Length % 8) == 0 && (Index % 8) == 0) {
  644. // Convert bit indices to byte indices.
  645. Length /= 8;
  646. Index /= 8;
  647. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  648. auto *ShufTy = FixedVectorType::get(IntTy8, 16);
  649. SmallVector<int, 16> ShuffleMask;
  650. for (int i = 0; i != (int)Index; ++i)
  651. ShuffleMask.push_back(i);
  652. for (int i = 0; i != (int)Length; ++i)
  653. ShuffleMask.push_back(i + 16);
  654. for (int i = Index + Length; i != 8; ++i)
  655. ShuffleMask.push_back(i);
  656. for (int i = 8; i != 16; ++i)
  657. ShuffleMask.push_back(-1);
  658. Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
  659. Builder.CreateBitCast(Op1, ShufTy),
  660. ShuffleMask);
  661. return Builder.CreateBitCast(SV, II.getType());
  662. }
  663. // See if we're dealing with constant values.
  664. auto *C0 = dyn_cast<Constant>(Op0);
  665. auto *C1 = dyn_cast<Constant>(Op1);
  666. auto *CI00 =
  667. C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
  668. : nullptr;
  669. auto *CI10 =
  670. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
  671. : nullptr;
  672. // Constant Fold - insert bottom Length bits starting at the Index'th bit.
  673. if (CI00 && CI10) {
  674. APInt V00 = CI00->getValue();
  675. APInt V10 = CI10->getValue();
  676. APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
  677. V00 = V00 & ~Mask;
  678. V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
  679. APInt Val = V00 | V10;
  680. Type *IntTy64 = Type::getInt64Ty(II.getContext());
  681. Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
  682. UndefValue::get(IntTy64)};
  683. return ConstantVector::get(Args);
  684. }
  685. // If we were an INSERTQ call, we'll save demanded elements if we convert to
  686. // INSERTQI.
  687. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
  688. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  689. Constant *CILength = ConstantInt::get(IntTy8, Length, false);
  690. Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
  691. Value *Args[] = {Op0, Op1, CILength, CIIndex};
  692. Module *M = II.getModule();
  693. Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
  694. return Builder.CreateCall(F, Args);
  695. }
  696. return nullptr;
  697. }
  698. /// Attempt to convert pshufb* to shufflevector if the mask is constant.
  699. static Value *simplifyX86pshufb(const IntrinsicInst &II,
  700. InstCombiner::BuilderTy &Builder) {
  701. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  702. if (!V)
  703. return nullptr;
  704. auto *VecTy = cast<FixedVectorType>(II.getType());
  705. unsigned NumElts = VecTy->getNumElements();
  706. assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
  707. "Unexpected number of elements in shuffle mask!");
  708. // Construct a shuffle mask from constant integers or UNDEFs.
  709. int Indexes[64];
  710. // Each byte in the shuffle control mask forms an index to permute the
  711. // corresponding byte in the destination operand.
  712. for (unsigned I = 0; I < NumElts; ++I) {
  713. Constant *COp = V->getAggregateElement(I);
  714. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  715. return nullptr;
  716. if (isa<UndefValue>(COp)) {
  717. Indexes[I] = -1;
  718. continue;
  719. }
  720. int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
  721. // If the most significant bit (bit[7]) of each byte of the shuffle
  722. // control mask is set, then zero is written in the result byte.
  723. // The zero vector is in the right-hand side of the resulting
  724. // shufflevector.
  725. // The value of each index for the high 128-bit lane is the least
  726. // significant 4 bits of the respective shuffle control byte.
  727. Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
  728. Indexes[I] = Index;
  729. }
  730. auto V1 = II.getArgOperand(0);
  731. auto V2 = Constant::getNullValue(VecTy);
  732. return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
  733. }
  734. /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
  735. static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
  736. InstCombiner::BuilderTy &Builder) {
  737. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  738. if (!V)
  739. return nullptr;
  740. auto *VecTy = cast<FixedVectorType>(II.getType());
  741. unsigned NumElts = VecTy->getNumElements();
  742. bool IsPD = VecTy->getScalarType()->isDoubleTy();
  743. unsigned NumLaneElts = IsPD ? 2 : 4;
  744. assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
  745. // Construct a shuffle mask from constant integers or UNDEFs.
  746. int Indexes[16];
  747. // The intrinsics only read one or two bits, clear the rest.
  748. for (unsigned I = 0; I < NumElts; ++I) {
  749. Constant *COp = V->getAggregateElement(I);
  750. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  751. return nullptr;
  752. if (isa<UndefValue>(COp)) {
  753. Indexes[I] = -1;
  754. continue;
  755. }
  756. APInt Index = cast<ConstantInt>(COp)->getValue();
  757. Index = Index.zextOrTrunc(32).getLoBits(2);
  758. // The PD variants uses bit 1 to select per-lane element index, so
  759. // shift down to convert to generic shuffle mask index.
  760. if (IsPD)
  761. Index.lshrInPlace(1);
  762. // The _256 variants are a bit trickier since the mask bits always index
  763. // into the corresponding 128 half. In order to convert to a generic
  764. // shuffle, we have to make that explicit.
  765. Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
  766. Indexes[I] = Index.getZExtValue();
  767. }
  768. auto V1 = II.getArgOperand(0);
  769. return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
  770. }
  771. /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
  772. static Value *simplifyX86vpermv(const IntrinsicInst &II,
  773. InstCombiner::BuilderTy &Builder) {
  774. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  775. if (!V)
  776. return nullptr;
  777. auto *VecTy = cast<FixedVectorType>(II.getType());
  778. unsigned Size = VecTy->getNumElements();
  779. assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
  780. "Unexpected shuffle mask size");
  781. // Construct a shuffle mask from constant integers or UNDEFs.
  782. int Indexes[64];
  783. for (unsigned I = 0; I < Size; ++I) {
  784. Constant *COp = V->getAggregateElement(I);
  785. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  786. return nullptr;
  787. if (isa<UndefValue>(COp)) {
  788. Indexes[I] = -1;
  789. continue;
  790. }
  791. uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
  792. Index &= Size - 1;
  793. Indexes[I] = Index;
  794. }
  795. auto V1 = II.getArgOperand(0);
  796. return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
  797. }
  798. Optional<Instruction *>
  799. X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  800. auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
  801. unsigned DemandedWidth) {
  802. APInt UndefElts(Width, 0);
  803. APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
  804. return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
  805. };
  806. Intrinsic::ID IID = II.getIntrinsicID();
  807. switch (IID) {
  808. case Intrinsic::x86_bmi_bextr_32:
  809. case Intrinsic::x86_bmi_bextr_64:
  810. case Intrinsic::x86_tbm_bextri_u32:
  811. case Intrinsic::x86_tbm_bextri_u64:
  812. // If the RHS is a constant we can try some simplifications.
  813. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  814. uint64_t Shift = C->getZExtValue();
  815. uint64_t Length = (Shift >> 8) & 0xff;
  816. Shift &= 0xff;
  817. unsigned BitWidth = II.getType()->getIntegerBitWidth();
  818. // If the length is 0 or the shift is out of range, replace with zero.
  819. if (Length == 0 || Shift >= BitWidth) {
  820. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  821. }
  822. // If the LHS is also a constant, we can completely constant fold this.
  823. if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  824. uint64_t Result = InC->getZExtValue() >> Shift;
  825. if (Length > BitWidth)
  826. Length = BitWidth;
  827. Result &= maskTrailingOnes<uint64_t>(Length);
  828. return IC.replaceInstUsesWith(II,
  829. ConstantInt::get(II.getType(), Result));
  830. }
  831. // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
  832. // are only masking bits that a shift already cleared?
  833. }
  834. break;
  835. case Intrinsic::x86_bmi_bzhi_32:
  836. case Intrinsic::x86_bmi_bzhi_64:
  837. // If the RHS is a constant we can try some simplifications.
  838. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  839. uint64_t Index = C->getZExtValue() & 0xff;
  840. unsigned BitWidth = II.getType()->getIntegerBitWidth();
  841. if (Index >= BitWidth) {
  842. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  843. }
  844. if (Index == 0) {
  845. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  846. }
  847. // If the LHS is also a constant, we can completely constant fold this.
  848. if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  849. uint64_t Result = InC->getZExtValue();
  850. Result &= maskTrailingOnes<uint64_t>(Index);
  851. return IC.replaceInstUsesWith(II,
  852. ConstantInt::get(II.getType(), Result));
  853. }
  854. // TODO should we convert this to an AND if the RHS is constant?
  855. }
  856. break;
  857. case Intrinsic::x86_bmi_pext_32:
  858. case Intrinsic::x86_bmi_pext_64:
  859. if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  860. if (MaskC->isNullValue()) {
  861. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  862. }
  863. if (MaskC->isAllOnesValue()) {
  864. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  865. }
  866. if (MaskC->getValue().isShiftedMask()) {
  867. // any single contingous sequence of 1s anywhere in the mask simply
  868. // describes a subset of the input bits shifted to the appropriate
  869. // position. Replace with the straight forward IR.
  870. unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
  871. Value *Input = II.getArgOperand(0);
  872. Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
  873. Value *Shifted = IC.Builder.CreateLShr(Masked,
  874. ConstantInt::get(II.getType(),
  875. ShiftAmount));
  876. return IC.replaceInstUsesWith(II, Shifted);
  877. }
  878. if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  879. uint64_t Src = SrcC->getZExtValue();
  880. uint64_t Mask = MaskC->getZExtValue();
  881. uint64_t Result = 0;
  882. uint64_t BitToSet = 1;
  883. while (Mask) {
  884. // Isolate lowest set bit.
  885. uint64_t BitToTest = Mask & -Mask;
  886. if (BitToTest & Src)
  887. Result |= BitToSet;
  888. BitToSet <<= 1;
  889. // Clear lowest set bit.
  890. Mask &= Mask - 1;
  891. }
  892. return IC.replaceInstUsesWith(II,
  893. ConstantInt::get(II.getType(), Result));
  894. }
  895. }
  896. break;
  897. case Intrinsic::x86_bmi_pdep_32:
  898. case Intrinsic::x86_bmi_pdep_64:
  899. if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  900. if (MaskC->isNullValue()) {
  901. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  902. }
  903. if (MaskC->isAllOnesValue()) {
  904. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  905. }
  906. if (MaskC->getValue().isShiftedMask()) {
  907. // any single contingous sequence of 1s anywhere in the mask simply
  908. // describes a subset of the input bits shifted to the appropriate
  909. // position. Replace with the straight forward IR.
  910. unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
  911. Value *Input = II.getArgOperand(0);
  912. Value *Shifted = IC.Builder.CreateShl(Input,
  913. ConstantInt::get(II.getType(),
  914. ShiftAmount));
  915. Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
  916. return IC.replaceInstUsesWith(II, Masked);
  917. }
  918. if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  919. uint64_t Src = SrcC->getZExtValue();
  920. uint64_t Mask = MaskC->getZExtValue();
  921. uint64_t Result = 0;
  922. uint64_t BitToTest = 1;
  923. while (Mask) {
  924. // Isolate lowest set bit.
  925. uint64_t BitToSet = Mask & -Mask;
  926. if (BitToTest & Src)
  927. Result |= BitToSet;
  928. BitToTest <<= 1;
  929. // Clear lowest set bit;
  930. Mask &= Mask - 1;
  931. }
  932. return IC.replaceInstUsesWith(II,
  933. ConstantInt::get(II.getType(), Result));
  934. }
  935. }
  936. break;
  937. case Intrinsic::x86_sse_cvtss2si:
  938. case Intrinsic::x86_sse_cvtss2si64:
  939. case Intrinsic::x86_sse_cvttss2si:
  940. case Intrinsic::x86_sse_cvttss2si64:
  941. case Intrinsic::x86_sse2_cvtsd2si:
  942. case Intrinsic::x86_sse2_cvtsd2si64:
  943. case Intrinsic::x86_sse2_cvttsd2si:
  944. case Intrinsic::x86_sse2_cvttsd2si64:
  945. case Intrinsic::x86_avx512_vcvtss2si32:
  946. case Intrinsic::x86_avx512_vcvtss2si64:
  947. case Intrinsic::x86_avx512_vcvtss2usi32:
  948. case Intrinsic::x86_avx512_vcvtss2usi64:
  949. case Intrinsic::x86_avx512_vcvtsd2si32:
  950. case Intrinsic::x86_avx512_vcvtsd2si64:
  951. case Intrinsic::x86_avx512_vcvtsd2usi32:
  952. case Intrinsic::x86_avx512_vcvtsd2usi64:
  953. case Intrinsic::x86_avx512_cvttss2si:
  954. case Intrinsic::x86_avx512_cvttss2si64:
  955. case Intrinsic::x86_avx512_cvttss2usi:
  956. case Intrinsic::x86_avx512_cvttss2usi64:
  957. case Intrinsic::x86_avx512_cvttsd2si:
  958. case Intrinsic::x86_avx512_cvttsd2si64:
  959. case Intrinsic::x86_avx512_cvttsd2usi:
  960. case Intrinsic::x86_avx512_cvttsd2usi64: {
  961. // These intrinsics only demand the 0th element of their input vectors. If
  962. // we can simplify the input based on that, do so now.
  963. Value *Arg = II.getArgOperand(0);
  964. unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
  965. if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
  966. return IC.replaceOperand(II, 0, V);
  967. }
  968. break;
  969. }
  970. case Intrinsic::x86_mmx_pmovmskb:
  971. case Intrinsic::x86_sse_movmsk_ps:
  972. case Intrinsic::x86_sse2_movmsk_pd:
  973. case Intrinsic::x86_sse2_pmovmskb_128:
  974. case Intrinsic::x86_avx_movmsk_pd_256:
  975. case Intrinsic::x86_avx_movmsk_ps_256:
  976. case Intrinsic::x86_avx2_pmovmskb:
  977. if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
  978. return IC.replaceInstUsesWith(II, V);
  979. }
  980. break;
  981. case Intrinsic::x86_sse_comieq_ss:
  982. case Intrinsic::x86_sse_comige_ss:
  983. case Intrinsic::x86_sse_comigt_ss:
  984. case Intrinsic::x86_sse_comile_ss:
  985. case Intrinsic::x86_sse_comilt_ss:
  986. case Intrinsic::x86_sse_comineq_ss:
  987. case Intrinsic::x86_sse_ucomieq_ss:
  988. case Intrinsic::x86_sse_ucomige_ss:
  989. case Intrinsic::x86_sse_ucomigt_ss:
  990. case Intrinsic::x86_sse_ucomile_ss:
  991. case Intrinsic::x86_sse_ucomilt_ss:
  992. case Intrinsic::x86_sse_ucomineq_ss:
  993. case Intrinsic::x86_sse2_comieq_sd:
  994. case Intrinsic::x86_sse2_comige_sd:
  995. case Intrinsic::x86_sse2_comigt_sd:
  996. case Intrinsic::x86_sse2_comile_sd:
  997. case Intrinsic::x86_sse2_comilt_sd:
  998. case Intrinsic::x86_sse2_comineq_sd:
  999. case Intrinsic::x86_sse2_ucomieq_sd:
  1000. case Intrinsic::x86_sse2_ucomige_sd:
  1001. case Intrinsic::x86_sse2_ucomigt_sd:
  1002. case Intrinsic::x86_sse2_ucomile_sd:
  1003. case Intrinsic::x86_sse2_ucomilt_sd:
  1004. case Intrinsic::x86_sse2_ucomineq_sd:
  1005. case Intrinsic::x86_avx512_vcomi_ss:
  1006. case Intrinsic::x86_avx512_vcomi_sd:
  1007. case Intrinsic::x86_avx512_mask_cmp_ss:
  1008. case Intrinsic::x86_avx512_mask_cmp_sd: {
  1009. // These intrinsics only demand the 0th element of their input vectors. If
  1010. // we can simplify the input based on that, do so now.
  1011. bool MadeChange = false;
  1012. Value *Arg0 = II.getArgOperand(0);
  1013. Value *Arg1 = II.getArgOperand(1);
  1014. unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
  1015. if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
  1016. IC.replaceOperand(II, 0, V);
  1017. MadeChange = true;
  1018. }
  1019. if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
  1020. IC.replaceOperand(II, 1, V);
  1021. MadeChange = true;
  1022. }
  1023. if (MadeChange) {
  1024. return &II;
  1025. }
  1026. break;
  1027. }
  1028. case Intrinsic::x86_avx512_add_ps_512:
  1029. case Intrinsic::x86_avx512_div_ps_512:
  1030. case Intrinsic::x86_avx512_mul_ps_512:
  1031. case Intrinsic::x86_avx512_sub_ps_512:
  1032. case Intrinsic::x86_avx512_add_pd_512:
  1033. case Intrinsic::x86_avx512_div_pd_512:
  1034. case Intrinsic::x86_avx512_mul_pd_512:
  1035. case Intrinsic::x86_avx512_sub_pd_512:
  1036. // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
  1037. // IR operations.
  1038. if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
  1039. if (R->getValue() == 4) {
  1040. Value *Arg0 = II.getArgOperand(0);
  1041. Value *Arg1 = II.getArgOperand(1);
  1042. Value *V;
  1043. switch (IID) {
  1044. default:
  1045. llvm_unreachable("Case stmts out of sync!");
  1046. case Intrinsic::x86_avx512_add_ps_512:
  1047. case Intrinsic::x86_avx512_add_pd_512:
  1048. V = IC.Builder.CreateFAdd(Arg0, Arg1);
  1049. break;
  1050. case Intrinsic::x86_avx512_sub_ps_512:
  1051. case Intrinsic::x86_avx512_sub_pd_512:
  1052. V = IC.Builder.CreateFSub(Arg0, Arg1);
  1053. break;
  1054. case Intrinsic::x86_avx512_mul_ps_512:
  1055. case Intrinsic::x86_avx512_mul_pd_512:
  1056. V = IC.Builder.CreateFMul(Arg0, Arg1);
  1057. break;
  1058. case Intrinsic::x86_avx512_div_ps_512:
  1059. case Intrinsic::x86_avx512_div_pd_512:
  1060. V = IC.Builder.CreateFDiv(Arg0, Arg1);
  1061. break;
  1062. }
  1063. return IC.replaceInstUsesWith(II, V);
  1064. }
  1065. }
  1066. break;
  1067. case Intrinsic::x86_avx512_mask_add_ss_round:
  1068. case Intrinsic::x86_avx512_mask_div_ss_round:
  1069. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1070. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1071. case Intrinsic::x86_avx512_mask_add_sd_round:
  1072. case Intrinsic::x86_avx512_mask_div_sd_round:
  1073. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1074. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1075. // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
  1076. // IR operations.
  1077. if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
  1078. if (R->getValue() == 4) {
  1079. // Extract the element as scalars.
  1080. Value *Arg0 = II.getArgOperand(0);
  1081. Value *Arg1 = II.getArgOperand(1);
  1082. Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
  1083. Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
  1084. Value *V;
  1085. switch (IID) {
  1086. default:
  1087. llvm_unreachable("Case stmts out of sync!");
  1088. case Intrinsic::x86_avx512_mask_add_ss_round:
  1089. case Intrinsic::x86_avx512_mask_add_sd_round:
  1090. V = IC.Builder.CreateFAdd(LHS, RHS);
  1091. break;
  1092. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1093. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1094. V = IC.Builder.CreateFSub(LHS, RHS);
  1095. break;
  1096. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1097. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1098. V = IC.Builder.CreateFMul(LHS, RHS);
  1099. break;
  1100. case Intrinsic::x86_avx512_mask_div_ss_round:
  1101. case Intrinsic::x86_avx512_mask_div_sd_round:
  1102. V = IC.Builder.CreateFDiv(LHS, RHS);
  1103. break;
  1104. }
  1105. // Handle the masking aspect of the intrinsic.
  1106. Value *Mask = II.getArgOperand(3);
  1107. auto *C = dyn_cast<ConstantInt>(Mask);
  1108. // We don't need a select if we know the mask bit is a 1.
  1109. if (!C || !C->getValue()[0]) {
  1110. // Cast the mask to an i1 vector and then extract the lowest element.
  1111. auto *MaskTy = FixedVectorType::get(
  1112. IC.Builder.getInt1Ty(),
  1113. cast<IntegerType>(Mask->getType())->getBitWidth());
  1114. Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
  1115. Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
  1116. // Extract the lowest element from the passthru operand.
  1117. Value *Passthru =
  1118. IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
  1119. V = IC.Builder.CreateSelect(Mask, V, Passthru);
  1120. }
  1121. // Insert the result back into the original argument 0.
  1122. V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
  1123. return IC.replaceInstUsesWith(II, V);
  1124. }
  1125. }
  1126. break;
  1127. // Constant fold ashr( <A x Bi>, Ci ).
  1128. // Constant fold lshr( <A x Bi>, Ci ).
  1129. // Constant fold shl( <A x Bi>, Ci ).
  1130. case Intrinsic::x86_sse2_psrai_d:
  1131. case Intrinsic::x86_sse2_psrai_w:
  1132. case Intrinsic::x86_avx2_psrai_d:
  1133. case Intrinsic::x86_avx2_psrai_w:
  1134. case Intrinsic::x86_avx512_psrai_q_128:
  1135. case Intrinsic::x86_avx512_psrai_q_256:
  1136. case Intrinsic::x86_avx512_psrai_d_512:
  1137. case Intrinsic::x86_avx512_psrai_q_512:
  1138. case Intrinsic::x86_avx512_psrai_w_512:
  1139. case Intrinsic::x86_sse2_psrli_d:
  1140. case Intrinsic::x86_sse2_psrli_q:
  1141. case Intrinsic::x86_sse2_psrli_w:
  1142. case Intrinsic::x86_avx2_psrli_d:
  1143. case Intrinsic::x86_avx2_psrli_q:
  1144. case Intrinsic::x86_avx2_psrli_w:
  1145. case Intrinsic::x86_avx512_psrli_d_512:
  1146. case Intrinsic::x86_avx512_psrli_q_512:
  1147. case Intrinsic::x86_avx512_psrli_w_512:
  1148. case Intrinsic::x86_sse2_pslli_d:
  1149. case Intrinsic::x86_sse2_pslli_q:
  1150. case Intrinsic::x86_sse2_pslli_w:
  1151. case Intrinsic::x86_avx2_pslli_d:
  1152. case Intrinsic::x86_avx2_pslli_q:
  1153. case Intrinsic::x86_avx2_pslli_w:
  1154. case Intrinsic::x86_avx512_pslli_d_512:
  1155. case Intrinsic::x86_avx512_pslli_q_512:
  1156. case Intrinsic::x86_avx512_pslli_w_512:
  1157. if (Value *V = simplifyX86immShift(II, IC.Builder)) {
  1158. return IC.replaceInstUsesWith(II, V);
  1159. }
  1160. break;
  1161. case Intrinsic::x86_sse2_psra_d:
  1162. case Intrinsic::x86_sse2_psra_w:
  1163. case Intrinsic::x86_avx2_psra_d:
  1164. case Intrinsic::x86_avx2_psra_w:
  1165. case Intrinsic::x86_avx512_psra_q_128:
  1166. case Intrinsic::x86_avx512_psra_q_256:
  1167. case Intrinsic::x86_avx512_psra_d_512:
  1168. case Intrinsic::x86_avx512_psra_q_512:
  1169. case Intrinsic::x86_avx512_psra_w_512:
  1170. case Intrinsic::x86_sse2_psrl_d:
  1171. case Intrinsic::x86_sse2_psrl_q:
  1172. case Intrinsic::x86_sse2_psrl_w:
  1173. case Intrinsic::x86_avx2_psrl_d:
  1174. case Intrinsic::x86_avx2_psrl_q:
  1175. case Intrinsic::x86_avx2_psrl_w:
  1176. case Intrinsic::x86_avx512_psrl_d_512:
  1177. case Intrinsic::x86_avx512_psrl_q_512:
  1178. case Intrinsic::x86_avx512_psrl_w_512:
  1179. case Intrinsic::x86_sse2_psll_d:
  1180. case Intrinsic::x86_sse2_psll_q:
  1181. case Intrinsic::x86_sse2_psll_w:
  1182. case Intrinsic::x86_avx2_psll_d:
  1183. case Intrinsic::x86_avx2_psll_q:
  1184. case Intrinsic::x86_avx2_psll_w:
  1185. case Intrinsic::x86_avx512_psll_d_512:
  1186. case Intrinsic::x86_avx512_psll_q_512:
  1187. case Intrinsic::x86_avx512_psll_w_512: {
  1188. if (Value *V = simplifyX86immShift(II, IC.Builder)) {
  1189. return IC.replaceInstUsesWith(II, V);
  1190. }
  1191. // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
  1192. // operand to compute the shift amount.
  1193. Value *Arg1 = II.getArgOperand(1);
  1194. assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
  1195. "Unexpected packed shift size");
  1196. unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
  1197. if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
  1198. return IC.replaceOperand(II, 1, V);
  1199. }
  1200. break;
  1201. }
  1202. case Intrinsic::x86_avx2_psllv_d:
  1203. case Intrinsic::x86_avx2_psllv_d_256:
  1204. case Intrinsic::x86_avx2_psllv_q:
  1205. case Intrinsic::x86_avx2_psllv_q_256:
  1206. case Intrinsic::x86_avx512_psllv_d_512:
  1207. case Intrinsic::x86_avx512_psllv_q_512:
  1208. case Intrinsic::x86_avx512_psllv_w_128:
  1209. case Intrinsic::x86_avx512_psllv_w_256:
  1210. case Intrinsic::x86_avx512_psllv_w_512:
  1211. case Intrinsic::x86_avx2_psrav_d:
  1212. case Intrinsic::x86_avx2_psrav_d_256:
  1213. case Intrinsic::x86_avx512_psrav_q_128:
  1214. case Intrinsic::x86_avx512_psrav_q_256:
  1215. case Intrinsic::x86_avx512_psrav_d_512:
  1216. case Intrinsic::x86_avx512_psrav_q_512:
  1217. case Intrinsic::x86_avx512_psrav_w_128:
  1218. case Intrinsic::x86_avx512_psrav_w_256:
  1219. case Intrinsic::x86_avx512_psrav_w_512:
  1220. case Intrinsic::x86_avx2_psrlv_d:
  1221. case Intrinsic::x86_avx2_psrlv_d_256:
  1222. case Intrinsic::x86_avx2_psrlv_q:
  1223. case Intrinsic::x86_avx2_psrlv_q_256:
  1224. case Intrinsic::x86_avx512_psrlv_d_512:
  1225. case Intrinsic::x86_avx512_psrlv_q_512:
  1226. case Intrinsic::x86_avx512_psrlv_w_128:
  1227. case Intrinsic::x86_avx512_psrlv_w_256:
  1228. case Intrinsic::x86_avx512_psrlv_w_512:
  1229. if (Value *V = simplifyX86varShift(II, IC.Builder)) {
  1230. return IC.replaceInstUsesWith(II, V);
  1231. }
  1232. break;
  1233. case Intrinsic::x86_sse2_packssdw_128:
  1234. case Intrinsic::x86_sse2_packsswb_128:
  1235. case Intrinsic::x86_avx2_packssdw:
  1236. case Intrinsic::x86_avx2_packsswb:
  1237. case Intrinsic::x86_avx512_packssdw_512:
  1238. case Intrinsic::x86_avx512_packsswb_512:
  1239. if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
  1240. return IC.replaceInstUsesWith(II, V);
  1241. }
  1242. break;
  1243. case Intrinsic::x86_sse2_packuswb_128:
  1244. case Intrinsic::x86_sse41_packusdw:
  1245. case Intrinsic::x86_avx2_packusdw:
  1246. case Intrinsic::x86_avx2_packuswb:
  1247. case Intrinsic::x86_avx512_packusdw_512:
  1248. case Intrinsic::x86_avx512_packuswb_512:
  1249. if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
  1250. return IC.replaceInstUsesWith(II, V);
  1251. }
  1252. break;
  1253. case Intrinsic::x86_pclmulqdq:
  1254. case Intrinsic::x86_pclmulqdq_256:
  1255. case Intrinsic::x86_pclmulqdq_512: {
  1256. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
  1257. unsigned Imm = C->getZExtValue();
  1258. bool MadeChange = false;
  1259. Value *Arg0 = II.getArgOperand(0);
  1260. Value *Arg1 = II.getArgOperand(1);
  1261. unsigned VWidth =
  1262. cast<FixedVectorType>(Arg0->getType())->getNumElements();
  1263. APInt UndefElts1(VWidth, 0);
  1264. APInt DemandedElts1 =
  1265. APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
  1266. if (Value *V =
  1267. IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
  1268. IC.replaceOperand(II, 0, V);
  1269. MadeChange = true;
  1270. }
  1271. APInt UndefElts2(VWidth, 0);
  1272. APInt DemandedElts2 =
  1273. APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
  1274. if (Value *V =
  1275. IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
  1276. IC.replaceOperand(II, 1, V);
  1277. MadeChange = true;
  1278. }
  1279. // If either input elements are undef, the result is zero.
  1280. if (DemandedElts1.isSubsetOf(UndefElts1) ||
  1281. DemandedElts2.isSubsetOf(UndefElts2)) {
  1282. return IC.replaceInstUsesWith(II,
  1283. ConstantAggregateZero::get(II.getType()));
  1284. }
  1285. if (MadeChange) {
  1286. return &II;
  1287. }
  1288. }
  1289. break;
  1290. }
  1291. case Intrinsic::x86_sse41_insertps:
  1292. if (Value *V = simplifyX86insertps(II, IC.Builder)) {
  1293. return IC.replaceInstUsesWith(II, V);
  1294. }
  1295. break;
  1296. case Intrinsic::x86_sse4a_extrq: {
  1297. Value *Op0 = II.getArgOperand(0);
  1298. Value *Op1 = II.getArgOperand(1);
  1299. unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1300. unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
  1301. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1302. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
  1303. VWidth1 == 16 && "Unexpected operand sizes");
  1304. // See if we're dealing with constant values.
  1305. auto *C1 = dyn_cast<Constant>(Op1);
  1306. auto *CILength =
  1307. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
  1308. : nullptr;
  1309. auto *CIIndex =
  1310. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
  1311. : nullptr;
  1312. // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
  1313. if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
  1314. return IC.replaceInstUsesWith(II, V);
  1315. }
  1316. // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
  1317. // operands and the lowest 16-bits of the second.
  1318. bool MadeChange = false;
  1319. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
  1320. IC.replaceOperand(II, 0, V);
  1321. MadeChange = true;
  1322. }
  1323. if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
  1324. IC.replaceOperand(II, 1, V);
  1325. MadeChange = true;
  1326. }
  1327. if (MadeChange) {
  1328. return &II;
  1329. }
  1330. break;
  1331. }
  1332. case Intrinsic::x86_sse4a_extrqi: {
  1333. // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
  1334. // bits of the lower 64-bits. The upper 64-bits are undefined.
  1335. Value *Op0 = II.getArgOperand(0);
  1336. unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1337. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
  1338. "Unexpected operand size");
  1339. // See if we're dealing with constant values.
  1340. auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
  1341. auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
  1342. // Attempt to simplify to a constant or shuffle vector.
  1343. if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
  1344. return IC.replaceInstUsesWith(II, V);
  1345. }
  1346. // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
  1347. // operand.
  1348. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
  1349. return IC.replaceOperand(II, 0, V);
  1350. }
  1351. break;
  1352. }
  1353. case Intrinsic::x86_sse4a_insertq: {
  1354. Value *Op0 = II.getArgOperand(0);
  1355. Value *Op1 = II.getArgOperand(1);
  1356. unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1357. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1358. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
  1359. cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
  1360. "Unexpected operand size");
  1361. // See if we're dealing with constant values.
  1362. auto *C1 = dyn_cast<Constant>(Op1);
  1363. auto *CI11 =
  1364. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
  1365. : nullptr;
  1366. // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
  1367. if (CI11) {
  1368. const APInt &V11 = CI11->getValue();
  1369. APInt Len = V11.zextOrTrunc(6);
  1370. APInt Idx = V11.lshr(8).zextOrTrunc(6);
  1371. if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
  1372. return IC.replaceInstUsesWith(II, V);
  1373. }
  1374. }
  1375. // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
  1376. // operand.
  1377. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
  1378. return IC.replaceOperand(II, 0, V);
  1379. }
  1380. break;
  1381. }
  1382. case Intrinsic::x86_sse4a_insertqi: {
  1383. // INSERTQI: Extract lowest Length bits from lower half of second source and
  1384. // insert over first source starting at Index bit. The upper 64-bits are
  1385. // undefined.
  1386. Value *Op0 = II.getArgOperand(0);
  1387. Value *Op1 = II.getArgOperand(1);
  1388. unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1389. unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
  1390. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1391. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
  1392. VWidth1 == 2 && "Unexpected operand sizes");
  1393. // See if we're dealing with constant values.
  1394. auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
  1395. auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
  1396. // Attempt to simplify to a constant or shuffle vector.
  1397. if (CILength && CIIndex) {
  1398. APInt Len = CILength->getValue().zextOrTrunc(6);
  1399. APInt Idx = CIIndex->getValue().zextOrTrunc(6);
  1400. if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
  1401. return IC.replaceInstUsesWith(II, V);
  1402. }
  1403. }
  1404. // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
  1405. // operands.
  1406. bool MadeChange = false;
  1407. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
  1408. IC.replaceOperand(II, 0, V);
  1409. MadeChange = true;
  1410. }
  1411. if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
  1412. IC.replaceOperand(II, 1, V);
  1413. MadeChange = true;
  1414. }
  1415. if (MadeChange) {
  1416. return &II;
  1417. }
  1418. break;
  1419. }
  1420. case Intrinsic::x86_sse41_pblendvb:
  1421. case Intrinsic::x86_sse41_blendvps:
  1422. case Intrinsic::x86_sse41_blendvpd:
  1423. case Intrinsic::x86_avx_blendv_ps_256:
  1424. case Intrinsic::x86_avx_blendv_pd_256:
  1425. case Intrinsic::x86_avx2_pblendvb: {
  1426. // fold (blend A, A, Mask) -> A
  1427. Value *Op0 = II.getArgOperand(0);
  1428. Value *Op1 = II.getArgOperand(1);
  1429. Value *Mask = II.getArgOperand(2);
  1430. if (Op0 == Op1) {
  1431. return IC.replaceInstUsesWith(II, Op0);
  1432. }
  1433. // Zero Mask - select 1st argument.
  1434. if (isa<ConstantAggregateZero>(Mask)) {
  1435. return IC.replaceInstUsesWith(II, Op0);
  1436. }
  1437. // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
  1438. if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
  1439. Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
  1440. return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
  1441. }
  1442. // Convert to a vector select if we can bypass casts and find a boolean
  1443. // vector condition value.
  1444. Value *BoolVec;
  1445. Mask = InstCombiner::peekThroughBitcast(Mask);
  1446. if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
  1447. BoolVec->getType()->isVectorTy() &&
  1448. BoolVec->getType()->getScalarSizeInBits() == 1) {
  1449. assert(Mask->getType()->getPrimitiveSizeInBits() ==
  1450. II.getType()->getPrimitiveSizeInBits() &&
  1451. "Not expecting mask and operands with different sizes");
  1452. unsigned NumMaskElts =
  1453. cast<FixedVectorType>(Mask->getType())->getNumElements();
  1454. unsigned NumOperandElts =
  1455. cast<FixedVectorType>(II.getType())->getNumElements();
  1456. if (NumMaskElts == NumOperandElts) {
  1457. return SelectInst::Create(BoolVec, Op1, Op0);
  1458. }
  1459. // If the mask has less elements than the operands, each mask bit maps to
  1460. // multiple elements of the operands. Bitcast back and forth.
  1461. if (NumMaskElts < NumOperandElts) {
  1462. Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
  1463. Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
  1464. Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
  1465. return new BitCastInst(Sel, II.getType());
  1466. }
  1467. }
  1468. break;
  1469. }
  1470. case Intrinsic::x86_ssse3_pshuf_b_128:
  1471. case Intrinsic::x86_avx2_pshuf_b:
  1472. case Intrinsic::x86_avx512_pshuf_b_512:
  1473. if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
  1474. return IC.replaceInstUsesWith(II, V);
  1475. }
  1476. break;
  1477. case Intrinsic::x86_avx_vpermilvar_ps:
  1478. case Intrinsic::x86_avx_vpermilvar_ps_256:
  1479. case Intrinsic::x86_avx512_vpermilvar_ps_512:
  1480. case Intrinsic::x86_avx_vpermilvar_pd:
  1481. case Intrinsic::x86_avx_vpermilvar_pd_256:
  1482. case Intrinsic::x86_avx512_vpermilvar_pd_512:
  1483. if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
  1484. return IC.replaceInstUsesWith(II, V);
  1485. }
  1486. break;
  1487. case Intrinsic::x86_avx2_permd:
  1488. case Intrinsic::x86_avx2_permps:
  1489. case Intrinsic::x86_avx512_permvar_df_256:
  1490. case Intrinsic::x86_avx512_permvar_df_512:
  1491. case Intrinsic::x86_avx512_permvar_di_256:
  1492. case Intrinsic::x86_avx512_permvar_di_512:
  1493. case Intrinsic::x86_avx512_permvar_hi_128:
  1494. case Intrinsic::x86_avx512_permvar_hi_256:
  1495. case Intrinsic::x86_avx512_permvar_hi_512:
  1496. case Intrinsic::x86_avx512_permvar_qi_128:
  1497. case Intrinsic::x86_avx512_permvar_qi_256:
  1498. case Intrinsic::x86_avx512_permvar_qi_512:
  1499. case Intrinsic::x86_avx512_permvar_sf_512:
  1500. case Intrinsic::x86_avx512_permvar_si_512:
  1501. if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
  1502. return IC.replaceInstUsesWith(II, V);
  1503. }
  1504. break;
  1505. case Intrinsic::x86_avx_maskload_ps:
  1506. case Intrinsic::x86_avx_maskload_pd:
  1507. case Intrinsic::x86_avx_maskload_ps_256:
  1508. case Intrinsic::x86_avx_maskload_pd_256:
  1509. case Intrinsic::x86_avx2_maskload_d:
  1510. case Intrinsic::x86_avx2_maskload_q:
  1511. case Intrinsic::x86_avx2_maskload_d_256:
  1512. case Intrinsic::x86_avx2_maskload_q_256:
  1513. if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
  1514. return I;
  1515. }
  1516. break;
  1517. case Intrinsic::x86_sse2_maskmov_dqu:
  1518. case Intrinsic::x86_avx_maskstore_ps:
  1519. case Intrinsic::x86_avx_maskstore_pd:
  1520. case Intrinsic::x86_avx_maskstore_ps_256:
  1521. case Intrinsic::x86_avx_maskstore_pd_256:
  1522. case Intrinsic::x86_avx2_maskstore_d:
  1523. case Intrinsic::x86_avx2_maskstore_q:
  1524. case Intrinsic::x86_avx2_maskstore_d_256:
  1525. case Intrinsic::x86_avx2_maskstore_q_256:
  1526. if (simplifyX86MaskedStore(II, IC)) {
  1527. return nullptr;
  1528. }
  1529. break;
  1530. case Intrinsic::x86_addcarry_32:
  1531. case Intrinsic::x86_addcarry_64:
  1532. if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
  1533. return IC.replaceInstUsesWith(II, V);
  1534. }
  1535. break;
  1536. default:
  1537. break;
  1538. }
  1539. return None;
  1540. }
  1541. Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
  1542. InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
  1543. bool &KnownBitsComputed) const {
  1544. switch (II.getIntrinsicID()) {
  1545. default:
  1546. break;
  1547. case Intrinsic::x86_mmx_pmovmskb:
  1548. case Intrinsic::x86_sse_movmsk_ps:
  1549. case Intrinsic::x86_sse2_movmsk_pd:
  1550. case Intrinsic::x86_sse2_pmovmskb_128:
  1551. case Intrinsic::x86_avx_movmsk_ps_256:
  1552. case Intrinsic::x86_avx_movmsk_pd_256:
  1553. case Intrinsic::x86_avx2_pmovmskb: {
  1554. // MOVMSK copies the vector elements' sign bits to the low bits
  1555. // and zeros the high bits.
  1556. unsigned ArgWidth;
  1557. if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
  1558. ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
  1559. } else {
  1560. auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
  1561. ArgWidth = ArgType->getNumElements();
  1562. }
  1563. // If we don't need any of low bits then return zero,
  1564. // we know that DemandedMask is non-zero already.
  1565. APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
  1566. Type *VTy = II.getType();
  1567. if (DemandedElts.isZero()) {
  1568. return ConstantInt::getNullValue(VTy);
  1569. }
  1570. // We know that the upper bits are set to zero.
  1571. Known.Zero.setBitsFrom(ArgWidth);
  1572. KnownBitsComputed = true;
  1573. break;
  1574. }
  1575. }
  1576. return None;
  1577. }
  1578. Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
  1579. InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
  1580. APInt &UndefElts2, APInt &UndefElts3,
  1581. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  1582. simplifyAndSetOp) const {
  1583. unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
  1584. switch (II.getIntrinsicID()) {
  1585. default:
  1586. break;
  1587. case Intrinsic::x86_xop_vfrcz_ss:
  1588. case Intrinsic::x86_xop_vfrcz_sd:
  1589. // The instructions for these intrinsics are speced to zero upper bits not
  1590. // pass them through like other scalar intrinsics. So we shouldn't just
  1591. // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
  1592. // Instead we should return a zero vector.
  1593. if (!DemandedElts[0]) {
  1594. IC.addToWorklist(&II);
  1595. return ConstantAggregateZero::get(II.getType());
  1596. }
  1597. // Only the lower element is used.
  1598. DemandedElts = 1;
  1599. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1600. // Only the lower element is undefined. The high elements are zero.
  1601. UndefElts = UndefElts[0];
  1602. break;
  1603. // Unary scalar-as-vector operations that work column-wise.
  1604. case Intrinsic::x86_sse_rcp_ss:
  1605. case Intrinsic::x86_sse_rsqrt_ss:
  1606. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1607. // If lowest element of a scalar op isn't used then use Arg0.
  1608. if (!DemandedElts[0]) {
  1609. IC.addToWorklist(&II);
  1610. return II.getArgOperand(0);
  1611. }
  1612. // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
  1613. // checks).
  1614. break;
  1615. // Binary scalar-as-vector operations that work column-wise. The high
  1616. // elements come from operand 0. The low element is a function of both
  1617. // operands.
  1618. case Intrinsic::x86_sse_min_ss:
  1619. case Intrinsic::x86_sse_max_ss:
  1620. case Intrinsic::x86_sse_cmp_ss:
  1621. case Intrinsic::x86_sse2_min_sd:
  1622. case Intrinsic::x86_sse2_max_sd:
  1623. case Intrinsic::x86_sse2_cmp_sd: {
  1624. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1625. // If lowest element of a scalar op isn't used then use Arg0.
  1626. if (!DemandedElts[0]) {
  1627. IC.addToWorklist(&II);
  1628. return II.getArgOperand(0);
  1629. }
  1630. // Only lower element is used for operand 1.
  1631. DemandedElts = 1;
  1632. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1633. // Lower element is undefined if both lower elements are undefined.
  1634. // Consider things like undef&0. The result is known zero, not undef.
  1635. if (!UndefElts2[0])
  1636. UndefElts.clearBit(0);
  1637. break;
  1638. }
  1639. // Binary scalar-as-vector operations that work column-wise. The high
  1640. // elements come from operand 0 and the low element comes from operand 1.
  1641. case Intrinsic::x86_sse41_round_ss:
  1642. case Intrinsic::x86_sse41_round_sd: {
  1643. // Don't use the low element of operand 0.
  1644. APInt DemandedElts2 = DemandedElts;
  1645. DemandedElts2.clearBit(0);
  1646. simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
  1647. // If lowest element of a scalar op isn't used then use Arg0.
  1648. if (!DemandedElts[0]) {
  1649. IC.addToWorklist(&II);
  1650. return II.getArgOperand(0);
  1651. }
  1652. // Only lower element is used for operand 1.
  1653. DemandedElts = 1;
  1654. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1655. // Take the high undef elements from operand 0 and take the lower element
  1656. // from operand 1.
  1657. UndefElts.clearBit(0);
  1658. UndefElts |= UndefElts2[0];
  1659. break;
  1660. }
  1661. // Three input scalar-as-vector operations that work column-wise. The high
  1662. // elements come from operand 0 and the low element is a function of all
  1663. // three inputs.
  1664. case Intrinsic::x86_avx512_mask_add_ss_round:
  1665. case Intrinsic::x86_avx512_mask_div_ss_round:
  1666. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1667. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1668. case Intrinsic::x86_avx512_mask_max_ss_round:
  1669. case Intrinsic::x86_avx512_mask_min_ss_round:
  1670. case Intrinsic::x86_avx512_mask_add_sd_round:
  1671. case Intrinsic::x86_avx512_mask_div_sd_round:
  1672. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1673. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1674. case Intrinsic::x86_avx512_mask_max_sd_round:
  1675. case Intrinsic::x86_avx512_mask_min_sd_round:
  1676. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1677. // If lowest element of a scalar op isn't used then use Arg0.
  1678. if (!DemandedElts[0]) {
  1679. IC.addToWorklist(&II);
  1680. return II.getArgOperand(0);
  1681. }
  1682. // Only lower element is used for operand 1 and 2.
  1683. DemandedElts = 1;
  1684. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1685. simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
  1686. // Lower element is undefined if all three lower elements are undefined.
  1687. // Consider things like undef&0. The result is known zero, not undef.
  1688. if (!UndefElts2[0] || !UndefElts3[0])
  1689. UndefElts.clearBit(0);
  1690. break;
  1691. // TODO: Add fmaddsub support?
  1692. case Intrinsic::x86_sse3_addsub_pd:
  1693. case Intrinsic::x86_sse3_addsub_ps:
  1694. case Intrinsic::x86_avx_addsub_pd_256:
  1695. case Intrinsic::x86_avx_addsub_ps_256: {
  1696. // If none of the even or none of the odd lanes are required, turn this
  1697. // into a generic FP math instruction.
  1698. APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
  1699. APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
  1700. bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
  1701. bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
  1702. if (IsSubOnly || IsAddOnly) {
  1703. assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
  1704. IRBuilderBase::InsertPointGuard Guard(IC.Builder);
  1705. IC.Builder.SetInsertPoint(&II);
  1706. Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
  1707. return IC.Builder.CreateBinOp(
  1708. IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
  1709. }
  1710. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1711. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1712. UndefElts &= UndefElts2;
  1713. break;
  1714. }
  1715. case Intrinsic::x86_sse2_packssdw_128:
  1716. case Intrinsic::x86_sse2_packsswb_128:
  1717. case Intrinsic::x86_sse2_packuswb_128:
  1718. case Intrinsic::x86_sse41_packusdw:
  1719. case Intrinsic::x86_avx2_packssdw:
  1720. case Intrinsic::x86_avx2_packsswb:
  1721. case Intrinsic::x86_avx2_packusdw:
  1722. case Intrinsic::x86_avx2_packuswb:
  1723. case Intrinsic::x86_avx512_packssdw_512:
  1724. case Intrinsic::x86_avx512_packsswb_512:
  1725. case Intrinsic::x86_avx512_packusdw_512:
  1726. case Intrinsic::x86_avx512_packuswb_512: {
  1727. auto *Ty0 = II.getArgOperand(0)->getType();
  1728. unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
  1729. assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
  1730. unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
  1731. unsigned VWidthPerLane = VWidth / NumLanes;
  1732. unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
  1733. // Per lane, pack the elements of the first input and then the second.
  1734. // e.g.
  1735. // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
  1736. // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
  1737. for (int OpNum = 0; OpNum != 2; ++OpNum) {
  1738. APInt OpDemandedElts(InnerVWidth, 0);
  1739. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  1740. unsigned LaneIdx = Lane * VWidthPerLane;
  1741. for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
  1742. unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
  1743. if (DemandedElts[Idx])
  1744. OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
  1745. }
  1746. }
  1747. // Demand elements from the operand.
  1748. APInt OpUndefElts(InnerVWidth, 0);
  1749. simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
  1750. // Pack the operand's UNDEF elements, one lane at a time.
  1751. OpUndefElts = OpUndefElts.zext(VWidth);
  1752. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  1753. APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
  1754. LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
  1755. LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
  1756. UndefElts |= LaneElts;
  1757. }
  1758. }
  1759. break;
  1760. }
  1761. // PSHUFB
  1762. case Intrinsic::x86_ssse3_pshuf_b_128:
  1763. case Intrinsic::x86_avx2_pshuf_b:
  1764. case Intrinsic::x86_avx512_pshuf_b_512:
  1765. // PERMILVAR
  1766. case Intrinsic::x86_avx_vpermilvar_ps:
  1767. case Intrinsic::x86_avx_vpermilvar_ps_256:
  1768. case Intrinsic::x86_avx512_vpermilvar_ps_512:
  1769. case Intrinsic::x86_avx_vpermilvar_pd:
  1770. case Intrinsic::x86_avx_vpermilvar_pd_256:
  1771. case Intrinsic::x86_avx512_vpermilvar_pd_512:
  1772. // PERMV
  1773. case Intrinsic::x86_avx2_permd:
  1774. case Intrinsic::x86_avx2_permps: {
  1775. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
  1776. break;
  1777. }
  1778. // SSE4A instructions leave the upper 64-bits of the 128-bit result
  1779. // in an undefined state.
  1780. case Intrinsic::x86_sse4a_extrq:
  1781. case Intrinsic::x86_sse4a_extrqi:
  1782. case Intrinsic::x86_sse4a_insertq:
  1783. case Intrinsic::x86_sse4a_insertqi:
  1784. UndefElts.setHighBits(VWidth / 2);
  1785. break;
  1786. }
  1787. return None;
  1788. }