X86InstCombineIntrinsic.cpp 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030
  1. //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// This file implements a TargetTransformInfo analysis pass specific to the
  10. /// X86 target machine. It uses the target's detailed information to provide
  11. /// more precise answers to certain TTI queries, while letting the target
  12. /// independent and default TTI implementations handle the rest.
  13. ///
  14. //===----------------------------------------------------------------------===//
  15. #include "X86TargetTransformInfo.h"
  16. #include "llvm/IR/IntrinsicInst.h"
  17. #include "llvm/IR/IntrinsicsX86.h"
  18. #include "llvm/Support/KnownBits.h"
  19. #include "llvm/Transforms/InstCombine/InstCombiner.h"
  20. #include <optional>
  21. using namespace llvm;
  22. #define DEBUG_TYPE "x86tti"
  23. /// Return a constant boolean vector that has true elements in all positions
  24. /// where the input constant data vector has an element with the sign bit set.
  25. static Constant *getNegativeIsTrueBoolVec(Constant *V) {
  26. VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
  27. V = ConstantExpr::getBitCast(V, IntTy);
  28. V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
  29. V);
  30. return V;
  31. }
  32. /// Convert the x86 XMM integer vector mask to a vector of bools based on
  33. /// each element's most significant bit (the sign bit).
  34. static Value *getBoolVecFromMask(Value *Mask) {
  35. // Fold Constant Mask.
  36. if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
  37. return getNegativeIsTrueBoolVec(ConstantMask);
  38. // Mask was extended from a boolean vector.
  39. Value *ExtMask;
  40. if (PatternMatch::match(
  41. Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
  42. ExtMask->getType()->isIntOrIntVectorTy(1))
  43. return ExtMask;
  44. return nullptr;
  45. }
  46. // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  47. // XMM register mask efficiently, we could transform all x86 masked intrinsics
  48. // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  49. static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
  50. Value *Ptr = II.getOperand(0);
  51. Value *Mask = II.getOperand(1);
  52. Constant *ZeroVec = Constant::getNullValue(II.getType());
  53. // Zero Mask - masked load instruction creates a zero vector.
  54. if (isa<ConstantAggregateZero>(Mask))
  55. return IC.replaceInstUsesWith(II, ZeroVec);
  56. // The mask is constant or extended from a bool vector. Convert this x86
  57. // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  58. if (Value *BoolMask = getBoolVecFromMask(Mask)) {
  59. // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
  60. // the LLVM intrinsic definition for the pointer argument.
  61. unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  62. PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
  63. Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  64. // The pass-through vector for an x86 masked load is a zero vector.
  65. CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
  66. II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
  67. return IC.replaceInstUsesWith(II, NewMaskedLoad);
  68. }
  69. return nullptr;
  70. }
  71. // TODO: If the x86 backend knew how to convert a bool vector mask back to an
  72. // XMM register mask efficiently, we could transform all x86 masked intrinsics
  73. // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
  74. static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
  75. Value *Ptr = II.getOperand(0);
  76. Value *Mask = II.getOperand(1);
  77. Value *Vec = II.getOperand(2);
  78. // Zero Mask - this masked store instruction does nothing.
  79. if (isa<ConstantAggregateZero>(Mask)) {
  80. IC.eraseInstFromFunction(II);
  81. return true;
  82. }
  83. // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
  84. // anything else at this level.
  85. if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
  86. return false;
  87. // The mask is constant or extended from a bool vector. Convert this x86
  88. // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
  89. if (Value *BoolMask = getBoolVecFromMask(Mask)) {
  90. unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
  91. PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
  92. Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
  93. IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
  94. // 'Replace uses' doesn't work for stores. Erase the original masked store.
  95. IC.eraseInstFromFunction(II);
  96. return true;
  97. }
  98. return false;
  99. }
  100. static Value *simplifyX86immShift(const IntrinsicInst &II,
  101. InstCombiner::BuilderTy &Builder) {
  102. bool LogicalShift = false;
  103. bool ShiftLeft = false;
  104. bool IsImm = false;
  105. switch (II.getIntrinsicID()) {
  106. default:
  107. llvm_unreachable("Unexpected intrinsic!");
  108. case Intrinsic::x86_sse2_psrai_d:
  109. case Intrinsic::x86_sse2_psrai_w:
  110. case Intrinsic::x86_avx2_psrai_d:
  111. case Intrinsic::x86_avx2_psrai_w:
  112. case Intrinsic::x86_avx512_psrai_q_128:
  113. case Intrinsic::x86_avx512_psrai_q_256:
  114. case Intrinsic::x86_avx512_psrai_d_512:
  115. case Intrinsic::x86_avx512_psrai_q_512:
  116. case Intrinsic::x86_avx512_psrai_w_512:
  117. IsImm = true;
  118. [[fallthrough]];
  119. case Intrinsic::x86_sse2_psra_d:
  120. case Intrinsic::x86_sse2_psra_w:
  121. case Intrinsic::x86_avx2_psra_d:
  122. case Intrinsic::x86_avx2_psra_w:
  123. case Intrinsic::x86_avx512_psra_q_128:
  124. case Intrinsic::x86_avx512_psra_q_256:
  125. case Intrinsic::x86_avx512_psra_d_512:
  126. case Intrinsic::x86_avx512_psra_q_512:
  127. case Intrinsic::x86_avx512_psra_w_512:
  128. LogicalShift = false;
  129. ShiftLeft = false;
  130. break;
  131. case Intrinsic::x86_sse2_psrli_d:
  132. case Intrinsic::x86_sse2_psrli_q:
  133. case Intrinsic::x86_sse2_psrli_w:
  134. case Intrinsic::x86_avx2_psrli_d:
  135. case Intrinsic::x86_avx2_psrli_q:
  136. case Intrinsic::x86_avx2_psrli_w:
  137. case Intrinsic::x86_avx512_psrli_d_512:
  138. case Intrinsic::x86_avx512_psrli_q_512:
  139. case Intrinsic::x86_avx512_psrli_w_512:
  140. IsImm = true;
  141. [[fallthrough]];
  142. case Intrinsic::x86_sse2_psrl_d:
  143. case Intrinsic::x86_sse2_psrl_q:
  144. case Intrinsic::x86_sse2_psrl_w:
  145. case Intrinsic::x86_avx2_psrl_d:
  146. case Intrinsic::x86_avx2_psrl_q:
  147. case Intrinsic::x86_avx2_psrl_w:
  148. case Intrinsic::x86_avx512_psrl_d_512:
  149. case Intrinsic::x86_avx512_psrl_q_512:
  150. case Intrinsic::x86_avx512_psrl_w_512:
  151. LogicalShift = true;
  152. ShiftLeft = false;
  153. break;
  154. case Intrinsic::x86_sse2_pslli_d:
  155. case Intrinsic::x86_sse2_pslli_q:
  156. case Intrinsic::x86_sse2_pslli_w:
  157. case Intrinsic::x86_avx2_pslli_d:
  158. case Intrinsic::x86_avx2_pslli_q:
  159. case Intrinsic::x86_avx2_pslli_w:
  160. case Intrinsic::x86_avx512_pslli_d_512:
  161. case Intrinsic::x86_avx512_pslli_q_512:
  162. case Intrinsic::x86_avx512_pslli_w_512:
  163. IsImm = true;
  164. [[fallthrough]];
  165. case Intrinsic::x86_sse2_psll_d:
  166. case Intrinsic::x86_sse2_psll_q:
  167. case Intrinsic::x86_sse2_psll_w:
  168. case Intrinsic::x86_avx2_psll_d:
  169. case Intrinsic::x86_avx2_psll_q:
  170. case Intrinsic::x86_avx2_psll_w:
  171. case Intrinsic::x86_avx512_psll_d_512:
  172. case Intrinsic::x86_avx512_psll_q_512:
  173. case Intrinsic::x86_avx512_psll_w_512:
  174. LogicalShift = true;
  175. ShiftLeft = true;
  176. break;
  177. }
  178. assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
  179. Value *Vec = II.getArgOperand(0);
  180. Value *Amt = II.getArgOperand(1);
  181. auto *VT = cast<FixedVectorType>(Vec->getType());
  182. Type *SVT = VT->getElementType();
  183. Type *AmtVT = Amt->getType();
  184. unsigned VWidth = VT->getNumElements();
  185. unsigned BitWidth = SVT->getPrimitiveSizeInBits();
  186. // If the shift amount is guaranteed to be in-range we can replace it with a
  187. // generic shift. If its guaranteed to be out of range, logical shifts combine
  188. // to zero and arithmetic shifts are clamped to (BitWidth - 1).
  189. if (IsImm) {
  190. assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
  191. KnownBits KnownAmtBits =
  192. llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
  193. if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
  194. Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
  195. Amt = Builder.CreateVectorSplat(VWidth, Amt);
  196. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  197. : Builder.CreateLShr(Vec, Amt))
  198. : Builder.CreateAShr(Vec, Amt));
  199. }
  200. if (KnownAmtBits.getMinValue().uge(BitWidth)) {
  201. if (LogicalShift)
  202. return ConstantAggregateZero::get(VT);
  203. Amt = ConstantInt::get(SVT, BitWidth - 1);
  204. return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
  205. }
  206. } else {
  207. // Ensure the first element has an in-range value and the rest of the
  208. // elements in the bottom 64 bits are zero.
  209. assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
  210. cast<VectorType>(AmtVT)->getElementType() == SVT &&
  211. "Unexpected shift-by-scalar type");
  212. unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
  213. APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
  214. APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
  215. KnownBits KnownLowerBits = llvm::computeKnownBits(
  216. Amt, DemandedLower, II.getModule()->getDataLayout());
  217. KnownBits KnownUpperBits = llvm::computeKnownBits(
  218. Amt, DemandedUpper, II.getModule()->getDataLayout());
  219. if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
  220. (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
  221. SmallVector<int, 16> ZeroSplat(VWidth, 0);
  222. Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
  223. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  224. : Builder.CreateLShr(Vec, Amt))
  225. : Builder.CreateAShr(Vec, Amt));
  226. }
  227. }
  228. // Simplify if count is constant vector.
  229. auto *CDV = dyn_cast<ConstantDataVector>(Amt);
  230. if (!CDV)
  231. return nullptr;
  232. // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
  233. // operand to compute the shift amount.
  234. assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
  235. cast<VectorType>(AmtVT)->getElementType() == SVT &&
  236. "Unexpected shift-by-scalar type");
  237. // Concatenate the sub-elements to create the 64-bit value.
  238. APInt Count(64, 0);
  239. for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
  240. unsigned SubEltIdx = (NumSubElts - 1) - i;
  241. auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
  242. Count <<= BitWidth;
  243. Count |= SubElt->getValue().zextOrTrunc(64);
  244. }
  245. // If shift-by-zero then just return the original value.
  246. if (Count.isZero())
  247. return Vec;
  248. // Handle cases when Shift >= BitWidth.
  249. if (Count.uge(BitWidth)) {
  250. // If LogicalShift - just return zero.
  251. if (LogicalShift)
  252. return ConstantAggregateZero::get(VT);
  253. // If ArithmeticShift - clamp Shift to (BitWidth - 1).
  254. Count = APInt(64, BitWidth - 1);
  255. }
  256. // Get a constant vector of the same type as the first operand.
  257. auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
  258. auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
  259. if (ShiftLeft)
  260. return Builder.CreateShl(Vec, ShiftVec);
  261. if (LogicalShift)
  262. return Builder.CreateLShr(Vec, ShiftVec);
  263. return Builder.CreateAShr(Vec, ShiftVec);
  264. }
  265. // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
  266. // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
  267. // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
  268. static Value *simplifyX86varShift(const IntrinsicInst &II,
  269. InstCombiner::BuilderTy &Builder) {
  270. bool LogicalShift = false;
  271. bool ShiftLeft = false;
  272. switch (II.getIntrinsicID()) {
  273. default:
  274. llvm_unreachable("Unexpected intrinsic!");
  275. case Intrinsic::x86_avx2_psrav_d:
  276. case Intrinsic::x86_avx2_psrav_d_256:
  277. case Intrinsic::x86_avx512_psrav_q_128:
  278. case Intrinsic::x86_avx512_psrav_q_256:
  279. case Intrinsic::x86_avx512_psrav_d_512:
  280. case Intrinsic::x86_avx512_psrav_q_512:
  281. case Intrinsic::x86_avx512_psrav_w_128:
  282. case Intrinsic::x86_avx512_psrav_w_256:
  283. case Intrinsic::x86_avx512_psrav_w_512:
  284. LogicalShift = false;
  285. ShiftLeft = false;
  286. break;
  287. case Intrinsic::x86_avx2_psrlv_d:
  288. case Intrinsic::x86_avx2_psrlv_d_256:
  289. case Intrinsic::x86_avx2_psrlv_q:
  290. case Intrinsic::x86_avx2_psrlv_q_256:
  291. case Intrinsic::x86_avx512_psrlv_d_512:
  292. case Intrinsic::x86_avx512_psrlv_q_512:
  293. case Intrinsic::x86_avx512_psrlv_w_128:
  294. case Intrinsic::x86_avx512_psrlv_w_256:
  295. case Intrinsic::x86_avx512_psrlv_w_512:
  296. LogicalShift = true;
  297. ShiftLeft = false;
  298. break;
  299. case Intrinsic::x86_avx2_psllv_d:
  300. case Intrinsic::x86_avx2_psllv_d_256:
  301. case Intrinsic::x86_avx2_psllv_q:
  302. case Intrinsic::x86_avx2_psllv_q_256:
  303. case Intrinsic::x86_avx512_psllv_d_512:
  304. case Intrinsic::x86_avx512_psllv_q_512:
  305. case Intrinsic::x86_avx512_psllv_w_128:
  306. case Intrinsic::x86_avx512_psllv_w_256:
  307. case Intrinsic::x86_avx512_psllv_w_512:
  308. LogicalShift = true;
  309. ShiftLeft = true;
  310. break;
  311. }
  312. assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
  313. Value *Vec = II.getArgOperand(0);
  314. Value *Amt = II.getArgOperand(1);
  315. auto *VT = cast<FixedVectorType>(II.getType());
  316. Type *SVT = VT->getElementType();
  317. int NumElts = VT->getNumElements();
  318. int BitWidth = SVT->getIntegerBitWidth();
  319. // If the shift amount is guaranteed to be in-range we can replace it with a
  320. // generic shift.
  321. KnownBits KnownAmt =
  322. llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
  323. if (KnownAmt.getMaxValue().ult(BitWidth)) {
  324. return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
  325. : Builder.CreateLShr(Vec, Amt))
  326. : Builder.CreateAShr(Vec, Amt));
  327. }
  328. // Simplify if all shift amounts are constant/undef.
  329. auto *CShift = dyn_cast<Constant>(Amt);
  330. if (!CShift)
  331. return nullptr;
  332. // Collect each element's shift amount.
  333. // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
  334. bool AnyOutOfRange = false;
  335. SmallVector<int, 8> ShiftAmts;
  336. for (int I = 0; I < NumElts; ++I) {
  337. auto *CElt = CShift->getAggregateElement(I);
  338. if (isa_and_nonnull<UndefValue>(CElt)) {
  339. ShiftAmts.push_back(-1);
  340. continue;
  341. }
  342. auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
  343. if (!COp)
  344. return nullptr;
  345. // Handle out of range shifts.
  346. // If LogicalShift - set to BitWidth (special case).
  347. // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
  348. APInt ShiftVal = COp->getValue();
  349. if (ShiftVal.uge(BitWidth)) {
  350. AnyOutOfRange = LogicalShift;
  351. ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
  352. continue;
  353. }
  354. ShiftAmts.push_back((int)ShiftVal.getZExtValue());
  355. }
  356. // If all elements out of range or UNDEF, return vector of zeros/undefs.
  357. // ArithmeticShift should only hit this if they are all UNDEF.
  358. auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
  359. if (llvm::all_of(ShiftAmts, OutOfRange)) {
  360. SmallVector<Constant *, 8> ConstantVec;
  361. for (int Idx : ShiftAmts) {
  362. if (Idx < 0) {
  363. ConstantVec.push_back(UndefValue::get(SVT));
  364. } else {
  365. assert(LogicalShift && "Logical shift expected");
  366. ConstantVec.push_back(ConstantInt::getNullValue(SVT));
  367. }
  368. }
  369. return ConstantVector::get(ConstantVec);
  370. }
  371. // We can't handle only some out of range values with generic logical shifts.
  372. if (AnyOutOfRange)
  373. return nullptr;
  374. // Build the shift amount constant vector.
  375. SmallVector<Constant *, 8> ShiftVecAmts;
  376. for (int Idx : ShiftAmts) {
  377. if (Idx < 0)
  378. ShiftVecAmts.push_back(UndefValue::get(SVT));
  379. else
  380. ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
  381. }
  382. auto ShiftVec = ConstantVector::get(ShiftVecAmts);
  383. if (ShiftLeft)
  384. return Builder.CreateShl(Vec, ShiftVec);
  385. if (LogicalShift)
  386. return Builder.CreateLShr(Vec, ShiftVec);
  387. return Builder.CreateAShr(Vec, ShiftVec);
  388. }
  389. static Value *simplifyX86pack(IntrinsicInst &II,
  390. InstCombiner::BuilderTy &Builder, bool IsSigned) {
  391. Value *Arg0 = II.getArgOperand(0);
  392. Value *Arg1 = II.getArgOperand(1);
  393. Type *ResTy = II.getType();
  394. // Fast all undef handling.
  395. if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
  396. return UndefValue::get(ResTy);
  397. auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
  398. unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
  399. unsigned NumSrcElts = ArgTy->getNumElements();
  400. assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
  401. "Unexpected packing types");
  402. unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
  403. unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
  404. unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
  405. assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
  406. "Unexpected packing types");
  407. // Constant folding.
  408. if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
  409. return nullptr;
  410. // Clamp Values - signed/unsigned both use signed clamp values, but they
  411. // differ on the min/max values.
  412. APInt MinValue, MaxValue;
  413. if (IsSigned) {
  414. // PACKSS: Truncate signed value with signed saturation.
  415. // Source values less than dst minint are saturated to minint.
  416. // Source values greater than dst maxint are saturated to maxint.
  417. MinValue =
  418. APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
  419. MaxValue =
  420. APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
  421. } else {
  422. // PACKUS: Truncate signed value with unsigned saturation.
  423. // Source values less than zero are saturated to zero.
  424. // Source values greater than dst maxuint are saturated to maxuint.
  425. MinValue = APInt::getZero(SrcScalarSizeInBits);
  426. MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
  427. }
  428. auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
  429. auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
  430. Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
  431. Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
  432. Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
  433. Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
  434. // Shuffle clamped args together at the lane level.
  435. SmallVector<int, 32> PackMask;
  436. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  437. for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
  438. PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
  439. for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
  440. PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
  441. }
  442. auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
  443. // Truncate to dst size.
  444. return Builder.CreateTrunc(Shuffle, ResTy);
  445. }
  446. static Value *simplifyX86movmsk(const IntrinsicInst &II,
  447. InstCombiner::BuilderTy &Builder) {
  448. Value *Arg = II.getArgOperand(0);
  449. Type *ResTy = II.getType();
  450. // movmsk(undef) -> zero as we must ensure the upper bits are zero.
  451. if (isa<UndefValue>(Arg))
  452. return Constant::getNullValue(ResTy);
  453. auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
  454. // We can't easily peek through x86_mmx types.
  455. if (!ArgTy)
  456. return nullptr;
  457. // Expand MOVMSK to compare/bitcast/zext:
  458. // e.g. PMOVMSKB(v16i8 x):
  459. // %cmp = icmp slt <16 x i8> %x, zeroinitializer
  460. // %int = bitcast <16 x i1> %cmp to i16
  461. // %res = zext i16 %int to i32
  462. unsigned NumElts = ArgTy->getNumElements();
  463. Type *IntegerTy = Builder.getIntNTy(NumElts);
  464. Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
  465. Res = Builder.CreateIsNeg(Res);
  466. Res = Builder.CreateBitCast(Res, IntegerTy);
  467. Res = Builder.CreateZExtOrTrunc(Res, ResTy);
  468. return Res;
  469. }
  470. static Value *simplifyX86addcarry(const IntrinsicInst &II,
  471. InstCombiner::BuilderTy &Builder) {
  472. Value *CarryIn = II.getArgOperand(0);
  473. Value *Op1 = II.getArgOperand(1);
  474. Value *Op2 = II.getArgOperand(2);
  475. Type *RetTy = II.getType();
  476. Type *OpTy = Op1->getType();
  477. assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
  478. RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
  479. "Unexpected types for x86 addcarry");
  480. // If carry-in is zero, this is just an unsigned add with overflow.
  481. if (match(CarryIn, PatternMatch::m_ZeroInt())) {
  482. Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
  483. {Op1, Op2});
  484. // The types have to be adjusted to match the x86 call types.
  485. Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
  486. Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
  487. Builder.getInt8Ty());
  488. Value *Res = PoisonValue::get(RetTy);
  489. Res = Builder.CreateInsertValue(Res, UAddOV, 0);
  490. return Builder.CreateInsertValue(Res, UAddResult, 1);
  491. }
  492. return nullptr;
  493. }
  494. static Value *simplifyX86insertps(const IntrinsicInst &II,
  495. InstCombiner::BuilderTy &Builder) {
  496. auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
  497. if (!CInt)
  498. return nullptr;
  499. auto *VecTy = cast<FixedVectorType>(II.getType());
  500. assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
  501. // The immediate permute control byte looks like this:
  502. // [3:0] - zero mask for each 32-bit lane
  503. // [5:4] - select one 32-bit destination lane
  504. // [7:6] - select one 32-bit source lane
  505. uint8_t Imm = CInt->getZExtValue();
  506. uint8_t ZMask = Imm & 0xf;
  507. uint8_t DestLane = (Imm >> 4) & 0x3;
  508. uint8_t SourceLane = (Imm >> 6) & 0x3;
  509. ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
  510. // If all zero mask bits are set, this was just a weird way to
  511. // generate a zero vector.
  512. if (ZMask == 0xf)
  513. return ZeroVector;
  514. // Initialize by passing all of the first source bits through.
  515. int ShuffleMask[4] = {0, 1, 2, 3};
  516. // We may replace the second operand with the zero vector.
  517. Value *V1 = II.getArgOperand(1);
  518. if (ZMask) {
  519. // If the zero mask is being used with a single input or the zero mask
  520. // overrides the destination lane, this is a shuffle with the zero vector.
  521. if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
  522. (ZMask & (1 << DestLane))) {
  523. V1 = ZeroVector;
  524. // We may still move 32-bits of the first source vector from one lane
  525. // to another.
  526. ShuffleMask[DestLane] = SourceLane;
  527. // The zero mask may override the previous insert operation.
  528. for (unsigned i = 0; i < 4; ++i)
  529. if ((ZMask >> i) & 0x1)
  530. ShuffleMask[i] = i + 4;
  531. } else {
  532. // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
  533. return nullptr;
  534. }
  535. } else {
  536. // Replace the selected destination lane with the selected source lane.
  537. ShuffleMask[DestLane] = SourceLane + 4;
  538. }
  539. return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
  540. }
  541. /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
  542. /// or conversion to a shuffle vector.
  543. static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
  544. ConstantInt *CILength, ConstantInt *CIIndex,
  545. InstCombiner::BuilderTy &Builder) {
  546. auto LowConstantHighUndef = [&](uint64_t Val) {
  547. Type *IntTy64 = Type::getInt64Ty(II.getContext());
  548. Constant *Args[] = {ConstantInt::get(IntTy64, Val),
  549. UndefValue::get(IntTy64)};
  550. return ConstantVector::get(Args);
  551. };
  552. // See if we're dealing with constant values.
  553. auto *C0 = dyn_cast<Constant>(Op0);
  554. auto *CI0 =
  555. C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
  556. : nullptr;
  557. // Attempt to constant fold.
  558. if (CILength && CIIndex) {
  559. // From AMD documentation: "The bit index and field length are each six
  560. // bits in length other bits of the field are ignored."
  561. APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
  562. APInt APLength = CILength->getValue().zextOrTrunc(6);
  563. unsigned Index = APIndex.getZExtValue();
  564. // From AMD documentation: "a value of zero in the field length is
  565. // defined as length of 64".
  566. unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
  567. // From AMD documentation: "If the sum of the bit index + length field
  568. // is greater than 64, the results are undefined".
  569. unsigned End = Index + Length;
  570. // Note that both field index and field length are 8-bit quantities.
  571. // Since variables 'Index' and 'Length' are unsigned values
  572. // obtained from zero-extending field index and field length
  573. // respectively, their sum should never wrap around.
  574. if (End > 64)
  575. return UndefValue::get(II.getType());
  576. // If we are inserting whole bytes, we can convert this to a shuffle.
  577. // Lowering can recognize EXTRQI shuffle masks.
  578. if ((Length % 8) == 0 && (Index % 8) == 0) {
  579. // Convert bit indices to byte indices.
  580. Length /= 8;
  581. Index /= 8;
  582. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  583. auto *ShufTy = FixedVectorType::get(IntTy8, 16);
  584. SmallVector<int, 16> ShuffleMask;
  585. for (int i = 0; i != (int)Length; ++i)
  586. ShuffleMask.push_back(i + Index);
  587. for (int i = Length; i != 8; ++i)
  588. ShuffleMask.push_back(i + 16);
  589. for (int i = 8; i != 16; ++i)
  590. ShuffleMask.push_back(-1);
  591. Value *SV = Builder.CreateShuffleVector(
  592. Builder.CreateBitCast(Op0, ShufTy),
  593. ConstantAggregateZero::get(ShufTy), ShuffleMask);
  594. return Builder.CreateBitCast(SV, II.getType());
  595. }
  596. // Constant Fold - shift Index'th bit to lowest position and mask off
  597. // Length bits.
  598. if (CI0) {
  599. APInt Elt = CI0->getValue();
  600. Elt.lshrInPlace(Index);
  601. Elt = Elt.zextOrTrunc(Length);
  602. return LowConstantHighUndef(Elt.getZExtValue());
  603. }
  604. // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
  605. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
  606. Value *Args[] = {Op0, CILength, CIIndex};
  607. Module *M = II.getModule();
  608. Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
  609. return Builder.CreateCall(F, Args);
  610. }
  611. }
  612. // Constant Fold - extraction from zero is always {zero, undef}.
  613. if (CI0 && CI0->isZero())
  614. return LowConstantHighUndef(0);
  615. return nullptr;
  616. }
  617. /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
  618. /// folding or conversion to a shuffle vector.
  619. static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
  620. APInt APLength, APInt APIndex,
  621. InstCombiner::BuilderTy &Builder) {
  622. // From AMD documentation: "The bit index and field length are each six bits
  623. // in length other bits of the field are ignored."
  624. APIndex = APIndex.zextOrTrunc(6);
  625. APLength = APLength.zextOrTrunc(6);
  626. // Attempt to constant fold.
  627. unsigned Index = APIndex.getZExtValue();
  628. // From AMD documentation: "a value of zero in the field length is
  629. // defined as length of 64".
  630. unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
  631. // From AMD documentation: "If the sum of the bit index + length field
  632. // is greater than 64, the results are undefined".
  633. unsigned End = Index + Length;
  634. // Note that both field index and field length are 8-bit quantities.
  635. // Since variables 'Index' and 'Length' are unsigned values
  636. // obtained from zero-extending field index and field length
  637. // respectively, their sum should never wrap around.
  638. if (End > 64)
  639. return UndefValue::get(II.getType());
  640. // If we are inserting whole bytes, we can convert this to a shuffle.
  641. // Lowering can recognize INSERTQI shuffle masks.
  642. if ((Length % 8) == 0 && (Index % 8) == 0) {
  643. // Convert bit indices to byte indices.
  644. Length /= 8;
  645. Index /= 8;
  646. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  647. auto *ShufTy = FixedVectorType::get(IntTy8, 16);
  648. SmallVector<int, 16> ShuffleMask;
  649. for (int i = 0; i != (int)Index; ++i)
  650. ShuffleMask.push_back(i);
  651. for (int i = 0; i != (int)Length; ++i)
  652. ShuffleMask.push_back(i + 16);
  653. for (int i = Index + Length; i != 8; ++i)
  654. ShuffleMask.push_back(i);
  655. for (int i = 8; i != 16; ++i)
  656. ShuffleMask.push_back(-1);
  657. Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
  658. Builder.CreateBitCast(Op1, ShufTy),
  659. ShuffleMask);
  660. return Builder.CreateBitCast(SV, II.getType());
  661. }
  662. // See if we're dealing with constant values.
  663. auto *C0 = dyn_cast<Constant>(Op0);
  664. auto *C1 = dyn_cast<Constant>(Op1);
  665. auto *CI00 =
  666. C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
  667. : nullptr;
  668. auto *CI10 =
  669. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
  670. : nullptr;
  671. // Constant Fold - insert bottom Length bits starting at the Index'th bit.
  672. if (CI00 && CI10) {
  673. APInt V00 = CI00->getValue();
  674. APInt V10 = CI10->getValue();
  675. APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
  676. V00 = V00 & ~Mask;
  677. V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
  678. APInt Val = V00 | V10;
  679. Type *IntTy64 = Type::getInt64Ty(II.getContext());
  680. Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
  681. UndefValue::get(IntTy64)};
  682. return ConstantVector::get(Args);
  683. }
  684. // If we were an INSERTQ call, we'll save demanded elements if we convert to
  685. // INSERTQI.
  686. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
  687. Type *IntTy8 = Type::getInt8Ty(II.getContext());
  688. Constant *CILength = ConstantInt::get(IntTy8, Length, false);
  689. Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
  690. Value *Args[] = {Op0, Op1, CILength, CIIndex};
  691. Module *M = II.getModule();
  692. Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
  693. return Builder.CreateCall(F, Args);
  694. }
  695. return nullptr;
  696. }
  697. /// Attempt to convert pshufb* to shufflevector if the mask is constant.
  698. static Value *simplifyX86pshufb(const IntrinsicInst &II,
  699. InstCombiner::BuilderTy &Builder) {
  700. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  701. if (!V)
  702. return nullptr;
  703. auto *VecTy = cast<FixedVectorType>(II.getType());
  704. unsigned NumElts = VecTy->getNumElements();
  705. assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
  706. "Unexpected number of elements in shuffle mask!");
  707. // Construct a shuffle mask from constant integers or UNDEFs.
  708. int Indexes[64];
  709. // Each byte in the shuffle control mask forms an index to permute the
  710. // corresponding byte in the destination operand.
  711. for (unsigned I = 0; I < NumElts; ++I) {
  712. Constant *COp = V->getAggregateElement(I);
  713. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  714. return nullptr;
  715. if (isa<UndefValue>(COp)) {
  716. Indexes[I] = -1;
  717. continue;
  718. }
  719. int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
  720. // If the most significant bit (bit[7]) of each byte of the shuffle
  721. // control mask is set, then zero is written in the result byte.
  722. // The zero vector is in the right-hand side of the resulting
  723. // shufflevector.
  724. // The value of each index for the high 128-bit lane is the least
  725. // significant 4 bits of the respective shuffle control byte.
  726. Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
  727. Indexes[I] = Index;
  728. }
  729. auto V1 = II.getArgOperand(0);
  730. auto V2 = Constant::getNullValue(VecTy);
  731. return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
  732. }
  733. /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
  734. static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
  735. InstCombiner::BuilderTy &Builder) {
  736. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  737. if (!V)
  738. return nullptr;
  739. auto *VecTy = cast<FixedVectorType>(II.getType());
  740. unsigned NumElts = VecTy->getNumElements();
  741. bool IsPD = VecTy->getScalarType()->isDoubleTy();
  742. unsigned NumLaneElts = IsPD ? 2 : 4;
  743. assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
  744. // Construct a shuffle mask from constant integers or UNDEFs.
  745. int Indexes[16];
  746. // The intrinsics only read one or two bits, clear the rest.
  747. for (unsigned I = 0; I < NumElts; ++I) {
  748. Constant *COp = V->getAggregateElement(I);
  749. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  750. return nullptr;
  751. if (isa<UndefValue>(COp)) {
  752. Indexes[I] = -1;
  753. continue;
  754. }
  755. APInt Index = cast<ConstantInt>(COp)->getValue();
  756. Index = Index.zextOrTrunc(32).getLoBits(2);
  757. // The PD variants uses bit 1 to select per-lane element index, so
  758. // shift down to convert to generic shuffle mask index.
  759. if (IsPD)
  760. Index.lshrInPlace(1);
  761. // The _256 variants are a bit trickier since the mask bits always index
  762. // into the corresponding 128 half. In order to convert to a generic
  763. // shuffle, we have to make that explicit.
  764. Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
  765. Indexes[I] = Index.getZExtValue();
  766. }
  767. auto V1 = II.getArgOperand(0);
  768. return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
  769. }
  770. /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
  771. static Value *simplifyX86vpermv(const IntrinsicInst &II,
  772. InstCombiner::BuilderTy &Builder) {
  773. auto *V = dyn_cast<Constant>(II.getArgOperand(1));
  774. if (!V)
  775. return nullptr;
  776. auto *VecTy = cast<FixedVectorType>(II.getType());
  777. unsigned Size = VecTy->getNumElements();
  778. assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
  779. "Unexpected shuffle mask size");
  780. // Construct a shuffle mask from constant integers or UNDEFs.
  781. int Indexes[64];
  782. for (unsigned I = 0; I < Size; ++I) {
  783. Constant *COp = V->getAggregateElement(I);
  784. if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
  785. return nullptr;
  786. if (isa<UndefValue>(COp)) {
  787. Indexes[I] = -1;
  788. continue;
  789. }
  790. uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
  791. Index &= Size - 1;
  792. Indexes[I] = Index;
  793. }
  794. auto V1 = II.getArgOperand(0);
  795. return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
  796. }
  797. std::optional<Instruction *>
  798. X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
  799. auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
  800. unsigned DemandedWidth) {
  801. APInt UndefElts(Width, 0);
  802. APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
  803. return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
  804. };
  805. Intrinsic::ID IID = II.getIntrinsicID();
  806. switch (IID) {
  807. case Intrinsic::x86_bmi_bextr_32:
  808. case Intrinsic::x86_bmi_bextr_64:
  809. case Intrinsic::x86_tbm_bextri_u32:
  810. case Intrinsic::x86_tbm_bextri_u64:
  811. // If the RHS is a constant we can try some simplifications.
  812. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  813. uint64_t Shift = C->getZExtValue();
  814. uint64_t Length = (Shift >> 8) & 0xff;
  815. Shift &= 0xff;
  816. unsigned BitWidth = II.getType()->getIntegerBitWidth();
  817. // If the length is 0 or the shift is out of range, replace with zero.
  818. if (Length == 0 || Shift >= BitWidth) {
  819. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  820. }
  821. // If the LHS is also a constant, we can completely constant fold this.
  822. if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  823. uint64_t Result = InC->getZExtValue() >> Shift;
  824. if (Length > BitWidth)
  825. Length = BitWidth;
  826. Result &= maskTrailingOnes<uint64_t>(Length);
  827. return IC.replaceInstUsesWith(II,
  828. ConstantInt::get(II.getType(), Result));
  829. }
  830. // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
  831. // are only masking bits that a shift already cleared?
  832. }
  833. break;
  834. case Intrinsic::x86_bmi_bzhi_32:
  835. case Intrinsic::x86_bmi_bzhi_64:
  836. // If the RHS is a constant we can try some simplifications.
  837. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  838. uint64_t Index = C->getZExtValue() & 0xff;
  839. unsigned BitWidth = II.getType()->getIntegerBitWidth();
  840. if (Index >= BitWidth) {
  841. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  842. }
  843. if (Index == 0) {
  844. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  845. }
  846. // If the LHS is also a constant, we can completely constant fold this.
  847. if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  848. uint64_t Result = InC->getZExtValue();
  849. Result &= maskTrailingOnes<uint64_t>(Index);
  850. return IC.replaceInstUsesWith(II,
  851. ConstantInt::get(II.getType(), Result));
  852. }
  853. // TODO should we convert this to an AND if the RHS is constant?
  854. }
  855. break;
  856. case Intrinsic::x86_bmi_pext_32:
  857. case Intrinsic::x86_bmi_pext_64:
  858. if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  859. if (MaskC->isNullValue()) {
  860. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  861. }
  862. if (MaskC->isAllOnesValue()) {
  863. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  864. }
  865. unsigned MaskIdx, MaskLen;
  866. if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
  867. // any single contingous sequence of 1s anywhere in the mask simply
  868. // describes a subset of the input bits shifted to the appropriate
  869. // position. Replace with the straight forward IR.
  870. Value *Input = II.getArgOperand(0);
  871. Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
  872. Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
  873. Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
  874. return IC.replaceInstUsesWith(II, Shifted);
  875. }
  876. if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  877. uint64_t Src = SrcC->getZExtValue();
  878. uint64_t Mask = MaskC->getZExtValue();
  879. uint64_t Result = 0;
  880. uint64_t BitToSet = 1;
  881. while (Mask) {
  882. // Isolate lowest set bit.
  883. uint64_t BitToTest = Mask & -Mask;
  884. if (BitToTest & Src)
  885. Result |= BitToSet;
  886. BitToSet <<= 1;
  887. // Clear lowest set bit.
  888. Mask &= Mask - 1;
  889. }
  890. return IC.replaceInstUsesWith(II,
  891. ConstantInt::get(II.getType(), Result));
  892. }
  893. }
  894. break;
  895. case Intrinsic::x86_bmi_pdep_32:
  896. case Intrinsic::x86_bmi_pdep_64:
  897. if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
  898. if (MaskC->isNullValue()) {
  899. return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
  900. }
  901. if (MaskC->isAllOnesValue()) {
  902. return IC.replaceInstUsesWith(II, II.getArgOperand(0));
  903. }
  904. unsigned MaskIdx, MaskLen;
  905. if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
  906. // any single contingous sequence of 1s anywhere in the mask simply
  907. // describes a subset of the input bits shifted to the appropriate
  908. // position. Replace with the straight forward IR.
  909. Value *Input = II.getArgOperand(0);
  910. Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
  911. Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
  912. Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
  913. return IC.replaceInstUsesWith(II, Masked);
  914. }
  915. if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
  916. uint64_t Src = SrcC->getZExtValue();
  917. uint64_t Mask = MaskC->getZExtValue();
  918. uint64_t Result = 0;
  919. uint64_t BitToTest = 1;
  920. while (Mask) {
  921. // Isolate lowest set bit.
  922. uint64_t BitToSet = Mask & -Mask;
  923. if (BitToTest & Src)
  924. Result |= BitToSet;
  925. BitToTest <<= 1;
  926. // Clear lowest set bit;
  927. Mask &= Mask - 1;
  928. }
  929. return IC.replaceInstUsesWith(II,
  930. ConstantInt::get(II.getType(), Result));
  931. }
  932. }
  933. break;
  934. case Intrinsic::x86_sse_cvtss2si:
  935. case Intrinsic::x86_sse_cvtss2si64:
  936. case Intrinsic::x86_sse_cvttss2si:
  937. case Intrinsic::x86_sse_cvttss2si64:
  938. case Intrinsic::x86_sse2_cvtsd2si:
  939. case Intrinsic::x86_sse2_cvtsd2si64:
  940. case Intrinsic::x86_sse2_cvttsd2si:
  941. case Intrinsic::x86_sse2_cvttsd2si64:
  942. case Intrinsic::x86_avx512_vcvtss2si32:
  943. case Intrinsic::x86_avx512_vcvtss2si64:
  944. case Intrinsic::x86_avx512_vcvtss2usi32:
  945. case Intrinsic::x86_avx512_vcvtss2usi64:
  946. case Intrinsic::x86_avx512_vcvtsd2si32:
  947. case Intrinsic::x86_avx512_vcvtsd2si64:
  948. case Intrinsic::x86_avx512_vcvtsd2usi32:
  949. case Intrinsic::x86_avx512_vcvtsd2usi64:
  950. case Intrinsic::x86_avx512_cvttss2si:
  951. case Intrinsic::x86_avx512_cvttss2si64:
  952. case Intrinsic::x86_avx512_cvttss2usi:
  953. case Intrinsic::x86_avx512_cvttss2usi64:
  954. case Intrinsic::x86_avx512_cvttsd2si:
  955. case Intrinsic::x86_avx512_cvttsd2si64:
  956. case Intrinsic::x86_avx512_cvttsd2usi:
  957. case Intrinsic::x86_avx512_cvttsd2usi64: {
  958. // These intrinsics only demand the 0th element of their input vectors. If
  959. // we can simplify the input based on that, do so now.
  960. Value *Arg = II.getArgOperand(0);
  961. unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
  962. if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
  963. return IC.replaceOperand(II, 0, V);
  964. }
  965. break;
  966. }
  967. case Intrinsic::x86_mmx_pmovmskb:
  968. case Intrinsic::x86_sse_movmsk_ps:
  969. case Intrinsic::x86_sse2_movmsk_pd:
  970. case Intrinsic::x86_sse2_pmovmskb_128:
  971. case Intrinsic::x86_avx_movmsk_pd_256:
  972. case Intrinsic::x86_avx_movmsk_ps_256:
  973. case Intrinsic::x86_avx2_pmovmskb:
  974. if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
  975. return IC.replaceInstUsesWith(II, V);
  976. }
  977. break;
  978. case Intrinsic::x86_sse_comieq_ss:
  979. case Intrinsic::x86_sse_comige_ss:
  980. case Intrinsic::x86_sse_comigt_ss:
  981. case Intrinsic::x86_sse_comile_ss:
  982. case Intrinsic::x86_sse_comilt_ss:
  983. case Intrinsic::x86_sse_comineq_ss:
  984. case Intrinsic::x86_sse_ucomieq_ss:
  985. case Intrinsic::x86_sse_ucomige_ss:
  986. case Intrinsic::x86_sse_ucomigt_ss:
  987. case Intrinsic::x86_sse_ucomile_ss:
  988. case Intrinsic::x86_sse_ucomilt_ss:
  989. case Intrinsic::x86_sse_ucomineq_ss:
  990. case Intrinsic::x86_sse2_comieq_sd:
  991. case Intrinsic::x86_sse2_comige_sd:
  992. case Intrinsic::x86_sse2_comigt_sd:
  993. case Intrinsic::x86_sse2_comile_sd:
  994. case Intrinsic::x86_sse2_comilt_sd:
  995. case Intrinsic::x86_sse2_comineq_sd:
  996. case Intrinsic::x86_sse2_ucomieq_sd:
  997. case Intrinsic::x86_sse2_ucomige_sd:
  998. case Intrinsic::x86_sse2_ucomigt_sd:
  999. case Intrinsic::x86_sse2_ucomile_sd:
  1000. case Intrinsic::x86_sse2_ucomilt_sd:
  1001. case Intrinsic::x86_sse2_ucomineq_sd:
  1002. case Intrinsic::x86_avx512_vcomi_ss:
  1003. case Intrinsic::x86_avx512_vcomi_sd:
  1004. case Intrinsic::x86_avx512_mask_cmp_ss:
  1005. case Intrinsic::x86_avx512_mask_cmp_sd: {
  1006. // These intrinsics only demand the 0th element of their input vectors. If
  1007. // we can simplify the input based on that, do so now.
  1008. bool MadeChange = false;
  1009. Value *Arg0 = II.getArgOperand(0);
  1010. Value *Arg1 = II.getArgOperand(1);
  1011. unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
  1012. if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
  1013. IC.replaceOperand(II, 0, V);
  1014. MadeChange = true;
  1015. }
  1016. if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
  1017. IC.replaceOperand(II, 1, V);
  1018. MadeChange = true;
  1019. }
  1020. if (MadeChange) {
  1021. return &II;
  1022. }
  1023. break;
  1024. }
  1025. case Intrinsic::x86_avx512_add_ps_512:
  1026. case Intrinsic::x86_avx512_div_ps_512:
  1027. case Intrinsic::x86_avx512_mul_ps_512:
  1028. case Intrinsic::x86_avx512_sub_ps_512:
  1029. case Intrinsic::x86_avx512_add_pd_512:
  1030. case Intrinsic::x86_avx512_div_pd_512:
  1031. case Intrinsic::x86_avx512_mul_pd_512:
  1032. case Intrinsic::x86_avx512_sub_pd_512:
  1033. // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
  1034. // IR operations.
  1035. if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
  1036. if (R->getValue() == 4) {
  1037. Value *Arg0 = II.getArgOperand(0);
  1038. Value *Arg1 = II.getArgOperand(1);
  1039. Value *V;
  1040. switch (IID) {
  1041. default:
  1042. llvm_unreachable("Case stmts out of sync!");
  1043. case Intrinsic::x86_avx512_add_ps_512:
  1044. case Intrinsic::x86_avx512_add_pd_512:
  1045. V = IC.Builder.CreateFAdd(Arg0, Arg1);
  1046. break;
  1047. case Intrinsic::x86_avx512_sub_ps_512:
  1048. case Intrinsic::x86_avx512_sub_pd_512:
  1049. V = IC.Builder.CreateFSub(Arg0, Arg1);
  1050. break;
  1051. case Intrinsic::x86_avx512_mul_ps_512:
  1052. case Intrinsic::x86_avx512_mul_pd_512:
  1053. V = IC.Builder.CreateFMul(Arg0, Arg1);
  1054. break;
  1055. case Intrinsic::x86_avx512_div_ps_512:
  1056. case Intrinsic::x86_avx512_div_pd_512:
  1057. V = IC.Builder.CreateFDiv(Arg0, Arg1);
  1058. break;
  1059. }
  1060. return IC.replaceInstUsesWith(II, V);
  1061. }
  1062. }
  1063. break;
  1064. case Intrinsic::x86_avx512_mask_add_ss_round:
  1065. case Intrinsic::x86_avx512_mask_div_ss_round:
  1066. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1067. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1068. case Intrinsic::x86_avx512_mask_add_sd_round:
  1069. case Intrinsic::x86_avx512_mask_div_sd_round:
  1070. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1071. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1072. // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
  1073. // IR operations.
  1074. if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
  1075. if (R->getValue() == 4) {
  1076. // Extract the element as scalars.
  1077. Value *Arg0 = II.getArgOperand(0);
  1078. Value *Arg1 = II.getArgOperand(1);
  1079. Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
  1080. Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
  1081. Value *V;
  1082. switch (IID) {
  1083. default:
  1084. llvm_unreachable("Case stmts out of sync!");
  1085. case Intrinsic::x86_avx512_mask_add_ss_round:
  1086. case Intrinsic::x86_avx512_mask_add_sd_round:
  1087. V = IC.Builder.CreateFAdd(LHS, RHS);
  1088. break;
  1089. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1090. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1091. V = IC.Builder.CreateFSub(LHS, RHS);
  1092. break;
  1093. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1094. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1095. V = IC.Builder.CreateFMul(LHS, RHS);
  1096. break;
  1097. case Intrinsic::x86_avx512_mask_div_ss_round:
  1098. case Intrinsic::x86_avx512_mask_div_sd_round:
  1099. V = IC.Builder.CreateFDiv(LHS, RHS);
  1100. break;
  1101. }
  1102. // Handle the masking aspect of the intrinsic.
  1103. Value *Mask = II.getArgOperand(3);
  1104. auto *C = dyn_cast<ConstantInt>(Mask);
  1105. // We don't need a select if we know the mask bit is a 1.
  1106. if (!C || !C->getValue()[0]) {
  1107. // Cast the mask to an i1 vector and then extract the lowest element.
  1108. auto *MaskTy = FixedVectorType::get(
  1109. IC.Builder.getInt1Ty(),
  1110. cast<IntegerType>(Mask->getType())->getBitWidth());
  1111. Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
  1112. Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
  1113. // Extract the lowest element from the passthru operand.
  1114. Value *Passthru =
  1115. IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
  1116. V = IC.Builder.CreateSelect(Mask, V, Passthru);
  1117. }
  1118. // Insert the result back into the original argument 0.
  1119. V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
  1120. return IC.replaceInstUsesWith(II, V);
  1121. }
  1122. }
  1123. break;
  1124. // Constant fold ashr( <A x Bi>, Ci ).
  1125. // Constant fold lshr( <A x Bi>, Ci ).
  1126. // Constant fold shl( <A x Bi>, Ci ).
  1127. case Intrinsic::x86_sse2_psrai_d:
  1128. case Intrinsic::x86_sse2_psrai_w:
  1129. case Intrinsic::x86_avx2_psrai_d:
  1130. case Intrinsic::x86_avx2_psrai_w:
  1131. case Intrinsic::x86_avx512_psrai_q_128:
  1132. case Intrinsic::x86_avx512_psrai_q_256:
  1133. case Intrinsic::x86_avx512_psrai_d_512:
  1134. case Intrinsic::x86_avx512_psrai_q_512:
  1135. case Intrinsic::x86_avx512_psrai_w_512:
  1136. case Intrinsic::x86_sse2_psrli_d:
  1137. case Intrinsic::x86_sse2_psrli_q:
  1138. case Intrinsic::x86_sse2_psrli_w:
  1139. case Intrinsic::x86_avx2_psrli_d:
  1140. case Intrinsic::x86_avx2_psrli_q:
  1141. case Intrinsic::x86_avx2_psrli_w:
  1142. case Intrinsic::x86_avx512_psrli_d_512:
  1143. case Intrinsic::x86_avx512_psrli_q_512:
  1144. case Intrinsic::x86_avx512_psrli_w_512:
  1145. case Intrinsic::x86_sse2_pslli_d:
  1146. case Intrinsic::x86_sse2_pslli_q:
  1147. case Intrinsic::x86_sse2_pslli_w:
  1148. case Intrinsic::x86_avx2_pslli_d:
  1149. case Intrinsic::x86_avx2_pslli_q:
  1150. case Intrinsic::x86_avx2_pslli_w:
  1151. case Intrinsic::x86_avx512_pslli_d_512:
  1152. case Intrinsic::x86_avx512_pslli_q_512:
  1153. case Intrinsic::x86_avx512_pslli_w_512:
  1154. if (Value *V = simplifyX86immShift(II, IC.Builder)) {
  1155. return IC.replaceInstUsesWith(II, V);
  1156. }
  1157. break;
  1158. case Intrinsic::x86_sse2_psra_d:
  1159. case Intrinsic::x86_sse2_psra_w:
  1160. case Intrinsic::x86_avx2_psra_d:
  1161. case Intrinsic::x86_avx2_psra_w:
  1162. case Intrinsic::x86_avx512_psra_q_128:
  1163. case Intrinsic::x86_avx512_psra_q_256:
  1164. case Intrinsic::x86_avx512_psra_d_512:
  1165. case Intrinsic::x86_avx512_psra_q_512:
  1166. case Intrinsic::x86_avx512_psra_w_512:
  1167. case Intrinsic::x86_sse2_psrl_d:
  1168. case Intrinsic::x86_sse2_psrl_q:
  1169. case Intrinsic::x86_sse2_psrl_w:
  1170. case Intrinsic::x86_avx2_psrl_d:
  1171. case Intrinsic::x86_avx2_psrl_q:
  1172. case Intrinsic::x86_avx2_psrl_w:
  1173. case Intrinsic::x86_avx512_psrl_d_512:
  1174. case Intrinsic::x86_avx512_psrl_q_512:
  1175. case Intrinsic::x86_avx512_psrl_w_512:
  1176. case Intrinsic::x86_sse2_psll_d:
  1177. case Intrinsic::x86_sse2_psll_q:
  1178. case Intrinsic::x86_sse2_psll_w:
  1179. case Intrinsic::x86_avx2_psll_d:
  1180. case Intrinsic::x86_avx2_psll_q:
  1181. case Intrinsic::x86_avx2_psll_w:
  1182. case Intrinsic::x86_avx512_psll_d_512:
  1183. case Intrinsic::x86_avx512_psll_q_512:
  1184. case Intrinsic::x86_avx512_psll_w_512: {
  1185. if (Value *V = simplifyX86immShift(II, IC.Builder)) {
  1186. return IC.replaceInstUsesWith(II, V);
  1187. }
  1188. // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
  1189. // operand to compute the shift amount.
  1190. Value *Arg1 = II.getArgOperand(1);
  1191. assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
  1192. "Unexpected packed shift size");
  1193. unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
  1194. if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
  1195. return IC.replaceOperand(II, 1, V);
  1196. }
  1197. break;
  1198. }
  1199. case Intrinsic::x86_avx2_psllv_d:
  1200. case Intrinsic::x86_avx2_psllv_d_256:
  1201. case Intrinsic::x86_avx2_psllv_q:
  1202. case Intrinsic::x86_avx2_psllv_q_256:
  1203. case Intrinsic::x86_avx512_psllv_d_512:
  1204. case Intrinsic::x86_avx512_psllv_q_512:
  1205. case Intrinsic::x86_avx512_psllv_w_128:
  1206. case Intrinsic::x86_avx512_psllv_w_256:
  1207. case Intrinsic::x86_avx512_psllv_w_512:
  1208. case Intrinsic::x86_avx2_psrav_d:
  1209. case Intrinsic::x86_avx2_psrav_d_256:
  1210. case Intrinsic::x86_avx512_psrav_q_128:
  1211. case Intrinsic::x86_avx512_psrav_q_256:
  1212. case Intrinsic::x86_avx512_psrav_d_512:
  1213. case Intrinsic::x86_avx512_psrav_q_512:
  1214. case Intrinsic::x86_avx512_psrav_w_128:
  1215. case Intrinsic::x86_avx512_psrav_w_256:
  1216. case Intrinsic::x86_avx512_psrav_w_512:
  1217. case Intrinsic::x86_avx2_psrlv_d:
  1218. case Intrinsic::x86_avx2_psrlv_d_256:
  1219. case Intrinsic::x86_avx2_psrlv_q:
  1220. case Intrinsic::x86_avx2_psrlv_q_256:
  1221. case Intrinsic::x86_avx512_psrlv_d_512:
  1222. case Intrinsic::x86_avx512_psrlv_q_512:
  1223. case Intrinsic::x86_avx512_psrlv_w_128:
  1224. case Intrinsic::x86_avx512_psrlv_w_256:
  1225. case Intrinsic::x86_avx512_psrlv_w_512:
  1226. if (Value *V = simplifyX86varShift(II, IC.Builder)) {
  1227. return IC.replaceInstUsesWith(II, V);
  1228. }
  1229. break;
  1230. case Intrinsic::x86_sse2_packssdw_128:
  1231. case Intrinsic::x86_sse2_packsswb_128:
  1232. case Intrinsic::x86_avx2_packssdw:
  1233. case Intrinsic::x86_avx2_packsswb:
  1234. case Intrinsic::x86_avx512_packssdw_512:
  1235. case Intrinsic::x86_avx512_packsswb_512:
  1236. if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
  1237. return IC.replaceInstUsesWith(II, V);
  1238. }
  1239. break;
  1240. case Intrinsic::x86_sse2_packuswb_128:
  1241. case Intrinsic::x86_sse41_packusdw:
  1242. case Intrinsic::x86_avx2_packusdw:
  1243. case Intrinsic::x86_avx2_packuswb:
  1244. case Intrinsic::x86_avx512_packusdw_512:
  1245. case Intrinsic::x86_avx512_packuswb_512:
  1246. if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
  1247. return IC.replaceInstUsesWith(II, V);
  1248. }
  1249. break;
  1250. case Intrinsic::x86_pclmulqdq:
  1251. case Intrinsic::x86_pclmulqdq_256:
  1252. case Intrinsic::x86_pclmulqdq_512: {
  1253. if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
  1254. unsigned Imm = C->getZExtValue();
  1255. bool MadeChange = false;
  1256. Value *Arg0 = II.getArgOperand(0);
  1257. Value *Arg1 = II.getArgOperand(1);
  1258. unsigned VWidth =
  1259. cast<FixedVectorType>(Arg0->getType())->getNumElements();
  1260. APInt UndefElts1(VWidth, 0);
  1261. APInt DemandedElts1 =
  1262. APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
  1263. if (Value *V =
  1264. IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
  1265. IC.replaceOperand(II, 0, V);
  1266. MadeChange = true;
  1267. }
  1268. APInt UndefElts2(VWidth, 0);
  1269. APInt DemandedElts2 =
  1270. APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
  1271. if (Value *V =
  1272. IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
  1273. IC.replaceOperand(II, 1, V);
  1274. MadeChange = true;
  1275. }
  1276. // If either input elements are undef, the result is zero.
  1277. if (DemandedElts1.isSubsetOf(UndefElts1) ||
  1278. DemandedElts2.isSubsetOf(UndefElts2)) {
  1279. return IC.replaceInstUsesWith(II,
  1280. ConstantAggregateZero::get(II.getType()));
  1281. }
  1282. if (MadeChange) {
  1283. return &II;
  1284. }
  1285. }
  1286. break;
  1287. }
  1288. case Intrinsic::x86_sse41_insertps:
  1289. if (Value *V = simplifyX86insertps(II, IC.Builder)) {
  1290. return IC.replaceInstUsesWith(II, V);
  1291. }
  1292. break;
  1293. case Intrinsic::x86_sse4a_extrq: {
  1294. Value *Op0 = II.getArgOperand(0);
  1295. Value *Op1 = II.getArgOperand(1);
  1296. unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1297. unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
  1298. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1299. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
  1300. VWidth1 == 16 && "Unexpected operand sizes");
  1301. // See if we're dealing with constant values.
  1302. auto *C1 = dyn_cast<Constant>(Op1);
  1303. auto *CILength =
  1304. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
  1305. : nullptr;
  1306. auto *CIIndex =
  1307. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
  1308. : nullptr;
  1309. // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
  1310. if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
  1311. return IC.replaceInstUsesWith(II, V);
  1312. }
  1313. // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
  1314. // operands and the lowest 16-bits of the second.
  1315. bool MadeChange = false;
  1316. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
  1317. IC.replaceOperand(II, 0, V);
  1318. MadeChange = true;
  1319. }
  1320. if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
  1321. IC.replaceOperand(II, 1, V);
  1322. MadeChange = true;
  1323. }
  1324. if (MadeChange) {
  1325. return &II;
  1326. }
  1327. break;
  1328. }
  1329. case Intrinsic::x86_sse4a_extrqi: {
  1330. // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
  1331. // bits of the lower 64-bits. The upper 64-bits are undefined.
  1332. Value *Op0 = II.getArgOperand(0);
  1333. unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1334. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
  1335. "Unexpected operand size");
  1336. // See if we're dealing with constant values.
  1337. auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
  1338. auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
  1339. // Attempt to simplify to a constant or shuffle vector.
  1340. if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
  1341. return IC.replaceInstUsesWith(II, V);
  1342. }
  1343. // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
  1344. // operand.
  1345. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
  1346. return IC.replaceOperand(II, 0, V);
  1347. }
  1348. break;
  1349. }
  1350. case Intrinsic::x86_sse4a_insertq: {
  1351. Value *Op0 = II.getArgOperand(0);
  1352. Value *Op1 = II.getArgOperand(1);
  1353. unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1354. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1355. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
  1356. cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
  1357. "Unexpected operand size");
  1358. // See if we're dealing with constant values.
  1359. auto *C1 = dyn_cast<Constant>(Op1);
  1360. auto *CI11 =
  1361. C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
  1362. : nullptr;
  1363. // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
  1364. if (CI11) {
  1365. const APInt &V11 = CI11->getValue();
  1366. APInt Len = V11.zextOrTrunc(6);
  1367. APInt Idx = V11.lshr(8).zextOrTrunc(6);
  1368. if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
  1369. return IC.replaceInstUsesWith(II, V);
  1370. }
  1371. }
  1372. // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
  1373. // operand.
  1374. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
  1375. return IC.replaceOperand(II, 0, V);
  1376. }
  1377. break;
  1378. }
  1379. case Intrinsic::x86_sse4a_insertqi: {
  1380. // INSERTQI: Extract lowest Length bits from lower half of second source and
  1381. // insert over first source starting at Index bit. The upper 64-bits are
  1382. // undefined.
  1383. Value *Op0 = II.getArgOperand(0);
  1384. Value *Op1 = II.getArgOperand(1);
  1385. unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
  1386. unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
  1387. assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
  1388. Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
  1389. VWidth1 == 2 && "Unexpected operand sizes");
  1390. // See if we're dealing with constant values.
  1391. auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
  1392. auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
  1393. // Attempt to simplify to a constant or shuffle vector.
  1394. if (CILength && CIIndex) {
  1395. APInt Len = CILength->getValue().zextOrTrunc(6);
  1396. APInt Idx = CIIndex->getValue().zextOrTrunc(6);
  1397. if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
  1398. return IC.replaceInstUsesWith(II, V);
  1399. }
  1400. }
  1401. // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
  1402. // operands.
  1403. bool MadeChange = false;
  1404. if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
  1405. IC.replaceOperand(II, 0, V);
  1406. MadeChange = true;
  1407. }
  1408. if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
  1409. IC.replaceOperand(II, 1, V);
  1410. MadeChange = true;
  1411. }
  1412. if (MadeChange) {
  1413. return &II;
  1414. }
  1415. break;
  1416. }
  1417. case Intrinsic::x86_sse41_pblendvb:
  1418. case Intrinsic::x86_sse41_blendvps:
  1419. case Intrinsic::x86_sse41_blendvpd:
  1420. case Intrinsic::x86_avx_blendv_ps_256:
  1421. case Intrinsic::x86_avx_blendv_pd_256:
  1422. case Intrinsic::x86_avx2_pblendvb: {
  1423. // fold (blend A, A, Mask) -> A
  1424. Value *Op0 = II.getArgOperand(0);
  1425. Value *Op1 = II.getArgOperand(1);
  1426. Value *Mask = II.getArgOperand(2);
  1427. if (Op0 == Op1) {
  1428. return IC.replaceInstUsesWith(II, Op0);
  1429. }
  1430. // Zero Mask - select 1st argument.
  1431. if (isa<ConstantAggregateZero>(Mask)) {
  1432. return IC.replaceInstUsesWith(II, Op0);
  1433. }
  1434. // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
  1435. if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
  1436. Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
  1437. return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
  1438. }
  1439. // Convert to a vector select if we can bypass casts and find a boolean
  1440. // vector condition value.
  1441. Value *BoolVec;
  1442. Mask = InstCombiner::peekThroughBitcast(Mask);
  1443. if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
  1444. BoolVec->getType()->isVectorTy() &&
  1445. BoolVec->getType()->getScalarSizeInBits() == 1) {
  1446. assert(Mask->getType()->getPrimitiveSizeInBits() ==
  1447. II.getType()->getPrimitiveSizeInBits() &&
  1448. "Not expecting mask and operands with different sizes");
  1449. unsigned NumMaskElts =
  1450. cast<FixedVectorType>(Mask->getType())->getNumElements();
  1451. unsigned NumOperandElts =
  1452. cast<FixedVectorType>(II.getType())->getNumElements();
  1453. if (NumMaskElts == NumOperandElts) {
  1454. return SelectInst::Create(BoolVec, Op1, Op0);
  1455. }
  1456. // If the mask has less elements than the operands, each mask bit maps to
  1457. // multiple elements of the operands. Bitcast back and forth.
  1458. if (NumMaskElts < NumOperandElts) {
  1459. Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
  1460. Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
  1461. Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
  1462. return new BitCastInst(Sel, II.getType());
  1463. }
  1464. }
  1465. break;
  1466. }
  1467. case Intrinsic::x86_ssse3_pshuf_b_128:
  1468. case Intrinsic::x86_avx2_pshuf_b:
  1469. case Intrinsic::x86_avx512_pshuf_b_512:
  1470. if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
  1471. return IC.replaceInstUsesWith(II, V);
  1472. }
  1473. break;
  1474. case Intrinsic::x86_avx_vpermilvar_ps:
  1475. case Intrinsic::x86_avx_vpermilvar_ps_256:
  1476. case Intrinsic::x86_avx512_vpermilvar_ps_512:
  1477. case Intrinsic::x86_avx_vpermilvar_pd:
  1478. case Intrinsic::x86_avx_vpermilvar_pd_256:
  1479. case Intrinsic::x86_avx512_vpermilvar_pd_512:
  1480. if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
  1481. return IC.replaceInstUsesWith(II, V);
  1482. }
  1483. break;
  1484. case Intrinsic::x86_avx2_permd:
  1485. case Intrinsic::x86_avx2_permps:
  1486. case Intrinsic::x86_avx512_permvar_df_256:
  1487. case Intrinsic::x86_avx512_permvar_df_512:
  1488. case Intrinsic::x86_avx512_permvar_di_256:
  1489. case Intrinsic::x86_avx512_permvar_di_512:
  1490. case Intrinsic::x86_avx512_permvar_hi_128:
  1491. case Intrinsic::x86_avx512_permvar_hi_256:
  1492. case Intrinsic::x86_avx512_permvar_hi_512:
  1493. case Intrinsic::x86_avx512_permvar_qi_128:
  1494. case Intrinsic::x86_avx512_permvar_qi_256:
  1495. case Intrinsic::x86_avx512_permvar_qi_512:
  1496. case Intrinsic::x86_avx512_permvar_sf_512:
  1497. case Intrinsic::x86_avx512_permvar_si_512:
  1498. if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
  1499. return IC.replaceInstUsesWith(II, V);
  1500. }
  1501. break;
  1502. case Intrinsic::x86_avx_maskload_ps:
  1503. case Intrinsic::x86_avx_maskload_pd:
  1504. case Intrinsic::x86_avx_maskload_ps_256:
  1505. case Intrinsic::x86_avx_maskload_pd_256:
  1506. case Intrinsic::x86_avx2_maskload_d:
  1507. case Intrinsic::x86_avx2_maskload_q:
  1508. case Intrinsic::x86_avx2_maskload_d_256:
  1509. case Intrinsic::x86_avx2_maskload_q_256:
  1510. if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
  1511. return I;
  1512. }
  1513. break;
  1514. case Intrinsic::x86_sse2_maskmov_dqu:
  1515. case Intrinsic::x86_avx_maskstore_ps:
  1516. case Intrinsic::x86_avx_maskstore_pd:
  1517. case Intrinsic::x86_avx_maskstore_ps_256:
  1518. case Intrinsic::x86_avx_maskstore_pd_256:
  1519. case Intrinsic::x86_avx2_maskstore_d:
  1520. case Intrinsic::x86_avx2_maskstore_q:
  1521. case Intrinsic::x86_avx2_maskstore_d_256:
  1522. case Intrinsic::x86_avx2_maskstore_q_256:
  1523. if (simplifyX86MaskedStore(II, IC)) {
  1524. return nullptr;
  1525. }
  1526. break;
  1527. case Intrinsic::x86_addcarry_32:
  1528. case Intrinsic::x86_addcarry_64:
  1529. if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
  1530. return IC.replaceInstUsesWith(II, V);
  1531. }
  1532. break;
  1533. default:
  1534. break;
  1535. }
  1536. return std::nullopt;
  1537. }
  1538. std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
  1539. InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
  1540. bool &KnownBitsComputed) const {
  1541. switch (II.getIntrinsicID()) {
  1542. default:
  1543. break;
  1544. case Intrinsic::x86_mmx_pmovmskb:
  1545. case Intrinsic::x86_sse_movmsk_ps:
  1546. case Intrinsic::x86_sse2_movmsk_pd:
  1547. case Intrinsic::x86_sse2_pmovmskb_128:
  1548. case Intrinsic::x86_avx_movmsk_ps_256:
  1549. case Intrinsic::x86_avx_movmsk_pd_256:
  1550. case Intrinsic::x86_avx2_pmovmskb: {
  1551. // MOVMSK copies the vector elements' sign bits to the low bits
  1552. // and zeros the high bits.
  1553. unsigned ArgWidth;
  1554. if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
  1555. ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
  1556. } else {
  1557. auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
  1558. ArgWidth = ArgType->getNumElements();
  1559. }
  1560. // If we don't need any of low bits then return zero,
  1561. // we know that DemandedMask is non-zero already.
  1562. APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
  1563. Type *VTy = II.getType();
  1564. if (DemandedElts.isZero()) {
  1565. return ConstantInt::getNullValue(VTy);
  1566. }
  1567. // We know that the upper bits are set to zero.
  1568. Known.Zero.setBitsFrom(ArgWidth);
  1569. KnownBitsComputed = true;
  1570. break;
  1571. }
  1572. }
  1573. return std::nullopt;
  1574. }
  1575. std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
  1576. InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
  1577. APInt &UndefElts2, APInt &UndefElts3,
  1578. std::function<void(Instruction *, unsigned, APInt, APInt &)>
  1579. simplifyAndSetOp) const {
  1580. unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
  1581. switch (II.getIntrinsicID()) {
  1582. default:
  1583. break;
  1584. case Intrinsic::x86_xop_vfrcz_ss:
  1585. case Intrinsic::x86_xop_vfrcz_sd:
  1586. // The instructions for these intrinsics are speced to zero upper bits not
  1587. // pass them through like other scalar intrinsics. So we shouldn't just
  1588. // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
  1589. // Instead we should return a zero vector.
  1590. if (!DemandedElts[0]) {
  1591. IC.addToWorklist(&II);
  1592. return ConstantAggregateZero::get(II.getType());
  1593. }
  1594. // Only the lower element is used.
  1595. DemandedElts = 1;
  1596. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1597. // Only the lower element is undefined. The high elements are zero.
  1598. UndefElts = UndefElts[0];
  1599. break;
  1600. // Unary scalar-as-vector operations that work column-wise.
  1601. case Intrinsic::x86_sse_rcp_ss:
  1602. case Intrinsic::x86_sse_rsqrt_ss:
  1603. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1604. // If lowest element of a scalar op isn't used then use Arg0.
  1605. if (!DemandedElts[0]) {
  1606. IC.addToWorklist(&II);
  1607. return II.getArgOperand(0);
  1608. }
  1609. // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
  1610. // checks).
  1611. break;
  1612. // Binary scalar-as-vector operations that work column-wise. The high
  1613. // elements come from operand 0. The low element is a function of both
  1614. // operands.
  1615. case Intrinsic::x86_sse_min_ss:
  1616. case Intrinsic::x86_sse_max_ss:
  1617. case Intrinsic::x86_sse_cmp_ss:
  1618. case Intrinsic::x86_sse2_min_sd:
  1619. case Intrinsic::x86_sse2_max_sd:
  1620. case Intrinsic::x86_sse2_cmp_sd: {
  1621. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1622. // If lowest element of a scalar op isn't used then use Arg0.
  1623. if (!DemandedElts[0]) {
  1624. IC.addToWorklist(&II);
  1625. return II.getArgOperand(0);
  1626. }
  1627. // Only lower element is used for operand 1.
  1628. DemandedElts = 1;
  1629. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1630. // Lower element is undefined if both lower elements are undefined.
  1631. // Consider things like undef&0. The result is known zero, not undef.
  1632. if (!UndefElts2[0])
  1633. UndefElts.clearBit(0);
  1634. break;
  1635. }
  1636. // Binary scalar-as-vector operations that work column-wise. The high
  1637. // elements come from operand 0 and the low element comes from operand 1.
  1638. case Intrinsic::x86_sse41_round_ss:
  1639. case Intrinsic::x86_sse41_round_sd: {
  1640. // Don't use the low element of operand 0.
  1641. APInt DemandedElts2 = DemandedElts;
  1642. DemandedElts2.clearBit(0);
  1643. simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
  1644. // If lowest element of a scalar op isn't used then use Arg0.
  1645. if (!DemandedElts[0]) {
  1646. IC.addToWorklist(&II);
  1647. return II.getArgOperand(0);
  1648. }
  1649. // Only lower element is used for operand 1.
  1650. DemandedElts = 1;
  1651. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1652. // Take the high undef elements from operand 0 and take the lower element
  1653. // from operand 1.
  1654. UndefElts.clearBit(0);
  1655. UndefElts |= UndefElts2[0];
  1656. break;
  1657. }
  1658. // Three input scalar-as-vector operations that work column-wise. The high
  1659. // elements come from operand 0 and the low element is a function of all
  1660. // three inputs.
  1661. case Intrinsic::x86_avx512_mask_add_ss_round:
  1662. case Intrinsic::x86_avx512_mask_div_ss_round:
  1663. case Intrinsic::x86_avx512_mask_mul_ss_round:
  1664. case Intrinsic::x86_avx512_mask_sub_ss_round:
  1665. case Intrinsic::x86_avx512_mask_max_ss_round:
  1666. case Intrinsic::x86_avx512_mask_min_ss_round:
  1667. case Intrinsic::x86_avx512_mask_add_sd_round:
  1668. case Intrinsic::x86_avx512_mask_div_sd_round:
  1669. case Intrinsic::x86_avx512_mask_mul_sd_round:
  1670. case Intrinsic::x86_avx512_mask_sub_sd_round:
  1671. case Intrinsic::x86_avx512_mask_max_sd_round:
  1672. case Intrinsic::x86_avx512_mask_min_sd_round:
  1673. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1674. // If lowest element of a scalar op isn't used then use Arg0.
  1675. if (!DemandedElts[0]) {
  1676. IC.addToWorklist(&II);
  1677. return II.getArgOperand(0);
  1678. }
  1679. // Only lower element is used for operand 1 and 2.
  1680. DemandedElts = 1;
  1681. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1682. simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
  1683. // Lower element is undefined if all three lower elements are undefined.
  1684. // Consider things like undef&0. The result is known zero, not undef.
  1685. if (!UndefElts2[0] || !UndefElts3[0])
  1686. UndefElts.clearBit(0);
  1687. break;
  1688. // TODO: Add fmaddsub support?
  1689. case Intrinsic::x86_sse3_addsub_pd:
  1690. case Intrinsic::x86_sse3_addsub_ps:
  1691. case Intrinsic::x86_avx_addsub_pd_256:
  1692. case Intrinsic::x86_avx_addsub_ps_256: {
  1693. // If none of the even or none of the odd lanes are required, turn this
  1694. // into a generic FP math instruction.
  1695. APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
  1696. APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
  1697. bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
  1698. bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
  1699. if (IsSubOnly || IsAddOnly) {
  1700. assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
  1701. IRBuilderBase::InsertPointGuard Guard(IC.Builder);
  1702. IC.Builder.SetInsertPoint(&II);
  1703. Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
  1704. return IC.Builder.CreateBinOp(
  1705. IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
  1706. }
  1707. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1708. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1709. UndefElts &= UndefElts2;
  1710. break;
  1711. }
  1712. // General per-element vector operations.
  1713. case Intrinsic::x86_avx2_psllv_d:
  1714. case Intrinsic::x86_avx2_psllv_d_256:
  1715. case Intrinsic::x86_avx2_psllv_q:
  1716. case Intrinsic::x86_avx2_psllv_q_256:
  1717. case Intrinsic::x86_avx2_psrlv_d:
  1718. case Intrinsic::x86_avx2_psrlv_d_256:
  1719. case Intrinsic::x86_avx2_psrlv_q:
  1720. case Intrinsic::x86_avx2_psrlv_q_256:
  1721. case Intrinsic::x86_avx2_psrav_d:
  1722. case Intrinsic::x86_avx2_psrav_d_256: {
  1723. simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
  1724. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
  1725. UndefElts &= UndefElts2;
  1726. break;
  1727. }
  1728. case Intrinsic::x86_sse2_packssdw_128:
  1729. case Intrinsic::x86_sse2_packsswb_128:
  1730. case Intrinsic::x86_sse2_packuswb_128:
  1731. case Intrinsic::x86_sse41_packusdw:
  1732. case Intrinsic::x86_avx2_packssdw:
  1733. case Intrinsic::x86_avx2_packsswb:
  1734. case Intrinsic::x86_avx2_packusdw:
  1735. case Intrinsic::x86_avx2_packuswb:
  1736. case Intrinsic::x86_avx512_packssdw_512:
  1737. case Intrinsic::x86_avx512_packsswb_512:
  1738. case Intrinsic::x86_avx512_packusdw_512:
  1739. case Intrinsic::x86_avx512_packuswb_512: {
  1740. auto *Ty0 = II.getArgOperand(0)->getType();
  1741. unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
  1742. assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
  1743. unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
  1744. unsigned VWidthPerLane = VWidth / NumLanes;
  1745. unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
  1746. // Per lane, pack the elements of the first input and then the second.
  1747. // e.g.
  1748. // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
  1749. // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
  1750. for (int OpNum = 0; OpNum != 2; ++OpNum) {
  1751. APInt OpDemandedElts(InnerVWidth, 0);
  1752. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  1753. unsigned LaneIdx = Lane * VWidthPerLane;
  1754. for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
  1755. unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
  1756. if (DemandedElts[Idx])
  1757. OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
  1758. }
  1759. }
  1760. // Demand elements from the operand.
  1761. APInt OpUndefElts(InnerVWidth, 0);
  1762. simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
  1763. // Pack the operand's UNDEF elements, one lane at a time.
  1764. OpUndefElts = OpUndefElts.zext(VWidth);
  1765. for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
  1766. APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
  1767. LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
  1768. LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
  1769. UndefElts |= LaneElts;
  1770. }
  1771. }
  1772. break;
  1773. }
  1774. // PSHUFB
  1775. case Intrinsic::x86_ssse3_pshuf_b_128:
  1776. case Intrinsic::x86_avx2_pshuf_b:
  1777. case Intrinsic::x86_avx512_pshuf_b_512:
  1778. // PERMILVAR
  1779. case Intrinsic::x86_avx_vpermilvar_ps:
  1780. case Intrinsic::x86_avx_vpermilvar_ps_256:
  1781. case Intrinsic::x86_avx512_vpermilvar_ps_512:
  1782. case Intrinsic::x86_avx_vpermilvar_pd:
  1783. case Intrinsic::x86_avx_vpermilvar_pd_256:
  1784. case Intrinsic::x86_avx512_vpermilvar_pd_512:
  1785. // PERMV
  1786. case Intrinsic::x86_avx2_permd:
  1787. case Intrinsic::x86_avx2_permps: {
  1788. simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
  1789. break;
  1790. }
  1791. // SSE4A instructions leave the upper 64-bits of the 128-bit result
  1792. // in an undefined state.
  1793. case Intrinsic::x86_sse4a_extrq:
  1794. case Intrinsic::x86_sse4a_extrqi:
  1795. case Intrinsic::x86_sse4a_insertq:
  1796. case Intrinsic::x86_sse4a_insertqi:
  1797. UndefElts.setHighBits(VWidth / 2);
  1798. break;
  1799. }
  1800. return std::nullopt;
  1801. }