12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030 |
- //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- /// \file
- /// This file implements a TargetTransformInfo analysis pass specific to the
- /// X86 target machine. It uses the target's detailed information to provide
- /// more precise answers to certain TTI queries, while letting the target
- /// independent and default TTI implementations handle the rest.
- ///
- //===----------------------------------------------------------------------===//
- #include "X86TargetTransformInfo.h"
- #include "llvm/IR/IntrinsicInst.h"
- #include "llvm/IR/IntrinsicsX86.h"
- #include "llvm/Support/KnownBits.h"
- #include "llvm/Transforms/InstCombine/InstCombiner.h"
- #include <optional>
- using namespace llvm;
- #define DEBUG_TYPE "x86tti"
- /// Return a constant boolean vector that has true elements in all positions
- /// where the input constant data vector has an element with the sign bit set.
- static Constant *getNegativeIsTrueBoolVec(Constant *V) {
- VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
- V = ConstantExpr::getBitCast(V, IntTy);
- V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
- V);
- return V;
- }
- /// Convert the x86 XMM integer vector mask to a vector of bools based on
- /// each element's most significant bit (the sign bit).
- static Value *getBoolVecFromMask(Value *Mask) {
- // Fold Constant Mask.
- if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
- return getNegativeIsTrueBoolVec(ConstantMask);
- // Mask was extended from a boolean vector.
- Value *ExtMask;
- if (PatternMatch::match(
- Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
- ExtMask->getType()->isIntOrIntVectorTy(1))
- return ExtMask;
- return nullptr;
- }
- // TODO: If the x86 backend knew how to convert a bool vector mask back to an
- // XMM register mask efficiently, we could transform all x86 masked intrinsics
- // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
- static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
- Value *Ptr = II.getOperand(0);
- Value *Mask = II.getOperand(1);
- Constant *ZeroVec = Constant::getNullValue(II.getType());
- // Zero Mask - masked load instruction creates a zero vector.
- if (isa<ConstantAggregateZero>(Mask))
- return IC.replaceInstUsesWith(II, ZeroVec);
- // The mask is constant or extended from a bool vector. Convert this x86
- // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
- if (Value *BoolMask = getBoolVecFromMask(Mask)) {
- // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
- // the LLVM intrinsic definition for the pointer argument.
- unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
- PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
- Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
- // The pass-through vector for an x86 masked load is a zero vector.
- CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
- II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
- return IC.replaceInstUsesWith(II, NewMaskedLoad);
- }
- return nullptr;
- }
- // TODO: If the x86 backend knew how to convert a bool vector mask back to an
- // XMM register mask efficiently, we could transform all x86 masked intrinsics
- // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
- static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
- Value *Ptr = II.getOperand(0);
- Value *Mask = II.getOperand(1);
- Value *Vec = II.getOperand(2);
- // Zero Mask - this masked store instruction does nothing.
- if (isa<ConstantAggregateZero>(Mask)) {
- IC.eraseInstFromFunction(II);
- return true;
- }
- // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
- // anything else at this level.
- if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
- return false;
- // The mask is constant or extended from a bool vector. Convert this x86
- // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
- if (Value *BoolMask = getBoolVecFromMask(Mask)) {
- unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
- PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
- Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
- IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
- // 'Replace uses' doesn't work for stores. Erase the original masked store.
- IC.eraseInstFromFunction(II);
- return true;
- }
- return false;
- }
- static Value *simplifyX86immShift(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- bool LogicalShift = false;
- bool ShiftLeft = false;
- bool IsImm = false;
- switch (II.getIntrinsicID()) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psrai_d_512:
- case Intrinsic::x86_avx512_psrai_q_512:
- case Intrinsic::x86_avx512_psrai_w_512:
- IsImm = true;
- [[fallthrough]];
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx512_psra_q_128:
- case Intrinsic::x86_avx512_psra_q_256:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
- LogicalShift = false;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_sse2_psrli_d:
- case Intrinsic::x86_sse2_psrli_q:
- case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx512_psrli_d_512:
- case Intrinsic::x86_avx512_psrli_q_512:
- case Intrinsic::x86_avx512_psrli_w_512:
- IsImm = true;
- [[fallthrough]];
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
- LogicalShift = true;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx512_pslli_d_512:
- case Intrinsic::x86_avx512_pslli_q_512:
- case Intrinsic::x86_avx512_pslli_w_512:
- IsImm = true;
- [[fallthrough]];
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_psll_w_512:
- LogicalShift = true;
- ShiftLeft = true;
- break;
- }
- assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
- Value *Vec = II.getArgOperand(0);
- Value *Amt = II.getArgOperand(1);
- auto *VT = cast<FixedVectorType>(Vec->getType());
- Type *SVT = VT->getElementType();
- Type *AmtVT = Amt->getType();
- unsigned VWidth = VT->getNumElements();
- unsigned BitWidth = SVT->getPrimitiveSizeInBits();
- // If the shift amount is guaranteed to be in-range we can replace it with a
- // generic shift. If its guaranteed to be out of range, logical shifts combine
- // to zero and arithmetic shifts are clamped to (BitWidth - 1).
- if (IsImm) {
- assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
- KnownBits KnownAmtBits =
- llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
- if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
- Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
- Amt = Builder.CreateVectorSplat(VWidth, Amt);
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
- if (KnownAmtBits.getMinValue().uge(BitWidth)) {
- if (LogicalShift)
- return ConstantAggregateZero::get(VT);
- Amt = ConstantInt::get(SVT, BitWidth - 1);
- return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
- }
- } else {
- // Ensure the first element has an in-range value and the rest of the
- // elements in the bottom 64 bits are zero.
- assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
- cast<VectorType>(AmtVT)->getElementType() == SVT &&
- "Unexpected shift-by-scalar type");
- unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
- APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
- APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
- KnownBits KnownLowerBits = llvm::computeKnownBits(
- Amt, DemandedLower, II.getModule()->getDataLayout());
- KnownBits KnownUpperBits = llvm::computeKnownBits(
- Amt, DemandedUpper, II.getModule()->getDataLayout());
- if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
- (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
- SmallVector<int, 16> ZeroSplat(VWidth, 0);
- Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
- }
- // Simplify if count is constant vector.
- auto *CDV = dyn_cast<ConstantDataVector>(Amt);
- if (!CDV)
- return nullptr;
- // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
- // operand to compute the shift amount.
- assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
- cast<VectorType>(AmtVT)->getElementType() == SVT &&
- "Unexpected shift-by-scalar type");
- // Concatenate the sub-elements to create the 64-bit value.
- APInt Count(64, 0);
- for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
- unsigned SubEltIdx = (NumSubElts - 1) - i;
- auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
- Count <<= BitWidth;
- Count |= SubElt->getValue().zextOrTrunc(64);
- }
- // If shift-by-zero then just return the original value.
- if (Count.isZero())
- return Vec;
- // Handle cases when Shift >= BitWidth.
- if (Count.uge(BitWidth)) {
- // If LogicalShift - just return zero.
- if (LogicalShift)
- return ConstantAggregateZero::get(VT);
- // If ArithmeticShift - clamp Shift to (BitWidth - 1).
- Count = APInt(64, BitWidth - 1);
- }
- // Get a constant vector of the same type as the first operand.
- auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
- auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
- if (ShiftLeft)
- return Builder.CreateShl(Vec, ShiftVec);
- if (LogicalShift)
- return Builder.CreateLShr(Vec, ShiftVec);
- return Builder.CreateAShr(Vec, ShiftVec);
- }
- // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
- // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
- // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
- static Value *simplifyX86varShift(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- bool LogicalShift = false;
- bool ShiftLeft = false;
- switch (II.getIntrinsicID()) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- case Intrinsic::x86_avx512_psrav_q_128:
- case Intrinsic::x86_avx512_psrav_q_256:
- case Intrinsic::x86_avx512_psrav_d_512:
- case Intrinsic::x86_avx512_psrav_q_512:
- case Intrinsic::x86_avx512_psrav_w_128:
- case Intrinsic::x86_avx512_psrav_w_256:
- case Intrinsic::x86_avx512_psrav_w_512:
- LogicalShift = false;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx512_psrlv_d_512:
- case Intrinsic::x86_avx512_psrlv_q_512:
- case Intrinsic::x86_avx512_psrlv_w_128:
- case Intrinsic::x86_avx512_psrlv_w_256:
- case Intrinsic::x86_avx512_psrlv_w_512:
- LogicalShift = true;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx512_psllv_d_512:
- case Intrinsic::x86_avx512_psllv_q_512:
- case Intrinsic::x86_avx512_psllv_w_128:
- case Intrinsic::x86_avx512_psllv_w_256:
- case Intrinsic::x86_avx512_psllv_w_512:
- LogicalShift = true;
- ShiftLeft = true;
- break;
- }
- assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
- Value *Vec = II.getArgOperand(0);
- Value *Amt = II.getArgOperand(1);
- auto *VT = cast<FixedVectorType>(II.getType());
- Type *SVT = VT->getElementType();
- int NumElts = VT->getNumElements();
- int BitWidth = SVT->getIntegerBitWidth();
- // If the shift amount is guaranteed to be in-range we can replace it with a
- // generic shift.
- KnownBits KnownAmt =
- llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
- if (KnownAmt.getMaxValue().ult(BitWidth)) {
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
- // Simplify if all shift amounts are constant/undef.
- auto *CShift = dyn_cast<Constant>(Amt);
- if (!CShift)
- return nullptr;
- // Collect each element's shift amount.
- // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
- bool AnyOutOfRange = false;
- SmallVector<int, 8> ShiftAmts;
- for (int I = 0; I < NumElts; ++I) {
- auto *CElt = CShift->getAggregateElement(I);
- if (isa_and_nonnull<UndefValue>(CElt)) {
- ShiftAmts.push_back(-1);
- continue;
- }
- auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
- if (!COp)
- return nullptr;
- // Handle out of range shifts.
- // If LogicalShift - set to BitWidth (special case).
- // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
- APInt ShiftVal = COp->getValue();
- if (ShiftVal.uge(BitWidth)) {
- AnyOutOfRange = LogicalShift;
- ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
- continue;
- }
- ShiftAmts.push_back((int)ShiftVal.getZExtValue());
- }
- // If all elements out of range or UNDEF, return vector of zeros/undefs.
- // ArithmeticShift should only hit this if they are all UNDEF.
- auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
- if (llvm::all_of(ShiftAmts, OutOfRange)) {
- SmallVector<Constant *, 8> ConstantVec;
- for (int Idx : ShiftAmts) {
- if (Idx < 0) {
- ConstantVec.push_back(UndefValue::get(SVT));
- } else {
- assert(LogicalShift && "Logical shift expected");
- ConstantVec.push_back(ConstantInt::getNullValue(SVT));
- }
- }
- return ConstantVector::get(ConstantVec);
- }
- // We can't handle only some out of range values with generic logical shifts.
- if (AnyOutOfRange)
- return nullptr;
- // Build the shift amount constant vector.
- SmallVector<Constant *, 8> ShiftVecAmts;
- for (int Idx : ShiftAmts) {
- if (Idx < 0)
- ShiftVecAmts.push_back(UndefValue::get(SVT));
- else
- ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
- }
- auto ShiftVec = ConstantVector::get(ShiftVecAmts);
- if (ShiftLeft)
- return Builder.CreateShl(Vec, ShiftVec);
- if (LogicalShift)
- return Builder.CreateLShr(Vec, ShiftVec);
- return Builder.CreateAShr(Vec, ShiftVec);
- }
- static Value *simplifyX86pack(IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder, bool IsSigned) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Type *ResTy = II.getType();
- // Fast all undef handling.
- if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
- return UndefValue::get(ResTy);
- auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
- unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
- unsigned NumSrcElts = ArgTy->getNumElements();
- assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
- "Unexpected packing types");
- unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
- unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
- unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
- assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
- "Unexpected packing types");
- // Constant folding.
- if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
- return nullptr;
- // Clamp Values - signed/unsigned both use signed clamp values, but they
- // differ on the min/max values.
- APInt MinValue, MaxValue;
- if (IsSigned) {
- // PACKSS: Truncate signed value with signed saturation.
- // Source values less than dst minint are saturated to minint.
- // Source values greater than dst maxint are saturated to maxint.
- MinValue =
- APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
- MaxValue =
- APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
- } else {
- // PACKUS: Truncate signed value with unsigned saturation.
- // Source values less than zero are saturated to zero.
- // Source values greater than dst maxuint are saturated to maxuint.
- MinValue = APInt::getZero(SrcScalarSizeInBits);
- MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
- }
- auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
- auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
- Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
- Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
- Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
- Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
- // Shuffle clamped args together at the lane level.
- SmallVector<int, 32> PackMask;
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
- PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
- for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
- PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
- }
- auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
- // Truncate to dst size.
- return Builder.CreateTrunc(Shuffle, ResTy);
- }
- static Value *simplifyX86movmsk(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *Arg = II.getArgOperand(0);
- Type *ResTy = II.getType();
- // movmsk(undef) -> zero as we must ensure the upper bits are zero.
- if (isa<UndefValue>(Arg))
- return Constant::getNullValue(ResTy);
- auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
- // We can't easily peek through x86_mmx types.
- if (!ArgTy)
- return nullptr;
- // Expand MOVMSK to compare/bitcast/zext:
- // e.g. PMOVMSKB(v16i8 x):
- // %cmp = icmp slt <16 x i8> %x, zeroinitializer
- // %int = bitcast <16 x i1> %cmp to i16
- // %res = zext i16 %int to i32
- unsigned NumElts = ArgTy->getNumElements();
- Type *IntegerTy = Builder.getIntNTy(NumElts);
- Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
- Res = Builder.CreateIsNeg(Res);
- Res = Builder.CreateBitCast(Res, IntegerTy);
- Res = Builder.CreateZExtOrTrunc(Res, ResTy);
- return Res;
- }
- static Value *simplifyX86addcarry(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *CarryIn = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- Value *Op2 = II.getArgOperand(2);
- Type *RetTy = II.getType();
- Type *OpTy = Op1->getType();
- assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
- RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
- "Unexpected types for x86 addcarry");
- // If carry-in is zero, this is just an unsigned add with overflow.
- if (match(CarryIn, PatternMatch::m_ZeroInt())) {
- Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
- {Op1, Op2});
- // The types have to be adjusted to match the x86 call types.
- Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
- Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
- Builder.getInt8Ty());
- Value *Res = PoisonValue::get(RetTy);
- Res = Builder.CreateInsertValue(Res, UAddOV, 0);
- return Builder.CreateInsertValue(Res, UAddResult, 1);
- }
- return nullptr;
- }
- static Value *simplifyX86insertps(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
- if (!CInt)
- return nullptr;
- auto *VecTy = cast<FixedVectorType>(II.getType());
- assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
- // The immediate permute control byte looks like this:
- // [3:0] - zero mask for each 32-bit lane
- // [5:4] - select one 32-bit destination lane
- // [7:6] - select one 32-bit source lane
- uint8_t Imm = CInt->getZExtValue();
- uint8_t ZMask = Imm & 0xf;
- uint8_t DestLane = (Imm >> 4) & 0x3;
- uint8_t SourceLane = (Imm >> 6) & 0x3;
- ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
- // If all zero mask bits are set, this was just a weird way to
- // generate a zero vector.
- if (ZMask == 0xf)
- return ZeroVector;
- // Initialize by passing all of the first source bits through.
- int ShuffleMask[4] = {0, 1, 2, 3};
- // We may replace the second operand with the zero vector.
- Value *V1 = II.getArgOperand(1);
- if (ZMask) {
- // If the zero mask is being used with a single input or the zero mask
- // overrides the destination lane, this is a shuffle with the zero vector.
- if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
- (ZMask & (1 << DestLane))) {
- V1 = ZeroVector;
- // We may still move 32-bits of the first source vector from one lane
- // to another.
- ShuffleMask[DestLane] = SourceLane;
- // The zero mask may override the previous insert operation.
- for (unsigned i = 0; i < 4; ++i)
- if ((ZMask >> i) & 0x1)
- ShuffleMask[i] = i + 4;
- } else {
- // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
- return nullptr;
- }
- } else {
- // Replace the selected destination lane with the selected source lane.
- ShuffleMask[DestLane] = SourceLane + 4;
- }
- return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
- }
- /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
- /// or conversion to a shuffle vector.
- static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
- ConstantInt *CILength, ConstantInt *CIIndex,
- InstCombiner::BuilderTy &Builder) {
- auto LowConstantHighUndef = [&](uint64_t Val) {
- Type *IntTy64 = Type::getInt64Ty(II.getContext());
- Constant *Args[] = {ConstantInt::get(IntTy64, Val),
- UndefValue::get(IntTy64)};
- return ConstantVector::get(Args);
- };
- // See if we're dealing with constant values.
- auto *C0 = dyn_cast<Constant>(Op0);
- auto *CI0 =
- C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
- : nullptr;
- // Attempt to constant fold.
- if (CILength && CIIndex) {
- // From AMD documentation: "The bit index and field length are each six
- // bits in length other bits of the field are ignored."
- APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
- APInt APLength = CILength->getValue().zextOrTrunc(6);
- unsigned Index = APIndex.getZExtValue();
- // From AMD documentation: "a value of zero in the field length is
- // defined as length of 64".
- unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
- // From AMD documentation: "If the sum of the bit index + length field
- // is greater than 64, the results are undefined".
- unsigned End = Index + Length;
- // Note that both field index and field length are 8-bit quantities.
- // Since variables 'Index' and 'Length' are unsigned values
- // obtained from zero-extending field index and field length
- // respectively, their sum should never wrap around.
- if (End > 64)
- return UndefValue::get(II.getType());
- // If we are inserting whole bytes, we can convert this to a shuffle.
- // Lowering can recognize EXTRQI shuffle masks.
- if ((Length % 8) == 0 && (Index % 8) == 0) {
- // Convert bit indices to byte indices.
- Length /= 8;
- Index /= 8;
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- auto *ShufTy = FixedVectorType::get(IntTy8, 16);
- SmallVector<int, 16> ShuffleMask;
- for (int i = 0; i != (int)Length; ++i)
- ShuffleMask.push_back(i + Index);
- for (int i = Length; i != 8; ++i)
- ShuffleMask.push_back(i + 16);
- for (int i = 8; i != 16; ++i)
- ShuffleMask.push_back(-1);
- Value *SV = Builder.CreateShuffleVector(
- Builder.CreateBitCast(Op0, ShufTy),
- ConstantAggregateZero::get(ShufTy), ShuffleMask);
- return Builder.CreateBitCast(SV, II.getType());
- }
- // Constant Fold - shift Index'th bit to lowest position and mask off
- // Length bits.
- if (CI0) {
- APInt Elt = CI0->getValue();
- Elt.lshrInPlace(Index);
- Elt = Elt.zextOrTrunc(Length);
- return LowConstantHighUndef(Elt.getZExtValue());
- }
- // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
- if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
- Value *Args[] = {Op0, CILength, CIIndex};
- Module *M = II.getModule();
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
- return Builder.CreateCall(F, Args);
- }
- }
- // Constant Fold - extraction from zero is always {zero, undef}.
- if (CI0 && CI0->isZero())
- return LowConstantHighUndef(0);
- return nullptr;
- }
- /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
- /// folding or conversion to a shuffle vector.
- static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
- APInt APLength, APInt APIndex,
- InstCombiner::BuilderTy &Builder) {
- // From AMD documentation: "The bit index and field length are each six bits
- // in length other bits of the field are ignored."
- APIndex = APIndex.zextOrTrunc(6);
- APLength = APLength.zextOrTrunc(6);
- // Attempt to constant fold.
- unsigned Index = APIndex.getZExtValue();
- // From AMD documentation: "a value of zero in the field length is
- // defined as length of 64".
- unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
- // From AMD documentation: "If the sum of the bit index + length field
- // is greater than 64, the results are undefined".
- unsigned End = Index + Length;
- // Note that both field index and field length are 8-bit quantities.
- // Since variables 'Index' and 'Length' are unsigned values
- // obtained from zero-extending field index and field length
- // respectively, their sum should never wrap around.
- if (End > 64)
- return UndefValue::get(II.getType());
- // If we are inserting whole bytes, we can convert this to a shuffle.
- // Lowering can recognize INSERTQI shuffle masks.
- if ((Length % 8) == 0 && (Index % 8) == 0) {
- // Convert bit indices to byte indices.
- Length /= 8;
- Index /= 8;
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- auto *ShufTy = FixedVectorType::get(IntTy8, 16);
- SmallVector<int, 16> ShuffleMask;
- for (int i = 0; i != (int)Index; ++i)
- ShuffleMask.push_back(i);
- for (int i = 0; i != (int)Length; ++i)
- ShuffleMask.push_back(i + 16);
- for (int i = Index + Length; i != 8; ++i)
- ShuffleMask.push_back(i);
- for (int i = 8; i != 16; ++i)
- ShuffleMask.push_back(-1);
- Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
- Builder.CreateBitCast(Op1, ShufTy),
- ShuffleMask);
- return Builder.CreateBitCast(SV, II.getType());
- }
- // See if we're dealing with constant values.
- auto *C0 = dyn_cast<Constant>(Op0);
- auto *C1 = dyn_cast<Constant>(Op1);
- auto *CI00 =
- C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
- : nullptr;
- auto *CI10 =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
- : nullptr;
- // Constant Fold - insert bottom Length bits starting at the Index'th bit.
- if (CI00 && CI10) {
- APInt V00 = CI00->getValue();
- APInt V10 = CI10->getValue();
- APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
- V00 = V00 & ~Mask;
- V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
- APInt Val = V00 | V10;
- Type *IntTy64 = Type::getInt64Ty(II.getContext());
- Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
- UndefValue::get(IntTy64)};
- return ConstantVector::get(Args);
- }
- // If we were an INSERTQ call, we'll save demanded elements if we convert to
- // INSERTQI.
- if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- Constant *CILength = ConstantInt::get(IntTy8, Length, false);
- Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
- Value *Args[] = {Op0, Op1, CILength, CIIndex};
- Module *M = II.getModule();
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
- return Builder.CreateCall(F, Args);
- }
- return nullptr;
- }
- /// Attempt to convert pshufb* to shufflevector if the mask is constant.
- static Value *simplifyX86pshufb(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned NumElts = VecTy->getNumElements();
- assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
- "Unexpected number of elements in shuffle mask!");
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[64];
- // Each byte in the shuffle control mask forms an index to permute the
- // corresponding byte in the destination operand.
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
- int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
- // If the most significant bit (bit[7]) of each byte of the shuffle
- // control mask is set, then zero is written in the result byte.
- // The zero vector is in the right-hand side of the resulting
- // shufflevector.
- // The value of each index for the high 128-bit lane is the least
- // significant 4 bits of the respective shuffle control byte.
- Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
- Indexes[I] = Index;
- }
- auto V1 = II.getArgOperand(0);
- auto V2 = Constant::getNullValue(VecTy);
- return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
- }
- /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
- static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned NumElts = VecTy->getNumElements();
- bool IsPD = VecTy->getScalarType()->isDoubleTy();
- unsigned NumLaneElts = IsPD ? 2 : 4;
- assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[16];
- // The intrinsics only read one or two bits, clear the rest.
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
- APInt Index = cast<ConstantInt>(COp)->getValue();
- Index = Index.zextOrTrunc(32).getLoBits(2);
- // The PD variants uses bit 1 to select per-lane element index, so
- // shift down to convert to generic shuffle mask index.
- if (IsPD)
- Index.lshrInPlace(1);
- // The _256 variants are a bit trickier since the mask bits always index
- // into the corresponding 128 half. In order to convert to a generic
- // shuffle, we have to make that explicit.
- Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
- Indexes[I] = Index.getZExtValue();
- }
- auto V1 = II.getArgOperand(0);
- return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
- }
- /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
- static Value *simplifyX86vpermv(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned Size = VecTy->getNumElements();
- assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
- "Unexpected shuffle mask size");
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[64];
- for (unsigned I = 0; I < Size; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
- uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
- Index &= Size - 1;
- Indexes[I] = Index;
- }
- auto V1 = II.getArgOperand(0);
- return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
- }
- std::optional<Instruction *>
- X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
- auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
- unsigned DemandedWidth) {
- APInt UndefElts(Width, 0);
- APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
- return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
- };
- Intrinsic::ID IID = II.getIntrinsicID();
- switch (IID) {
- case Intrinsic::x86_bmi_bextr_32:
- case Intrinsic::x86_bmi_bextr_64:
- case Intrinsic::x86_tbm_bextri_u32:
- case Intrinsic::x86_tbm_bextri_u64:
- // If the RHS is a constant we can try some simplifications.
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- uint64_t Shift = C->getZExtValue();
- uint64_t Length = (Shift >> 8) & 0xff;
- Shift &= 0xff;
- unsigned BitWidth = II.getType()->getIntegerBitWidth();
- // If the length is 0 or the shift is out of range, replace with zero.
- if (Length == 0 || Shift >= BitWidth) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- // If the LHS is also a constant, we can completely constant fold this.
- if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Result = InC->getZExtValue() >> Shift;
- if (Length > BitWidth)
- Length = BitWidth;
- Result &= maskTrailingOnes<uint64_t>(Length);
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
- // are only masking bits that a shift already cleared?
- }
- break;
- case Intrinsic::x86_bmi_bzhi_32:
- case Intrinsic::x86_bmi_bzhi_64:
- // If the RHS is a constant we can try some simplifications.
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- uint64_t Index = C->getZExtValue() & 0xff;
- unsigned BitWidth = II.getType()->getIntegerBitWidth();
- if (Index >= BitWidth) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
- if (Index == 0) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- // If the LHS is also a constant, we can completely constant fold this.
- if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Result = InC->getZExtValue();
- Result &= maskTrailingOnes<uint64_t>(Index);
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- // TODO should we convert this to an AND if the RHS is constant?
- }
- break;
- case Intrinsic::x86_bmi_pext_32:
- case Intrinsic::x86_bmi_pext_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
- unsigned MaskIdx, MaskLen;
- if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- Value *Input = II.getArgOperand(0);
- Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
- Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
- Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
- return IC.replaceInstUsesWith(II, Shifted);
- }
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToSet = 1;
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToTest = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
- BitToSet <<= 1;
- // Clear lowest set bit.
- Mask &= Mask - 1;
- }
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
- case Intrinsic::x86_bmi_pdep_32:
- case Intrinsic::x86_bmi_pdep_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
- unsigned MaskIdx, MaskLen;
- if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- Value *Input = II.getArgOperand(0);
- Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
- Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
- Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
- return IC.replaceInstUsesWith(II, Masked);
- }
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToTest = 1;
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToSet = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
- BitToTest <<= 1;
- // Clear lowest set bit;
- Mask &= Mask - 1;
- }
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
- case Intrinsic::x86_sse_cvtss2si:
- case Intrinsic::x86_sse_cvtss2si64:
- case Intrinsic::x86_sse_cvttss2si:
- case Intrinsic::x86_sse_cvttss2si64:
- case Intrinsic::x86_sse2_cvtsd2si:
- case Intrinsic::x86_sse2_cvtsd2si64:
- case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse2_cvttsd2si64:
- case Intrinsic::x86_avx512_vcvtss2si32:
- case Intrinsic::x86_avx512_vcvtss2si64:
- case Intrinsic::x86_avx512_vcvtss2usi32:
- case Intrinsic::x86_avx512_vcvtss2usi64:
- case Intrinsic::x86_avx512_vcvtsd2si32:
- case Intrinsic::x86_avx512_vcvtsd2si64:
- case Intrinsic::x86_avx512_vcvtsd2usi32:
- case Intrinsic::x86_avx512_vcvtsd2usi64:
- case Intrinsic::x86_avx512_cvttss2si:
- case Intrinsic::x86_avx512_cvttss2si64:
- case Intrinsic::x86_avx512_cvttss2usi:
- case Intrinsic::x86_avx512_cvttss2usi64:
- case Intrinsic::x86_avx512_cvttsd2si:
- case Intrinsic::x86_avx512_cvttsd2si64:
- case Intrinsic::x86_avx512_cvttsd2usi:
- case Intrinsic::x86_avx512_cvttsd2usi64: {
- // These intrinsics only demand the 0th element of their input vectors. If
- // we can simplify the input based on that, do so now.
- Value *Arg = II.getArgOperand(0);
- unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
- case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse_movmsk_ps:
- case Intrinsic::x86_sse2_movmsk_pd:
- case Intrinsic::x86_sse2_pmovmskb_128:
- case Intrinsic::x86_avx_movmsk_pd_256:
- case Intrinsic::x86_avx_movmsk_ps_256:
- case Intrinsic::x86_avx2_pmovmskb:
- if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_sse_comieq_ss:
- case Intrinsic::x86_sse_comige_ss:
- case Intrinsic::x86_sse_comigt_ss:
- case Intrinsic::x86_sse_comile_ss:
- case Intrinsic::x86_sse_comilt_ss:
- case Intrinsic::x86_sse_comineq_ss:
- case Intrinsic::x86_sse_ucomieq_ss:
- case Intrinsic::x86_sse_ucomige_ss:
- case Intrinsic::x86_sse_ucomigt_ss:
- case Intrinsic::x86_sse_ucomile_ss:
- case Intrinsic::x86_sse_ucomilt_ss:
- case Intrinsic::x86_sse_ucomineq_ss:
- case Intrinsic::x86_sse2_comieq_sd:
- case Intrinsic::x86_sse2_comige_sd:
- case Intrinsic::x86_sse2_comigt_sd:
- case Intrinsic::x86_sse2_comile_sd:
- case Intrinsic::x86_sse2_comilt_sd:
- case Intrinsic::x86_sse2_comineq_sd:
- case Intrinsic::x86_sse2_ucomieq_sd:
- case Intrinsic::x86_sse2_ucomige_sd:
- case Intrinsic::x86_sse2_ucomigt_sd:
- case Intrinsic::x86_sse2_ucomile_sd:
- case Intrinsic::x86_sse2_ucomilt_sd:
- case Intrinsic::x86_sse2_ucomineq_sd:
- case Intrinsic::x86_avx512_vcomi_ss:
- case Intrinsic::x86_avx512_vcomi_sd:
- case Intrinsic::x86_avx512_mask_cmp_ss:
- case Intrinsic::x86_avx512_mask_cmp_sd: {
- // These intrinsics only demand the 0th element of their input vectors. If
- // we can simplify the input based on that, do so now.
- bool MadeChange = false;
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
- case Intrinsic::x86_avx512_add_ps_512:
- case Intrinsic::x86_avx512_div_ps_512:
- case Intrinsic::x86_avx512_mul_ps_512:
- case Intrinsic::x86_avx512_sub_ps_512:
- case Intrinsic::x86_avx512_add_pd_512:
- case Intrinsic::x86_avx512_div_pd_512:
- case Intrinsic::x86_avx512_mul_pd_512:
- case Intrinsic::x86_avx512_sub_pd_512:
- // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
- // IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
- if (R->getValue() == 4) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Value *V;
- switch (IID) {
- default:
- llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_add_ps_512:
- case Intrinsic::x86_avx512_add_pd_512:
- V = IC.Builder.CreateFAdd(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_sub_ps_512:
- case Intrinsic::x86_avx512_sub_pd_512:
- V = IC.Builder.CreateFSub(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_mul_ps_512:
- case Intrinsic::x86_avx512_mul_pd_512:
- V = IC.Builder.CreateFMul(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_div_ps_512:
- case Intrinsic::x86_avx512_div_pd_512:
- V = IC.Builder.CreateFDiv(Arg0, Arg1);
- break;
- }
- return IC.replaceInstUsesWith(II, V);
- }
- }
- break;
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
- // IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
- if (R->getValue() == 4) {
- // Extract the element as scalars.
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
- Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
- Value *V;
- switch (IID) {
- default:
- llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- V = IC.Builder.CreateFAdd(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- V = IC.Builder.CreateFSub(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- V = IC.Builder.CreateFMul(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- V = IC.Builder.CreateFDiv(LHS, RHS);
- break;
- }
- // Handle the masking aspect of the intrinsic.
- Value *Mask = II.getArgOperand(3);
- auto *C = dyn_cast<ConstantInt>(Mask);
- // We don't need a select if we know the mask bit is a 1.
- if (!C || !C->getValue()[0]) {
- // Cast the mask to an i1 vector and then extract the lowest element.
- auto *MaskTy = FixedVectorType::get(
- IC.Builder.getInt1Ty(),
- cast<IntegerType>(Mask->getType())->getBitWidth());
- Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
- Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
- // Extract the lowest element from the passthru operand.
- Value *Passthru =
- IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
- V = IC.Builder.CreateSelect(Mask, V, Passthru);
- }
- // Insert the result back into the original argument 0.
- V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
- return IC.replaceInstUsesWith(II, V);
- }
- }
- break;
- // Constant fold ashr( <A x Bi>, Ci ).
- // Constant fold lshr( <A x Bi>, Ci ).
- // Constant fold shl( <A x Bi>, Ci ).
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psrai_d_512:
- case Intrinsic::x86_avx512_psrai_q_512:
- case Intrinsic::x86_avx512_psrai_w_512:
- case Intrinsic::x86_sse2_psrli_d:
- case Intrinsic::x86_sse2_psrli_q:
- case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx512_psrli_d_512:
- case Intrinsic::x86_avx512_psrli_q_512:
- case Intrinsic::x86_avx512_psrli_w_512:
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx512_pslli_d_512:
- case Intrinsic::x86_avx512_pslli_q_512:
- case Intrinsic::x86_avx512_pslli_w_512:
- if (Value *V = simplifyX86immShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx512_psra_q_128:
- case Intrinsic::x86_avx512_psra_q_256:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_psll_w_512: {
- if (Value *V = simplifyX86immShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
- // operand to compute the shift amount.
- Value *Arg1 = II.getArgOperand(1);
- assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
- "Unexpected packed shift size");
- unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
- return IC.replaceOperand(II, 1, V);
- }
- break;
- }
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx512_psllv_d_512:
- case Intrinsic::x86_avx512_psllv_q_512:
- case Intrinsic::x86_avx512_psllv_w_128:
- case Intrinsic::x86_avx512_psllv_w_256:
- case Intrinsic::x86_avx512_psllv_w_512:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- case Intrinsic::x86_avx512_psrav_q_128:
- case Intrinsic::x86_avx512_psrav_q_256:
- case Intrinsic::x86_avx512_psrav_d_512:
- case Intrinsic::x86_avx512_psrav_q_512:
- case Intrinsic::x86_avx512_psrav_w_128:
- case Intrinsic::x86_avx512_psrav_w_256:
- case Intrinsic::x86_avx512_psrav_w_512:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx512_psrlv_d_512:
- case Intrinsic::x86_avx512_psrlv_q_512:
- case Intrinsic::x86_avx512_psrlv_w_128:
- case Intrinsic::x86_avx512_psrlv_w_256:
- case Intrinsic::x86_avx512_psrlv_w_512:
- if (Value *V = simplifyX86varShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx512_packssdw_512:
- case Intrinsic::x86_avx512_packsswb_512:
- if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx512_packusdw_512:
- case Intrinsic::x86_avx512_packuswb_512:
- if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_pclmulqdq:
- case Intrinsic::x86_pclmulqdq_256:
- case Intrinsic::x86_pclmulqdq_512: {
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
- unsigned Imm = C->getZExtValue();
- bool MadeChange = false;
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- unsigned VWidth =
- cast<FixedVectorType>(Arg0->getType())->getNumElements();
- APInt UndefElts1(VWidth, 0);
- APInt DemandedElts1 =
- APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
- if (Value *V =
- IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- APInt UndefElts2(VWidth, 0);
- APInt DemandedElts2 =
- APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
- if (Value *V =
- IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- // If either input elements are undef, the result is zero.
- if (DemandedElts1.isSubsetOf(UndefElts1) ||
- DemandedElts2.isSubsetOf(UndefElts2)) {
- return IC.replaceInstUsesWith(II,
- ConstantAggregateZero::get(II.getType()));
- }
- if (MadeChange) {
- return &II;
- }
- }
- break;
- }
- case Intrinsic::x86_sse41_insertps:
- if (Value *V = simplifyX86insertps(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_sse4a_extrq: {
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
- unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
- VWidth1 == 16 && "Unexpected operand sizes");
- // See if we're dealing with constant values.
- auto *C1 = dyn_cast<Constant>(Op1);
- auto *CILength =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
- : nullptr;
- auto *CIIndex =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
- : nullptr;
- // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
- if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
- // operands and the lowest 16-bits of the second.
- bool MadeChange = false;
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
- case Intrinsic::x86_sse4a_extrqi: {
- // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
- // bits of the lower 64-bits. The upper 64-bits are undefined.
- Value *Op0 = II.getArgOperand(0);
- unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
- "Unexpected operand size");
- // See if we're dealing with constant values.
- auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
- auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
- // Attempt to simplify to a constant or shuffle vector.
- if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
- // operand.
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
- case Intrinsic::x86_sse4a_insertq: {
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
- cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
- "Unexpected operand size");
- // See if we're dealing with constant values.
- auto *C1 = dyn_cast<Constant>(Op1);
- auto *CI11 =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
- : nullptr;
- // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
- if (CI11) {
- const APInt &V11 = CI11->getValue();
- APInt Len = V11.zextOrTrunc(6);
- APInt Idx = V11.lshr(8).zextOrTrunc(6);
- if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- }
- // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
- // operand.
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
- case Intrinsic::x86_sse4a_insertqi: {
- // INSERTQI: Extract lowest Length bits from lower half of second source and
- // insert over first source starting at Index bit. The upper 64-bits are
- // undefined.
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
- unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
- VWidth1 == 2 && "Unexpected operand sizes");
- // See if we're dealing with constant values.
- auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
- auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
- // Attempt to simplify to a constant or shuffle vector.
- if (CILength && CIIndex) {
- APInt Len = CILength->getValue().zextOrTrunc(6);
- APInt Idx = CIIndex->getValue().zextOrTrunc(6);
- if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- }
- // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
- // operands.
- bool MadeChange = false;
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
- case Intrinsic::x86_sse41_pblendvb:
- case Intrinsic::x86_sse41_blendvps:
- case Intrinsic::x86_sse41_blendvpd:
- case Intrinsic::x86_avx_blendv_ps_256:
- case Intrinsic::x86_avx_blendv_pd_256:
- case Intrinsic::x86_avx2_pblendvb: {
- // fold (blend A, A, Mask) -> A
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- Value *Mask = II.getArgOperand(2);
- if (Op0 == Op1) {
- return IC.replaceInstUsesWith(II, Op0);
- }
- // Zero Mask - select 1st argument.
- if (isa<ConstantAggregateZero>(Mask)) {
- return IC.replaceInstUsesWith(II, Op0);
- }
- // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
- if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
- Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
- return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
- }
- // Convert to a vector select if we can bypass casts and find a boolean
- // vector condition value.
- Value *BoolVec;
- Mask = InstCombiner::peekThroughBitcast(Mask);
- if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
- BoolVec->getType()->isVectorTy() &&
- BoolVec->getType()->getScalarSizeInBits() == 1) {
- assert(Mask->getType()->getPrimitiveSizeInBits() ==
- II.getType()->getPrimitiveSizeInBits() &&
- "Not expecting mask and operands with different sizes");
- unsigned NumMaskElts =
- cast<FixedVectorType>(Mask->getType())->getNumElements();
- unsigned NumOperandElts =
- cast<FixedVectorType>(II.getType())->getNumElements();
- if (NumMaskElts == NumOperandElts) {
- return SelectInst::Create(BoolVec, Op1, Op0);
- }
- // If the mask has less elements than the operands, each mask bit maps to
- // multiple elements of the operands. Bitcast back and forth.
- if (NumMaskElts < NumOperandElts) {
- Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
- Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
- Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
- return new BitCastInst(Sel, II.getType());
- }
- }
- break;
- }
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- case Intrinsic::x86_avx512_pshuf_b_512:
- if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_avx_vpermilvar_ps:
- case Intrinsic::x86_avx_vpermilvar_ps_256:
- case Intrinsic::x86_avx512_vpermilvar_ps_512:
- case Intrinsic::x86_avx_vpermilvar_pd:
- case Intrinsic::x86_avx_vpermilvar_pd_256:
- case Intrinsic::x86_avx512_vpermilvar_pd_512:
- if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps:
- case Intrinsic::x86_avx512_permvar_df_256:
- case Intrinsic::x86_avx512_permvar_df_512:
- case Intrinsic::x86_avx512_permvar_di_256:
- case Intrinsic::x86_avx512_permvar_di_512:
- case Intrinsic::x86_avx512_permvar_hi_128:
- case Intrinsic::x86_avx512_permvar_hi_256:
- case Intrinsic::x86_avx512_permvar_hi_512:
- case Intrinsic::x86_avx512_permvar_qi_128:
- case Intrinsic::x86_avx512_permvar_qi_256:
- case Intrinsic::x86_avx512_permvar_qi_512:
- case Intrinsic::x86_avx512_permvar_sf_512:
- case Intrinsic::x86_avx512_permvar_si_512:
- if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- case Intrinsic::x86_avx_maskload_ps:
- case Intrinsic::x86_avx_maskload_pd:
- case Intrinsic::x86_avx_maskload_ps_256:
- case Intrinsic::x86_avx_maskload_pd_256:
- case Intrinsic::x86_avx2_maskload_d:
- case Intrinsic::x86_avx2_maskload_q:
- case Intrinsic::x86_avx2_maskload_d_256:
- case Intrinsic::x86_avx2_maskload_q_256:
- if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
- return I;
- }
- break;
- case Intrinsic::x86_sse2_maskmov_dqu:
- case Intrinsic::x86_avx_maskstore_ps:
- case Intrinsic::x86_avx_maskstore_pd:
- case Intrinsic::x86_avx_maskstore_ps_256:
- case Intrinsic::x86_avx_maskstore_pd_256:
- case Intrinsic::x86_avx2_maskstore_d:
- case Intrinsic::x86_avx2_maskstore_q:
- case Intrinsic::x86_avx2_maskstore_d_256:
- case Intrinsic::x86_avx2_maskstore_q_256:
- if (simplifyX86MaskedStore(II, IC)) {
- return nullptr;
- }
- break;
- case Intrinsic::x86_addcarry_32:
- case Intrinsic::x86_addcarry_64:
- if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
- default:
- break;
- }
- return std::nullopt;
- }
- std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
- InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
- bool &KnownBitsComputed) const {
- switch (II.getIntrinsicID()) {
- default:
- break;
- case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse_movmsk_ps:
- case Intrinsic::x86_sse2_movmsk_pd:
- case Intrinsic::x86_sse2_pmovmskb_128:
- case Intrinsic::x86_avx_movmsk_ps_256:
- case Intrinsic::x86_avx_movmsk_pd_256:
- case Intrinsic::x86_avx2_pmovmskb: {
- // MOVMSK copies the vector elements' sign bits to the low bits
- // and zeros the high bits.
- unsigned ArgWidth;
- if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
- ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
- } else {
- auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
- ArgWidth = ArgType->getNumElements();
- }
- // If we don't need any of low bits then return zero,
- // we know that DemandedMask is non-zero already.
- APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
- Type *VTy = II.getType();
- if (DemandedElts.isZero()) {
- return ConstantInt::getNullValue(VTy);
- }
- // We know that the upper bits are set to zero.
- Known.Zero.setBitsFrom(ArgWidth);
- KnownBitsComputed = true;
- break;
- }
- }
- return std::nullopt;
- }
- std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
- InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
- APInt &UndefElts2, APInt &UndefElts3,
- std::function<void(Instruction *, unsigned, APInt, APInt &)>
- simplifyAndSetOp) const {
- unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
- switch (II.getIntrinsicID()) {
- default:
- break;
- case Intrinsic::x86_xop_vfrcz_ss:
- case Intrinsic::x86_xop_vfrcz_sd:
- // The instructions for these intrinsics are speced to zero upper bits not
- // pass them through like other scalar intrinsics. So we shouldn't just
- // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
- // Instead we should return a zero vector.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return ConstantAggregateZero::get(II.getType());
- }
- // Only the lower element is used.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- // Only the lower element is undefined. The high elements are zero.
- UndefElts = UndefElts[0];
- break;
- // Unary scalar-as-vector operations that work column-wise.
- case Intrinsic::x86_sse_rcp_ss:
- case Intrinsic::x86_sse_rsqrt_ss:
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
- // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
- // checks).
- break;
- // Binary scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0. The low element is a function of both
- // operands.
- case Intrinsic::x86_sse_min_ss:
- case Intrinsic::x86_sse_max_ss:
- case Intrinsic::x86_sse_cmp_ss:
- case Intrinsic::x86_sse2_min_sd:
- case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse2_cmp_sd: {
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
- // Only lower element is used for operand 1.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- // Lower element is undefined if both lower elements are undefined.
- // Consider things like undef&0. The result is known zero, not undef.
- if (!UndefElts2[0])
- UndefElts.clearBit(0);
- break;
- }
- // Binary scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0 and the low element comes from operand 1.
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd: {
- // Don't use the low element of operand 0.
- APInt DemandedElts2 = DemandedElts;
- DemandedElts2.clearBit(0);
- simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
- // Only lower element is used for operand 1.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- // Take the high undef elements from operand 0 and take the lower element
- // from operand 1.
- UndefElts.clearBit(0);
- UndefElts |= UndefElts2[0];
- break;
- }
- // Three input scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0 and the low element is a function of all
- // three inputs.
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_max_ss_round:
- case Intrinsic::x86_avx512_mask_min_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- case Intrinsic::x86_avx512_mask_max_sd_round:
- case Intrinsic::x86_avx512_mask_min_sd_round:
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
- // Only lower element is used for operand 1 and 2.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
- // Lower element is undefined if all three lower elements are undefined.
- // Consider things like undef&0. The result is known zero, not undef.
- if (!UndefElts2[0] || !UndefElts3[0])
- UndefElts.clearBit(0);
- break;
- // TODO: Add fmaddsub support?
- case Intrinsic::x86_sse3_addsub_pd:
- case Intrinsic::x86_sse3_addsub_ps:
- case Intrinsic::x86_avx_addsub_pd_256:
- case Intrinsic::x86_avx_addsub_ps_256: {
- // If none of the even or none of the odd lanes are required, turn this
- // into a generic FP math instruction.
- APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
- APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
- bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
- bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
- if (IsSubOnly || IsAddOnly) {
- assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
- IRBuilderBase::InsertPointGuard Guard(IC.Builder);
- IC.Builder.SetInsertPoint(&II);
- Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
- return IC.Builder.CreateBinOp(
- IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
- }
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- UndefElts &= UndefElts2;
- break;
- }
- // General per-element vector operations.
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256: {
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- UndefElts &= UndefElts2;
- break;
- }
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx2_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx512_packssdw_512:
- case Intrinsic::x86_avx512_packsswb_512:
- case Intrinsic::x86_avx512_packusdw_512:
- case Intrinsic::x86_avx512_packuswb_512: {
- auto *Ty0 = II.getArgOperand(0)->getType();
- unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
- assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
- unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
- unsigned VWidthPerLane = VWidth / NumLanes;
- unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
- // Per lane, pack the elements of the first input and then the second.
- // e.g.
- // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
- // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
- for (int OpNum = 0; OpNum != 2; ++OpNum) {
- APInt OpDemandedElts(InnerVWidth, 0);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- unsigned LaneIdx = Lane * VWidthPerLane;
- for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
- unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
- if (DemandedElts[Idx])
- OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
- }
- }
- // Demand elements from the operand.
- APInt OpUndefElts(InnerVWidth, 0);
- simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
- // Pack the operand's UNDEF elements, one lane at a time.
- OpUndefElts = OpUndefElts.zext(VWidth);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
- LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
- LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
- UndefElts |= LaneElts;
- }
- }
- break;
- }
- // PSHUFB
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- case Intrinsic::x86_avx512_pshuf_b_512:
- // PERMILVAR
- case Intrinsic::x86_avx_vpermilvar_ps:
- case Intrinsic::x86_avx_vpermilvar_ps_256:
- case Intrinsic::x86_avx512_vpermilvar_ps_512:
- case Intrinsic::x86_avx_vpermilvar_pd:
- case Intrinsic::x86_avx_vpermilvar_pd_256:
- case Intrinsic::x86_avx512_vpermilvar_pd_512:
- // PERMV
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps: {
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
- break;
- }
- // SSE4A instructions leave the upper 64-bits of the 128-bit result
- // in an undefined state.
- case Intrinsic::x86_sse4a_extrq:
- case Intrinsic::x86_sse4a_extrqi:
- case Intrinsic::x86_sse4a_insertq:
- case Intrinsic::x86_sse4a_insertqi:
- UndefElts.setHighBits(VWidth / 2);
- break;
- }
- return std::nullopt;
- }
|