X86InstrSSE.td 402 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283
  1. //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file describes the X86 SSE instruction set, defining the instructions,
  10. // and properties of the instructions which are needed for code generation,
  11. // machine code emission, and analysis.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. //===----------------------------------------------------------------------===//
  15. // SSE 1 & 2 Instructions Classes
  16. //===----------------------------------------------------------------------===//
  17. /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
  18. multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  19. RegisterClass RC, X86MemOperand x86memop,
  20. Domain d, X86FoldableSchedWrite sched,
  21. bit Is2Addr = 1> {
  22. let isCodeGenOnly = 1 in {
  23. let isCommutable = 1 in {
  24. def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  25. !if(Is2Addr,
  26. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  27. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  28. [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
  29. Sched<[sched]>;
  30. }
  31. def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  32. !if(Is2Addr,
  33. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  34. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  35. [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
  36. Sched<[sched.Folded, sched.ReadAfterFold]>;
  37. }
  38. }
  39. /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
  40. multiclass sse12_fp_scalar_int<bits<8> opc,
  41. SDPatternOperator OpNode, RegisterClass RC,
  42. ValueType VT, string asm, Operand memopr,
  43. PatFrags mem_frags, Domain d,
  44. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  45. let hasSideEffects = 0 in {
  46. def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  47. !if(Is2Addr,
  48. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  49. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  50. [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
  51. Sched<[sched]>;
  52. let mayLoad = 1 in
  53. def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
  54. !if(Is2Addr,
  55. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  56. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  57. [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
  58. Sched<[sched.Folded, sched.ReadAfterFold]>;
  59. }
  60. }
  61. /// sse12_fp_packed - SSE 1 & 2 packed instructions class
  62. multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  63. RegisterClass RC, ValueType vt,
  64. X86MemOperand x86memop, PatFrag mem_frag,
  65. Domain d, X86FoldableSchedWrite sched,
  66. bit Is2Addr = 1> {
  67. let isCommutable = 1 in
  68. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  69. !if(Is2Addr,
  70. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  71. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  72. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
  73. Sched<[sched]>;
  74. let mayLoad = 1 in
  75. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  76. !if(Is2Addr,
  77. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  78. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  79. [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
  80. d>,
  81. Sched<[sched.Folded, sched.ReadAfterFold]>;
  82. }
  83. /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
  84. multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
  85. string OpcodeStr, X86MemOperand x86memop,
  86. X86FoldableSchedWrite sched,
  87. list<dag> pat_rr, list<dag> pat_rm,
  88. bit Is2Addr = 1> {
  89. let isCommutable = 1, hasSideEffects = 0 in
  90. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  91. !if(Is2Addr,
  92. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  93. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  94. pat_rr, d>,
  95. Sched<[sched]>;
  96. let hasSideEffects = 0, mayLoad = 1 in
  97. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  98. !if(Is2Addr,
  99. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  100. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  101. pat_rm, d>,
  102. Sched<[sched.Folded, sched.ReadAfterFold]>;
  103. }
  104. // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
  105. // This is expanded by ExpandPostRAPseudos.
  106. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  107. isPseudo = 1, SchedRW = [WriteZero] in {
  108. def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
  109. [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
  110. def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
  111. [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
  112. def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
  113. [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
  114. def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  115. [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
  116. }
  117. //===----------------------------------------------------------------------===//
  118. // AVX & SSE - Zero/One Vectors
  119. //===----------------------------------------------------------------------===//
  120. // Alias instruction that maps zero vector to pxor / xorp* for sse.
  121. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
  122. // swizzled by ExecutionDomainFix to pxor.
  123. // We set canFoldAsLoad because this can be converted to a constant-pool
  124. // load of an all-zeros value if folding it would be beneficial.
  125. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  126. isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
  127. def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  128. [(set VR128:$dst, (v4f32 immAllZerosV))]>;
  129. }
  130. let Predicates = [NoAVX512] in {
  131. def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
  132. def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
  133. def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
  134. def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
  135. def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
  136. def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
  137. }
  138. // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
  139. // and doesn't need it because on sandy bridge the register is set to zero
  140. // at the rename stage without using any execution unit, so SET0PSY
  141. // and SET0PDY can be used for vector int instructions without penalty
  142. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  143. isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
  144. def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
  145. [(set VR256:$dst, (v8i32 immAllZerosV))]>;
  146. }
  147. let Predicates = [NoAVX512] in {
  148. def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
  149. def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
  150. def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
  151. def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
  152. def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
  153. def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
  154. }
  155. // We set canFoldAsLoad because this can be converted to a constant-pool
  156. // load of an all-ones value if folding it would be beneficial.
  157. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  158. isPseudo = 1, SchedRW = [WriteZero] in {
  159. def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  160. [(set VR128:$dst, (v4i32 immAllOnesV))]>;
  161. let Predicates = [HasAVX1Only, OptForMinSize] in {
  162. def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
  163. [(set VR256:$dst, (v8i32 immAllOnesV))]>;
  164. }
  165. let Predicates = [HasAVX2] in
  166. def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
  167. [(set VR256:$dst, (v8i32 immAllOnesV))]>;
  168. }
  169. //===----------------------------------------------------------------------===//
  170. // SSE 1 & 2 - Move FP Scalar Instructions
  171. //
  172. // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
  173. // register copies because it's a partial register update; Register-to-register
  174. // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
  175. // that the insert be implementable in terms of a copy, and just mentioned, we
  176. // don't use movss/movsd for copies.
  177. //===----------------------------------------------------------------------===//
  178. multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
  179. string asm_opr, Domain d, string Name> {
  180. let isCommutable = 1 in
  181. def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
  182. (ins VR128:$src1, VR128:$src2),
  183. !strconcat(base_opc, asm_opr),
  184. [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
  185. Sched<[SchedWriteFShuffle.XMM]>;
  186. // For the disassembler
  187. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  188. def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
  189. (ins VR128:$src1, VR128:$src2),
  190. !strconcat(base_opc, asm_opr), []>,
  191. Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
  192. }
  193. multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
  194. X86MemOperand x86memop, string OpcodeStr,
  195. Domain d, string Name, Predicate pred> {
  196. // AVX
  197. let Predicates = [UseAVX, OptForSize] in
  198. defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
  199. "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
  200. "V"#Name>,
  201. VEX_4V, VEX_LIG, VEX_WIG;
  202. def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
  203. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  204. [(store RC:$src, addr:$dst)], d>,
  205. VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
  206. // SSE1 & 2
  207. let Constraints = "$src1 = $dst" in {
  208. let Predicates = [pred, NoSSE41_Or_OptForSize] in
  209. defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
  210. "\t{$src2, $dst|$dst, $src2}", d, Name>;
  211. }
  212. def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
  213. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  214. [(store RC:$src, addr:$dst)], d>,
  215. Sched<[WriteFStore]>;
  216. def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  217. (!cast<Instruction>("V"#NAME#"rr_REV")
  218. VR128:$dst, VR128:$src1, VR128:$src2), 0>;
  219. def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
  220. (!cast<Instruction>(NAME#"rr_REV")
  221. VR128:$dst, VR128:$src2), 0>;
  222. }
  223. // Loading from memory automatically zeroing upper bits.
  224. multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
  225. PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
  226. Domain d> {
  227. def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  228. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  229. [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
  230. VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
  231. def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  232. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  233. [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
  234. Sched<[WriteFLoad]>;
  235. // _alt version uses FR32/FR64 register class.
  236. let isCodeGenOnly = 1 in {
  237. def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  238. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  239. [(set RC:$dst, (mem_pat addr:$src))], d>,
  240. VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
  241. def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  242. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  243. [(set RC:$dst, (mem_pat addr:$src))], d>,
  244. Sched<[WriteFLoad]>;
  245. }
  246. }
  247. defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
  248. SSEPackedSingle, "MOVSS", UseSSE1>, XS;
  249. defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
  250. SSEPackedDouble, "MOVSD", UseSSE2>, XD;
  251. let canFoldAsLoad = 1, isReMaterializable = 1 in {
  252. defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
  253. SSEPackedSingle>, XS;
  254. defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
  255. SSEPackedDouble>, XD;
  256. }
  257. // Patterns
  258. let Predicates = [UseAVX] in {
  259. def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
  260. (VMOVSSrm addr:$src)>;
  261. def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
  262. (VMOVSDrm addr:$src)>;
  263. // Represent the same patterns above but in the form they appear for
  264. // 256-bit types
  265. def : Pat<(v8f32 (X86vzload32 addr:$src)),
  266. (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
  267. def : Pat<(v4f64 (X86vzload64 addr:$src)),
  268. (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
  269. }
  270. let Predicates = [UseAVX, OptForSize] in {
  271. // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  272. // MOVSS to the lower bits.
  273. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  274. (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
  275. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  276. (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
  277. // Move low f32 and clear high bits.
  278. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
  279. (SUBREG_TO_REG (i32 0),
  280. (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
  281. (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
  282. def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
  283. (SUBREG_TO_REG (i32 0),
  284. (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
  285. (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
  286. }
  287. let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
  288. // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  289. // MOVSS to the lower bits.
  290. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  291. (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
  292. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  293. (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
  294. }
  295. let Predicates = [UseSSE2] in
  296. def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
  297. (MOVSDrm addr:$src)>;
  298. let Predicates = [UseSSE1] in
  299. def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
  300. (MOVSSrm addr:$src)>;
  301. //===----------------------------------------------------------------------===//
  302. // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
  303. //===----------------------------------------------------------------------===//
  304. multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
  305. X86MemOperand x86memop, PatFrag ld_frag,
  306. string asm, Domain d,
  307. X86SchedWriteMoveLS sched> {
  308. let hasSideEffects = 0, isMoveReg = 1 in
  309. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
  310. !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
  311. Sched<[sched.RR]>;
  312. let canFoldAsLoad = 1, isReMaterializable = 1 in
  313. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  314. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  315. [(set RC:$dst, (ld_frag addr:$src))], d>,
  316. Sched<[sched.RM]>;
  317. }
  318. let Predicates = [HasAVX, NoVLX] in {
  319. defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
  320. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  321. PS, VEX, VEX_WIG;
  322. defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
  323. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  324. PD, VEX, VEX_WIG;
  325. defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
  326. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  327. PS, VEX, VEX_WIG;
  328. defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
  329. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  330. PD, VEX, VEX_WIG;
  331. defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
  332. SSEPackedSingle, SchedWriteFMoveLS.YMM>,
  333. PS, VEX, VEX_L, VEX_WIG;
  334. defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
  335. SSEPackedDouble, SchedWriteFMoveLS.YMM>,
  336. PD, VEX, VEX_L, VEX_WIG;
  337. defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
  338. SSEPackedSingle, SchedWriteFMoveLS.YMM>,
  339. PS, VEX, VEX_L, VEX_WIG;
  340. defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
  341. SSEPackedDouble, SchedWriteFMoveLS.YMM>,
  342. PD, VEX, VEX_L, VEX_WIG;
  343. }
  344. let Predicates = [UseSSE1] in {
  345. defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
  346. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  347. PS;
  348. defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
  349. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  350. PS;
  351. }
  352. let Predicates = [UseSSE2] in {
  353. defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
  354. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  355. PD;
  356. defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
  357. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  358. PD;
  359. }
  360. let Predicates = [HasAVX, NoVLX] in {
  361. let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
  362. def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  363. "movaps\t{$src, $dst|$dst, $src}",
  364. [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
  365. VEX, VEX_WIG;
  366. def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  367. "movapd\t{$src, $dst|$dst, $src}",
  368. [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
  369. VEX, VEX_WIG;
  370. def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  371. "movups\t{$src, $dst|$dst, $src}",
  372. [(store (v4f32 VR128:$src), addr:$dst)]>,
  373. VEX, VEX_WIG;
  374. def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  375. "movupd\t{$src, $dst|$dst, $src}",
  376. [(store (v2f64 VR128:$src), addr:$dst)]>,
  377. VEX, VEX_WIG;
  378. } // SchedRW
  379. let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
  380. def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  381. "movaps\t{$src, $dst|$dst, $src}",
  382. [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
  383. VEX, VEX_L, VEX_WIG;
  384. def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  385. "movapd\t{$src, $dst|$dst, $src}",
  386. [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
  387. VEX, VEX_L, VEX_WIG;
  388. def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  389. "movups\t{$src, $dst|$dst, $src}",
  390. [(store (v8f32 VR256:$src), addr:$dst)]>,
  391. VEX, VEX_L, VEX_WIG;
  392. def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  393. "movupd\t{$src, $dst|$dst, $src}",
  394. [(store (v4f64 VR256:$src), addr:$dst)]>,
  395. VEX, VEX_L, VEX_WIG;
  396. } // SchedRW
  397. } // Predicate
  398. // For disassembler
  399. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  400. isMoveReg = 1 in {
  401. let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
  402. def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
  403. (ins VR128:$src),
  404. "movaps\t{$src, $dst|$dst, $src}", []>,
  405. VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
  406. def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
  407. (ins VR128:$src),
  408. "movapd\t{$src, $dst|$dst, $src}", []>,
  409. VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
  410. def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
  411. (ins VR128:$src),
  412. "movups\t{$src, $dst|$dst, $src}", []>,
  413. VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
  414. def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
  415. (ins VR128:$src),
  416. "movupd\t{$src, $dst|$dst, $src}", []>,
  417. VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
  418. } // SchedRW
  419. let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
  420. def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
  421. (ins VR256:$src),
  422. "movaps\t{$src, $dst|$dst, $src}", []>,
  423. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
  424. def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
  425. (ins VR256:$src),
  426. "movapd\t{$src, $dst|$dst, $src}", []>,
  427. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
  428. def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
  429. (ins VR256:$src),
  430. "movups\t{$src, $dst|$dst, $src}", []>,
  431. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
  432. def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
  433. (ins VR256:$src),
  434. "movupd\t{$src, $dst|$dst, $src}", []>,
  435. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
  436. } // SchedRW
  437. } // Predicate
  438. // Reversed version with ".s" suffix for GAS compatibility.
  439. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
  440. (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
  441. def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
  442. (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
  443. def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
  444. (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
  445. def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
  446. (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
  447. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
  448. (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
  449. def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
  450. (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
  451. def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
  452. (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
  453. def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
  454. (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
  455. let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
  456. def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  457. "movaps\t{$src, $dst|$dst, $src}",
  458. [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
  459. def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  460. "movapd\t{$src, $dst|$dst, $src}",
  461. [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
  462. def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  463. "movups\t{$src, $dst|$dst, $src}",
  464. [(store (v4f32 VR128:$src), addr:$dst)]>;
  465. def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  466. "movupd\t{$src, $dst|$dst, $src}",
  467. [(store (v2f64 VR128:$src), addr:$dst)]>;
  468. } // SchedRW
  469. // For disassembler
  470. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  471. isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
  472. def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  473. "movaps\t{$src, $dst|$dst, $src}", []>,
  474. FoldGenData<"MOVAPSrr">;
  475. def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  476. "movapd\t{$src, $dst|$dst, $src}", []>,
  477. FoldGenData<"MOVAPDrr">;
  478. def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  479. "movups\t{$src, $dst|$dst, $src}", []>,
  480. FoldGenData<"MOVUPSrr">;
  481. def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  482. "movupd\t{$src, $dst|$dst, $src}", []>,
  483. FoldGenData<"MOVUPDrr">;
  484. }
  485. // Reversed version with ".s" suffix for GAS compatibility.
  486. def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
  487. (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
  488. def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
  489. (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
  490. def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
  491. (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
  492. def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
  493. (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
  494. let Predicates = [HasAVX, NoVLX] in {
  495. // 256-bit load/store need to use floating point load/store in case we don't
  496. // have AVX2. Execution domain fixing will convert to integer if AVX2 is
  497. // available and changing the domain is beneficial.
  498. def : Pat<(alignedloadv4i64 addr:$src),
  499. (VMOVAPSYrm addr:$src)>;
  500. def : Pat<(alignedloadv8i32 addr:$src),
  501. (VMOVAPSYrm addr:$src)>;
  502. def : Pat<(alignedloadv16i16 addr:$src),
  503. (VMOVAPSYrm addr:$src)>;
  504. def : Pat<(alignedloadv32i8 addr:$src),
  505. (VMOVAPSYrm addr:$src)>;
  506. def : Pat<(loadv4i64 addr:$src),
  507. (VMOVUPSYrm addr:$src)>;
  508. def : Pat<(loadv8i32 addr:$src),
  509. (VMOVUPSYrm addr:$src)>;
  510. def : Pat<(loadv16i16 addr:$src),
  511. (VMOVUPSYrm addr:$src)>;
  512. def : Pat<(loadv32i8 addr:$src),
  513. (VMOVUPSYrm addr:$src)>;
  514. def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
  515. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  516. def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
  517. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  518. def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
  519. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  520. def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
  521. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  522. def : Pat<(store (v4i64 VR256:$src), addr:$dst),
  523. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  524. def : Pat<(store (v8i32 VR256:$src), addr:$dst),
  525. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  526. def : Pat<(store (v16i16 VR256:$src), addr:$dst),
  527. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  528. def : Pat<(store (v32i8 VR256:$src), addr:$dst),
  529. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  530. def : Pat<(alignedloadv8f16 addr:$src),
  531. (VMOVAPSrm addr:$src)>;
  532. def : Pat<(loadv8f16 addr:$src),
  533. (VMOVUPSrm addr:$src)>;
  534. def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
  535. (VMOVAPSmr addr:$dst, VR128:$src)>;
  536. def : Pat<(store (v8f16 VR128:$src), addr:$dst),
  537. (VMOVUPSmr addr:$dst, VR128:$src)>;
  538. def : Pat<(alignedloadv16f16 addr:$src),
  539. (VMOVAPSYrm addr:$src)>;
  540. def : Pat<(loadv16f16 addr:$src),
  541. (VMOVUPSYrm addr:$src)>;
  542. def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
  543. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  544. def : Pat<(store (v16f16 VR256:$src), addr:$dst),
  545. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  546. }
  547. // Use movaps / movups for SSE integer load / store (one byte shorter).
  548. // The instructions selected below are then converted to MOVDQA/MOVDQU
  549. // during the SSE domain pass.
  550. let Predicates = [UseSSE1] in {
  551. def : Pat<(alignedloadv2i64 addr:$src),
  552. (MOVAPSrm addr:$src)>;
  553. def : Pat<(alignedloadv4i32 addr:$src),
  554. (MOVAPSrm addr:$src)>;
  555. def : Pat<(alignedloadv8i16 addr:$src),
  556. (MOVAPSrm addr:$src)>;
  557. def : Pat<(alignedloadv16i8 addr:$src),
  558. (MOVAPSrm addr:$src)>;
  559. def : Pat<(loadv2i64 addr:$src),
  560. (MOVUPSrm addr:$src)>;
  561. def : Pat<(loadv4i32 addr:$src),
  562. (MOVUPSrm addr:$src)>;
  563. def : Pat<(loadv8i16 addr:$src),
  564. (MOVUPSrm addr:$src)>;
  565. def : Pat<(loadv16i8 addr:$src),
  566. (MOVUPSrm addr:$src)>;
  567. def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
  568. (MOVAPSmr addr:$dst, VR128:$src)>;
  569. def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
  570. (MOVAPSmr addr:$dst, VR128:$src)>;
  571. def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
  572. (MOVAPSmr addr:$dst, VR128:$src)>;
  573. def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
  574. (MOVAPSmr addr:$dst, VR128:$src)>;
  575. def : Pat<(store (v2i64 VR128:$src), addr:$dst),
  576. (MOVUPSmr addr:$dst, VR128:$src)>;
  577. def : Pat<(store (v4i32 VR128:$src), addr:$dst),
  578. (MOVUPSmr addr:$dst, VR128:$src)>;
  579. def : Pat<(store (v8i16 VR128:$src), addr:$dst),
  580. (MOVUPSmr addr:$dst, VR128:$src)>;
  581. def : Pat<(store (v16i8 VR128:$src), addr:$dst),
  582. (MOVUPSmr addr:$dst, VR128:$src)>;
  583. }
  584. let Predicates = [UseSSE2] in {
  585. def : Pat<(alignedloadv8f16 addr:$src),
  586. (MOVAPSrm addr:$src)>;
  587. def : Pat<(loadv8f16 addr:$src),
  588. (MOVUPSrm addr:$src)>;
  589. def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
  590. (MOVAPSmr addr:$dst, VR128:$src)>;
  591. def : Pat<(store (v8f16 VR128:$src), addr:$dst),
  592. (MOVUPSmr addr:$dst, VR128:$src)>;
  593. }
  594. //===----------------------------------------------------------------------===//
  595. // SSE 1 & 2 - Move Low packed FP Instructions
  596. //===----------------------------------------------------------------------===//
  597. multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
  598. string base_opc, string asm_opr> {
  599. // No pattern as they need be special cased between high and low.
  600. let hasSideEffects = 0, mayLoad = 1 in
  601. def PSrm : PI<opc, MRMSrcMem,
  602. (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
  603. !strconcat(base_opc, "s", asm_opr),
  604. [], SSEPackedSingle>, PS,
  605. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  606. def PDrm : PI<opc, MRMSrcMem,
  607. (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
  608. !strconcat(base_opc, "d", asm_opr),
  609. [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
  610. (scalar_to_vector (loadf64 addr:$src2)))))],
  611. SSEPackedDouble>, PD,
  612. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  613. }
  614. multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
  615. string base_opc> {
  616. let Predicates = [UseAVX] in
  617. defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
  618. "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
  619. VEX_4V, VEX_WIG;
  620. let Constraints = "$src1 = $dst" in
  621. defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
  622. "\t{$src2, $dst|$dst, $src2}">;
  623. }
  624. defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
  625. let SchedRW = [WriteFStore] in {
  626. let Predicates = [UseAVX] in {
  627. let mayStore = 1, hasSideEffects = 0 in
  628. def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  629. "movlps\t{$src, $dst|$dst, $src}",
  630. []>,
  631. VEX, VEX_WIG;
  632. def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  633. "movlpd\t{$src, $dst|$dst, $src}",
  634. [(store (f64 (extractelt (v2f64 VR128:$src),
  635. (iPTR 0))), addr:$dst)]>,
  636. VEX, VEX_WIG;
  637. }// UseAVX
  638. let mayStore = 1, hasSideEffects = 0 in
  639. def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  640. "movlps\t{$src, $dst|$dst, $src}",
  641. []>;
  642. def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  643. "movlpd\t{$src, $dst|$dst, $src}",
  644. [(store (f64 (extractelt (v2f64 VR128:$src),
  645. (iPTR 0))), addr:$dst)]>;
  646. } // SchedRW
  647. let Predicates = [UseSSE1] in {
  648. // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
  649. // end up with a movsd or blend instead of shufp.
  650. // No need for aligned load, we're only loading 64-bits.
  651. def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
  652. (i8 -28)),
  653. (MOVLPSrm VR128:$src1, addr:$src2)>;
  654. def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
  655. (MOVLPSrm VR128:$src1, addr:$src2)>;
  656. def : Pat<(v4f32 (X86vzload64 addr:$src)),
  657. (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
  658. def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
  659. (MOVLPSmr addr:$dst, VR128:$src)>;
  660. }
  661. //===----------------------------------------------------------------------===//
  662. // SSE 1 & 2 - Move Hi packed FP Instructions
  663. //===----------------------------------------------------------------------===//
  664. defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
  665. let SchedRW = [WriteFStore] in {
  666. // v2f64 extract element 1 is always custom lowered to unpack high to low
  667. // and extract element 0 so the non-store version isn't too horrible.
  668. let Predicates = [UseAVX] in {
  669. let mayStore = 1, hasSideEffects = 0 in
  670. def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  671. "movhps\t{$src, $dst|$dst, $src}",
  672. []>, VEX, VEX_WIG;
  673. def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  674. "movhpd\t{$src, $dst|$dst, $src}",
  675. [(store (f64 (extractelt
  676. (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
  677. (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
  678. } // UseAVX
  679. let mayStore = 1, hasSideEffects = 0 in
  680. def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  681. "movhps\t{$src, $dst|$dst, $src}",
  682. []>;
  683. def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  684. "movhpd\t{$src, $dst|$dst, $src}",
  685. [(store (f64 (extractelt
  686. (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
  687. (iPTR 0))), addr:$dst)]>;
  688. } // SchedRW
  689. let Predicates = [UseAVX] in {
  690. // MOVHPD patterns
  691. def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
  692. (VMOVHPDrm VR128:$src1, addr:$src2)>;
  693. def : Pat<(store (f64 (extractelt
  694. (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
  695. (iPTR 0))), addr:$dst),
  696. (VMOVHPDmr addr:$dst, VR128:$src)>;
  697. // MOVLPD patterns
  698. def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
  699. (VMOVLPDrm VR128:$src1, addr:$src2)>;
  700. }
  701. let Predicates = [UseSSE1] in {
  702. // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
  703. // end up with a movsd or blend instead of shufp.
  704. // No need for aligned load, we're only loading 64-bits.
  705. def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
  706. (MOVHPSrm VR128:$src1, addr:$src2)>;
  707. def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
  708. (MOVHPSrm VR128:$src1, addr:$src2)>;
  709. def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
  710. addr:$dst),
  711. (MOVHPSmr addr:$dst, VR128:$src)>;
  712. }
  713. let Predicates = [UseSSE2] in {
  714. // MOVHPD patterns
  715. def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
  716. (MOVHPDrm VR128:$src1, addr:$src2)>;
  717. def : Pat<(store (f64 (extractelt
  718. (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
  719. (iPTR 0))), addr:$dst),
  720. (MOVHPDmr addr:$dst, VR128:$src)>;
  721. // MOVLPD patterns
  722. def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
  723. (MOVLPDrm VR128:$src1, addr:$src2)>;
  724. }
  725. let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
  726. // Use MOVLPD to load into the low bits from a full vector unless we can use
  727. // BLENDPD.
  728. def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
  729. (MOVLPDrm VR128:$src1, addr:$src2)>;
  730. }
  731. //===----------------------------------------------------------------------===//
  732. // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
  733. //===----------------------------------------------------------------------===//
  734. let Predicates = [UseAVX] in {
  735. def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
  736. (ins VR128:$src1, VR128:$src2),
  737. "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  738. [(set VR128:$dst,
  739. (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
  740. VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
  741. let isCommutable = 1 in
  742. def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
  743. (ins VR128:$src1, VR128:$src2),
  744. "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  745. [(set VR128:$dst,
  746. (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
  747. VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
  748. NotMemoryFoldable;
  749. }
  750. let Constraints = "$src1 = $dst" in {
  751. def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
  752. (ins VR128:$src1, VR128:$src2),
  753. "movlhps\t{$src2, $dst|$dst, $src2}",
  754. [(set VR128:$dst,
  755. (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
  756. Sched<[SchedWriteFShuffle.XMM]>;
  757. let isCommutable = 1 in
  758. def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
  759. (ins VR128:$src1, VR128:$src2),
  760. "movhlps\t{$src2, $dst|$dst, $src2}",
  761. [(set VR128:$dst,
  762. (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
  763. Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
  764. }
  765. //===----------------------------------------------------------------------===//
  766. // SSE 1 & 2 - Conversion Instructions
  767. //===----------------------------------------------------------------------===//
  768. multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  769. SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
  770. string asm, string mem, X86FoldableSchedWrite sched,
  771. Domain d,
  772. SchedRead Int2Fpu = ReadDefault> {
  773. let ExeDomain = d in {
  774. def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
  775. !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
  776. [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
  777. Sched<[sched, Int2Fpu]>;
  778. def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
  779. mem#"\t{$src, $dst|$dst, $src}",
  780. [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
  781. Sched<[sched.Folded]>;
  782. }
  783. }
  784. multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
  785. ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
  786. string asm, Domain d, X86FoldableSchedWrite sched> {
  787. let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
  788. def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
  789. [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
  790. Sched<[sched]>;
  791. let mayLoad = 1 in
  792. def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
  793. [(set RC:$dst, (DstTy (any_sint_to_fp
  794. (SrcTy (ld_frag addr:$src)))))], d>,
  795. Sched<[sched.Folded]>;
  796. }
  797. }
  798. multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  799. X86MemOperand x86memop, string asm, string mem,
  800. X86FoldableSchedWrite sched, Domain d> {
  801. let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
  802. def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
  803. !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
  804. Sched<[sched, ReadDefault, ReadInt2Fpu]>;
  805. let mayLoad = 1 in
  806. def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
  807. (ins DstRC:$src1, x86memop:$src),
  808. asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
  809. Sched<[sched.Folded, sched.ReadAfterFold]>;
  810. } // hasSideEffects = 0
  811. }
  812. let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  813. defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
  814. "cvttss2si", "cvttss2si",
  815. WriteCvtSS2I, SSEPackedSingle>,
  816. XS, VEX, VEX_LIG;
  817. defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
  818. "cvttss2si", "cvttss2si",
  819. WriteCvtSS2I, SSEPackedSingle>,
  820. XS, VEX, VEX_W, VEX_LIG;
  821. defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
  822. "cvttsd2si", "cvttsd2si",
  823. WriteCvtSD2I, SSEPackedDouble>,
  824. XD, VEX, VEX_LIG;
  825. defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
  826. "cvttsd2si", "cvttsd2si",
  827. WriteCvtSD2I, SSEPackedDouble>,
  828. XD, VEX, VEX_W, VEX_LIG;
  829. defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
  830. "cvtss2si", "cvtss2si",
  831. WriteCvtSS2I, SSEPackedSingle>,
  832. XS, VEX, VEX_LIG;
  833. defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
  834. "cvtss2si", "cvtss2si",
  835. WriteCvtSS2I, SSEPackedSingle>,
  836. XS, VEX, VEX_W, VEX_LIG;
  837. defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
  838. "cvtsd2si", "cvtsd2si",
  839. WriteCvtSD2I, SSEPackedDouble>,
  840. XD, VEX, VEX_LIG;
  841. defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
  842. "cvtsd2si", "cvtsd2si",
  843. WriteCvtSD2I, SSEPackedDouble>,
  844. XD, VEX, VEX_W, VEX_LIG;
  845. }
  846. // The assembler can recognize rr 64-bit instructions by seeing a rxx
  847. // register, but the same isn't true when only using memory operands,
  848. // provide other assembly "l" and "q" forms to address this explicitly
  849. // where appropriate to do so.
  850. let isCodeGenOnly = 1 in {
  851. defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
  852. WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
  853. VEX_LIG, SIMD_EXC;
  854. defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
  855. WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
  856. VEX_W, VEX_LIG, SIMD_EXC;
  857. defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
  858. WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
  859. VEX_LIG;
  860. defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
  861. WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
  862. VEX_W, VEX_LIG, SIMD_EXC;
  863. } // isCodeGenOnly = 1
  864. let Predicates = [UseAVX] in {
  865. def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
  866. (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
  867. def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
  868. (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
  869. def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
  870. (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
  871. def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
  872. (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
  873. def : Pat<(f32 (any_sint_to_fp GR32:$src)),
  874. (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
  875. def : Pat<(f32 (any_sint_to_fp GR64:$src)),
  876. (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
  877. def : Pat<(f64 (any_sint_to_fp GR32:$src)),
  878. (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
  879. def : Pat<(f64 (any_sint_to_fp GR64:$src)),
  880. (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
  881. def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
  882. def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
  883. def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
  884. def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
  885. }
  886. let isCodeGenOnly = 1 in {
  887. defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
  888. "cvttss2si", "cvttss2si",
  889. WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
  890. defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
  891. "cvttss2si", "cvttss2si",
  892. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
  893. defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
  894. "cvttsd2si", "cvttsd2si",
  895. WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
  896. defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
  897. "cvttsd2si", "cvttsd2si",
  898. WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
  899. defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
  900. "cvtss2si", "cvtss2si",
  901. WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
  902. defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
  903. "cvtss2si", "cvtss2si",
  904. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
  905. defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
  906. "cvtsd2si", "cvtsd2si",
  907. WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
  908. defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
  909. "cvtsd2si", "cvtsd2si",
  910. WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
  911. defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
  912. "cvtsi2ss", "cvtsi2ss{l}",
  913. WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
  914. defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
  915. "cvtsi2ss", "cvtsi2ss{q}",
  916. WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
  917. defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
  918. "cvtsi2sd", "cvtsi2sd{l}",
  919. WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
  920. defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
  921. "cvtsi2sd", "cvtsi2sd{q}",
  922. WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
  923. } // isCodeGenOnly = 1
  924. let Predicates = [UseSSE1] in {
  925. def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
  926. def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
  927. }
  928. let Predicates = [UseSSE2] in {
  929. def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
  930. def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
  931. }
  932. // Conversion Instructions Intrinsics - Match intrinsics which expect MM
  933. // and/or XMM operand(s).
  934. multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  935. ValueType DstVT, ValueType SrcVT, SDNode OpNode,
  936. Operand memop, PatFrags mem_frags, string asm,
  937. X86FoldableSchedWrite sched, Domain d> {
  938. let ExeDomain = d in {
  939. def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
  940. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  941. [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
  942. Sched<[sched]>;
  943. def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
  944. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  945. [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
  946. Sched<[sched.Folded]>;
  947. }
  948. }
  949. multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
  950. RegisterClass DstRC, X86MemOperand x86memop,
  951. string asm, string mem, X86FoldableSchedWrite sched,
  952. Domain d, bit Is2Addr = 1> {
  953. let hasSideEffects = 0, ExeDomain = d in {
  954. def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
  955. !if(Is2Addr,
  956. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  957. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  958. []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
  959. let mayLoad = 1 in
  960. def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
  961. (ins DstRC:$src1, x86memop:$src2),
  962. !if(Is2Addr,
  963. asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
  964. asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  965. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  966. }
  967. }
  968. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  969. let Predicates = [UseAVX] in {
  970. defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
  971. X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
  972. WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
  973. defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
  974. X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
  975. WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
  976. }
  977. defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
  978. sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
  979. SSEPackedDouble>, XD;
  980. defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
  981. sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
  982. SSEPackedDouble>, XD, REX_W;
  983. }
  984. let Predicates = [UseAVX] in {
  985. defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  986. i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
  987. XS, VEX_4V, VEX_LIG, SIMD_EXC;
  988. defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  989. i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
  990. XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
  991. defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  992. i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
  993. XD, VEX_4V, VEX_LIG;
  994. defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  995. i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
  996. XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
  997. }
  998. let Constraints = "$src1 = $dst" in {
  999. defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  1000. i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
  1001. XS, SIMD_EXC;
  1002. defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  1003. i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
  1004. XS, REX_W, SIMD_EXC;
  1005. defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  1006. i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
  1007. XD;
  1008. defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  1009. i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
  1010. XD, REX_W, SIMD_EXC;
  1011. }
  1012. def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1013. (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
  1014. def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1015. (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
  1016. def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1017. (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
  1018. def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1019. (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
  1020. def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
  1021. (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
  1022. def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
  1023. (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
  1024. def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
  1025. (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
  1026. def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
  1027. (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
  1028. def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
  1029. (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
  1030. def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
  1031. (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
  1032. def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
  1033. (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
  1034. def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
  1035. (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
  1036. /// SSE 1 Only
  1037. // Aliases for intrinsics
  1038. let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1039. defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
  1040. ssmem, sse_load_f32, "cvttss2si",
  1041. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
  1042. defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
  1043. X86cvtts2Int, ssmem, sse_load_f32,
  1044. "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
  1045. XS, VEX, VEX_LIG, VEX_W;
  1046. defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
  1047. sdmem, sse_load_f64, "cvttsd2si",
  1048. WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
  1049. defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
  1050. X86cvtts2Int, sdmem, sse_load_f64,
  1051. "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
  1052. XD, VEX, VEX_LIG, VEX_W;
  1053. }
  1054. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1055. defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
  1056. ssmem, sse_load_f32, "cvttss2si",
  1057. WriteCvtSS2I, SSEPackedSingle>, XS;
  1058. defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
  1059. X86cvtts2Int, ssmem, sse_load_f32,
  1060. "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
  1061. XS, REX_W;
  1062. defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
  1063. sdmem, sse_load_f64, "cvttsd2si",
  1064. WriteCvtSD2I, SSEPackedDouble>, XD;
  1065. defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
  1066. X86cvtts2Int, sdmem, sse_load_f64,
  1067. "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
  1068. XD, REX_W;
  1069. }
  1070. def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
  1071. (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1072. def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
  1073. (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
  1074. def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1075. (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1076. def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1077. (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
  1078. def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
  1079. (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1080. def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
  1081. (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
  1082. def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1083. (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1084. def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1085. (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
  1086. def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
  1087. (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1088. def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
  1089. (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
  1090. def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1091. (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1092. def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1093. (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
  1094. def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
  1095. (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1096. def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
  1097. (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
  1098. def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1099. (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1100. def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1101. (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
  1102. let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1103. defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
  1104. ssmem, sse_load_f32, "cvtss2si",
  1105. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
  1106. defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
  1107. ssmem, sse_load_f32, "cvtss2si",
  1108. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
  1109. }
  1110. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1111. defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
  1112. ssmem, sse_load_f32, "cvtss2si",
  1113. WriteCvtSS2I, SSEPackedSingle>, XS;
  1114. defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
  1115. ssmem, sse_load_f32, "cvtss2si",
  1116. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
  1117. defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
  1118. "vcvtdq2ps\t{$src, $dst|$dst, $src}",
  1119. SSEPackedSingle, WriteCvtI2PS>,
  1120. PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
  1121. defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
  1122. "vcvtdq2ps\t{$src, $dst|$dst, $src}",
  1123. SSEPackedSingle, WriteCvtI2PSY>,
  1124. PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
  1125. defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
  1126. "cvtdq2ps\t{$src, $dst|$dst, $src}",
  1127. SSEPackedSingle, WriteCvtI2PS>,
  1128. PS, Requires<[UseSSE2]>;
  1129. }
  1130. // AVX aliases
  1131. def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
  1132. (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1133. def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
  1134. (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
  1135. def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1136. (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1137. def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1138. (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
  1139. def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
  1140. (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1141. def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
  1142. (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
  1143. def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1144. (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1145. def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1146. (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
  1147. // SSE aliases
  1148. def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
  1149. (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1150. def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
  1151. (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
  1152. def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1153. (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1154. def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1155. (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
  1156. def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
  1157. (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1158. def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
  1159. (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
  1160. def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1161. (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1162. def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1163. (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
  1164. /// SSE 2 Only
  1165. // Convert scalar double to scalar single
  1166. let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
  1167. ExeDomain = SSEPackedSingle in {
  1168. def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
  1169. (ins FR32:$src1, FR64:$src2),
  1170. "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1171. VEX_4V, VEX_LIG, VEX_WIG,
  1172. Sched<[WriteCvtSD2SS]>, SIMD_EXC;
  1173. let mayLoad = 1 in
  1174. def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
  1175. (ins FR32:$src1, f64mem:$src2),
  1176. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1177. XD, VEX_4V, VEX_LIG, VEX_WIG,
  1178. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
  1179. }
  1180. def : Pat<(f32 (any_fpround FR64:$src)),
  1181. (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
  1182. Requires<[UseAVX]>;
  1183. let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
  1184. def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
  1185. "cvtsd2ss\t{$src, $dst|$dst, $src}",
  1186. [(set FR32:$dst, (any_fpround FR64:$src))]>,
  1187. Sched<[WriteCvtSD2SS]>, SIMD_EXC;
  1188. def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
  1189. "cvtsd2ss\t{$src, $dst|$dst, $src}",
  1190. [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
  1191. XD, Requires<[UseSSE2, OptForSize]>,
  1192. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
  1193. }
  1194. let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
  1195. def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
  1196. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1197. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1198. [(set VR128:$dst,
  1199. (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
  1200. XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
  1201. Sched<[WriteCvtSD2SS]>;
  1202. def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
  1203. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
  1204. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1205. [(set VR128:$dst,
  1206. (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
  1207. XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
  1208. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
  1209. let Constraints = "$src1 = $dst" in {
  1210. def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
  1211. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1212. "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
  1213. [(set VR128:$dst,
  1214. (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
  1215. XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
  1216. def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
  1217. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
  1218. "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
  1219. [(set VR128:$dst,
  1220. (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
  1221. XD, Requires<[UseSSE2]>,
  1222. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
  1223. }
  1224. }
  1225. // Convert scalar single to scalar double
  1226. // SSE2 instructions with XS prefix
  1227. let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  1228. def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
  1229. (ins FR64:$src1, FR32:$src2),
  1230. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1231. XS, VEX_4V, VEX_LIG, VEX_WIG,
  1232. Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
  1233. let mayLoad = 1 in
  1234. def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
  1235. (ins FR64:$src1, f32mem:$src2),
  1236. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1237. XS, VEX_4V, VEX_LIG, VEX_WIG,
  1238. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
  1239. Requires<[UseAVX, OptForSize]>, SIMD_EXC;
  1240. } // isCodeGenOnly = 1, hasSideEffects = 0
  1241. def : Pat<(f64 (any_fpextend FR32:$src)),
  1242. (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
  1243. def : Pat<(any_fpextend (loadf32 addr:$src)),
  1244. (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
  1245. let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
  1246. def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
  1247. "cvtss2sd\t{$src, $dst|$dst, $src}",
  1248. [(set FR64:$dst, (any_fpextend FR32:$src))]>,
  1249. XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
  1250. def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
  1251. "cvtss2sd\t{$src, $dst|$dst, $src}",
  1252. [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
  1253. XS, Requires<[UseSSE2, OptForSize]>,
  1254. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
  1255. } // isCodeGenOnly = 1
  1256. let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
  1257. ExeDomain = SSEPackedSingle in {
  1258. def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
  1259. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1260. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1261. []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
  1262. Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
  1263. let mayLoad = 1 in
  1264. def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
  1265. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
  1266. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1267. []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
  1268. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
  1269. let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
  1270. def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
  1271. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1272. "cvtss2sd\t{$src2, $dst|$dst, $src2}",
  1273. []>, XS, Requires<[UseSSE2]>,
  1274. Sched<[WriteCvtSS2SD]>;
  1275. let mayLoad = 1 in
  1276. def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
  1277. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
  1278. "cvtss2sd\t{$src2, $dst|$dst, $src2}",
  1279. []>, XS, Requires<[UseSSE2]>,
  1280. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
  1281. }
  1282. } // hasSideEffects = 0
  1283. // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
  1284. // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
  1285. // vmovs{s,d} instructions
  1286. let Predicates = [UseAVX] in {
  1287. def : Pat<(v4f32 (X86Movss
  1288. (v4f32 VR128:$dst),
  1289. (v4f32 (scalar_to_vector
  1290. (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
  1291. (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
  1292. def : Pat<(v2f64 (X86Movsd
  1293. (v2f64 VR128:$dst),
  1294. (v2f64 (scalar_to_vector
  1295. (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
  1296. (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
  1297. def : Pat<(v4f32 (X86Movss
  1298. (v4f32 VR128:$dst),
  1299. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
  1300. (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
  1301. def : Pat<(v4f32 (X86Movss
  1302. (v4f32 VR128:$dst),
  1303. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
  1304. (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
  1305. def : Pat<(v4f32 (X86Movss
  1306. (v4f32 VR128:$dst),
  1307. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
  1308. (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
  1309. def : Pat<(v4f32 (X86Movss
  1310. (v4f32 VR128:$dst),
  1311. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
  1312. (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
  1313. def : Pat<(v2f64 (X86Movsd
  1314. (v2f64 VR128:$dst),
  1315. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
  1316. (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
  1317. def : Pat<(v2f64 (X86Movsd
  1318. (v2f64 VR128:$dst),
  1319. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
  1320. (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
  1321. def : Pat<(v2f64 (X86Movsd
  1322. (v2f64 VR128:$dst),
  1323. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
  1324. (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
  1325. def : Pat<(v2f64 (X86Movsd
  1326. (v2f64 VR128:$dst),
  1327. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
  1328. (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
  1329. } // Predicates = [UseAVX]
  1330. let Predicates = [UseSSE2] in {
  1331. def : Pat<(v4f32 (X86Movss
  1332. (v4f32 VR128:$dst),
  1333. (v4f32 (scalar_to_vector
  1334. (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
  1335. (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
  1336. def : Pat<(v2f64 (X86Movsd
  1337. (v2f64 VR128:$dst),
  1338. (v2f64 (scalar_to_vector
  1339. (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
  1340. (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
  1341. def : Pat<(v2f64 (X86Movsd
  1342. (v2f64 VR128:$dst),
  1343. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
  1344. (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
  1345. def : Pat<(v2f64 (X86Movsd
  1346. (v2f64 VR128:$dst),
  1347. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
  1348. (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
  1349. def : Pat<(v2f64 (X86Movsd
  1350. (v2f64 VR128:$dst),
  1351. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
  1352. (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
  1353. def : Pat<(v2f64 (X86Movsd
  1354. (v2f64 VR128:$dst),
  1355. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
  1356. (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
  1357. } // Predicates = [UseSSE2]
  1358. let Predicates = [UseSSE1] in {
  1359. def : Pat<(v4f32 (X86Movss
  1360. (v4f32 VR128:$dst),
  1361. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
  1362. (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
  1363. def : Pat<(v4f32 (X86Movss
  1364. (v4f32 VR128:$dst),
  1365. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
  1366. (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
  1367. def : Pat<(v4f32 (X86Movss
  1368. (v4f32 VR128:$dst),
  1369. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
  1370. (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
  1371. def : Pat<(v4f32 (X86Movss
  1372. (v4f32 VR128:$dst),
  1373. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
  1374. (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
  1375. } // Predicates = [UseSSE1]
  1376. let Predicates = [HasAVX, NoVLX] in {
  1377. // Convert packed single/double fp to doubleword
  1378. def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1379. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1380. [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
  1381. VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
  1382. def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1383. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1384. [(set VR128:$dst,
  1385. (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
  1386. VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
  1387. def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  1388. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1389. [(set VR256:$dst,
  1390. (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
  1391. VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
  1392. def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  1393. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1394. [(set VR256:$dst,
  1395. (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
  1396. VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
  1397. }
  1398. def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1399. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1400. [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
  1401. Sched<[WriteCvtPS2I]>, SIMD_EXC;
  1402. def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1403. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1404. [(set VR128:$dst,
  1405. (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
  1406. Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
  1407. // Convert Packed Double FP to Packed DW Integers
  1408. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1409. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1410. // register, but the same isn't true when using memory operands instead.
  1411. // Provide other assembly rr and rm forms to address this explicitly.
  1412. def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1413. "vcvtpd2dq\t{$src, $dst|$dst, $src}",
  1414. [(set VR128:$dst,
  1415. (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
  1416. VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
  1417. // XMM only
  1418. def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1419. "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
  1420. [(set VR128:$dst,
  1421. (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
  1422. Sched<[WriteCvtPD2ILd]>, VEX_WIG;
  1423. // YMM only
  1424. def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1425. "vcvtpd2dq\t{$src, $dst|$dst, $src}",
  1426. [(set VR128:$dst,
  1427. (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
  1428. VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
  1429. def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1430. "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
  1431. [(set VR128:$dst,
  1432. (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
  1433. VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
  1434. }
  1435. def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
  1436. (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
  1437. def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
  1438. (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
  1439. def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1440. "cvtpd2dq\t{$src, $dst|$dst, $src}",
  1441. [(set VR128:$dst,
  1442. (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
  1443. Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
  1444. def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1445. "cvtpd2dq\t{$src, $dst|$dst, $src}",
  1446. [(set VR128:$dst,
  1447. (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
  1448. Sched<[WriteCvtPD2I]>, SIMD_EXC;
  1449. // Convert with truncation packed single/double fp to doubleword
  1450. // SSE2 packed instructions with XS prefix
  1451. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1452. let Predicates = [HasAVX, NoVLX] in {
  1453. def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1454. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1455. [(set VR128:$dst,
  1456. (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
  1457. VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
  1458. def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1459. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1460. [(set VR128:$dst,
  1461. (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
  1462. VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
  1463. def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  1464. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1465. [(set VR256:$dst,
  1466. (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
  1467. VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
  1468. def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  1469. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1470. [(set VR256:$dst,
  1471. (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
  1472. VEX, VEX_L,
  1473. Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
  1474. }
  1475. def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1476. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1477. [(set VR128:$dst,
  1478. (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
  1479. Sched<[WriteCvtPS2I]>;
  1480. def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1481. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1482. [(set VR128:$dst,
  1483. (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
  1484. Sched<[WriteCvtPS2ILd]>;
  1485. }
  1486. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1487. // register, but the same isn't true when using memory operands instead.
  1488. // Provide other assembly rr and rm forms to address this explicitly.
  1489. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1490. // XMM only
  1491. def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1492. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1493. [(set VR128:$dst,
  1494. (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
  1495. VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
  1496. def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1497. "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
  1498. [(set VR128:$dst,
  1499. (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
  1500. VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
  1501. // YMM only
  1502. def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1503. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1504. [(set VR128:$dst,
  1505. (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
  1506. VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
  1507. def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1508. "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
  1509. [(set VR128:$dst,
  1510. (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
  1511. VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
  1512. } // Predicates = [HasAVX, NoVLX]
  1513. def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
  1514. (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
  1515. def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
  1516. (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
  1517. let Predicates = [HasAVX, NoVLX] in {
  1518. def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
  1519. (VCVTTPD2DQYrr VR256:$src)>;
  1520. def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
  1521. (VCVTTPD2DQYrm addr:$src)>;
  1522. }
  1523. def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1524. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1525. [(set VR128:$dst,
  1526. (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
  1527. Sched<[WriteCvtPD2I]>, SIMD_EXC;
  1528. def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
  1529. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1530. [(set VR128:$dst,
  1531. (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
  1532. Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
  1533. // Convert packed single to packed double
  1534. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1535. // SSE2 instructions without OpSize prefix
  1536. def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1537. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1538. [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
  1539. PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
  1540. def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  1541. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1542. [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
  1543. PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
  1544. def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  1545. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1546. [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
  1547. PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
  1548. def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
  1549. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1550. [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
  1551. PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
  1552. }
  1553. let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1554. def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1555. "cvtps2pd\t{$src, $dst|$dst, $src}",
  1556. [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
  1557. PS, Sched<[WriteCvtPS2PD]>;
  1558. def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  1559. "cvtps2pd\t{$src, $dst|$dst, $src}",
  1560. [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
  1561. PS, Sched<[WriteCvtPS2PD.Folded]>;
  1562. }
  1563. // Convert Packed DW Integers to Packed Double FP
  1564. let Predicates = [HasAVX, NoVLX] in {
  1565. let hasSideEffects = 0, mayLoad = 1 in
  1566. def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  1567. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1568. [(set VR128:$dst,
  1569. (v2f64 (X86any_VSintToFP
  1570. (bc_v4i32
  1571. (v2i64 (scalar_to_vector
  1572. (loadi64 addr:$src)))))))]>,
  1573. VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
  1574. def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1575. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1576. [(set VR128:$dst,
  1577. (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
  1578. VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
  1579. def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
  1580. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1581. [(set VR256:$dst,
  1582. (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
  1583. VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
  1584. VEX_WIG;
  1585. def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  1586. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1587. [(set VR256:$dst,
  1588. (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
  1589. VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
  1590. }
  1591. let hasSideEffects = 0, mayLoad = 1 in
  1592. def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  1593. "cvtdq2pd\t{$src, $dst|$dst, $src}",
  1594. [(set VR128:$dst,
  1595. (v2f64 (X86any_VSintToFP
  1596. (bc_v4i32
  1597. (v2i64 (scalar_to_vector
  1598. (loadi64 addr:$src)))))))]>,
  1599. Sched<[WriteCvtI2PDLd]>;
  1600. def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1601. "cvtdq2pd\t{$src, $dst|$dst, $src}",
  1602. [(set VR128:$dst,
  1603. (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
  1604. Sched<[WriteCvtI2PD]>;
  1605. // AVX register conversion intrinsics
  1606. let Predicates = [HasAVX, NoVLX] in {
  1607. def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  1608. (VCVTDQ2PDrm addr:$src)>;
  1609. } // Predicates = [HasAVX, NoVLX]
  1610. // SSE2 register conversion intrinsics
  1611. let Predicates = [UseSSE2] in {
  1612. def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  1613. (CVTDQ2PDrm addr:$src)>;
  1614. } // Predicates = [UseSSE2]
  1615. // Convert packed double to packed single
  1616. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1617. // register, but the same isn't true when using memory operands instead.
  1618. // Provide other assembly rr and rm forms to address this explicitly.
  1619. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1620. // XMM only
  1621. def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1622. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1623. [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
  1624. VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
  1625. def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1626. "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
  1627. [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
  1628. VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
  1629. def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1630. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1631. [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
  1632. VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
  1633. def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1634. "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
  1635. [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
  1636. VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
  1637. } // Predicates = [HasAVX, NoVLX]
  1638. def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
  1639. (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
  1640. def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
  1641. (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
  1642. def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1643. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1644. [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
  1645. Sched<[WriteCvtPD2PS]>, SIMD_EXC;
  1646. def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1647. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1648. [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
  1649. Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
  1650. //===----------------------------------------------------------------------===//
  1651. // SSE 1 & 2 - Compare Instructions
  1652. //===----------------------------------------------------------------------===//
  1653. // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
  1654. multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
  1655. Operand memop, SDNode OpNode, ValueType VT,
  1656. PatFrag ld_frag, string asm,
  1657. X86FoldableSchedWrite sched,
  1658. PatFrags mem_frags> {
  1659. def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
  1660. (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
  1661. [(set VR128:$dst, (OpNode (VT VR128:$src1),
  1662. VR128:$src2, timm:$cc))]>,
  1663. Sched<[sched]>, SIMD_EXC;
  1664. let mayLoad = 1 in
  1665. def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
  1666. (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
  1667. [(set VR128:$dst, (OpNode (VT VR128:$src1),
  1668. (mem_frags addr:$src2), timm:$cc))]>,
  1669. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1670. let isCodeGenOnly = 1 in {
  1671. let isCommutable = 1 in
  1672. def rr : SIi8<0xC2, MRMSrcReg,
  1673. (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
  1674. [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
  1675. Sched<[sched]>, SIMD_EXC;
  1676. def rm : SIi8<0xC2, MRMSrcMem,
  1677. (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
  1678. [(set RC:$dst, (OpNode RC:$src1,
  1679. (ld_frag addr:$src2), timm:$cc))]>,
  1680. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1681. }
  1682. }
  1683. let ExeDomain = SSEPackedSingle in
  1684. defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
  1685. "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1686. SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
  1687. XS, VEX_4V, VEX_LIG, VEX_WIG;
  1688. let ExeDomain = SSEPackedDouble in
  1689. defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
  1690. "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1691. SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
  1692. XD, VEX_4V, VEX_LIG, VEX_WIG;
  1693. let Constraints = "$src1 = $dst" in {
  1694. let ExeDomain = SSEPackedSingle in
  1695. defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
  1696. "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1697. SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
  1698. let ExeDomain = SSEPackedDouble in
  1699. defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
  1700. "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1701. SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
  1702. }
  1703. // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
  1704. multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
  1705. ValueType vt, X86MemOperand x86memop,
  1706. PatFrag ld_frag, string OpcodeStr, Domain d,
  1707. X86FoldableSchedWrite sched = WriteFComX> {
  1708. let ExeDomain = d in {
  1709. def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  1710. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1711. [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
  1712. Sched<[sched]>, SIMD_EXC;
  1713. let mayLoad = 1 in
  1714. def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
  1715. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1716. [(set EFLAGS, (OpNode (vt RC:$src1),
  1717. (ld_frag addr:$src2)))]>,
  1718. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1719. }
  1720. }
  1721. // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
  1722. multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
  1723. ValueType vt, Operand memop,
  1724. PatFrags mem_frags, string OpcodeStr,
  1725. Domain d,
  1726. X86FoldableSchedWrite sched = WriteFComX> {
  1727. let ExeDomain = d in {
  1728. def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  1729. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1730. [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
  1731. Sched<[sched]>, SIMD_EXC;
  1732. let mayLoad = 1 in
  1733. def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
  1734. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1735. [(set EFLAGS, (OpNode (vt RC:$src1),
  1736. (mem_frags addr:$src2)))]>,
  1737. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1738. }
  1739. }
  1740. let Defs = [EFLAGS] in {
  1741. defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
  1742. "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1743. defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
  1744. "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1745. defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
  1746. "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1747. defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
  1748. "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1749. let isCodeGenOnly = 1 in {
  1750. defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
  1751. sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1752. defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
  1753. sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1754. defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
  1755. sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1756. defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
  1757. sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1758. }
  1759. defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
  1760. "ucomiss", SSEPackedSingle>, PS;
  1761. defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
  1762. "ucomisd", SSEPackedDouble>, PD;
  1763. defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
  1764. "comiss", SSEPackedSingle>, PS;
  1765. defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
  1766. "comisd", SSEPackedDouble>, PD;
  1767. let isCodeGenOnly = 1 in {
  1768. defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
  1769. sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
  1770. defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
  1771. sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
  1772. defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
  1773. sse_load_f32, "comiss", SSEPackedSingle>, PS;
  1774. defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
  1775. sse_load_f64, "comisd", SSEPackedDouble>, PD;
  1776. }
  1777. } // Defs = [EFLAGS]
  1778. // sse12_cmp_packed - sse 1 & 2 compare packed instructions
  1779. multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
  1780. ValueType VT, string asm,
  1781. X86FoldableSchedWrite sched,
  1782. Domain d, PatFrag ld_frag> {
  1783. let isCommutable = 1 in
  1784. def rri : PIi8<0xC2, MRMSrcReg,
  1785. (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
  1786. [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
  1787. Sched<[sched]>, SIMD_EXC;
  1788. def rmi : PIi8<0xC2, MRMSrcMem,
  1789. (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
  1790. [(set RC:$dst,
  1791. (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
  1792. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1793. }
  1794. defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
  1795. "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1796. SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
  1797. defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
  1798. "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1799. SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
  1800. defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
  1801. "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1802. SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
  1803. defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
  1804. "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1805. SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
  1806. let Constraints = "$src1 = $dst" in {
  1807. defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
  1808. "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1809. SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
  1810. defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
  1811. "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1812. SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
  1813. }
  1814. def CommutableCMPCC : PatLeaf<(timm), [{
  1815. uint64_t Imm = N->getZExtValue() & 0x7;
  1816. return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
  1817. }]>;
  1818. // Patterns to select compares with loads in first operand.
  1819. let Predicates = [HasAVX] in {
  1820. def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
  1821. CommutableCMPCC:$cc)),
  1822. (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
  1823. def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
  1824. CommutableCMPCC:$cc)),
  1825. (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
  1826. def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
  1827. CommutableCMPCC:$cc)),
  1828. (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1829. def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
  1830. CommutableCMPCC:$cc)),
  1831. (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1832. def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
  1833. CommutableCMPCC:$cc)),
  1834. (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
  1835. def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
  1836. CommutableCMPCC:$cc)),
  1837. (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
  1838. }
  1839. let Predicates = [UseSSE2] in {
  1840. def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
  1841. CommutableCMPCC:$cc)),
  1842. (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1843. def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
  1844. CommutableCMPCC:$cc)),
  1845. (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
  1846. }
  1847. let Predicates = [UseSSE1] in {
  1848. def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
  1849. CommutableCMPCC:$cc)),
  1850. (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1851. def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
  1852. CommutableCMPCC:$cc)),
  1853. (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
  1854. }
  1855. //===----------------------------------------------------------------------===//
  1856. // SSE 1 & 2 - Shuffle Instructions
  1857. //===----------------------------------------------------------------------===//
  1858. /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
  1859. multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
  1860. ValueType vt, string asm, PatFrag mem_frag,
  1861. X86FoldableSchedWrite sched, Domain d,
  1862. bit IsCommutable = 0> {
  1863. def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
  1864. (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
  1865. [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
  1866. (i8 timm:$src3))))], d>,
  1867. Sched<[sched.Folded, sched.ReadAfterFold]>;
  1868. let isCommutable = IsCommutable in
  1869. def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
  1870. (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
  1871. [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
  1872. (i8 timm:$src3))))], d>,
  1873. Sched<[sched]>;
  1874. }
  1875. let Predicates = [HasAVX, NoVLX] in {
  1876. defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
  1877. "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1878. loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
  1879. PS, VEX_4V, VEX_WIG;
  1880. defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
  1881. "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1882. loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
  1883. PS, VEX_4V, VEX_L, VEX_WIG;
  1884. defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
  1885. "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1886. loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
  1887. PD, VEX_4V, VEX_WIG;
  1888. defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
  1889. "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1890. loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
  1891. PD, VEX_4V, VEX_L, VEX_WIG;
  1892. }
  1893. let Constraints = "$src1 = $dst" in {
  1894. defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
  1895. "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  1896. memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1897. defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
  1898. "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  1899. memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
  1900. }
  1901. //===----------------------------------------------------------------------===//
  1902. // SSE 1 & 2 - Unpack FP Instructions
  1903. //===----------------------------------------------------------------------===//
  1904. /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
  1905. multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
  1906. PatFrag mem_frag, RegisterClass RC,
  1907. X86MemOperand x86memop, string asm,
  1908. X86FoldableSchedWrite sched, Domain d,
  1909. bit IsCommutable = 0> {
  1910. let isCommutable = IsCommutable in
  1911. def rr : PI<opc, MRMSrcReg,
  1912. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  1913. asm, [(set RC:$dst,
  1914. (vt (OpNode RC:$src1, RC:$src2)))], d>,
  1915. Sched<[sched]>;
  1916. def rm : PI<opc, MRMSrcMem,
  1917. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  1918. asm, [(set RC:$dst,
  1919. (vt (OpNode RC:$src1,
  1920. (mem_frag addr:$src2))))], d>,
  1921. Sched<[sched.Folded, sched.ReadAfterFold]>;
  1922. }
  1923. let Predicates = [HasAVX, NoVLX] in {
  1924. defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
  1925. VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1926. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
  1927. defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
  1928. VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1929. SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
  1930. defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
  1931. VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1932. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
  1933. defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
  1934. VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1935. SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
  1936. defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
  1937. VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1938. SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
  1939. defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
  1940. VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1941. SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
  1942. defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
  1943. VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1944. SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
  1945. defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
  1946. VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1947. SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
  1948. }// Predicates = [HasAVX, NoVLX]
  1949. let Constraints = "$src1 = $dst" in {
  1950. defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
  1951. VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
  1952. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1953. defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
  1954. VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
  1955. SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
  1956. defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
  1957. VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
  1958. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1959. defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
  1960. VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
  1961. SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
  1962. } // Constraints = "$src1 = $dst"
  1963. let Predicates = [HasAVX1Only] in {
  1964. def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
  1965. (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
  1966. def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
  1967. (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
  1968. def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
  1969. (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
  1970. def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
  1971. (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
  1972. def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
  1973. (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
  1974. def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
  1975. (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
  1976. def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
  1977. (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
  1978. def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
  1979. (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
  1980. }
  1981. let Predicates = [UseSSE2] in {
  1982. // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
  1983. def : Pat<(v2f64 (X86Unpckl VR128:$src1,
  1984. (v2f64 (simple_load addr:$src2)))),
  1985. (MOVHPDrm VR128:$src1, addr:$src2)>;
  1986. }
  1987. //===----------------------------------------------------------------------===//
  1988. // SSE 1 & 2 - Extract Floating-Point Sign mask
  1989. //===----------------------------------------------------------------------===//
  1990. /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
  1991. multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
  1992. string asm, Domain d> {
  1993. def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
  1994. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  1995. [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
  1996. Sched<[WriteFMOVMSK]>;
  1997. }
  1998. let Predicates = [HasAVX] in {
  1999. defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
  2000. SSEPackedSingle>, PS, VEX, VEX_WIG;
  2001. defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
  2002. SSEPackedDouble>, PD, VEX, VEX_WIG;
  2003. defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
  2004. SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
  2005. defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
  2006. SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
  2007. // Also support integer VTs to avoid a int->fp bitcast in the DAG.
  2008. def : Pat<(X86movmsk (v4i32 VR128:$src)),
  2009. (VMOVMSKPSrr VR128:$src)>;
  2010. def : Pat<(X86movmsk (v2i64 VR128:$src)),
  2011. (VMOVMSKPDrr VR128:$src)>;
  2012. def : Pat<(X86movmsk (v8i32 VR256:$src)),
  2013. (VMOVMSKPSYrr VR256:$src)>;
  2014. def : Pat<(X86movmsk (v4i64 VR256:$src)),
  2015. (VMOVMSKPDYrr VR256:$src)>;
  2016. }
  2017. defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
  2018. SSEPackedSingle>, PS;
  2019. defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
  2020. SSEPackedDouble>, PD;
  2021. let Predicates = [UseSSE2] in {
  2022. // Also support integer VTs to avoid a int->fp bitcast in the DAG.
  2023. def : Pat<(X86movmsk (v4i32 VR128:$src)),
  2024. (MOVMSKPSrr VR128:$src)>;
  2025. def : Pat<(X86movmsk (v2i64 VR128:$src)),
  2026. (MOVMSKPDrr VR128:$src)>;
  2027. }
  2028. //===---------------------------------------------------------------------===//
  2029. // SSE2 - Packed Integer Logical Instructions
  2030. //===---------------------------------------------------------------------===//
  2031. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  2032. /// PDI_binop_rm - Simple SSE2 binary operator.
  2033. multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  2034. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  2035. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  2036. bit IsCommutable, bit Is2Addr> {
  2037. let isCommutable = IsCommutable in
  2038. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  2039. (ins RC:$src1, RC:$src2),
  2040. !if(Is2Addr,
  2041. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  2042. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  2043. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  2044. Sched<[sched]>;
  2045. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  2046. (ins RC:$src1, x86memop:$src2),
  2047. !if(Is2Addr,
  2048. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  2049. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  2050. [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  2051. Sched<[sched.Folded, sched.ReadAfterFold]>;
  2052. }
  2053. } // ExeDomain = SSEPackedInt
  2054. multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
  2055. ValueType OpVT128, ValueType OpVT256,
  2056. X86SchedWriteWidths sched, bit IsCommutable,
  2057. Predicate prd> {
  2058. let Predicates = [HasAVX, prd] in
  2059. defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
  2060. VR128, load, i128mem, sched.XMM,
  2061. IsCommutable, 0>, VEX_4V, VEX_WIG;
  2062. let Constraints = "$src1 = $dst" in
  2063. defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
  2064. memop, i128mem, sched.XMM, IsCommutable, 1>;
  2065. let Predicates = [HasAVX2, prd] in
  2066. defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
  2067. OpVT256, VR256, load, i256mem, sched.YMM,
  2068. IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
  2069. }
  2070. // These are ordered here for pattern ordering requirements with the fp versions
  2071. defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
  2072. SchedWriteVecLogic, 1, NoVLX>;
  2073. defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
  2074. SchedWriteVecLogic, 1, NoVLX>;
  2075. defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
  2076. SchedWriteVecLogic, 1, NoVLX>;
  2077. defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
  2078. SchedWriteVecLogic, 0, NoVLX>;
  2079. //===----------------------------------------------------------------------===//
  2080. // SSE 1 & 2 - Logical Instructions
  2081. //===----------------------------------------------------------------------===//
  2082. /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
  2083. ///
  2084. /// There are no patterns here because isel prefers integer versions for SSE2
  2085. /// and later. There are SSE1 v4f32 patterns later.
  2086. multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
  2087. X86SchedWriteWidths sched> {
  2088. let Predicates = [HasAVX, NoVLX] in {
  2089. defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
  2090. !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
  2091. [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
  2092. defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
  2093. !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
  2094. [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
  2095. defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
  2096. !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
  2097. [], [], 0>, PS, VEX_4V, VEX_WIG;
  2098. defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
  2099. !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
  2100. [], [], 0>, PD, VEX_4V, VEX_WIG;
  2101. }
  2102. let Constraints = "$src1 = $dst" in {
  2103. defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
  2104. !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
  2105. [], []>, PS;
  2106. defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
  2107. !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
  2108. [], []>, PD;
  2109. }
  2110. }
  2111. defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
  2112. defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
  2113. defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
  2114. let isCommutable = 0 in
  2115. defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
  2116. let Predicates = [HasAVX2, NoVLX] in {
  2117. def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
  2118. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2119. def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
  2120. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2121. def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
  2122. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2123. def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
  2124. (VPORYrr VR256:$src1, VR256:$src2)>;
  2125. def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
  2126. (VPORYrr VR256:$src1, VR256:$src2)>;
  2127. def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
  2128. (VPORYrr VR256:$src1, VR256:$src2)>;
  2129. def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
  2130. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2131. def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
  2132. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2133. def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
  2134. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2135. def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
  2136. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2137. def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
  2138. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2139. def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
  2140. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2141. def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
  2142. (VPANDYrm VR256:$src1, addr:$src2)>;
  2143. def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
  2144. (VPANDYrm VR256:$src1, addr:$src2)>;
  2145. def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
  2146. (VPANDYrm VR256:$src1, addr:$src2)>;
  2147. def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
  2148. (VPORYrm VR256:$src1, addr:$src2)>;
  2149. def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
  2150. (VPORYrm VR256:$src1, addr:$src2)>;
  2151. def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
  2152. (VPORYrm VR256:$src1, addr:$src2)>;
  2153. def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
  2154. (VPXORYrm VR256:$src1, addr:$src2)>;
  2155. def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
  2156. (VPXORYrm VR256:$src1, addr:$src2)>;
  2157. def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
  2158. (VPXORYrm VR256:$src1, addr:$src2)>;
  2159. def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
  2160. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2161. def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
  2162. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2163. def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
  2164. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2165. }
  2166. // If only AVX1 is supported, we need to handle integer operations with
  2167. // floating point instructions since the integer versions aren't available.
  2168. let Predicates = [HasAVX1Only] in {
  2169. def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
  2170. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2171. def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
  2172. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2173. def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
  2174. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2175. def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
  2176. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2177. def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
  2178. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2179. def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
  2180. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2181. def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
  2182. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2183. def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
  2184. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2185. def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
  2186. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2187. def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
  2188. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2189. def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
  2190. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2191. def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
  2192. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2193. def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
  2194. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2195. def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
  2196. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2197. def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
  2198. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2199. def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
  2200. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2201. def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
  2202. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2203. def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
  2204. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2205. def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
  2206. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2207. def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
  2208. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2209. def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
  2210. (VORPSYrm VR256:$src1, addr:$src2)>;
  2211. def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
  2212. (VORPSYrm VR256:$src1, addr:$src2)>;
  2213. def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
  2214. (VORPSYrm VR256:$src1, addr:$src2)>;
  2215. def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
  2216. (VORPSYrm VR256:$src1, addr:$src2)>;
  2217. def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
  2218. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2219. def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
  2220. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2221. def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
  2222. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2223. def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
  2224. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2225. def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
  2226. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2227. def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
  2228. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2229. def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
  2230. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2231. def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
  2232. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2233. }
  2234. let Predicates = [HasAVX, NoVLX] in {
  2235. def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
  2236. (VPANDrr VR128:$src1, VR128:$src2)>;
  2237. def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
  2238. (VPANDrr VR128:$src1, VR128:$src2)>;
  2239. def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
  2240. (VPANDrr VR128:$src1, VR128:$src2)>;
  2241. def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
  2242. (VPORrr VR128:$src1, VR128:$src2)>;
  2243. def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
  2244. (VPORrr VR128:$src1, VR128:$src2)>;
  2245. def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
  2246. (VPORrr VR128:$src1, VR128:$src2)>;
  2247. def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
  2248. (VPXORrr VR128:$src1, VR128:$src2)>;
  2249. def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
  2250. (VPXORrr VR128:$src1, VR128:$src2)>;
  2251. def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
  2252. (VPXORrr VR128:$src1, VR128:$src2)>;
  2253. def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
  2254. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2255. def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
  2256. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2257. def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
  2258. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2259. def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
  2260. (VPANDrm VR128:$src1, addr:$src2)>;
  2261. def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
  2262. (VPANDrm VR128:$src1, addr:$src2)>;
  2263. def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
  2264. (VPANDrm VR128:$src1, addr:$src2)>;
  2265. def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
  2266. (VPORrm VR128:$src1, addr:$src2)>;
  2267. def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
  2268. (VPORrm VR128:$src1, addr:$src2)>;
  2269. def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
  2270. (VPORrm VR128:$src1, addr:$src2)>;
  2271. def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
  2272. (VPXORrm VR128:$src1, addr:$src2)>;
  2273. def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
  2274. (VPXORrm VR128:$src1, addr:$src2)>;
  2275. def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
  2276. (VPXORrm VR128:$src1, addr:$src2)>;
  2277. def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
  2278. (VPANDNrm VR128:$src1, addr:$src2)>;
  2279. def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
  2280. (VPANDNrm VR128:$src1, addr:$src2)>;
  2281. def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
  2282. (VPANDNrm VR128:$src1, addr:$src2)>;
  2283. }
  2284. let Predicates = [UseSSE2] in {
  2285. def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
  2286. (PANDrr VR128:$src1, VR128:$src2)>;
  2287. def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
  2288. (PANDrr VR128:$src1, VR128:$src2)>;
  2289. def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
  2290. (PANDrr VR128:$src1, VR128:$src2)>;
  2291. def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
  2292. (PORrr VR128:$src1, VR128:$src2)>;
  2293. def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
  2294. (PORrr VR128:$src1, VR128:$src2)>;
  2295. def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
  2296. (PORrr VR128:$src1, VR128:$src2)>;
  2297. def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
  2298. (PXORrr VR128:$src1, VR128:$src2)>;
  2299. def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
  2300. (PXORrr VR128:$src1, VR128:$src2)>;
  2301. def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
  2302. (PXORrr VR128:$src1, VR128:$src2)>;
  2303. def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
  2304. (PANDNrr VR128:$src1, VR128:$src2)>;
  2305. def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
  2306. (PANDNrr VR128:$src1, VR128:$src2)>;
  2307. def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
  2308. (PANDNrr VR128:$src1, VR128:$src2)>;
  2309. def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
  2310. (PANDrm VR128:$src1, addr:$src2)>;
  2311. def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
  2312. (PANDrm VR128:$src1, addr:$src2)>;
  2313. def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
  2314. (PANDrm VR128:$src1, addr:$src2)>;
  2315. def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
  2316. (PORrm VR128:$src1, addr:$src2)>;
  2317. def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
  2318. (PORrm VR128:$src1, addr:$src2)>;
  2319. def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
  2320. (PORrm VR128:$src1, addr:$src2)>;
  2321. def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
  2322. (PXORrm VR128:$src1, addr:$src2)>;
  2323. def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
  2324. (PXORrm VR128:$src1, addr:$src2)>;
  2325. def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
  2326. (PXORrm VR128:$src1, addr:$src2)>;
  2327. def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
  2328. (PANDNrm VR128:$src1, addr:$src2)>;
  2329. def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
  2330. (PANDNrm VR128:$src1, addr:$src2)>;
  2331. def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
  2332. (PANDNrm VR128:$src1, addr:$src2)>;
  2333. }
  2334. // Patterns for packed operations when we don't have integer type available.
  2335. def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
  2336. (ANDPSrr VR128:$src1, VR128:$src2)>;
  2337. def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
  2338. (ORPSrr VR128:$src1, VR128:$src2)>;
  2339. def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
  2340. (XORPSrr VR128:$src1, VR128:$src2)>;
  2341. def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
  2342. (ANDNPSrr VR128:$src1, VR128:$src2)>;
  2343. def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
  2344. (ANDPSrm VR128:$src1, addr:$src2)>;
  2345. def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
  2346. (ORPSrm VR128:$src1, addr:$src2)>;
  2347. def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
  2348. (XORPSrm VR128:$src1, addr:$src2)>;
  2349. def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
  2350. (ANDNPSrm VR128:$src1, addr:$src2)>;
  2351. //===----------------------------------------------------------------------===//
  2352. // SSE 1 & 2 - Arithmetic Instructions
  2353. //===----------------------------------------------------------------------===//
  2354. /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
  2355. /// vector forms.
  2356. ///
  2357. /// In addition, we also have a special variant of the scalar form here to
  2358. /// represent the associated intrinsic operation. This form is unlike the
  2359. /// plain scalar form, in that it takes an entire vector (instead of a scalar)
  2360. /// and leaves the top elements unmodified (therefore these cannot be commuted).
  2361. ///
  2362. /// These three forms can each be reg+reg or reg+mem.
  2363. ///
  2364. /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
  2365. /// classes below
  2366. multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
  2367. SDPatternOperator OpNode, X86SchedWriteSizes sched> {
  2368. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2369. let Predicates = [HasAVX, NoVLX] in {
  2370. defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
  2371. VR128, v4f32, f128mem, loadv4f32,
  2372. SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
  2373. defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
  2374. VR128, v2f64, f128mem, loadv2f64,
  2375. SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
  2376. defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
  2377. OpNode, VR256, v8f32, f256mem, loadv8f32,
  2378. SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
  2379. defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
  2380. OpNode, VR256, v4f64, f256mem, loadv4f64,
  2381. SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
  2382. }
  2383. let Constraints = "$src1 = $dst" in {
  2384. defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
  2385. v4f32, f128mem, memopv4f32, SSEPackedSingle,
  2386. sched.PS.XMM>, PS;
  2387. defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
  2388. v2f64, f128mem, memopv2f64, SSEPackedDouble,
  2389. sched.PD.XMM>, PD;
  2390. }
  2391. }
  2392. }
  2393. multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2394. X86SchedWriteSizes sched> {
  2395. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2396. defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
  2397. OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
  2398. XS, VEX_4V, VEX_LIG, VEX_WIG;
  2399. defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
  2400. OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
  2401. XD, VEX_4V, VEX_LIG, VEX_WIG;
  2402. let Constraints = "$src1 = $dst" in {
  2403. defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
  2404. OpNode, FR32, f32mem, SSEPackedSingle,
  2405. sched.PS.Scl>, XS;
  2406. defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
  2407. OpNode, FR64, f64mem, SSEPackedDouble,
  2408. sched.PD.Scl>, XD;
  2409. }
  2410. }
  2411. }
  2412. multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
  2413. SDPatternOperator OpNode,
  2414. X86SchedWriteSizes sched> {
  2415. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2416. defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
  2417. !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
  2418. SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
  2419. defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
  2420. !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
  2421. SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
  2422. let Constraints = "$src1 = $dst" in {
  2423. defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
  2424. !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
  2425. SSEPackedSingle, sched.PS.Scl>, XS;
  2426. defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
  2427. !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
  2428. SSEPackedDouble, sched.PD.Scl>, XD;
  2429. }
  2430. }
  2431. }
  2432. // Binary Arithmetic instructions
  2433. defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
  2434. basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
  2435. basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
  2436. defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
  2437. basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
  2438. basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
  2439. let isCommutable = 0 in {
  2440. defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
  2441. basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
  2442. basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
  2443. defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
  2444. basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
  2445. basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
  2446. defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
  2447. basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
  2448. basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
  2449. defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
  2450. basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
  2451. basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
  2452. }
  2453. let isCodeGenOnly = 1 in {
  2454. defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
  2455. basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
  2456. defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
  2457. basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
  2458. }
  2459. // Patterns used to select SSE scalar fp arithmetic instructions from
  2460. // either:
  2461. //
  2462. // (1) a scalar fp operation followed by a blend
  2463. //
  2464. // The effect is that the backend no longer emits unnecessary vector
  2465. // insert instructions immediately after SSE scalar fp instructions
  2466. // like addss or mulss.
  2467. //
  2468. // For example, given the following code:
  2469. // __m128 foo(__m128 A, __m128 B) {
  2470. // A[0] += B[0];
  2471. // return A;
  2472. // }
  2473. //
  2474. // Previously we generated:
  2475. // addss %xmm0, %xmm1
  2476. // movss %xmm1, %xmm0
  2477. //
  2478. // We now generate:
  2479. // addss %xmm1, %xmm0
  2480. //
  2481. // (2) a vector packed single/double fp operation followed by a vector insert
  2482. //
  2483. // The effect is that the backend converts the packed fp instruction
  2484. // followed by a vector insert into a single SSE scalar fp instruction.
  2485. //
  2486. // For example, given the following code:
  2487. // __m128 foo(__m128 A, __m128 B) {
  2488. // __m128 C = A + B;
  2489. // return (__m128) {c[0], a[1], a[2], a[3]};
  2490. // }
  2491. //
  2492. // Previously we generated:
  2493. // addps %xmm0, %xmm1
  2494. // movss %xmm1, %xmm0
  2495. //
  2496. // We now generate:
  2497. // addss %xmm1, %xmm0
  2498. // TODO: Some canonicalization in lowering would simplify the number of
  2499. // patterns we have to try to match.
  2500. multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
  2501. ValueType VT, ValueType EltTy,
  2502. RegisterClass RC, PatFrag ld_frag,
  2503. Predicate BasePredicate> {
  2504. let Predicates = [BasePredicate] in {
  2505. // extracted scalar math op with insert via movss/movsd
  2506. def : Pat<(VT (Move (VT VR128:$dst),
  2507. (VT (scalar_to_vector
  2508. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2509. RC:$src))))),
  2510. (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
  2511. (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
  2512. def : Pat<(VT (Move (VT VR128:$dst),
  2513. (VT (scalar_to_vector
  2514. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2515. (ld_frag addr:$src)))))),
  2516. (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
  2517. }
  2518. // Repeat for AVX versions of the instructions.
  2519. let Predicates = [UseAVX] in {
  2520. // extracted scalar math op with insert via movss/movsd
  2521. def : Pat<(VT (Move (VT VR128:$dst),
  2522. (VT (scalar_to_vector
  2523. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2524. RC:$src))))),
  2525. (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
  2526. (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
  2527. def : Pat<(VT (Move (VT VR128:$dst),
  2528. (VT (scalar_to_vector
  2529. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2530. (ld_frag addr:$src)))))),
  2531. (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
  2532. }
  2533. }
  2534. defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2535. defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2536. defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2537. defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2538. defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2539. defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2540. defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2541. defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2542. /// Unop Arithmetic
  2543. /// In addition, we also have a special variant of the scalar form here to
  2544. /// represent the associated intrinsic operation. This form is unlike the
  2545. /// plain scalar form, in that it takes an entire vector (instead of a
  2546. /// scalar) and leaves the top elements undefined.
  2547. ///
  2548. /// And, we have a special variant form for a full-vector intrinsic form.
  2549. /// sse_fp_unop_s - SSE1 unops in scalar form
  2550. /// For the non-AVX defs, we need $src1 to be tied to $dst because
  2551. /// the HW instructions are 2 operand / destructive.
  2552. multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
  2553. X86MemOperand x86memop, Operand intmemop,
  2554. SDPatternOperator OpNode, Domain d,
  2555. X86FoldableSchedWrite sched, Predicate target> {
  2556. let isCodeGenOnly = 1, hasSideEffects = 0 in {
  2557. def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
  2558. !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
  2559. [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
  2560. Requires<[target]>;
  2561. let mayLoad = 1 in
  2562. def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
  2563. !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
  2564. [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
  2565. Sched<[sched.Folded]>,
  2566. Requires<[target, OptForSize]>;
  2567. }
  2568. let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
  2569. def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  2570. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
  2571. Sched<[sched]>;
  2572. let mayLoad = 1 in
  2573. def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
  2574. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
  2575. Sched<[sched.Folded, sched.ReadAfterFold]>;
  2576. }
  2577. }
  2578. multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
  2579. Intrinsic Intr, Predicate target> {
  2580. let Predicates = [target] in {
  2581. // These are unary operations, but they are modeled as having 2 source operands
  2582. // because the high elements of the destination are unchanged in SSE.
  2583. def : Pat<(Intr VR128:$src),
  2584. (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
  2585. }
  2586. // We don't want to fold scalar loads into these instructions unless
  2587. // optimizing for size. This is because the folded instruction will have a
  2588. // partial register update, while the unfolded sequence will not, e.g.
  2589. // movss mem, %xmm0
  2590. // rcpss %xmm0, %xmm0
  2591. // which has a clobber before the rcp, vs.
  2592. // rcpss mem, %xmm0
  2593. let Predicates = [target, OptForSize] in {
  2594. def : Pat<(Intr (mem_frags addr:$src2)),
  2595. (!cast<Instruction>(NAME#m_Int)
  2596. (vt (IMPLICIT_DEF)), addr:$src2)>;
  2597. }
  2598. }
  2599. multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
  2600. Intrinsic Intr, Predicate target> {
  2601. let Predicates = [target] in {
  2602. def : Pat<(Intr VR128:$src),
  2603. (!cast<Instruction>(NAME#r_Int) VR128:$src,
  2604. VR128:$src)>;
  2605. }
  2606. let Predicates = [target, OptForSize] in {
  2607. def : Pat<(Intr (mem_frags addr:$src2)),
  2608. (!cast<Instruction>(NAME#m_Int)
  2609. (vt (IMPLICIT_DEF)), addr:$src2)>;
  2610. }
  2611. }
  2612. multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
  2613. ValueType ScalarVT, X86MemOperand x86memop,
  2614. Operand intmemop, SDPatternOperator OpNode, Domain d,
  2615. X86FoldableSchedWrite sched, Predicate target> {
  2616. let isCodeGenOnly = 1, hasSideEffects = 0 in {
  2617. def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  2618. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2619. [], d>, Sched<[sched]>;
  2620. let mayLoad = 1 in
  2621. def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  2622. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2623. [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  2624. }
  2625. let hasSideEffects = 0, ExeDomain = d in {
  2626. def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
  2627. (ins VR128:$src1, VR128:$src2),
  2628. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2629. []>, Sched<[sched]>;
  2630. let mayLoad = 1 in
  2631. def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
  2632. (ins VR128:$src1, intmemop:$src2),
  2633. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2634. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  2635. }
  2636. // We don't want to fold scalar loads into these instructions unless
  2637. // optimizing for size. This is because the folded instruction will have a
  2638. // partial register update, while the unfolded sequence will not, e.g.
  2639. // vmovss mem, %xmm0
  2640. // vrcpss %xmm0, %xmm0, %xmm0
  2641. // which has a clobber before the rcp, vs.
  2642. // vrcpss mem, %xmm0, %xmm0
  2643. // TODO: In theory, we could fold the load, and avoid the stall caused by
  2644. // the partial register store, either in BreakFalseDeps or with smarter RA.
  2645. let Predicates = [target] in {
  2646. def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
  2647. (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
  2648. }
  2649. let Predicates = [target, OptForSize] in {
  2650. def : Pat<(ScalarVT (OpNode (load addr:$src))),
  2651. (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
  2652. addr:$src)>;
  2653. }
  2654. }
  2655. /// sse1_fp_unop_p - SSE1 unops in packed form.
  2656. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2657. X86SchedWriteWidths sched, list<Predicate> prds> {
  2658. let Predicates = prds in {
  2659. def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2660. !strconcat("v", OpcodeStr,
  2661. "ps\t{$src, $dst|$dst, $src}"),
  2662. [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
  2663. VEX, Sched<[sched.XMM]>, VEX_WIG;
  2664. def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2665. !strconcat("v", OpcodeStr,
  2666. "ps\t{$src, $dst|$dst, $src}"),
  2667. [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
  2668. VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
  2669. def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2670. !strconcat("v", OpcodeStr,
  2671. "ps\t{$src, $dst|$dst, $src}"),
  2672. [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
  2673. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  2674. def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  2675. !strconcat("v", OpcodeStr,
  2676. "ps\t{$src, $dst|$dst, $src}"),
  2677. [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
  2678. VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
  2679. }
  2680. def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2681. !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
  2682. [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
  2683. Sched<[sched.XMM]>;
  2684. def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2685. !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
  2686. [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
  2687. Sched<[sched.XMM.Folded]>;
  2688. }
  2689. /// sse2_fp_unop_p - SSE2 unops in vector forms.
  2690. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
  2691. SDPatternOperator OpNode, X86SchedWriteWidths sched> {
  2692. let Predicates = [HasAVX, NoVLX] in {
  2693. def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2694. !strconcat("v", OpcodeStr,
  2695. "pd\t{$src, $dst|$dst, $src}"),
  2696. [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
  2697. VEX, Sched<[sched.XMM]>, VEX_WIG;
  2698. def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2699. !strconcat("v", OpcodeStr,
  2700. "pd\t{$src, $dst|$dst, $src}"),
  2701. [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
  2702. VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
  2703. def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2704. !strconcat("v", OpcodeStr,
  2705. "pd\t{$src, $dst|$dst, $src}"),
  2706. [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
  2707. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  2708. def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  2709. !strconcat("v", OpcodeStr,
  2710. "pd\t{$src, $dst|$dst, $src}"),
  2711. [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
  2712. VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
  2713. }
  2714. def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2715. !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
  2716. [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
  2717. Sched<[sched.XMM]>;
  2718. def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2719. !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
  2720. [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
  2721. Sched<[sched.XMM.Folded]>;
  2722. }
  2723. multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
  2724. defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32,
  2725. !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
  2726. UseSSE1>, XS;
  2727. defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32,
  2728. !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
  2729. AVXTarget>,
  2730. XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
  2731. }
  2732. multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2733. X86SchedWriteWidths sched, Predicate AVXTarget> {
  2734. defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
  2735. ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
  2736. defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
  2737. f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
  2738. XS, VEX_4V, VEX_LIG, VEX_WIG;
  2739. }
  2740. multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2741. X86SchedWriteWidths sched, Predicate AVXTarget> {
  2742. defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
  2743. sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
  2744. defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
  2745. f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
  2746. XD, VEX_4V, VEX_LIG, VEX_WIG;
  2747. }
  2748. // Square root.
  2749. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
  2750. sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
  2751. sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
  2752. sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
  2753. // Reciprocal approximations. Note that these typically require refinement
  2754. // in order to obtain suitable precision.
  2755. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
  2756. sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
  2757. sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
  2758. defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
  2759. sse1_fp_unop_s_intr<"rcp", HasAVX>,
  2760. sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
  2761. // There is no f64 version of the reciprocal approximation instructions.
  2762. multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
  2763. ValueType VT, Predicate BasePredicate> {
  2764. let Predicates = [BasePredicate] in {
  2765. def : Pat<(VT (Move VT:$dst, (scalar_to_vector
  2766. (OpNode (extractelt VT:$src, 0))))),
  2767. (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2768. }
  2769. // Repeat for AVX versions of the instructions.
  2770. let Predicates = [UseAVX] in {
  2771. def : Pat<(VT (Move VT:$dst, (scalar_to_vector
  2772. (OpNode (extractelt VT:$src, 0))))),
  2773. (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2774. }
  2775. }
  2776. defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
  2777. defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
  2778. multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
  2779. SDNode Move, ValueType VT,
  2780. Predicate BasePredicate> {
  2781. let Predicates = [BasePredicate] in {
  2782. def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
  2783. (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2784. }
  2785. // Repeat for AVX versions of the instructions.
  2786. let Predicates = [HasAVX] in {
  2787. def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
  2788. (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2789. }
  2790. }
  2791. defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
  2792. v4f32, UseSSE1>;
  2793. defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
  2794. v4f32, UseSSE1>;
  2795. //===----------------------------------------------------------------------===//
  2796. // SSE 1 & 2 - Non-temporal stores
  2797. //===----------------------------------------------------------------------===//
  2798. let AddedComplexity = 400 in { // Prefer non-temporal versions
  2799. let Predicates = [HasAVX, NoVLX] in {
  2800. let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
  2801. def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
  2802. (ins f128mem:$dst, VR128:$src),
  2803. "movntps\t{$src, $dst|$dst, $src}",
  2804. [(alignednontemporalstore (v4f32 VR128:$src),
  2805. addr:$dst)]>, VEX, VEX_WIG;
  2806. def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
  2807. (ins f128mem:$dst, VR128:$src),
  2808. "movntpd\t{$src, $dst|$dst, $src}",
  2809. [(alignednontemporalstore (v2f64 VR128:$src),
  2810. addr:$dst)]>, VEX, VEX_WIG;
  2811. } // SchedRW
  2812. let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
  2813. def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
  2814. (ins f256mem:$dst, VR256:$src),
  2815. "movntps\t{$src, $dst|$dst, $src}",
  2816. [(alignednontemporalstore (v8f32 VR256:$src),
  2817. addr:$dst)]>, VEX, VEX_L, VEX_WIG;
  2818. def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
  2819. (ins f256mem:$dst, VR256:$src),
  2820. "movntpd\t{$src, $dst|$dst, $src}",
  2821. [(alignednontemporalstore (v4f64 VR256:$src),
  2822. addr:$dst)]>, VEX, VEX_L, VEX_WIG;
  2823. } // SchedRW
  2824. let ExeDomain = SSEPackedInt in {
  2825. def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
  2826. (ins i128mem:$dst, VR128:$src),
  2827. "movntdq\t{$src, $dst|$dst, $src}",
  2828. [(alignednontemporalstore (v2i64 VR128:$src),
  2829. addr:$dst)]>, VEX, VEX_WIG,
  2830. Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
  2831. def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
  2832. (ins i256mem:$dst, VR256:$src),
  2833. "movntdq\t{$src, $dst|$dst, $src}",
  2834. [(alignednontemporalstore (v4i64 VR256:$src),
  2835. addr:$dst)]>, VEX, VEX_L, VEX_WIG,
  2836. Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
  2837. } // ExeDomain
  2838. } // Predicates
  2839. let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
  2840. def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2841. "movntps\t{$src, $dst|$dst, $src}",
  2842. [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
  2843. def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2844. "movntpd\t{$src, $dst|$dst, $src}",
  2845. [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
  2846. } // SchedRW
  2847. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
  2848. def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2849. "movntdq\t{$src, $dst|$dst, $src}",
  2850. [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
  2851. let SchedRW = [WriteStoreNT] in {
  2852. // There is no AVX form for instructions below this point
  2853. def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
  2854. "movnti{l}\t{$src, $dst|$dst, $src}",
  2855. [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
  2856. PS, Requires<[HasSSE2]>;
  2857. def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
  2858. "movnti{q}\t{$src, $dst|$dst, $src}",
  2859. [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
  2860. PS, Requires<[HasSSE2]>;
  2861. } // SchedRW = [WriteStoreNT]
  2862. let Predicates = [HasAVX, NoVLX] in {
  2863. def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
  2864. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2865. def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
  2866. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2867. def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
  2868. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2869. def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
  2870. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2871. def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
  2872. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2873. def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
  2874. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2875. def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
  2876. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2877. def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
  2878. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2879. }
  2880. let Predicates = [UseSSE2] in {
  2881. def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
  2882. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2883. def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
  2884. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2885. def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
  2886. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2887. def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
  2888. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2889. }
  2890. } // AddedComplexity
  2891. //===----------------------------------------------------------------------===//
  2892. // SSE 1 & 2 - Prefetch and memory fence
  2893. //===----------------------------------------------------------------------===//
  2894. // Prefetch intrinsic.
  2895. let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
  2896. def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
  2897. "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
  2898. def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
  2899. "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
  2900. def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
  2901. "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
  2902. def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
  2903. "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
  2904. }
  2905. // FIXME: How should flush instruction be modeled?
  2906. let SchedRW = [WriteLoad] in {
  2907. // Flush cache
  2908. def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
  2909. "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
  2910. PS, Requires<[HasCLFLUSH]>;
  2911. }
  2912. let SchedRW = [WriteNop] in {
  2913. // Pause. This "instruction" is encoded as "rep; nop", so even though it
  2914. // was introduced with SSE2, it's backward compatible.
  2915. def PAUSE : I<0x90, RawFrm, (outs), (ins),
  2916. "pause", [(int_x86_sse2_pause)]>, OBXS;
  2917. }
  2918. let SchedRW = [WriteFence] in {
  2919. // Load, store, and memory fence
  2920. // TODO: As with mfence, we may want to ease the availability of sfence/lfence
  2921. // to include any 64-bit target.
  2922. def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
  2923. PS, Requires<[HasSSE1]>;
  2924. def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
  2925. PS, Requires<[HasSSE2]>;
  2926. def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
  2927. PS, Requires<[HasMFence]>;
  2928. } // SchedRW
  2929. def : Pat<(X86MFence), (MFENCE)>;
  2930. //===----------------------------------------------------------------------===//
  2931. // SSE 1 & 2 - Load/Store XCSR register
  2932. //===----------------------------------------------------------------------===//
  2933. let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
  2934. def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
  2935. "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
  2936. VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
  2937. let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
  2938. def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
  2939. "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
  2940. VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
  2941. let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
  2942. def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
  2943. "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
  2944. PS, Sched<[WriteLDMXCSR]>;
  2945. let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
  2946. def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
  2947. "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
  2948. PS, Sched<[WriteSTMXCSR]>;
  2949. //===---------------------------------------------------------------------===//
  2950. // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
  2951. //===---------------------------------------------------------------------===//
  2952. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  2953. let hasSideEffects = 0 in {
  2954. def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2955. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2956. Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
  2957. def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2958. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2959. Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
  2960. def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2961. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2962. Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
  2963. def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2964. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2965. Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
  2966. }
  2967. // For Disassembler
  2968. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
  2969. def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  2970. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2971. Sched<[SchedWriteVecMoveLS.XMM.RR]>,
  2972. VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
  2973. def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
  2974. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2975. Sched<[SchedWriteVecMoveLS.YMM.RR]>,
  2976. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
  2977. def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  2978. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2979. Sched<[SchedWriteVecMoveLS.XMM.RR]>,
  2980. VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
  2981. def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
  2982. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2983. Sched<[SchedWriteVecMoveLS.YMM.RR]>,
  2984. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
  2985. }
  2986. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
  2987. hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
  2988. def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  2989. "movdqa\t{$src, $dst|$dst, $src}",
  2990. [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
  2991. Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
  2992. def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  2993. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2994. Sched<[SchedWriteVecMoveLS.YMM.RM]>,
  2995. VEX, VEX_L, VEX_WIG;
  2996. def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  2997. "vmovdqu\t{$src, $dst|$dst, $src}",
  2998. [(set VR128:$dst, (loadv2i64 addr:$src))]>,
  2999. Sched<[SchedWriteVecMoveLS.XMM.RM]>,
  3000. XS, VEX, VEX_WIG;
  3001. def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  3002. "vmovdqu\t{$src, $dst|$dst, $src}", []>,
  3003. Sched<[SchedWriteVecMoveLS.YMM.RM]>,
  3004. XS, VEX, VEX_L, VEX_WIG;
  3005. }
  3006. let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
  3007. def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
  3008. (ins i128mem:$dst, VR128:$src),
  3009. "movdqa\t{$src, $dst|$dst, $src}",
  3010. [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
  3011. Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
  3012. def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
  3013. (ins i256mem:$dst, VR256:$src),
  3014. "movdqa\t{$src, $dst|$dst, $src}", []>,
  3015. Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
  3016. def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  3017. "vmovdqu\t{$src, $dst|$dst, $src}",
  3018. [(store (v2i64 VR128:$src), addr:$dst)]>,
  3019. Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
  3020. def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
  3021. "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
  3022. Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
  3023. }
  3024. let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
  3025. let hasSideEffects = 0 in {
  3026. def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3027. "movdqa\t{$src, $dst|$dst, $src}", []>;
  3028. def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3029. "movdqu\t{$src, $dst|$dst, $src}", []>,
  3030. XS, Requires<[UseSSE2]>;
  3031. }
  3032. // For Disassembler
  3033. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
  3034. def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3035. "movdqa\t{$src, $dst|$dst, $src}", []>,
  3036. FoldGenData<"MOVDQArr">;
  3037. def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3038. "movdqu\t{$src, $dst|$dst, $src}", []>,
  3039. XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
  3040. }
  3041. } // SchedRW
  3042. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
  3043. hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
  3044. def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  3045. "movdqa\t{$src, $dst|$dst, $src}",
  3046. [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
  3047. def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  3048. "movdqu\t{$src, $dst|$dst, $src}",
  3049. [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
  3050. XS, Requires<[UseSSE2]>;
  3051. }
  3052. let mayStore = 1, hasSideEffects = 0,
  3053. SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
  3054. def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  3055. "movdqa\t{$src, $dst|$dst, $src}",
  3056. [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
  3057. def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  3058. "movdqu\t{$src, $dst|$dst, $src}",
  3059. [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
  3060. XS, Requires<[UseSSE2]>;
  3061. }
  3062. } // ExeDomain = SSEPackedInt
  3063. // Reversed version with ".s" suffix for GAS compatibility.
  3064. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
  3065. (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
  3066. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
  3067. (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
  3068. def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
  3069. (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
  3070. def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
  3071. (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
  3072. // Reversed version with ".s" suffix for GAS compatibility.
  3073. def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
  3074. (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
  3075. def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
  3076. (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
  3077. let Predicates = [HasAVX, NoVLX] in {
  3078. // Additional patterns for other integer sizes.
  3079. def : Pat<(alignedloadv4i32 addr:$src),
  3080. (VMOVDQArm addr:$src)>;
  3081. def : Pat<(alignedloadv8i16 addr:$src),
  3082. (VMOVDQArm addr:$src)>;
  3083. def : Pat<(alignedloadv8f16 addr:$src),
  3084. (VMOVDQArm addr:$src)>;
  3085. def : Pat<(alignedloadv16i8 addr:$src),
  3086. (VMOVDQArm addr:$src)>;
  3087. def : Pat<(loadv4i32 addr:$src),
  3088. (VMOVDQUrm addr:$src)>;
  3089. def : Pat<(loadv8i16 addr:$src),
  3090. (VMOVDQUrm addr:$src)>;
  3091. def : Pat<(loadv8f16 addr:$src),
  3092. (VMOVDQUrm addr:$src)>;
  3093. def : Pat<(loadv16i8 addr:$src),
  3094. (VMOVDQUrm addr:$src)>;
  3095. def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
  3096. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3097. def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
  3098. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3099. def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
  3100. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3101. def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
  3102. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3103. def : Pat<(store (v4i32 VR128:$src), addr:$dst),
  3104. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3105. def : Pat<(store (v8i16 VR128:$src), addr:$dst),
  3106. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3107. def : Pat<(store (v8f16 VR128:$src), addr:$dst),
  3108. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3109. def : Pat<(store (v16i8 VR128:$src), addr:$dst),
  3110. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3111. }
  3112. //===---------------------------------------------------------------------===//
  3113. // SSE2 - Packed Integer Arithmetic Instructions
  3114. //===---------------------------------------------------------------------===//
  3115. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  3116. /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
  3117. multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
  3118. ValueType DstVT, ValueType SrcVT, RegisterClass RC,
  3119. PatFrag memop_frag, X86MemOperand x86memop,
  3120. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  3121. let isCommutable = 1 in
  3122. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  3123. (ins RC:$src1, RC:$src2),
  3124. !if(Is2Addr,
  3125. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3126. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3127. [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
  3128. Sched<[sched]>;
  3129. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  3130. (ins RC:$src1, x86memop:$src2),
  3131. !if(Is2Addr,
  3132. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3133. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3134. [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
  3135. (memop_frag addr:$src2))))]>,
  3136. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3137. }
  3138. } // ExeDomain = SSEPackedInt
  3139. defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
  3140. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3141. defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
  3142. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3143. defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
  3144. SchedWriteVecALU, 1, NoVLX>;
  3145. defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
  3146. SchedWriteVecALU, 1, NoVLX>;
  3147. defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
  3148. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3149. defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
  3150. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3151. defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
  3152. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3153. defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
  3154. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3155. defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
  3156. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3157. defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
  3158. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3159. defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
  3160. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3161. defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
  3162. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3163. defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
  3164. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3165. defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
  3166. SchedWriteVecALU, 0, NoVLX>;
  3167. defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
  3168. SchedWriteVecALU, 0, NoVLX>;
  3169. defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
  3170. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3171. defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
  3172. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3173. defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
  3174. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3175. defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
  3176. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3177. defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
  3178. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3179. defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
  3180. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3181. defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
  3182. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3183. defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
  3184. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3185. defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
  3186. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3187. defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
  3188. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3189. defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
  3190. SchedWriteVecIMul, 1, NoVLX>;
  3191. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3192. defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
  3193. load, i128mem, SchedWriteVecIMul.XMM, 0>,
  3194. VEX_4V, VEX_WIG;
  3195. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3196. defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
  3197. VR256, load, i256mem, SchedWriteVecIMul.YMM,
  3198. 0>, VEX_4V, VEX_L, VEX_WIG;
  3199. let Constraints = "$src1 = $dst" in
  3200. defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
  3201. memop, i128mem, SchedWriteVecIMul.XMM>;
  3202. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3203. defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
  3204. load, i128mem, SchedWritePSADBW.XMM, 0>,
  3205. VEX_4V, VEX_WIG;
  3206. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3207. defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
  3208. load, i256mem, SchedWritePSADBW.YMM, 0>,
  3209. VEX_4V, VEX_L, VEX_WIG;
  3210. let Constraints = "$src1 = $dst" in
  3211. defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
  3212. memop, i128mem, SchedWritePSADBW.XMM>;
  3213. //===---------------------------------------------------------------------===//
  3214. // SSE2 - Packed Integer Logical Instructions
  3215. //===---------------------------------------------------------------------===//
  3216. multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
  3217. string OpcodeStr, SDNode OpNode,
  3218. SDNode OpNode2, RegisterClass RC,
  3219. X86FoldableSchedWrite sched,
  3220. X86FoldableSchedWrite schedImm,
  3221. ValueType DstVT, ValueType SrcVT,
  3222. PatFrag ld_frag, bit Is2Addr = 1> {
  3223. // src2 is always 128-bit
  3224. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  3225. (ins RC:$src1, VR128:$src2),
  3226. !if(Is2Addr,
  3227. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3228. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3229. [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
  3230. Sched<[sched]>;
  3231. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  3232. (ins RC:$src1, i128mem:$src2),
  3233. !if(Is2Addr,
  3234. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3235. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3236. [(set RC:$dst, (DstVT (OpNode RC:$src1,
  3237. (SrcVT (ld_frag addr:$src2)))))]>,
  3238. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3239. def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
  3240. (ins RC:$src1, u8imm:$src2),
  3241. !if(Is2Addr,
  3242. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3243. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3244. [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
  3245. Sched<[schedImm]>;
  3246. }
  3247. multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
  3248. string OpcodeStr, SDNode OpNode,
  3249. SDNode OpNode2, ValueType DstVT128,
  3250. ValueType DstVT256, ValueType SrcVT,
  3251. X86SchedWriteWidths sched,
  3252. X86SchedWriteWidths schedImm, Predicate prd> {
  3253. let Predicates = [HasAVX, prd] in
  3254. defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
  3255. OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
  3256. DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
  3257. let Predicates = [HasAVX2, prd] in
  3258. defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
  3259. OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
  3260. DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
  3261. VEX_WIG;
  3262. let Constraints = "$src1 = $dst" in
  3263. defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
  3264. VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
  3265. memop>;
  3266. }
  3267. multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
  3268. SDNode OpNode, RegisterClass RC, ValueType VT,
  3269. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  3270. def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
  3271. !if(Is2Addr,
  3272. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3273. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3274. [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
  3275. Sched<[sched]>;
  3276. }
  3277. multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
  3278. SDNode OpNode, X86SchedWriteWidths sched> {
  3279. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3280. defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
  3281. VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
  3282. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3283. defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
  3284. VR256, v32i8, sched.YMM, 0>,
  3285. VEX_4V, VEX_L, VEX_WIG;
  3286. let Constraints = "$src1 = $dst" in
  3287. defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
  3288. sched.XMM>;
  3289. }
  3290. let ExeDomain = SSEPackedInt in {
  3291. defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
  3292. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3293. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3294. defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
  3295. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3296. SchedWriteVecShiftImm, NoVLX>;
  3297. defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
  3298. v2i64, v4i64, v2i64, SchedWriteVecShift,
  3299. SchedWriteVecShiftImm, NoVLX>;
  3300. defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
  3301. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3302. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3303. defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
  3304. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3305. SchedWriteVecShiftImm, NoVLX>;
  3306. defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
  3307. v2i64, v4i64, v2i64, SchedWriteVecShift,
  3308. SchedWriteVecShiftImm, NoVLX>;
  3309. defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
  3310. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3311. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3312. defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
  3313. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3314. SchedWriteVecShiftImm, NoVLX>;
  3315. defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
  3316. SchedWriteShuffle>;
  3317. defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
  3318. SchedWriteShuffle>;
  3319. } // ExeDomain = SSEPackedInt
  3320. //===---------------------------------------------------------------------===//
  3321. // SSE2 - Packed Integer Comparison Instructions
  3322. //===---------------------------------------------------------------------===//
  3323. defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
  3324. SchedWriteVecALU, 1, TruePredicate>;
  3325. defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
  3326. SchedWriteVecALU, 1, TruePredicate>;
  3327. defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
  3328. SchedWriteVecALU, 1, TruePredicate>;
  3329. defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
  3330. SchedWriteVecALU, 0, TruePredicate>;
  3331. defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
  3332. SchedWriteVecALU, 0, TruePredicate>;
  3333. defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
  3334. SchedWriteVecALU, 0, TruePredicate>;
  3335. //===---------------------------------------------------------------------===//
  3336. // SSE2 - Packed Integer Shuffle Instructions
  3337. //===---------------------------------------------------------------------===//
  3338. let ExeDomain = SSEPackedInt in {
  3339. multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
  3340. SDNode OpNode, X86SchedWriteWidths sched,
  3341. Predicate prd> {
  3342. let Predicates = [HasAVX, prd] in {
  3343. def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
  3344. (ins VR128:$src1, u8imm:$src2),
  3345. !strconcat("v", OpcodeStr,
  3346. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3347. [(set VR128:$dst,
  3348. (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
  3349. VEX, Sched<[sched.XMM]>, VEX_WIG;
  3350. def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
  3351. (ins i128mem:$src1, u8imm:$src2),
  3352. !strconcat("v", OpcodeStr,
  3353. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3354. [(set VR128:$dst,
  3355. (vt128 (OpNode (load addr:$src1),
  3356. (i8 timm:$src2))))]>, VEX,
  3357. Sched<[sched.XMM.Folded]>, VEX_WIG;
  3358. }
  3359. let Predicates = [HasAVX2, prd] in {
  3360. def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
  3361. (ins VR256:$src1, u8imm:$src2),
  3362. !strconcat("v", OpcodeStr,
  3363. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3364. [(set VR256:$dst,
  3365. (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
  3366. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  3367. def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
  3368. (ins i256mem:$src1, u8imm:$src2),
  3369. !strconcat("v", OpcodeStr,
  3370. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3371. [(set VR256:$dst,
  3372. (vt256 (OpNode (load addr:$src1),
  3373. (i8 timm:$src2))))]>, VEX, VEX_L,
  3374. Sched<[sched.YMM.Folded]>, VEX_WIG;
  3375. }
  3376. let Predicates = [UseSSE2] in {
  3377. def ri : Ii8<0x70, MRMSrcReg,
  3378. (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
  3379. !strconcat(OpcodeStr,
  3380. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3381. [(set VR128:$dst,
  3382. (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
  3383. Sched<[sched.XMM]>;
  3384. def mi : Ii8<0x70, MRMSrcMem,
  3385. (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
  3386. !strconcat(OpcodeStr,
  3387. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3388. [(set VR128:$dst,
  3389. (vt128 (OpNode (memop addr:$src1),
  3390. (i8 timm:$src2))))]>,
  3391. Sched<[sched.XMM.Folded]>;
  3392. }
  3393. }
  3394. } // ExeDomain = SSEPackedInt
  3395. defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
  3396. SchedWriteShuffle, NoVLX>, PD;
  3397. defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
  3398. SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
  3399. defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
  3400. SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
  3401. //===---------------------------------------------------------------------===//
  3402. // Packed Integer Pack Instructions (SSE & AVX)
  3403. //===---------------------------------------------------------------------===//
  3404. let ExeDomain = SSEPackedInt in {
  3405. multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
  3406. ValueType ArgVT, SDNode OpNode, RegisterClass RC,
  3407. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  3408. PatFrag ld_frag, bit Is2Addr = 1> {
  3409. def rr : PDI<opc, MRMSrcReg,
  3410. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3411. !if(Is2Addr,
  3412. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3413. !strconcat(OpcodeStr,
  3414. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3415. [(set RC:$dst,
  3416. (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
  3417. Sched<[sched]>;
  3418. def rm : PDI<opc, MRMSrcMem,
  3419. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3420. !if(Is2Addr,
  3421. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3422. !strconcat(OpcodeStr,
  3423. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3424. [(set RC:$dst,
  3425. (OutVT (OpNode (ArgVT RC:$src1),
  3426. (ld_frag addr:$src2))))]>,
  3427. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3428. }
  3429. multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
  3430. ValueType ArgVT, SDNode OpNode, RegisterClass RC,
  3431. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  3432. PatFrag ld_frag, bit Is2Addr = 1> {
  3433. def rr : SS48I<opc, MRMSrcReg,
  3434. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3435. !if(Is2Addr,
  3436. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3437. !strconcat(OpcodeStr,
  3438. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3439. [(set RC:$dst,
  3440. (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
  3441. Sched<[sched]>;
  3442. def rm : SS48I<opc, MRMSrcMem,
  3443. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3444. !if(Is2Addr,
  3445. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3446. !strconcat(OpcodeStr,
  3447. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3448. [(set RC:$dst,
  3449. (OutVT (OpNode (ArgVT RC:$src1),
  3450. (ld_frag addr:$src2))))]>,
  3451. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3452. }
  3453. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  3454. defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
  3455. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3456. VEX_4V, VEX_WIG;
  3457. defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
  3458. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3459. VEX_4V, VEX_WIG;
  3460. defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
  3461. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3462. VEX_4V, VEX_WIG;
  3463. defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
  3464. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3465. VEX_4V, VEX_WIG;
  3466. }
  3467. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  3468. defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
  3469. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3470. VEX_4V, VEX_L, VEX_WIG;
  3471. defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
  3472. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3473. VEX_4V, VEX_L, VEX_WIG;
  3474. defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
  3475. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3476. VEX_4V, VEX_L, VEX_WIG;
  3477. defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
  3478. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3479. VEX_4V, VEX_L, VEX_WIG;
  3480. }
  3481. let Constraints = "$src1 = $dst" in {
  3482. defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
  3483. i128mem, SchedWriteShuffle.XMM, memop>;
  3484. defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
  3485. i128mem, SchedWriteShuffle.XMM, memop>;
  3486. defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
  3487. i128mem, SchedWriteShuffle.XMM, memop>;
  3488. defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
  3489. i128mem, SchedWriteShuffle.XMM, memop>;
  3490. }
  3491. } // ExeDomain = SSEPackedInt
  3492. //===---------------------------------------------------------------------===//
  3493. // SSE2 - Packed Integer Unpack Instructions
  3494. //===---------------------------------------------------------------------===//
  3495. let ExeDomain = SSEPackedInt in {
  3496. multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
  3497. SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
  3498. X86FoldableSchedWrite sched, PatFrag ld_frag,
  3499. bit Is2Addr = 1> {
  3500. def rr : PDI<opc, MRMSrcReg,
  3501. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3502. !if(Is2Addr,
  3503. !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
  3504. !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3505. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  3506. Sched<[sched]>;
  3507. def rm : PDI<opc, MRMSrcMem,
  3508. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3509. !if(Is2Addr,
  3510. !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
  3511. !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3512. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  3513. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3514. }
  3515. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  3516. defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
  3517. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3518. VEX_4V, VEX_WIG;
  3519. defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
  3520. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3521. VEX_4V, VEX_WIG;
  3522. defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
  3523. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3524. VEX_4V, VEX_WIG;
  3525. defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
  3526. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3527. VEX_4V, VEX_WIG;
  3528. }
  3529. let Predicates = [HasAVX, NoVLX] in {
  3530. defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
  3531. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3532. VEX_4V, VEX_WIG;
  3533. defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
  3534. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3535. VEX_4V, VEX_WIG;
  3536. defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
  3537. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3538. VEX_4V, VEX_WIG;
  3539. defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
  3540. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3541. VEX_4V, VEX_WIG;
  3542. }
  3543. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  3544. defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
  3545. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3546. VEX_4V, VEX_L, VEX_WIG;
  3547. defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
  3548. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3549. VEX_4V, VEX_L, VEX_WIG;
  3550. defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
  3551. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3552. VEX_4V, VEX_L, VEX_WIG;
  3553. defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
  3554. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3555. VEX_4V, VEX_L, VEX_WIG;
  3556. }
  3557. let Predicates = [HasAVX2, NoVLX] in {
  3558. defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
  3559. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3560. VEX_4V, VEX_L, VEX_WIG;
  3561. defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
  3562. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3563. VEX_4V, VEX_L, VEX_WIG;
  3564. defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
  3565. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3566. VEX_4V, VEX_L, VEX_WIG;
  3567. defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
  3568. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3569. VEX_4V, VEX_L, VEX_WIG;
  3570. }
  3571. let Constraints = "$src1 = $dst" in {
  3572. defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
  3573. i128mem, SchedWriteShuffle.XMM, memop>;
  3574. defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
  3575. i128mem, SchedWriteShuffle.XMM, memop>;
  3576. defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
  3577. i128mem, SchedWriteShuffle.XMM, memop>;
  3578. defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
  3579. i128mem, SchedWriteShuffle.XMM, memop>;
  3580. defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
  3581. i128mem, SchedWriteShuffle.XMM, memop>;
  3582. defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
  3583. i128mem, SchedWriteShuffle.XMM, memop>;
  3584. defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
  3585. i128mem, SchedWriteShuffle.XMM, memop>;
  3586. defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
  3587. i128mem, SchedWriteShuffle.XMM, memop>;
  3588. }
  3589. } // ExeDomain = SSEPackedInt
  3590. //===---------------------------------------------------------------------===//
  3591. // SSE2 - Packed Integer Extract and Insert
  3592. //===---------------------------------------------------------------------===//
  3593. let ExeDomain = SSEPackedInt in {
  3594. multiclass sse2_pinsrw<bit Is2Addr = 1> {
  3595. def rr : Ii8<0xC4, MRMSrcReg,
  3596. (outs VR128:$dst), (ins VR128:$src1,
  3597. GR32orGR64:$src2, u8imm:$src3),
  3598. !if(Is2Addr,
  3599. "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  3600. "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  3601. [(set VR128:$dst,
  3602. (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
  3603. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  3604. def rm : Ii8<0xC4, MRMSrcMem,
  3605. (outs VR128:$dst), (ins VR128:$src1,
  3606. i16mem:$src2, u8imm:$src3),
  3607. !if(Is2Addr,
  3608. "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  3609. "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  3610. [(set VR128:$dst,
  3611. (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
  3612. timm:$src3))]>,
  3613. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  3614. }
  3615. // Extract
  3616. let Predicates = [HasAVX, NoBWI] in
  3617. def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
  3618. (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
  3619. "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  3620. [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
  3621. timm:$src2))]>,
  3622. PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
  3623. def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
  3624. (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
  3625. "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  3626. [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
  3627. timm:$src2))]>,
  3628. Sched<[WriteVecExtract]>;
  3629. // Insert
  3630. let Predicates = [HasAVX, NoBWI] in
  3631. defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
  3632. let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
  3633. defm PINSRW : sse2_pinsrw, PD;
  3634. } // ExeDomain = SSEPackedInt
  3635. // Always select FP16 instructions if available.
  3636. let Predicates = [UseSSE2], AddedComplexity = -10 in {
  3637. def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
  3638. def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
  3639. def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
  3640. def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
  3641. }
  3642. let Predicates = [HasAVX, NoBWI] in {
  3643. def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
  3644. def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
  3645. def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
  3646. }
  3647. //===---------------------------------------------------------------------===//
  3648. // SSE2 - Packed Mask Creation
  3649. //===---------------------------------------------------------------------===//
  3650. let ExeDomain = SSEPackedInt in {
  3651. def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
  3652. (ins VR128:$src),
  3653. "pmovmskb\t{$src, $dst|$dst, $src}",
  3654. [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
  3655. Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
  3656. let Predicates = [HasAVX2] in {
  3657. def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
  3658. (ins VR256:$src),
  3659. "pmovmskb\t{$src, $dst|$dst, $src}",
  3660. [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
  3661. Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
  3662. }
  3663. def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
  3664. "pmovmskb\t{$src, $dst|$dst, $src}",
  3665. [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
  3666. Sched<[WriteVecMOVMSK]>;
  3667. } // ExeDomain = SSEPackedInt
  3668. //===---------------------------------------------------------------------===//
  3669. // SSE2 - Conditional Store
  3670. //===---------------------------------------------------------------------===//
  3671. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
  3672. // As VEX does not have separate instruction contexts for address size
  3673. // overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
  3674. // Prefer VMASKMODDQU64.
  3675. let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
  3676. def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
  3677. (ins VR128:$src, VR128:$mask),
  3678. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3679. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
  3680. VEX, VEX_WIG;
  3681. let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
  3682. def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
  3683. (ins VR128:$src, VR128:$mask),
  3684. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3685. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
  3686. VEX, VEX_WIG;
  3687. let Uses = [EDI], Predicates = [UseSSE2] in
  3688. def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
  3689. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3690. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
  3691. let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
  3692. def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
  3693. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3694. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
  3695. } // ExeDomain = SSEPackedInt
  3696. //===---------------------------------------------------------------------===//
  3697. // SSE2 - Move Doubleword/Quadword
  3698. //===---------------------------------------------------------------------===//
  3699. //===---------------------------------------------------------------------===//
  3700. // Move Int Doubleword to Packed Double Int
  3701. //
  3702. let ExeDomain = SSEPackedInt in {
  3703. def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
  3704. "movd\t{$src, $dst|$dst, $src}",
  3705. [(set VR128:$dst,
  3706. (v4i32 (scalar_to_vector GR32:$src)))]>,
  3707. VEX, Sched<[WriteVecMoveFromGpr]>;
  3708. def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
  3709. "movd\t{$src, $dst|$dst, $src}",
  3710. [(set VR128:$dst,
  3711. (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
  3712. VEX, Sched<[WriteVecLoad]>;
  3713. def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
  3714. "movq\t{$src, $dst|$dst, $src}",
  3715. [(set VR128:$dst,
  3716. (v2i64 (scalar_to_vector GR64:$src)))]>,
  3717. VEX, Sched<[WriteVecMoveFromGpr]>;
  3718. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
  3719. def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3720. "movq\t{$src, $dst|$dst, $src}", []>,
  3721. VEX, Sched<[WriteVecLoad]>;
  3722. let isCodeGenOnly = 1 in
  3723. def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
  3724. "movq\t{$src, $dst|$dst, $src}",
  3725. [(set FR64:$dst, (bitconvert GR64:$src))]>,
  3726. VEX, Sched<[WriteVecMoveFromGpr]>;
  3727. def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
  3728. "movd\t{$src, $dst|$dst, $src}",
  3729. [(set VR128:$dst,
  3730. (v4i32 (scalar_to_vector GR32:$src)))]>,
  3731. Sched<[WriteVecMoveFromGpr]>;
  3732. def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
  3733. "movd\t{$src, $dst|$dst, $src}",
  3734. [(set VR128:$dst,
  3735. (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
  3736. Sched<[WriteVecLoad]>;
  3737. def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
  3738. "movq\t{$src, $dst|$dst, $src}",
  3739. [(set VR128:$dst,
  3740. (v2i64 (scalar_to_vector GR64:$src)))]>,
  3741. Sched<[WriteVecMoveFromGpr]>;
  3742. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
  3743. def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3744. "movq\t{$src, $dst|$dst, $src}", []>,
  3745. Sched<[WriteVecLoad]>;
  3746. let isCodeGenOnly = 1 in
  3747. def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
  3748. "movq\t{$src, $dst|$dst, $src}",
  3749. [(set FR64:$dst, (bitconvert GR64:$src))]>,
  3750. Sched<[WriteVecMoveFromGpr]>;
  3751. } // ExeDomain = SSEPackedInt
  3752. //===---------------------------------------------------------------------===//
  3753. // Move Int Doubleword to Single Scalar
  3754. //
  3755. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3756. def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
  3757. "movd\t{$src, $dst|$dst, $src}",
  3758. [(set FR32:$dst, (bitconvert GR32:$src))]>,
  3759. VEX, Sched<[WriteVecMoveFromGpr]>;
  3760. def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
  3761. "movd\t{$src, $dst|$dst, $src}",
  3762. [(set FR32:$dst, (bitconvert GR32:$src))]>,
  3763. Sched<[WriteVecMoveFromGpr]>;
  3764. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3765. //===---------------------------------------------------------------------===//
  3766. // Move Packed Doubleword Int to Packed Double Int
  3767. //
  3768. let ExeDomain = SSEPackedInt in {
  3769. def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
  3770. "movd\t{$src, $dst|$dst, $src}",
  3771. [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
  3772. (iPTR 0)))]>, VEX,
  3773. Sched<[WriteVecMoveToGpr]>;
  3774. def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
  3775. (ins i32mem:$dst, VR128:$src),
  3776. "movd\t{$src, $dst|$dst, $src}",
  3777. [(store (i32 (extractelt (v4i32 VR128:$src),
  3778. (iPTR 0))), addr:$dst)]>,
  3779. VEX, Sched<[WriteVecStore]>;
  3780. def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
  3781. "movd\t{$src, $dst|$dst, $src}",
  3782. [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
  3783. (iPTR 0)))]>,
  3784. Sched<[WriteVecMoveToGpr]>;
  3785. def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
  3786. "movd\t{$src, $dst|$dst, $src}",
  3787. [(store (i32 (extractelt (v4i32 VR128:$src),
  3788. (iPTR 0))), addr:$dst)]>,
  3789. Sched<[WriteVecStore]>;
  3790. } // ExeDomain = SSEPackedInt
  3791. //===---------------------------------------------------------------------===//
  3792. // Move Packed Doubleword Int first element to Doubleword Int
  3793. //
  3794. let ExeDomain = SSEPackedInt in {
  3795. let SchedRW = [WriteVecMoveToGpr] in {
  3796. def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
  3797. "movq\t{$src, $dst|$dst, $src}",
  3798. [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
  3799. (iPTR 0)))]>,
  3800. VEX;
  3801. def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
  3802. "movq\t{$src, $dst|$dst, $src}",
  3803. [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
  3804. (iPTR 0)))]>;
  3805. } //SchedRW
  3806. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
  3807. def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
  3808. (ins i64mem:$dst, VR128:$src),
  3809. "movq\t{$src, $dst|$dst, $src}", []>,
  3810. VEX, Sched<[WriteVecStore]>;
  3811. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
  3812. def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3813. "movq\t{$src, $dst|$dst, $src}", []>,
  3814. Sched<[WriteVecStore]>;
  3815. } // ExeDomain = SSEPackedInt
  3816. //===---------------------------------------------------------------------===//
  3817. // Bitcast FR64 <-> GR64
  3818. //
  3819. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3820. def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
  3821. "movq\t{$src, $dst|$dst, $src}",
  3822. [(set GR64:$dst, (bitconvert FR64:$src))]>,
  3823. VEX, Sched<[WriteVecMoveToGpr]>;
  3824. def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
  3825. "movq\t{$src, $dst|$dst, $src}",
  3826. [(set GR64:$dst, (bitconvert FR64:$src))]>,
  3827. Sched<[WriteVecMoveToGpr]>;
  3828. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3829. //===---------------------------------------------------------------------===//
  3830. // Move Scalar Single to Double Int
  3831. //
  3832. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3833. def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
  3834. "movd\t{$src, $dst|$dst, $src}",
  3835. [(set GR32:$dst, (bitconvert FR32:$src))]>,
  3836. VEX, Sched<[WriteVecMoveToGpr]>;
  3837. def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
  3838. "movd\t{$src, $dst|$dst, $src}",
  3839. [(set GR32:$dst, (bitconvert FR32:$src))]>,
  3840. Sched<[WriteVecMoveToGpr]>;
  3841. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3842. let Predicates = [UseAVX] in {
  3843. def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))),
  3844. (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  3845. GR8:$src, sub_8bit)))>;
  3846. def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
  3847. (VMOVDI2PDIrr GR32:$src)>;
  3848. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
  3849. (VMOV64toPQIrr GR64:$src)>;
  3850. // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
  3851. // These instructions also write zeros in the high part of a 256-bit register.
  3852. def : Pat<(v4i32 (X86vzload32 addr:$src)),
  3853. (VMOVDI2PDIrm addr:$src)>;
  3854. def : Pat<(v8i32 (X86vzload32 addr:$src)),
  3855. (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
  3856. }
  3857. let Predicates = [UseSSE2] in {
  3858. def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
  3859. (MOVDI2PDIrr GR32:$src)>;
  3860. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
  3861. (MOV64toPQIrr GR64:$src)>;
  3862. def : Pat<(v4i32 (X86vzload32 addr:$src)),
  3863. (MOVDI2PDIrm addr:$src)>;
  3864. }
  3865. // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
  3866. // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
  3867. // these aliases.
  3868. def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
  3869. (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
  3870. def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
  3871. (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
  3872. // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
  3873. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
  3874. (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
  3875. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
  3876. (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
  3877. //===---------------------------------------------------------------------===//
  3878. // SSE2 - Move Quadword
  3879. //===---------------------------------------------------------------------===//
  3880. //===---------------------------------------------------------------------===//
  3881. // Move Quadword Int to Packed Quadword Int
  3882. //
  3883. let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
  3884. def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3885. "vmovq\t{$src, $dst|$dst, $src}",
  3886. [(set VR128:$dst,
  3887. (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
  3888. VEX, Requires<[UseAVX]>, VEX_WIG;
  3889. def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3890. "movq\t{$src, $dst|$dst, $src}",
  3891. [(set VR128:$dst,
  3892. (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
  3893. XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
  3894. } // ExeDomain, SchedRW
  3895. //===---------------------------------------------------------------------===//
  3896. // Move Packed Quadword Int to Quadword Int
  3897. //
  3898. let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
  3899. def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3900. "movq\t{$src, $dst|$dst, $src}",
  3901. [(store (i64 (extractelt (v2i64 VR128:$src),
  3902. (iPTR 0))), addr:$dst)]>,
  3903. VEX, VEX_WIG;
  3904. def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3905. "movq\t{$src, $dst|$dst, $src}",
  3906. [(store (i64 (extractelt (v2i64 VR128:$src),
  3907. (iPTR 0))), addr:$dst)]>;
  3908. } // ExeDomain, SchedRW
  3909. // For disassembler only
  3910. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  3911. SchedRW = [SchedWriteVecLogic.XMM] in {
  3912. def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3913. "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
  3914. def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3915. "movq\t{$src, $dst|$dst, $src}", []>;
  3916. }
  3917. def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
  3918. (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
  3919. def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
  3920. (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
  3921. let Predicates = [UseAVX] in {
  3922. def : Pat<(v2i64 (X86vzload64 addr:$src)),
  3923. (VMOVQI2PQIrm addr:$src)>;
  3924. def : Pat<(v4i64 (X86vzload64 addr:$src)),
  3925. (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
  3926. def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
  3927. (VMOVPQI2QImr addr:$dst, VR128:$src)>;
  3928. }
  3929. let Predicates = [UseSSE2] in {
  3930. def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
  3931. def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
  3932. (MOVPQI2QImr addr:$dst, VR128:$src)>;
  3933. }
  3934. //===---------------------------------------------------------------------===//
  3935. // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
  3936. // IA32 document. movq xmm1, xmm2 does clear the high bits.
  3937. //
  3938. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
  3939. def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3940. "vmovq\t{$src, $dst|$dst, $src}",
  3941. [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
  3942. XS, VEX, Requires<[UseAVX]>, VEX_WIG;
  3943. def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3944. "movq\t{$src, $dst|$dst, $src}",
  3945. [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
  3946. XS, Requires<[UseSSE2]>;
  3947. } // ExeDomain, SchedRW
  3948. let Predicates = [UseAVX] in {
  3949. def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
  3950. (VMOVZPQILo2PQIrr VR128:$src)>;
  3951. }
  3952. let Predicates = [UseSSE2] in {
  3953. def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
  3954. (MOVZPQILo2PQIrr VR128:$src)>;
  3955. }
  3956. let Predicates = [UseAVX] in {
  3957. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
  3958. (SUBREG_TO_REG (i32 0),
  3959. (v2f64 (VMOVZPQILo2PQIrr
  3960. (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
  3961. sub_xmm)>;
  3962. def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
  3963. (SUBREG_TO_REG (i32 0),
  3964. (v2i64 (VMOVZPQILo2PQIrr
  3965. (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
  3966. sub_xmm)>;
  3967. }
  3968. //===---------------------------------------------------------------------===//
  3969. // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
  3970. //===---------------------------------------------------------------------===//
  3971. multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
  3972. ValueType vt, RegisterClass RC, PatFrag mem_frag,
  3973. X86MemOperand x86memop, X86FoldableSchedWrite sched> {
  3974. def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
  3975. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3976. [(set RC:$dst, (vt (OpNode RC:$src)))]>,
  3977. Sched<[sched]>;
  3978. def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  3979. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3980. [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
  3981. Sched<[sched.Folded]>;
  3982. }
  3983. let Predicates = [HasAVX, NoVLX] in {
  3984. defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
  3985. v4f32, VR128, loadv4f32, f128mem,
  3986. SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
  3987. defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
  3988. v4f32, VR128, loadv4f32, f128mem,
  3989. SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
  3990. defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
  3991. v8f32, VR256, loadv8f32, f256mem,
  3992. SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
  3993. defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
  3994. v8f32, VR256, loadv8f32, f256mem,
  3995. SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
  3996. }
  3997. defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
  3998. memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
  3999. defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
  4000. memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
  4001. let Predicates = [HasAVX, NoVLX] in {
  4002. def : Pat<(v4i32 (X86Movshdup VR128:$src)),
  4003. (VMOVSHDUPrr VR128:$src)>;
  4004. def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
  4005. (VMOVSHDUPrm addr:$src)>;
  4006. def : Pat<(v4i32 (X86Movsldup VR128:$src)),
  4007. (VMOVSLDUPrr VR128:$src)>;
  4008. def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
  4009. (VMOVSLDUPrm addr:$src)>;
  4010. def : Pat<(v8i32 (X86Movshdup VR256:$src)),
  4011. (VMOVSHDUPYrr VR256:$src)>;
  4012. def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
  4013. (VMOVSHDUPYrm addr:$src)>;
  4014. def : Pat<(v8i32 (X86Movsldup VR256:$src)),
  4015. (VMOVSLDUPYrr VR256:$src)>;
  4016. def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
  4017. (VMOVSLDUPYrm addr:$src)>;
  4018. }
  4019. let Predicates = [UseSSE3] in {
  4020. def : Pat<(v4i32 (X86Movshdup VR128:$src)),
  4021. (MOVSHDUPrr VR128:$src)>;
  4022. def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
  4023. (MOVSHDUPrm addr:$src)>;
  4024. def : Pat<(v4i32 (X86Movsldup VR128:$src)),
  4025. (MOVSLDUPrr VR128:$src)>;
  4026. def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
  4027. (MOVSLDUPrm addr:$src)>;
  4028. }
  4029. //===---------------------------------------------------------------------===//
  4030. // SSE3 - Replicate Double FP - MOVDDUP
  4031. //===---------------------------------------------------------------------===//
  4032. multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
  4033. def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  4034. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4035. [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
  4036. Sched<[sched.XMM]>;
  4037. def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  4038. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4039. [(set VR128:$dst,
  4040. (v2f64 (X86Movddup
  4041. (scalar_to_vector (loadf64 addr:$src)))))]>,
  4042. Sched<[sched.XMM.Folded]>;
  4043. }
  4044. // FIXME: Merge with above classes when there are patterns for the ymm version
  4045. multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
  4046. def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  4047. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4048. [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
  4049. Sched<[sched.YMM]>;
  4050. def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  4051. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4052. [(set VR256:$dst,
  4053. (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
  4054. Sched<[sched.YMM.Folded]>;
  4055. }
  4056. let Predicates = [HasAVX, NoVLX] in {
  4057. defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
  4058. VEX, VEX_WIG;
  4059. defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
  4060. VEX, VEX_L, VEX_WIG;
  4061. }
  4062. defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
  4063. let Predicates = [HasAVX, NoVLX] in {
  4064. def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
  4065. (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
  4066. }
  4067. let Predicates = [UseSSE3] in {
  4068. def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
  4069. (MOVDDUPrm addr:$src)>;
  4070. }
  4071. //===---------------------------------------------------------------------===//
  4072. // SSE3 - Move Unaligned Integer
  4073. //===---------------------------------------------------------------------===//
  4074. let Predicates = [HasAVX] in {
  4075. def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  4076. "vlddqu\t{$src, $dst|$dst, $src}",
  4077. [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
  4078. Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
  4079. def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  4080. "vlddqu\t{$src, $dst|$dst, $src}",
  4081. [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
  4082. Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
  4083. } // Predicates
  4084. def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  4085. "lddqu\t{$src, $dst|$dst, $src}",
  4086. [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
  4087. Sched<[SchedWriteVecMoveLS.XMM.RM]>;
  4088. //===---------------------------------------------------------------------===//
  4089. // SSE3 - Arithmetic
  4090. //===---------------------------------------------------------------------===//
  4091. multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
  4092. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  4093. PatFrag ld_frag, bit Is2Addr = 1> {
  4094. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4095. def rr : I<0xD0, MRMSrcReg,
  4096. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4097. !if(Is2Addr,
  4098. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4099. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4100. [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
  4101. Sched<[sched]>;
  4102. def rm : I<0xD0, MRMSrcMem,
  4103. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4104. !if(Is2Addr,
  4105. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4106. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4107. [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
  4108. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4109. }
  4110. }
  4111. let Predicates = [HasAVX] in {
  4112. let ExeDomain = SSEPackedSingle in {
  4113. defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
  4114. SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
  4115. XD, VEX_4V, VEX_WIG;
  4116. defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
  4117. SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
  4118. XD, VEX_4V, VEX_L, VEX_WIG;
  4119. }
  4120. let ExeDomain = SSEPackedDouble in {
  4121. defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
  4122. SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
  4123. PD, VEX_4V, VEX_WIG;
  4124. defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
  4125. SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
  4126. PD, VEX_4V, VEX_L, VEX_WIG;
  4127. }
  4128. }
  4129. let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
  4130. let ExeDomain = SSEPackedSingle in
  4131. defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
  4132. SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
  4133. let ExeDomain = SSEPackedDouble in
  4134. defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
  4135. SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
  4136. }
  4137. //===---------------------------------------------------------------------===//
  4138. // SSE3 Instructions
  4139. //===---------------------------------------------------------------------===//
  4140. // Horizontal ops
  4141. multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
  4142. X86MemOperand x86memop, SDNode OpNode,
  4143. X86FoldableSchedWrite sched, PatFrag ld_frag,
  4144. bit Is2Addr = 1> {
  4145. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4146. def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4147. !if(Is2Addr,
  4148. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4149. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4150. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  4151. Sched<[sched]>;
  4152. def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4153. !if(Is2Addr,
  4154. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4155. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4156. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  4157. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4158. }
  4159. }
  4160. multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
  4161. X86MemOperand x86memop, SDNode OpNode,
  4162. X86FoldableSchedWrite sched, PatFrag ld_frag,
  4163. bit Is2Addr = 1> {
  4164. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4165. def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4166. !if(Is2Addr,
  4167. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4168. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4169. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  4170. Sched<[sched]>;
  4171. def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4172. !if(Is2Addr,
  4173. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4174. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4175. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  4176. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4177. }
  4178. }
  4179. let Predicates = [HasAVX] in {
  4180. let ExeDomain = SSEPackedSingle in {
  4181. defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
  4182. X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
  4183. defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
  4184. X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
  4185. defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
  4186. X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
  4187. defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
  4188. X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
  4189. }
  4190. let ExeDomain = SSEPackedDouble in {
  4191. defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
  4192. X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
  4193. defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
  4194. X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
  4195. defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
  4196. X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
  4197. defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
  4198. X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
  4199. }
  4200. }
  4201. let Constraints = "$src1 = $dst" in {
  4202. let ExeDomain = SSEPackedSingle in {
  4203. defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
  4204. WriteFHAdd, memopv4f32>;
  4205. defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
  4206. WriteFHAdd, memopv4f32>;
  4207. }
  4208. let ExeDomain = SSEPackedDouble in {
  4209. defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
  4210. WriteFHAdd, memopv2f64>;
  4211. defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
  4212. WriteFHAdd, memopv2f64>;
  4213. }
  4214. }
  4215. //===---------------------------------------------------------------------===//
  4216. // SSSE3 - Packed Absolute Instructions
  4217. //===---------------------------------------------------------------------===//
  4218. /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
  4219. multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
  4220. SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
  4221. def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
  4222. (ins VR128:$src),
  4223. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4224. [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
  4225. Sched<[sched.XMM]>;
  4226. def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
  4227. (ins i128mem:$src),
  4228. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4229. [(set VR128:$dst,
  4230. (vt (OpNode (ld_frag addr:$src))))]>,
  4231. Sched<[sched.XMM.Folded]>;
  4232. }
  4233. /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
  4234. multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
  4235. SDNode OpNode, X86SchedWriteWidths sched> {
  4236. def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
  4237. (ins VR256:$src),
  4238. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4239. [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
  4240. Sched<[sched.YMM]>;
  4241. def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
  4242. (ins i256mem:$src),
  4243. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4244. [(set VR256:$dst,
  4245. (vt (OpNode (load addr:$src))))]>,
  4246. Sched<[sched.YMM.Folded]>;
  4247. }
  4248. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4249. defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
  4250. load>, VEX, VEX_WIG;
  4251. defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
  4252. load>, VEX, VEX_WIG;
  4253. }
  4254. let Predicates = [HasAVX, NoVLX] in {
  4255. defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
  4256. load>, VEX, VEX_WIG;
  4257. }
  4258. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4259. defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
  4260. VEX, VEX_L, VEX_WIG;
  4261. defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
  4262. VEX, VEX_L, VEX_WIG;
  4263. }
  4264. let Predicates = [HasAVX2, NoVLX] in {
  4265. defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
  4266. VEX, VEX_L, VEX_WIG;
  4267. }
  4268. defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
  4269. memop>;
  4270. defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
  4271. memop>;
  4272. defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
  4273. memop>;
  4274. //===---------------------------------------------------------------------===//
  4275. // SSSE3 - Packed Binary Operator Instructions
  4276. //===---------------------------------------------------------------------===//
  4277. /// SS3I_binop_rm - Simple SSSE3 bin op
  4278. multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  4279. ValueType DstVT, ValueType OpVT, RegisterClass RC,
  4280. PatFrag memop_frag, X86MemOperand x86memop,
  4281. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  4282. let isCommutable = 1 in
  4283. def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
  4284. (ins RC:$src1, RC:$src2),
  4285. !if(Is2Addr,
  4286. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4287. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4288. [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
  4289. Sched<[sched]>;
  4290. def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
  4291. (ins RC:$src1, x86memop:$src2),
  4292. !if(Is2Addr,
  4293. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4294. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4295. [(set RC:$dst,
  4296. (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
  4297. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4298. }
  4299. /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
  4300. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
  4301. Intrinsic IntId128, X86FoldableSchedWrite sched,
  4302. PatFrag ld_frag, bit Is2Addr = 1> {
  4303. let isCommutable = 1 in
  4304. def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
  4305. (ins VR128:$src1, VR128:$src2),
  4306. !if(Is2Addr,
  4307. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4308. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4309. [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
  4310. Sched<[sched]>;
  4311. def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
  4312. (ins VR128:$src1, i128mem:$src2),
  4313. !if(Is2Addr,
  4314. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4315. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4316. [(set VR128:$dst,
  4317. (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
  4318. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4319. }
  4320. multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
  4321. Intrinsic IntId256,
  4322. X86FoldableSchedWrite sched> {
  4323. let isCommutable = 1 in
  4324. def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
  4325. (ins VR256:$src1, VR256:$src2),
  4326. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4327. [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
  4328. Sched<[sched]>;
  4329. def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
  4330. (ins VR256:$src1, i256mem:$src2),
  4331. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4332. [(set VR256:$dst,
  4333. (IntId256 VR256:$src1, (load addr:$src2)))]>,
  4334. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4335. }
  4336. let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4337. let isCommutable = 0 in {
  4338. defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
  4339. VR128, load, i128mem,
  4340. SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
  4341. defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
  4342. v16i8, VR128, load, i128mem,
  4343. SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
  4344. }
  4345. defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
  4346. VR128, load, i128mem,
  4347. SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
  4348. }
  4349. let ImmT = NoImm, Predicates = [HasAVX] in {
  4350. let isCommutable = 0 in {
  4351. defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
  4352. load, i128mem,
  4353. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4354. defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
  4355. load, i128mem,
  4356. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4357. defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
  4358. load, i128mem,
  4359. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4360. defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
  4361. load, i128mem,
  4362. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4363. defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
  4364. int_x86_ssse3_psign_b_128,
  4365. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4366. defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
  4367. int_x86_ssse3_psign_w_128,
  4368. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4369. defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
  4370. int_x86_ssse3_psign_d_128,
  4371. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4372. defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
  4373. int_x86_ssse3_phadd_sw_128,
  4374. SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
  4375. defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
  4376. int_x86_ssse3_phsub_sw_128,
  4377. SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
  4378. }
  4379. }
  4380. let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4381. let isCommutable = 0 in {
  4382. defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
  4383. VR256, load, i256mem,
  4384. SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4385. defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
  4386. v32i8, VR256, load, i256mem,
  4387. SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4388. }
  4389. defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
  4390. VR256, load, i256mem,
  4391. SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4392. }
  4393. let ImmT = NoImm, Predicates = [HasAVX2] in {
  4394. let isCommutable = 0 in {
  4395. defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
  4396. VR256, load, i256mem,
  4397. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4398. defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
  4399. load, i256mem,
  4400. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4401. defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
  4402. VR256, load, i256mem,
  4403. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4404. defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
  4405. load, i256mem,
  4406. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4407. defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
  4408. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4409. defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
  4410. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4411. defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
  4412. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4413. defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
  4414. int_x86_avx2_phadd_sw,
  4415. SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4416. defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
  4417. int_x86_avx2_phsub_sw,
  4418. SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4419. }
  4420. }
  4421. // None of these have i8 immediate fields.
  4422. let ImmT = NoImm, Constraints = "$src1 = $dst" in {
  4423. let isCommutable = 0 in {
  4424. defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
  4425. memop, i128mem, SchedWritePHAdd.XMM>;
  4426. defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
  4427. memop, i128mem, SchedWritePHAdd.XMM>;
  4428. defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
  4429. memop, i128mem, SchedWritePHAdd.XMM>;
  4430. defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
  4431. memop, i128mem, SchedWritePHAdd.XMM>;
  4432. defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
  4433. SchedWriteVecALU.XMM, memop>;
  4434. defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
  4435. SchedWriteVecALU.XMM, memop>;
  4436. defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
  4437. SchedWriteVecALU.XMM, memop>;
  4438. defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
  4439. memop, i128mem, SchedWriteVarShuffle.XMM>;
  4440. defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
  4441. int_x86_ssse3_phadd_sw_128,
  4442. SchedWritePHAdd.XMM, memop>;
  4443. defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
  4444. int_x86_ssse3_phsub_sw_128,
  4445. SchedWritePHAdd.XMM, memop>;
  4446. defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
  4447. v16i8, VR128, memop, i128mem,
  4448. SchedWriteVecIMul.XMM>;
  4449. }
  4450. defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
  4451. VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
  4452. }
  4453. //===---------------------------------------------------------------------===//
  4454. // SSSE3 - Packed Align Instruction Patterns
  4455. //===---------------------------------------------------------------------===//
  4456. multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
  4457. PatFrag memop_frag, X86MemOperand x86memop,
  4458. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  4459. let hasSideEffects = 0 in {
  4460. def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
  4461. (ins RC:$src1, RC:$src2, u8imm:$src3),
  4462. !if(Is2Addr,
  4463. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4464. !strconcat(asm,
  4465. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4466. [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
  4467. Sched<[sched]>;
  4468. let mayLoad = 1 in
  4469. def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
  4470. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  4471. !if(Is2Addr,
  4472. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4473. !strconcat(asm,
  4474. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4475. [(set RC:$dst, (VT (X86PAlignr RC:$src1,
  4476. (memop_frag addr:$src2),
  4477. (i8 timm:$src3))))]>,
  4478. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4479. }
  4480. }
  4481. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  4482. defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
  4483. SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
  4484. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  4485. defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
  4486. SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4487. let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
  4488. defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
  4489. SchedWriteShuffle.XMM>;
  4490. //===---------------------------------------------------------------------===//
  4491. // SSSE3 - Thread synchronization
  4492. //===---------------------------------------------------------------------===//
  4493. let SchedRW = [WriteSystem] in {
  4494. let Uses = [EAX, ECX, EDX] in
  4495. def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
  4496. TB, Requires<[HasSSE3, Not64BitMode]>;
  4497. let Uses = [RAX, ECX, EDX] in
  4498. def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
  4499. TB, Requires<[HasSSE3, In64BitMode]>;
  4500. let Uses = [ECX, EAX] in
  4501. def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
  4502. [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
  4503. } // SchedRW
  4504. def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
  4505. def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
  4506. def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
  4507. Requires<[Not64BitMode]>;
  4508. def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
  4509. Requires<[In64BitMode]>;
  4510. //===----------------------------------------------------------------------===//
  4511. // SSE4.1 - Packed Move with Sign/Zero Extend
  4512. // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
  4513. //===----------------------------------------------------------------------===//
  4514. multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
  4515. RegisterClass OutRC, RegisterClass InRC,
  4516. X86FoldableSchedWrite sched> {
  4517. def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
  4518. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
  4519. Sched<[sched]>;
  4520. def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
  4521. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
  4522. Sched<[sched.Folded]>;
  4523. }
  4524. multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
  4525. X86MemOperand MemOp, X86MemOperand MemYOp,
  4526. Predicate prd> {
  4527. defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
  4528. SchedWriteShuffle.XMM>;
  4529. let Predicates = [HasAVX, prd] in
  4530. defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
  4531. VR128, VR128, SchedWriteVecExtend.XMM>,
  4532. VEX, VEX_WIG;
  4533. let Predicates = [HasAVX2, prd] in
  4534. defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
  4535. VR256, VR128, SchedWriteVecExtend.YMM>,
  4536. VEX, VEX_L, VEX_WIG;
  4537. }
  4538. multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
  4539. X86MemOperand MemYOp, Predicate prd> {
  4540. defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
  4541. MemOp, MemYOp, prd>;
  4542. defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
  4543. !strconcat("pmovzx", OpcodeStr),
  4544. MemOp, MemYOp, prd>;
  4545. }
  4546. defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
  4547. defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
  4548. defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
  4549. defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
  4550. defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
  4551. defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
  4552. // AVX2 Patterns
  4553. multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
  4554. SDNode ExtOp, SDNode InVecOp> {
  4555. // Register-Register patterns
  4556. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4557. def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
  4558. (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
  4559. }
  4560. let Predicates = [HasAVX2, NoVLX] in {
  4561. def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
  4562. (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
  4563. def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
  4564. (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
  4565. def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
  4566. (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
  4567. def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
  4568. (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
  4569. def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
  4570. (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
  4571. }
  4572. // Simple Register-Memory patterns
  4573. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4574. def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4575. (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
  4576. def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
  4577. (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
  4578. }
  4579. let Predicates = [HasAVX2, NoVLX] in {
  4580. def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4581. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4582. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4583. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4584. def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4585. (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
  4586. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4587. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4588. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
  4589. (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
  4590. }
  4591. // AVX2 Register-Memory patterns
  4592. let Predicates = [HasAVX2, NoVLX] in {
  4593. def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
  4594. (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
  4595. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4596. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4597. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4598. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4599. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
  4600. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4601. def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
  4602. (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
  4603. def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4604. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4605. def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
  4606. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4607. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4608. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4609. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4610. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4611. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  4612. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4613. }
  4614. }
  4615. defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
  4616. defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
  4617. // SSE4.1/AVX patterns.
  4618. multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
  4619. SDNode ExtOp> {
  4620. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4621. def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
  4622. (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
  4623. }
  4624. let Predicates = [HasAVX, NoVLX] in {
  4625. def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
  4626. (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
  4627. def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
  4628. (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
  4629. def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
  4630. (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
  4631. def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
  4632. (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
  4633. def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
  4634. (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
  4635. }
  4636. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4637. def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4638. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4639. }
  4640. let Predicates = [HasAVX, NoVLX] in {
  4641. def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4642. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4643. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4644. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4645. def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4646. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4647. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4648. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4649. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
  4650. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4651. }
  4652. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4653. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4654. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4655. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4656. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4657. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
  4658. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4659. def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
  4660. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4661. }
  4662. let Predicates = [HasAVX, NoVLX] in {
  4663. def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4664. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4665. def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
  4666. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4667. def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
  4668. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4669. def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
  4670. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4671. def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
  4672. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4673. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4674. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4675. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4676. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4677. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  4678. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4679. def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
  4680. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4681. def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4682. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4683. def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
  4684. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4685. def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
  4686. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4687. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4688. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4689. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4690. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4691. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  4692. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4693. def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
  4694. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4695. }
  4696. }
  4697. defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
  4698. defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
  4699. let Predicates = [UseSSE41] in {
  4700. defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
  4701. defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
  4702. }
  4703. //===----------------------------------------------------------------------===//
  4704. // SSE4.1 - Extract Instructions
  4705. //===----------------------------------------------------------------------===//
  4706. /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
  4707. multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
  4708. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4709. (ins VR128:$src1, u8imm:$src2),
  4710. !strconcat(OpcodeStr,
  4711. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4712. [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
  4713. timm:$src2))]>,
  4714. Sched<[WriteVecExtract]>;
  4715. let hasSideEffects = 0, mayStore = 1 in
  4716. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4717. (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
  4718. !strconcat(OpcodeStr,
  4719. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4720. [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
  4721. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4722. }
  4723. let Predicates = [HasAVX, NoBWI] in
  4724. defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
  4725. defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
  4726. /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
  4727. multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
  4728. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  4729. def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4730. (ins VR128:$src1, u8imm:$src2),
  4731. !strconcat(OpcodeStr,
  4732. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
  4733. Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
  4734. let hasSideEffects = 0, mayStore = 1 in
  4735. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4736. (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
  4737. !strconcat(OpcodeStr,
  4738. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4739. [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
  4740. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4741. }
  4742. let Predicates = [HasAVX, NoBWI] in
  4743. defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
  4744. defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
  4745. let Predicates = [UseSSE41] in
  4746. def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
  4747. let Predicates = [HasAVX, NoBWI] in
  4748. def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
  4749. /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
  4750. multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
  4751. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
  4752. (ins VR128:$src1, u8imm:$src2),
  4753. !strconcat(OpcodeStr,
  4754. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4755. [(set GR32:$dst,
  4756. (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
  4757. Sched<[WriteVecExtract]>;
  4758. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4759. (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
  4760. !strconcat(OpcodeStr,
  4761. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4762. [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
  4763. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4764. }
  4765. let Predicates = [HasAVX, NoDQI] in
  4766. defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
  4767. defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
  4768. /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
  4769. multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
  4770. def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
  4771. (ins VR128:$src1, u8imm:$src2),
  4772. !strconcat(OpcodeStr,
  4773. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4774. [(set GR64:$dst,
  4775. (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
  4776. Sched<[WriteVecExtract]>;
  4777. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4778. (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
  4779. !strconcat(OpcodeStr,
  4780. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4781. [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
  4782. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4783. }
  4784. let Predicates = [HasAVX, NoDQI] in
  4785. defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
  4786. defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
  4787. /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
  4788. /// destination
  4789. multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
  4790. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4791. (ins VR128:$src1, u8imm:$src2),
  4792. !strconcat(OpcodeStr,
  4793. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4794. [(set GR32orGR64:$dst,
  4795. (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
  4796. Sched<[WriteVecExtract]>;
  4797. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4798. (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
  4799. !strconcat(OpcodeStr,
  4800. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4801. [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
  4802. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4803. }
  4804. let ExeDomain = SSEPackedSingle in {
  4805. let Predicates = [UseAVX] in
  4806. defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
  4807. defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
  4808. }
  4809. //===----------------------------------------------------------------------===//
  4810. // SSE4.1 - Insert Instructions
  4811. //===----------------------------------------------------------------------===//
  4812. multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
  4813. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4814. (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
  4815. !if(Is2Addr,
  4816. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4817. !strconcat(asm,
  4818. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4819. [(set VR128:$dst,
  4820. (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
  4821. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4822. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4823. (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
  4824. !if(Is2Addr,
  4825. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4826. !strconcat(asm,
  4827. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4828. [(set VR128:$dst,
  4829. (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
  4830. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4831. }
  4832. let Predicates = [HasAVX, NoBWI] in {
  4833. defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
  4834. def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
  4835. (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  4836. GR8:$src2, sub_8bit), timm:$src3)>;
  4837. }
  4838. let Constraints = "$src1 = $dst" in
  4839. defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
  4840. multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
  4841. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4842. (ins VR128:$src1, GR32:$src2, u8imm:$src3),
  4843. !if(Is2Addr,
  4844. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4845. !strconcat(asm,
  4846. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4847. [(set VR128:$dst,
  4848. (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
  4849. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4850. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4851. (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
  4852. !if(Is2Addr,
  4853. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4854. !strconcat(asm,
  4855. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4856. [(set VR128:$dst,
  4857. (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
  4858. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4859. }
  4860. let Predicates = [HasAVX, NoDQI] in
  4861. defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
  4862. let Constraints = "$src1 = $dst" in
  4863. defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
  4864. multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
  4865. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4866. (ins VR128:$src1, GR64:$src2, u8imm:$src3),
  4867. !if(Is2Addr,
  4868. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4869. !strconcat(asm,
  4870. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4871. [(set VR128:$dst,
  4872. (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
  4873. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4874. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4875. (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
  4876. !if(Is2Addr,
  4877. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4878. !strconcat(asm,
  4879. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4880. [(set VR128:$dst,
  4881. (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
  4882. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4883. }
  4884. let Predicates = [HasAVX, NoDQI] in
  4885. defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
  4886. let Constraints = "$src1 = $dst" in
  4887. defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
  4888. // insertps has a few different modes, there's the first two here below which
  4889. // are optimized inserts that won't zero arbitrary elements in the destination
  4890. // vector. The next one matches the intrinsic and could zero arbitrary elements
  4891. // in the target vector.
  4892. multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
  4893. let isCommutable = 1 in
  4894. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4895. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  4896. !if(Is2Addr,
  4897. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4898. !strconcat(asm,
  4899. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4900. [(set VR128:$dst,
  4901. (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
  4902. Sched<[SchedWriteFShuffle.XMM]>;
  4903. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4904. (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
  4905. !if(Is2Addr,
  4906. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4907. !strconcat(asm,
  4908. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4909. [(set VR128:$dst,
  4910. (X86insertps VR128:$src1,
  4911. (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
  4912. timm:$src3))]>,
  4913. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  4914. }
  4915. let ExeDomain = SSEPackedSingle in {
  4916. let Predicates = [UseAVX] in
  4917. defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
  4918. VEX_4V, VEX_WIG;
  4919. let Constraints = "$src1 = $dst" in
  4920. defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
  4921. }
  4922. //===----------------------------------------------------------------------===//
  4923. // SSE4.1 - Round Instructions
  4924. //===----------------------------------------------------------------------===//
  4925. multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
  4926. X86MemOperand x86memop, RegisterClass RC,
  4927. ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
  4928. X86FoldableSchedWrite sched> {
  4929. // Intrinsic operation, reg.
  4930. // Vector intrinsic operation, reg
  4931. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4932. def r : SS4AIi8<opc, MRMSrcReg,
  4933. (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
  4934. !strconcat(OpcodeStr,
  4935. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4936. [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
  4937. Sched<[sched]>;
  4938. // Vector intrinsic operation, mem
  4939. def m : SS4AIi8<opc, MRMSrcMem,
  4940. (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
  4941. !strconcat(OpcodeStr,
  4942. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4943. [(set RC:$dst,
  4944. (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
  4945. Sched<[sched.Folded]>;
  4946. }
  4947. }
  4948. multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
  4949. string OpcodeStr, X86FoldableSchedWrite sched> {
  4950. let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4951. def SSr : SS4AIi8<opcss, MRMSrcReg,
  4952. (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
  4953. !strconcat(OpcodeStr,
  4954. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4955. []>, Sched<[sched]>;
  4956. let mayLoad = 1 in
  4957. def SSm : SS4AIi8<opcss, MRMSrcMem,
  4958. (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
  4959. !strconcat(OpcodeStr,
  4960. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4961. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4962. } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
  4963. let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4964. def SDr : SS4AIi8<opcsd, MRMSrcReg,
  4965. (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
  4966. !strconcat(OpcodeStr,
  4967. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4968. []>, Sched<[sched]>;
  4969. let mayLoad = 1 in
  4970. def SDm : SS4AIi8<opcsd, MRMSrcMem,
  4971. (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
  4972. !strconcat(OpcodeStr,
  4973. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4974. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4975. } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
  4976. }
  4977. multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
  4978. string OpcodeStr, X86FoldableSchedWrite sched> {
  4979. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4980. let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4981. def SSr : SS4AIi8<opcss, MRMSrcReg,
  4982. (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
  4983. !strconcat(OpcodeStr,
  4984. "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4985. []>, Sched<[sched]>;
  4986. let mayLoad = 1 in
  4987. def SSm : SS4AIi8<opcss, MRMSrcMem,
  4988. (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
  4989. !strconcat(OpcodeStr,
  4990. "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4991. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4992. } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
  4993. let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4994. def SDr : SS4AIi8<opcsd, MRMSrcReg,
  4995. (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
  4996. !strconcat(OpcodeStr,
  4997. "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4998. []>, Sched<[sched]>;
  4999. let mayLoad = 1 in
  5000. def SDm : SS4AIi8<opcsd, MRMSrcMem,
  5001. (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
  5002. !strconcat(OpcodeStr,
  5003. "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  5004. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  5005. } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
  5006. }
  5007. }
  5008. multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
  5009. string OpcodeStr, X86FoldableSchedWrite sched,
  5010. ValueType VT32, ValueType VT64,
  5011. SDNode OpNode, bit Is2Addr = 1> {
  5012. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  5013. let ExeDomain = SSEPackedSingle in {
  5014. def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
  5015. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
  5016. !if(Is2Addr,
  5017. !strconcat(OpcodeStr,
  5018. "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5019. !strconcat(OpcodeStr,
  5020. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5021. [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
  5022. Sched<[sched]>;
  5023. def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
  5024. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
  5025. !if(Is2Addr,
  5026. !strconcat(OpcodeStr,
  5027. "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5028. !strconcat(OpcodeStr,
  5029. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5030. [(set VR128:$dst,
  5031. (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
  5032. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5033. } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
  5034. let ExeDomain = SSEPackedDouble in {
  5035. def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
  5036. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
  5037. !if(Is2Addr,
  5038. !strconcat(OpcodeStr,
  5039. "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5040. !strconcat(OpcodeStr,
  5041. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5042. [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
  5043. Sched<[sched]>;
  5044. def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
  5045. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
  5046. !if(Is2Addr,
  5047. !strconcat(OpcodeStr,
  5048. "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5049. !strconcat(OpcodeStr,
  5050. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5051. [(set VR128:$dst,
  5052. (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
  5053. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5054. } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
  5055. }
  5056. }
  5057. // FP round - roundss, roundps, roundsd, roundpd
  5058. let Predicates = [HasAVX, NoVLX] in {
  5059. let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
  5060. // Intrinsic form
  5061. defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
  5062. loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
  5063. VEX, VEX_WIG;
  5064. defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
  5065. loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
  5066. VEX, VEX_L, VEX_WIG;
  5067. }
  5068. let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
  5069. defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
  5070. loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
  5071. VEX, VEX_WIG;
  5072. defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
  5073. loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
  5074. VEX, VEX_L, VEX_WIG;
  5075. }
  5076. }
  5077. let Predicates = [UseAVX] in {
  5078. defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
  5079. v4f32, v2f64, X86RndScales, 0>,
  5080. VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
  5081. defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
  5082. VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
  5083. }
  5084. let Predicates = [UseAVX] in {
  5085. def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
  5086. (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
  5087. def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
  5088. (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
  5089. }
  5090. let Predicates = [UseAVX, OptForSize] in {
  5091. def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
  5092. (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
  5093. def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
  5094. (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
  5095. }
  5096. let ExeDomain = SSEPackedSingle in
  5097. defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
  5098. memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
  5099. let ExeDomain = SSEPackedDouble in
  5100. defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
  5101. memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
  5102. defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
  5103. let Constraints = "$src1 = $dst" in
  5104. defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
  5105. v4f32, v2f64, X86RndScales>;
  5106. let Predicates = [UseSSE41] in {
  5107. def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
  5108. (ROUNDSSr FR32:$src1, timm:$src2)>;
  5109. def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
  5110. (ROUNDSDr FR64:$src1, timm:$src2)>;
  5111. }
  5112. let Predicates = [UseSSE41, OptForSize] in {
  5113. def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
  5114. (ROUNDSSm addr:$src1, timm:$src2)>;
  5115. def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
  5116. (ROUNDSDm addr:$src1, timm:$src2)>;
  5117. }
  5118. //===----------------------------------------------------------------------===//
  5119. // SSE4.1 - Packed Bit Test
  5120. //===----------------------------------------------------------------------===//
  5121. // ptest instruction we'll lower to this in X86ISelLowering primarily from
  5122. // the intel intrinsic that corresponds to this.
  5123. let Defs = [EFLAGS], Predicates = [HasAVX] in {
  5124. def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
  5125. "vptest\t{$src2, $src1|$src1, $src2}",
  5126. [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
  5127. Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
  5128. def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
  5129. "vptest\t{$src2, $src1|$src1, $src2}",
  5130. [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
  5131. Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
  5132. VEX, VEX_WIG;
  5133. def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
  5134. "vptest\t{$src2, $src1|$src1, $src2}",
  5135. [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
  5136. Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
  5137. def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
  5138. "vptest\t{$src2, $src1|$src1, $src2}",
  5139. [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
  5140. Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
  5141. VEX, VEX_L, VEX_WIG;
  5142. }
  5143. let Defs = [EFLAGS] in {
  5144. def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
  5145. "ptest\t{$src2, $src1|$src1, $src2}",
  5146. [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
  5147. Sched<[SchedWriteVecTest.XMM]>;
  5148. def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
  5149. "ptest\t{$src2, $src1|$src1, $src2}",
  5150. [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
  5151. Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
  5152. }
  5153. // The bit test instructions below are AVX only
  5154. multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
  5155. X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
  5156. X86FoldableSchedWrite sched> {
  5157. def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  5158. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  5159. [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
  5160. Sched<[sched]>, VEX;
  5161. def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
  5162. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  5163. [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
  5164. Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
  5165. }
  5166. let Defs = [EFLAGS], Predicates = [HasAVX] in {
  5167. let ExeDomain = SSEPackedSingle in {
  5168. defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
  5169. SchedWriteFTest.XMM>;
  5170. defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
  5171. SchedWriteFTest.YMM>, VEX_L;
  5172. }
  5173. let ExeDomain = SSEPackedDouble in {
  5174. defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
  5175. SchedWriteFTest.XMM>;
  5176. defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
  5177. SchedWriteFTest.YMM>, VEX_L;
  5178. }
  5179. }
  5180. //===----------------------------------------------------------------------===//
  5181. // SSE4.1 - Misc Instructions
  5182. //===----------------------------------------------------------------------===//
  5183. let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
  5184. def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
  5185. "popcnt{w}\t{$src, $dst|$dst, $src}",
  5186. [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
  5187. Sched<[WritePOPCNT]>, OpSize16, XS;
  5188. def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
  5189. "popcnt{w}\t{$src, $dst|$dst, $src}",
  5190. [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
  5191. (implicit EFLAGS)]>,
  5192. Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
  5193. def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
  5194. "popcnt{l}\t{$src, $dst|$dst, $src}",
  5195. [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
  5196. Sched<[WritePOPCNT]>, OpSize32, XS;
  5197. def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
  5198. "popcnt{l}\t{$src, $dst|$dst, $src}",
  5199. [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
  5200. (implicit EFLAGS)]>,
  5201. Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
  5202. def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
  5203. "popcnt{q}\t{$src, $dst|$dst, $src}",
  5204. [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
  5205. Sched<[WritePOPCNT]>, XS;
  5206. def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
  5207. "popcnt{q}\t{$src, $dst|$dst, $src}",
  5208. [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
  5209. (implicit EFLAGS)]>,
  5210. Sched<[WritePOPCNT.Folded]>, XS;
  5211. }
  5212. // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
  5213. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
  5214. SDNode OpNode, PatFrag ld_frag,
  5215. X86FoldableSchedWrite Sched> {
  5216. def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
  5217. (ins VR128:$src),
  5218. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  5219. [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
  5220. Sched<[Sched]>;
  5221. def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
  5222. (ins i128mem:$src),
  5223. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  5224. [(set VR128:$dst,
  5225. (v8i16 (OpNode (ld_frag addr:$src))))]>,
  5226. Sched<[Sched.Folded]>;
  5227. }
  5228. // PHMIN has the same profile as PSAD, thus we use the same scheduling
  5229. // model, although the naming is misleading.
  5230. let Predicates = [HasAVX] in
  5231. defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
  5232. X86phminpos, load,
  5233. WritePHMINPOS>, VEX, VEX_WIG;
  5234. defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
  5235. X86phminpos, memop,
  5236. WritePHMINPOS>;
  5237. /// SS48I_binop_rm - Simple SSE41 binary operator.
  5238. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5239. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5240. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  5241. bit Is2Addr = 1> {
  5242. let isCommutable = 1 in
  5243. def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
  5244. (ins RC:$src1, RC:$src2),
  5245. !if(Is2Addr,
  5246. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5247. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5248. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  5249. Sched<[sched]>;
  5250. def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
  5251. (ins RC:$src1, x86memop:$src2),
  5252. !if(Is2Addr,
  5253. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5254. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5255. [(set RC:$dst,
  5256. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  5257. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5258. }
  5259. let Predicates = [HasAVX, NoVLX] in {
  5260. defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
  5261. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5262. VEX_4V, VEX_WIG;
  5263. defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
  5264. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5265. VEX_4V, VEX_WIG;
  5266. defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
  5267. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5268. VEX_4V, VEX_WIG;
  5269. defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
  5270. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5271. VEX_4V, VEX_WIG;
  5272. defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
  5273. load, i128mem, SchedWriteVecIMul.XMM, 0>,
  5274. VEX_4V, VEX_WIG;
  5275. }
  5276. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  5277. defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
  5278. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5279. VEX_4V, VEX_WIG;
  5280. defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
  5281. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5282. VEX_4V, VEX_WIG;
  5283. defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
  5284. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5285. VEX_4V, VEX_WIG;
  5286. defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
  5287. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5288. VEX_4V, VEX_WIG;
  5289. }
  5290. let Predicates = [HasAVX2, NoVLX] in {
  5291. defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
  5292. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5293. VEX_4V, VEX_L, VEX_WIG;
  5294. defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
  5295. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5296. VEX_4V, VEX_L, VEX_WIG;
  5297. defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
  5298. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5299. VEX_4V, VEX_L, VEX_WIG;
  5300. defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
  5301. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5302. VEX_4V, VEX_L, VEX_WIG;
  5303. defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
  5304. load, i256mem, SchedWriteVecIMul.YMM, 0>,
  5305. VEX_4V, VEX_L, VEX_WIG;
  5306. }
  5307. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  5308. defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
  5309. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5310. VEX_4V, VEX_L, VEX_WIG;
  5311. defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
  5312. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5313. VEX_4V, VEX_L, VEX_WIG;
  5314. defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
  5315. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5316. VEX_4V, VEX_L, VEX_WIG;
  5317. defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
  5318. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5319. VEX_4V, VEX_L, VEX_WIG;
  5320. }
  5321. let Constraints = "$src1 = $dst" in {
  5322. defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
  5323. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5324. defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
  5325. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5326. defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
  5327. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5328. defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
  5329. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5330. defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
  5331. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5332. defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
  5333. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5334. defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
  5335. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5336. defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
  5337. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5338. defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
  5339. memop, i128mem, SchedWriteVecIMul.XMM, 1>;
  5340. }
  5341. let Predicates = [HasAVX, NoVLX] in
  5342. defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
  5343. load, i128mem, SchedWritePMULLD.XMM, 0>,
  5344. VEX_4V, VEX_WIG;
  5345. let Predicates = [HasAVX] in
  5346. defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
  5347. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5348. VEX_4V, VEX_WIG;
  5349. let Predicates = [HasAVX2, NoVLX] in
  5350. defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
  5351. load, i256mem, SchedWritePMULLD.YMM, 0>,
  5352. VEX_4V, VEX_L, VEX_WIG;
  5353. let Predicates = [HasAVX2] in
  5354. defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
  5355. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5356. VEX_4V, VEX_L, VEX_WIG;
  5357. let Constraints = "$src1 = $dst" in {
  5358. defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
  5359. memop, i128mem, SchedWritePMULLD.XMM, 1>;
  5360. defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
  5361. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5362. }
  5363. /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
  5364. multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
  5365. Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
  5366. X86MemOperand x86memop, bit Is2Addr,
  5367. X86FoldableSchedWrite sched> {
  5368. let isCommutable = 1 in
  5369. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5370. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5371. !if(Is2Addr,
  5372. !strconcat(OpcodeStr,
  5373. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5374. !strconcat(OpcodeStr,
  5375. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5376. [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
  5377. Sched<[sched]>;
  5378. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5379. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5380. !if(Is2Addr,
  5381. !strconcat(OpcodeStr,
  5382. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5383. !strconcat(OpcodeStr,
  5384. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5385. [(set RC:$dst,
  5386. (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
  5387. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5388. }
  5389. /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
  5390. multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5391. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5392. X86MemOperand x86memop, bit Is2Addr,
  5393. X86FoldableSchedWrite sched> {
  5394. let isCommutable = 1 in
  5395. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5396. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5397. !if(Is2Addr,
  5398. !strconcat(OpcodeStr,
  5399. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5400. !strconcat(OpcodeStr,
  5401. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5402. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  5403. Sched<[sched]>;
  5404. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5405. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5406. !if(Is2Addr,
  5407. !strconcat(OpcodeStr,
  5408. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5409. !strconcat(OpcodeStr,
  5410. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5411. [(set RC:$dst,
  5412. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
  5413. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5414. }
  5415. def BlendCommuteImm2 : SDNodeXForm<timm, [{
  5416. uint8_t Imm = N->getZExtValue() & 0x03;
  5417. return getI8Imm(Imm ^ 0x03, SDLoc(N));
  5418. }]>;
  5419. def BlendCommuteImm4 : SDNodeXForm<timm, [{
  5420. uint8_t Imm = N->getZExtValue() & 0x0f;
  5421. return getI8Imm(Imm ^ 0x0f, SDLoc(N));
  5422. }]>;
  5423. def BlendCommuteImm8 : SDNodeXForm<timm, [{
  5424. uint8_t Imm = N->getZExtValue() & 0xff;
  5425. return getI8Imm(Imm ^ 0xff, SDLoc(N));
  5426. }]>;
  5427. // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
  5428. def BlendScaleImm4 : SDNodeXForm<timm, [{
  5429. uint8_t Imm = N->getZExtValue();
  5430. uint8_t NewImm = 0;
  5431. for (unsigned i = 0; i != 4; ++i) {
  5432. if (Imm & (1 << i))
  5433. NewImm |= 0x3 << (i * 2);
  5434. }
  5435. return getI8Imm(NewImm, SDLoc(N));
  5436. }]>;
  5437. // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
  5438. def BlendScaleImm2 : SDNodeXForm<timm, [{
  5439. uint8_t Imm = N->getZExtValue();
  5440. uint8_t NewImm = 0;
  5441. for (unsigned i = 0; i != 2; ++i) {
  5442. if (Imm & (1 << i))
  5443. NewImm |= 0xf << (i * 4);
  5444. }
  5445. return getI8Imm(NewImm, SDLoc(N));
  5446. }]>;
  5447. // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
  5448. def BlendScaleImm2to4 : SDNodeXForm<timm, [{
  5449. uint8_t Imm = N->getZExtValue();
  5450. uint8_t NewImm = 0;
  5451. for (unsigned i = 0; i != 2; ++i) {
  5452. if (Imm & (1 << i))
  5453. NewImm |= 0x3 << (i * 2);
  5454. }
  5455. return getI8Imm(NewImm, SDLoc(N));
  5456. }]>;
  5457. // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
  5458. def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
  5459. uint8_t Imm = N->getZExtValue();
  5460. uint8_t NewImm = 0;
  5461. for (unsigned i = 0; i != 4; ++i) {
  5462. if (Imm & (1 << i))
  5463. NewImm |= 0x3 << (i * 2);
  5464. }
  5465. return getI8Imm(NewImm ^ 0xff, SDLoc(N));
  5466. }]>;
  5467. // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
  5468. def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
  5469. uint8_t Imm = N->getZExtValue();
  5470. uint8_t NewImm = 0;
  5471. for (unsigned i = 0; i != 2; ++i) {
  5472. if (Imm & (1 << i))
  5473. NewImm |= 0xf << (i * 4);
  5474. }
  5475. return getI8Imm(NewImm ^ 0xff, SDLoc(N));
  5476. }]>;
  5477. // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
  5478. def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
  5479. uint8_t Imm = N->getZExtValue();
  5480. uint8_t NewImm = 0;
  5481. for (unsigned i = 0; i != 2; ++i) {
  5482. if (Imm & (1 << i))
  5483. NewImm |= 0x3 << (i * 2);
  5484. }
  5485. return getI8Imm(NewImm ^ 0xf, SDLoc(N));
  5486. }]>;
  5487. let Predicates = [HasAVX] in {
  5488. let isCommutable = 0 in {
  5489. defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
  5490. VR128, load, i128mem, 0,
  5491. SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
  5492. }
  5493. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  5494. let ExeDomain = SSEPackedSingle in
  5495. defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
  5496. VR128, load, f128mem, 0,
  5497. SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
  5498. let ExeDomain = SSEPackedDouble in
  5499. defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
  5500. VR128, load, f128mem, 0,
  5501. SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
  5502. let ExeDomain = SSEPackedSingle in
  5503. defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
  5504. VR256, load, i256mem, 0,
  5505. SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
  5506. }
  5507. }
  5508. let Predicates = [HasAVX2] in {
  5509. let isCommutable = 0 in {
  5510. defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
  5511. VR256, load, i256mem, 0,
  5512. SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
  5513. }
  5514. }
  5515. let Constraints = "$src1 = $dst" in {
  5516. let isCommutable = 0 in {
  5517. defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
  5518. VR128, memop, i128mem, 1,
  5519. SchedWriteMPSAD.XMM>;
  5520. }
  5521. let ExeDomain = SSEPackedSingle in
  5522. defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
  5523. VR128, memop, f128mem, 1,
  5524. SchedWriteDPPS.XMM>, SIMD_EXC;
  5525. let ExeDomain = SSEPackedDouble in
  5526. defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
  5527. VR128, memop, f128mem, 1,
  5528. SchedWriteDPPD.XMM>, SIMD_EXC;
  5529. }
  5530. /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
  5531. multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5532. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5533. X86MemOperand x86memop, bit Is2Addr, Domain d,
  5534. X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
  5535. let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
  5536. let isCommutable = 1 in
  5537. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5538. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5539. !if(Is2Addr,
  5540. !strconcat(OpcodeStr,
  5541. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5542. !strconcat(OpcodeStr,
  5543. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5544. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  5545. Sched<[sched]>;
  5546. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5547. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5548. !if(Is2Addr,
  5549. !strconcat(OpcodeStr,
  5550. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5551. !strconcat(OpcodeStr,
  5552. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5553. [(set RC:$dst,
  5554. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
  5555. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5556. }
  5557. // Pattern to commute if load is in first source.
  5558. def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
  5559. (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
  5560. (commuteXForm timm:$src3))>;
  5561. }
  5562. let Predicates = [HasAVX] in {
  5563. defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
  5564. VR128, load, f128mem, 0, SSEPackedSingle,
  5565. SchedWriteFBlend.XMM, BlendCommuteImm4>,
  5566. VEX_4V, VEX_WIG;
  5567. defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
  5568. VR256, load, f256mem, 0, SSEPackedSingle,
  5569. SchedWriteFBlend.YMM, BlendCommuteImm8>,
  5570. VEX_4V, VEX_L, VEX_WIG;
  5571. defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
  5572. VR128, load, f128mem, 0, SSEPackedDouble,
  5573. SchedWriteFBlend.XMM, BlendCommuteImm2>,
  5574. VEX_4V, VEX_WIG;
  5575. defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
  5576. VR256, load, f256mem, 0, SSEPackedDouble,
  5577. SchedWriteFBlend.YMM, BlendCommuteImm4>,
  5578. VEX_4V, VEX_L, VEX_WIG;
  5579. defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
  5580. VR128, load, i128mem, 0, SSEPackedInt,
  5581. SchedWriteBlend.XMM, BlendCommuteImm8>,
  5582. VEX_4V, VEX_WIG;
  5583. }
  5584. let Predicates = [HasAVX2] in {
  5585. defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
  5586. VR256, load, i256mem, 0, SSEPackedInt,
  5587. SchedWriteBlend.YMM, BlendCommuteImm8>,
  5588. VEX_4V, VEX_L, VEX_WIG;
  5589. }
  5590. // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
  5591. // ExecutionDomainFixPass will cleanup domains later on.
  5592. let Predicates = [HasAVX1Only] in {
  5593. def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
  5594. (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
  5595. def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
  5596. (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
  5597. def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
  5598. (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
  5599. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5600. // it from becoming movsd via commuting under optsize.
  5601. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  5602. (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
  5603. def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
  5604. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
  5605. def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
  5606. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
  5607. def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
  5608. (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
  5609. def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
  5610. (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
  5611. def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
  5612. (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
  5613. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5614. // it from becoming movss via commuting under optsize.
  5615. def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
  5616. (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
  5617. def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
  5618. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  5619. def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
  5620. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  5621. }
  5622. defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
  5623. VR128, memop, f128mem, 1, SSEPackedSingle,
  5624. SchedWriteFBlend.XMM, BlendCommuteImm4>;
  5625. defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
  5626. VR128, memop, f128mem, 1, SSEPackedDouble,
  5627. SchedWriteFBlend.XMM, BlendCommuteImm2>;
  5628. defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
  5629. VR128, memop, i128mem, 1, SSEPackedInt,
  5630. SchedWriteBlend.XMM, BlendCommuteImm8>;
  5631. let Predicates = [UseSSE41] in {
  5632. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5633. // it from becoming movss via commuting under optsize.
  5634. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  5635. (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
  5636. def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
  5637. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
  5638. def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
  5639. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
  5640. def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
  5641. (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
  5642. def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
  5643. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  5644. def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
  5645. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  5646. }
  5647. // For insertion into the zero index (low half) of a 256-bit vector, it is
  5648. // more efficient to generate a blend with immediate instead of an insert*128.
  5649. let Predicates = [HasAVX] in {
  5650. def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
  5651. (VBLENDPDYrri VR256:$src1,
  5652. (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  5653. VR128:$src2, sub_xmm), 0x3)>;
  5654. def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
  5655. (VBLENDPSYrri VR256:$src1,
  5656. (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  5657. VR128:$src2, sub_xmm), 0xf)>;
  5658. def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
  5659. (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  5660. VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
  5661. def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
  5662. (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  5663. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  5664. }
  5665. /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
  5666. multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
  5667. X86MemOperand x86memop, ValueType VT,
  5668. PatFrag mem_frag, SDNode OpNode,
  5669. X86FoldableSchedWrite sched> {
  5670. def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
  5671. (ins RC:$src1, RC:$src2, RC:$src3),
  5672. !strconcat(OpcodeStr,
  5673. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  5674. [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
  5675. SSEPackedInt>, TAPD, VEX_4V,
  5676. Sched<[sched]>;
  5677. def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
  5678. (ins RC:$src1, x86memop:$src2, RC:$src3),
  5679. !strconcat(OpcodeStr,
  5680. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  5681. [(set RC:$dst,
  5682. (OpNode RC:$src3, (mem_frag addr:$src2),
  5683. RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
  5684. Sched<[sched.Folded, sched.ReadAfterFold,
  5685. // x86memop:$src2
  5686. ReadDefault, ReadDefault, ReadDefault, ReadDefault,
  5687. ReadDefault,
  5688. // RC::$src3
  5689. sched.ReadAfterFold]>;
  5690. }
  5691. let Predicates = [HasAVX] in {
  5692. let ExeDomain = SSEPackedDouble in {
  5693. defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
  5694. v2f64, loadv2f64, X86Blendv,
  5695. SchedWriteFVarBlend.XMM>;
  5696. defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
  5697. v4f64, loadv4f64, X86Blendv,
  5698. SchedWriteFVarBlend.YMM>, VEX_L;
  5699. } // ExeDomain = SSEPackedDouble
  5700. let ExeDomain = SSEPackedSingle in {
  5701. defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
  5702. v4f32, loadv4f32, X86Blendv,
  5703. SchedWriteFVarBlend.XMM>;
  5704. defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
  5705. v8f32, loadv8f32, X86Blendv,
  5706. SchedWriteFVarBlend.YMM>, VEX_L;
  5707. } // ExeDomain = SSEPackedSingle
  5708. defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
  5709. v16i8, loadv16i8, X86Blendv,
  5710. SchedWriteVarBlend.XMM>;
  5711. }
  5712. let Predicates = [HasAVX2] in {
  5713. defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
  5714. v32i8, loadv32i8, X86Blendv,
  5715. SchedWriteVarBlend.YMM>, VEX_L;
  5716. }
  5717. let Predicates = [HasAVX] in {
  5718. def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
  5719. (v4i32 VR128:$src2))),
  5720. (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
  5721. def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
  5722. (v2i64 VR128:$src2))),
  5723. (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
  5724. def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
  5725. (v8i32 VR256:$src2))),
  5726. (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
  5727. def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
  5728. (v4i64 VR256:$src2))),
  5729. (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
  5730. }
  5731. // Prefer a movss or movsd over a blendps when optimizing for size. these were
  5732. // changed to use blends because blends have better throughput on sandybridge
  5733. // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
  5734. let Predicates = [HasAVX, OptForSpeed] in {
  5735. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  5736. (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
  5737. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  5738. (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
  5739. def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
  5740. (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
  5741. def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
  5742. (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
  5743. def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
  5744. (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
  5745. def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
  5746. (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
  5747. def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
  5748. (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
  5749. def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
  5750. (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
  5751. // Move low f32 and clear high bits.
  5752. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
  5753. (SUBREG_TO_REG (i32 0),
  5754. (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
  5755. (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
  5756. (i8 1))), sub_xmm)>;
  5757. def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
  5758. (SUBREG_TO_REG (i32 0),
  5759. (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
  5760. (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
  5761. (i8 3))), sub_xmm)>;
  5762. }
  5763. // Prefer a movss or movsd over a blendps when optimizing for size. these were
  5764. // changed to use blends because blends have better throughput on sandybridge
  5765. // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
  5766. let Predicates = [UseSSE41, OptForSpeed] in {
  5767. // With SSE41 we can use blends for these patterns.
  5768. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  5769. (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
  5770. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  5771. (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
  5772. def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
  5773. (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
  5774. def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
  5775. (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
  5776. def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
  5777. (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
  5778. def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
  5779. (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
  5780. def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
  5781. (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
  5782. def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
  5783. (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
  5784. }
  5785. /// SS41I_ternary - SSE 4.1 ternary operator
  5786. let Uses = [XMM0], Constraints = "$src1 = $dst" in {
  5787. multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
  5788. PatFrag mem_frag, X86MemOperand x86memop,
  5789. SDNode OpNode, X86FoldableSchedWrite sched> {
  5790. def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
  5791. (ins VR128:$src1, VR128:$src2),
  5792. !strconcat(OpcodeStr,
  5793. "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  5794. [(set VR128:$dst,
  5795. (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
  5796. Sched<[sched]>;
  5797. def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
  5798. (ins VR128:$src1, x86memop:$src2),
  5799. !strconcat(OpcodeStr,
  5800. "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  5801. [(set VR128:$dst,
  5802. (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
  5803. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5804. }
  5805. }
  5806. let ExeDomain = SSEPackedDouble in
  5807. defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
  5808. X86Blendv, SchedWriteFVarBlend.XMM>;
  5809. let ExeDomain = SSEPackedSingle in
  5810. defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
  5811. X86Blendv, SchedWriteFVarBlend.XMM>;
  5812. defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
  5813. X86Blendv, SchedWriteVarBlend.XMM>;
  5814. // Aliases with the implicit xmm0 argument
  5815. def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
  5816. (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
  5817. def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
  5818. (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
  5819. def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
  5820. (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
  5821. def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
  5822. (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
  5823. def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
  5824. (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
  5825. def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
  5826. (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
  5827. let Predicates = [UseSSE41] in {
  5828. def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
  5829. (v4i32 VR128:$src2))),
  5830. (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
  5831. def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
  5832. (v2i64 VR128:$src2))),
  5833. (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
  5834. }
  5835. let AddedComplexity = 400 in { // Prefer non-temporal versions
  5836. let Predicates = [HasAVX, NoVLX] in
  5837. def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  5838. "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
  5839. Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
  5840. let Predicates = [HasAVX2, NoVLX] in
  5841. def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  5842. "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
  5843. Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
  5844. def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  5845. "movntdqa\t{$src, $dst|$dst, $src}", []>,
  5846. Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
  5847. let Predicates = [HasAVX2, NoVLX] in {
  5848. def : Pat<(v8f32 (alignednontemporalload addr:$src)),
  5849. (VMOVNTDQAYrm addr:$src)>;
  5850. def : Pat<(v4f64 (alignednontemporalload addr:$src)),
  5851. (VMOVNTDQAYrm addr:$src)>;
  5852. def : Pat<(v4i64 (alignednontemporalload addr:$src)),
  5853. (VMOVNTDQAYrm addr:$src)>;
  5854. def : Pat<(v8i32 (alignednontemporalload addr:$src)),
  5855. (VMOVNTDQAYrm addr:$src)>;
  5856. def : Pat<(v16i16 (alignednontemporalload addr:$src)),
  5857. (VMOVNTDQAYrm addr:$src)>;
  5858. def : Pat<(v16f16 (alignednontemporalload addr:$src)),
  5859. (VMOVNTDQAYrm addr:$src)>;
  5860. def : Pat<(v32i8 (alignednontemporalload addr:$src)),
  5861. (VMOVNTDQAYrm addr:$src)>;
  5862. }
  5863. let Predicates = [HasAVX, NoVLX] in {
  5864. def : Pat<(v4f32 (alignednontemporalload addr:$src)),
  5865. (VMOVNTDQArm addr:$src)>;
  5866. def : Pat<(v2f64 (alignednontemporalload addr:$src)),
  5867. (VMOVNTDQArm addr:$src)>;
  5868. def : Pat<(v2i64 (alignednontemporalload addr:$src)),
  5869. (VMOVNTDQArm addr:$src)>;
  5870. def : Pat<(v4i32 (alignednontemporalload addr:$src)),
  5871. (VMOVNTDQArm addr:$src)>;
  5872. def : Pat<(v8i16 (alignednontemporalload addr:$src)),
  5873. (VMOVNTDQArm addr:$src)>;
  5874. def : Pat<(v8f16 (alignednontemporalload addr:$src)),
  5875. (VMOVNTDQArm addr:$src)>;
  5876. def : Pat<(v16i8 (alignednontemporalload addr:$src)),
  5877. (VMOVNTDQArm addr:$src)>;
  5878. }
  5879. let Predicates = [UseSSE41] in {
  5880. def : Pat<(v4f32 (alignednontemporalload addr:$src)),
  5881. (MOVNTDQArm addr:$src)>;
  5882. def : Pat<(v2f64 (alignednontemporalload addr:$src)),
  5883. (MOVNTDQArm addr:$src)>;
  5884. def : Pat<(v2i64 (alignednontemporalload addr:$src)),
  5885. (MOVNTDQArm addr:$src)>;
  5886. def : Pat<(v4i32 (alignednontemporalload addr:$src)),
  5887. (MOVNTDQArm addr:$src)>;
  5888. def : Pat<(v8i16 (alignednontemporalload addr:$src)),
  5889. (MOVNTDQArm addr:$src)>;
  5890. def : Pat<(v8f16 (alignednontemporalload addr:$src)),
  5891. (MOVNTDQArm addr:$src)>;
  5892. def : Pat<(v16i8 (alignednontemporalload addr:$src)),
  5893. (MOVNTDQArm addr:$src)>;
  5894. }
  5895. } // AddedComplexity
  5896. //===----------------------------------------------------------------------===//
  5897. // SSE4.2 - Compare Instructions
  5898. //===----------------------------------------------------------------------===//
  5899. /// SS42I_binop_rm - Simple SSE 4.2 binary operator
  5900. multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5901. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5902. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  5903. bit Is2Addr = 1> {
  5904. def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
  5905. (ins RC:$src1, RC:$src2),
  5906. !if(Is2Addr,
  5907. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5908. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5909. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  5910. Sched<[sched]>;
  5911. def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
  5912. (ins RC:$src1, x86memop:$src2),
  5913. !if(Is2Addr,
  5914. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5915. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5916. [(set RC:$dst,
  5917. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  5918. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5919. }
  5920. let Predicates = [HasAVX] in
  5921. defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
  5922. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5923. VEX_4V, VEX_WIG;
  5924. let Predicates = [HasAVX2] in
  5925. defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
  5926. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5927. VEX_4V, VEX_L, VEX_WIG;
  5928. let Constraints = "$src1 = $dst" in
  5929. defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
  5930. memop, i128mem, SchedWriteVecALU.XMM>;
  5931. //===----------------------------------------------------------------------===//
  5932. // SSE4.2 - String/text Processing Instructions
  5933. //===----------------------------------------------------------------------===//
  5934. multiclass pcmpistrm_SS42AI<string asm> {
  5935. def rr : SS42AI<0x62, MRMSrcReg, (outs),
  5936. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  5937. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5938. []>, Sched<[WritePCmpIStrM]>;
  5939. let mayLoad = 1 in
  5940. def rm :SS42AI<0x62, MRMSrcMem, (outs),
  5941. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  5942. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5943. []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
  5944. }
  5945. let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
  5946. let Predicates = [HasAVX] in
  5947. defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
  5948. defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
  5949. }
  5950. multiclass SS42AI_pcmpestrm<string asm> {
  5951. def rr : SS42AI<0x60, MRMSrcReg, (outs),
  5952. (ins VR128:$src1, VR128:$src3, u8imm:$src5),
  5953. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5954. []>, Sched<[WritePCmpEStrM]>;
  5955. let mayLoad = 1 in
  5956. def rm : SS42AI<0x60, MRMSrcMem, (outs),
  5957. (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
  5958. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5959. []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
  5960. }
  5961. let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
  5962. let Predicates = [HasAVX] in
  5963. defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
  5964. defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
  5965. }
  5966. multiclass SS42AI_pcmpistri<string asm> {
  5967. def rr : SS42AI<0x63, MRMSrcReg, (outs),
  5968. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  5969. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5970. []>, Sched<[WritePCmpIStrI]>;
  5971. let mayLoad = 1 in
  5972. def rm : SS42AI<0x63, MRMSrcMem, (outs),
  5973. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  5974. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5975. []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
  5976. }
  5977. let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
  5978. let Predicates = [HasAVX] in
  5979. defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
  5980. defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
  5981. }
  5982. multiclass SS42AI_pcmpestri<string asm> {
  5983. def rr : SS42AI<0x61, MRMSrcReg, (outs),
  5984. (ins VR128:$src1, VR128:$src3, u8imm:$src5),
  5985. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5986. []>, Sched<[WritePCmpEStrI]>;
  5987. let mayLoad = 1 in
  5988. def rm : SS42AI<0x61, MRMSrcMem, (outs),
  5989. (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
  5990. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5991. []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
  5992. }
  5993. let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
  5994. let Predicates = [HasAVX] in
  5995. defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
  5996. defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
  5997. }
  5998. //===----------------------------------------------------------------------===//
  5999. // SSE4.2 - CRC Instructions
  6000. //===----------------------------------------------------------------------===//
  6001. // No CRC instructions have AVX equivalents
  6002. // crc intrinsic instruction
  6003. // This set of instructions are only rm, the only difference is the size
  6004. // of r and m.
  6005. class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
  6006. RegisterClass RCIn, SDPatternOperator Int> :
  6007. CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
  6008. !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
  6009. [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
  6010. Sched<[WriteCRC32]>;
  6011. class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
  6012. X86MemOperand x86memop, SDPatternOperator Int> :
  6013. CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
  6014. !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
  6015. [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
  6016. Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
  6017. let Constraints = "$src1 = $dst" in {
  6018. def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
  6019. int_x86_sse42_crc32_32_8>;
  6020. def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
  6021. int_x86_sse42_crc32_32_8>;
  6022. def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
  6023. int_x86_sse42_crc32_32_16>, OpSize16;
  6024. def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
  6025. int_x86_sse42_crc32_32_16>, OpSize16;
  6026. def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
  6027. int_x86_sse42_crc32_32_32>, OpSize32;
  6028. def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
  6029. int_x86_sse42_crc32_32_32>, OpSize32;
  6030. def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
  6031. int_x86_sse42_crc32_64_64>, REX_W;
  6032. def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
  6033. int_x86_sse42_crc32_64_64>, REX_W;
  6034. let hasSideEffects = 0 in {
  6035. let mayLoad = 1 in
  6036. def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
  6037. null_frag>, REX_W;
  6038. def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
  6039. null_frag>, REX_W;
  6040. }
  6041. }
  6042. //===----------------------------------------------------------------------===//
  6043. // SHA-NI Instructions
  6044. //===----------------------------------------------------------------------===//
  6045. // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
  6046. multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
  6047. X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
  6048. def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
  6049. (ins VR128:$src1, VR128:$src2),
  6050. !if(UsesXMM0,
  6051. !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  6052. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
  6053. [!if(UsesXMM0,
  6054. (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
  6055. (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
  6056. T8PS, Sched<[sched]>;
  6057. def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
  6058. (ins VR128:$src1, i128mem:$src2),
  6059. !if(UsesXMM0,
  6060. !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  6061. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
  6062. [!if(UsesXMM0,
  6063. (set VR128:$dst, (IntId VR128:$src1,
  6064. (memop addr:$src2), XMM0)),
  6065. (set VR128:$dst, (IntId VR128:$src1,
  6066. (memop addr:$src2))))]>, T8PS,
  6067. Sched<[sched.Folded, sched.ReadAfterFold]>;
  6068. }
  6069. let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
  6070. def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
  6071. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  6072. "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6073. [(set VR128:$dst,
  6074. (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
  6075. (i8 timm:$src3)))]>, TAPS,
  6076. Sched<[SchedWriteVecIMul.XMM]>;
  6077. def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
  6078. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  6079. "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6080. [(set VR128:$dst,
  6081. (int_x86_sha1rnds4 VR128:$src1,
  6082. (memop addr:$src2),
  6083. (i8 timm:$src3)))]>, TAPS,
  6084. Sched<[SchedWriteVecIMul.XMM.Folded,
  6085. SchedWriteVecIMul.XMM.ReadAfterFold]>;
  6086. defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
  6087. SchedWriteVecIMul.XMM>;
  6088. defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
  6089. SchedWriteVecIMul.XMM>;
  6090. defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
  6091. SchedWriteVecIMul.XMM>;
  6092. let Uses=[XMM0] in
  6093. defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
  6094. SchedWriteVecIMul.XMM, 1>;
  6095. defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
  6096. SchedWriteVecIMul.XMM>;
  6097. defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
  6098. SchedWriteVecIMul.XMM>;
  6099. }
  6100. // Aliases with explicit %xmm0
  6101. def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
  6102. (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
  6103. def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
  6104. (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
  6105. //===----------------------------------------------------------------------===//
  6106. // AES-NI Instructions
  6107. //===----------------------------------------------------------------------===//
  6108. multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
  6109. Intrinsic IntId, PatFrag ld_frag,
  6110. bit Is2Addr = 0, RegisterClass RC = VR128,
  6111. X86MemOperand MemOp = i128mem> {
  6112. let AsmString = OpcodeStr#
  6113. !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
  6114. "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
  6115. def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
  6116. (ins RC:$src1, RC:$src2), "",
  6117. [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
  6118. Sched<[WriteAESDecEnc]>;
  6119. def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
  6120. (ins RC:$src1, MemOp:$src2), "",
  6121. [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
  6122. Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
  6123. }
  6124. }
  6125. // Perform One Round of an AES Encryption/Decryption Flow
  6126. let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
  6127. defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
  6128. int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
  6129. defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
  6130. int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
  6131. defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
  6132. int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
  6133. defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
  6134. int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
  6135. }
  6136. let Predicates = [NoVLX, HasVAES] in {
  6137. defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
  6138. int_x86_aesni_aesenc_256, load, 0, VR256,
  6139. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6140. defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
  6141. int_x86_aesni_aesenclast_256, load, 0, VR256,
  6142. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6143. defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
  6144. int_x86_aesni_aesdec_256, load, 0, VR256,
  6145. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6146. defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
  6147. int_x86_aesni_aesdeclast_256, load, 0, VR256,
  6148. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6149. }
  6150. let Constraints = "$src1 = $dst" in {
  6151. defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
  6152. int_x86_aesni_aesenc, memop, 1>;
  6153. defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
  6154. int_x86_aesni_aesenclast, memop, 1>;
  6155. defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
  6156. int_x86_aesni_aesdec, memop, 1>;
  6157. defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
  6158. int_x86_aesni_aesdeclast, memop, 1>;
  6159. }
  6160. // Perform the AES InvMixColumn Transformation
  6161. let Predicates = [HasAVX, HasAES] in {
  6162. def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
  6163. (ins VR128:$src1),
  6164. "vaesimc\t{$src1, $dst|$dst, $src1}",
  6165. [(set VR128:$dst,
  6166. (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
  6167. VEX, VEX_WIG;
  6168. def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
  6169. (ins i128mem:$src1),
  6170. "vaesimc\t{$src1, $dst|$dst, $src1}",
  6171. [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
  6172. Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
  6173. }
  6174. def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
  6175. (ins VR128:$src1),
  6176. "aesimc\t{$src1, $dst|$dst, $src1}",
  6177. [(set VR128:$dst,
  6178. (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
  6179. def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
  6180. (ins i128mem:$src1),
  6181. "aesimc\t{$src1, $dst|$dst, $src1}",
  6182. [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
  6183. Sched<[WriteAESIMC.Folded]>;
  6184. // AES Round Key Generation Assist
  6185. let Predicates = [HasAVX, HasAES] in {
  6186. def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
  6187. (ins VR128:$src1, u8imm:$src2),
  6188. "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6189. [(set VR128:$dst,
  6190. (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
  6191. Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
  6192. def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
  6193. (ins i128mem:$src1, u8imm:$src2),
  6194. "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6195. [(set VR128:$dst,
  6196. (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
  6197. Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
  6198. }
  6199. def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
  6200. (ins VR128:$src1, u8imm:$src2),
  6201. "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6202. [(set VR128:$dst,
  6203. (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
  6204. Sched<[WriteAESKeyGen]>;
  6205. def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
  6206. (ins i128mem:$src1, u8imm:$src2),
  6207. "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6208. [(set VR128:$dst,
  6209. (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
  6210. Sched<[WriteAESKeyGen.Folded]>;
  6211. //===----------------------------------------------------------------------===//
  6212. // PCLMUL Instructions
  6213. //===----------------------------------------------------------------------===//
  6214. // Immediate transform to help with commuting.
  6215. def PCLMULCommuteImm : SDNodeXForm<timm, [{
  6216. uint8_t Imm = N->getZExtValue();
  6217. return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
  6218. }]>;
  6219. // SSE carry-less Multiplication instructions
  6220. let Predicates = [NoAVX, HasPCLMUL] in {
  6221. let Constraints = "$src1 = $dst" in {
  6222. let isCommutable = 1 in
  6223. def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
  6224. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  6225. "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6226. [(set VR128:$dst,
  6227. (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
  6228. Sched<[WriteCLMul]>;
  6229. def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
  6230. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  6231. "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6232. [(set VR128:$dst,
  6233. (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
  6234. timm:$src3))]>,
  6235. Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
  6236. } // Constraints = "$src1 = $dst"
  6237. def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
  6238. (i8 timm:$src3)),
  6239. (PCLMULQDQrm VR128:$src1, addr:$src2,
  6240. (PCLMULCommuteImm timm:$src3))>;
  6241. } // Predicates = [NoAVX, HasPCLMUL]
  6242. // SSE aliases
  6243. foreach HI = ["hq","lq"] in
  6244. foreach LO = ["hq","lq"] in {
  6245. def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
  6246. (PCLMULQDQrr VR128:$dst, VR128:$src,
  6247. !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
  6248. def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
  6249. (PCLMULQDQrm VR128:$dst, i128mem:$src,
  6250. !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
  6251. }
  6252. // AVX carry-less Multiplication instructions
  6253. multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
  6254. PatFrag LdFrag, Intrinsic IntId> {
  6255. let isCommutable = 1 in
  6256. def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
  6257. (ins RC:$src1, RC:$src2, u8imm:$src3),
  6258. "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6259. [(set RC:$dst,
  6260. (IntId RC:$src1, RC:$src2, timm:$src3))]>,
  6261. Sched<[WriteCLMul]>;
  6262. def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
  6263. (ins RC:$src1, MemOp:$src2, u8imm:$src3),
  6264. "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6265. [(set RC:$dst,
  6266. (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
  6267. Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
  6268. // We can commute a load in the first operand by swapping the sources and
  6269. // rotating the immediate.
  6270. def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
  6271. (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
  6272. (PCLMULCommuteImm timm:$src3))>;
  6273. }
  6274. let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
  6275. defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
  6276. int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
  6277. let Predicates = [NoVLX, HasVPCLMULQDQ] in
  6278. defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
  6279. int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
  6280. multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
  6281. X86MemOperand MemOp, string Hi, string Lo> {
  6282. def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6283. (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
  6284. !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
  6285. def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6286. (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
  6287. !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
  6288. }
  6289. multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
  6290. X86MemOperand MemOp> {
  6291. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
  6292. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
  6293. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
  6294. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
  6295. }
  6296. // AVX aliases
  6297. defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
  6298. defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
  6299. //===----------------------------------------------------------------------===//
  6300. // SSE4A Instructions
  6301. //===----------------------------------------------------------------------===//
  6302. let Predicates = [HasSSE4A] in {
  6303. let ExeDomain = SSEPackedInt in {
  6304. let Constraints = "$src = $dst" in {
  6305. def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
  6306. (ins VR128:$src, u8imm:$len, u8imm:$idx),
  6307. "extrq\t{$idx, $len, $src|$src, $len, $idx}",
  6308. [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
  6309. timm:$idx))]>,
  6310. PD, Sched<[SchedWriteVecALU.XMM]>;
  6311. def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
  6312. (ins VR128:$src, VR128:$mask),
  6313. "extrq\t{$mask, $src|$src, $mask}",
  6314. [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
  6315. VR128:$mask))]>,
  6316. PD, Sched<[SchedWriteVecALU.XMM]>;
  6317. def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
  6318. (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
  6319. "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
  6320. [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
  6321. timm:$len, timm:$idx))]>,
  6322. XD, Sched<[SchedWriteVecALU.XMM]>;
  6323. def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
  6324. (ins VR128:$src, VR128:$mask),
  6325. "insertq\t{$mask, $src|$src, $mask}",
  6326. [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
  6327. VR128:$mask))]>,
  6328. XD, Sched<[SchedWriteVecALU.XMM]>;
  6329. }
  6330. } // ExeDomain = SSEPackedInt
  6331. // Non-temporal (unaligned) scalar stores.
  6332. let AddedComplexity = 400 in { // Prefer non-temporal versions
  6333. let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
  6334. def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
  6335. "movntss\t{$src, $dst|$dst, $src}", []>, XS;
  6336. def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  6337. "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
  6338. } // SchedRW
  6339. def : Pat<(nontemporalstore FR32:$src, addr:$dst),
  6340. (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6341. def : Pat<(nontemporalstore FR64:$src, addr:$dst),
  6342. (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6343. } // AddedComplexity
  6344. } // HasSSE4A
  6345. //===----------------------------------------------------------------------===//
  6346. // AVX Instructions
  6347. //===----------------------------------------------------------------------===//
  6348. //===----------------------------------------------------------------------===//
  6349. // VBROADCAST - Load from memory and broadcast to all elements of the
  6350. // destination operand
  6351. //
  6352. class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
  6353. X86MemOperand x86memop, ValueType VT,
  6354. PatFrag bcast_frag, SchedWrite Sched> :
  6355. AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  6356. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6357. [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
  6358. Sched<[Sched]>, VEX;
  6359. // AVX2 adds register forms
  6360. class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
  6361. ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
  6362. AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
  6363. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6364. [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
  6365. Sched<[Sched]>, VEX;
  6366. let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
  6367. def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
  6368. f32mem, v4f32, X86VBroadcastld32,
  6369. SchedWriteFShuffle.XMM.Folded>;
  6370. def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
  6371. f32mem, v8f32, X86VBroadcastld32,
  6372. SchedWriteFShuffle.XMM.Folded>, VEX_L;
  6373. }
  6374. let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
  6375. def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
  6376. v4f64, X86VBroadcastld64,
  6377. SchedWriteFShuffle.XMM.Folded>, VEX_L;
  6378. let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
  6379. def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
  6380. v4f32, v4f32, SchedWriteFShuffle.XMM>;
  6381. def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
  6382. v8f32, v4f32, WriteFShuffle256>, VEX_L;
  6383. }
  6384. let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
  6385. def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
  6386. v4f64, v2f64, WriteFShuffle256>, VEX_L;
  6387. //===----------------------------------------------------------------------===//
  6388. // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
  6389. // halves of a 256-bit vector.
  6390. //
  6391. let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
  6392. def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
  6393. (ins i128mem:$src),
  6394. "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
  6395. Sched<[WriteShuffleLd]>, VEX, VEX_L;
  6396. let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
  6397. ExeDomain = SSEPackedSingle in
  6398. def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
  6399. (ins f128mem:$src),
  6400. "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
  6401. Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
  6402. let Predicates = [HasAVX, NoVLX] in {
  6403. def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
  6404. (VBROADCASTF128 addr:$src)>;
  6405. def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
  6406. (VBROADCASTF128 addr:$src)>;
  6407. // NOTE: We're using FP instructions here, but execution domain fixing can
  6408. // convert to integer when profitable.
  6409. def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
  6410. (VBROADCASTF128 addr:$src)>;
  6411. def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
  6412. (VBROADCASTF128 addr:$src)>;
  6413. def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
  6414. (VBROADCASTF128 addr:$src)>;
  6415. def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
  6416. (VBROADCASTF128 addr:$src)>;
  6417. def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
  6418. (VBROADCASTF128 addr:$src)>;
  6419. }
  6420. //===----------------------------------------------------------------------===//
  6421. // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
  6422. //
  6423. let ExeDomain = SSEPackedSingle in {
  6424. let isCommutable = 1 in
  6425. def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
  6426. (ins VR256:$src1, VR256:$src2, u8imm:$src3),
  6427. "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6428. VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
  6429. def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
  6430. (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
  6431. "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6432. VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
  6433. }
  6434. // Immediate transform to help with commuting.
  6435. def Perm2XCommuteImm : SDNodeXForm<timm, [{
  6436. return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
  6437. }]>;
  6438. multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
  6439. def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
  6440. (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
  6441. def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
  6442. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
  6443. // Pattern with load in other operand.
  6444. def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
  6445. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
  6446. (Perm2XCommuteImm timm:$imm))>;
  6447. }
  6448. let Predicates = [HasAVX] in {
  6449. defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
  6450. defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
  6451. }
  6452. let Predicates = [HasAVX1Only] in {
  6453. defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
  6454. defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
  6455. defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
  6456. defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
  6457. defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
  6458. }
  6459. //===----------------------------------------------------------------------===//
  6460. // VINSERTF128 - Insert packed floating-point values
  6461. //
  6462. let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  6463. def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
  6464. (ins VR256:$src1, VR128:$src2, u8imm:$src3),
  6465. "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6466. []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
  6467. let mayLoad = 1 in
  6468. def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
  6469. (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
  6470. "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6471. []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  6472. }
  6473. // To create a 256-bit all ones value, we should produce VCMPTRUEPS
  6474. // with YMM register containing zero.
  6475. // FIXME: Avoid producing vxorps to clear the fake inputs.
  6476. let Predicates = [HasAVX1Only] in {
  6477. def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
  6478. }
  6479. multiclass vinsert_lowering<string InstrStr, string PermStr,
  6480. ValueType From, ValueType To,
  6481. PatFrag frommemop_frag, PatFrag tomemop_frag> {
  6482. def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
  6483. (iPTR imm)),
  6484. (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
  6485. (INSERT_get_vinsert128_imm VR256:$ins))>;
  6486. def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
  6487. (From (frommemop_frag addr:$src2)),
  6488. (iPTR imm)),
  6489. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
  6490. (INSERT_get_vinsert128_imm VR256:$ins))>;
  6491. // Folding "To" vector - convert to perm2x128 and commute inputs.
  6492. def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
  6493. (From VR128:$src2),
  6494. (iPTR imm)),
  6495. (!cast<Instruction>(PermStr#rm)
  6496. (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
  6497. addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
  6498. }
  6499. let Predicates = [HasAVX, NoVLX] in {
  6500. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
  6501. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
  6502. }
  6503. let Predicates = [HasAVX1Only] in {
  6504. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>;
  6505. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>;
  6506. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
  6507. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
  6508. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
  6509. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
  6510. }
  6511. //===----------------------------------------------------------------------===//
  6512. // VEXTRACTF128 - Extract packed floating-point values
  6513. //
  6514. let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  6515. def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
  6516. (ins VR256:$src1, u8imm:$src2),
  6517. "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6518. []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
  6519. let mayStore = 1 in
  6520. def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
  6521. (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
  6522. "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6523. []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
  6524. }
  6525. multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
  6526. def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
  6527. (To (!cast<Instruction>(InstrStr#rr)
  6528. (From VR256:$src1),
  6529. (EXTRACT_get_vextract128_imm VR128:$ext)))>;
  6530. def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
  6531. (iPTR imm))), addr:$dst),
  6532. (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
  6533. (EXTRACT_get_vextract128_imm VR128:$ext))>;
  6534. }
  6535. // AVX1 patterns
  6536. let Predicates = [HasAVX, NoVLX] in {
  6537. defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
  6538. defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
  6539. }
  6540. let Predicates = [HasAVX1Only] in {
  6541. defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
  6542. defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
  6543. defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
  6544. defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
  6545. defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
  6546. defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
  6547. }
  6548. //===----------------------------------------------------------------------===//
  6549. // VMASKMOV - Conditional SIMD Packed Loads and Stores
  6550. //
  6551. multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
  6552. Intrinsic IntLd, Intrinsic IntLd256,
  6553. Intrinsic IntSt, Intrinsic IntSt256,
  6554. X86SchedWriteMaskMove schedX,
  6555. X86SchedWriteMaskMove schedY> {
  6556. def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
  6557. (ins VR128:$src1, f128mem:$src2),
  6558. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6559. [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
  6560. VEX_4V, Sched<[schedX.RM]>;
  6561. def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
  6562. (ins VR256:$src1, f256mem:$src2),
  6563. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6564. [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
  6565. VEX_4V, VEX_L, Sched<[schedY.RM]>;
  6566. def mr : AVX8I<opc_mr, MRMDestMem, (outs),
  6567. (ins f128mem:$dst, VR128:$src1, VR128:$src2),
  6568. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6569. [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
  6570. VEX_4V, Sched<[schedX.MR]>;
  6571. def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
  6572. (ins f256mem:$dst, VR256:$src1, VR256:$src2),
  6573. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6574. [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
  6575. VEX_4V, VEX_L, Sched<[schedY.MR]>;
  6576. }
  6577. let ExeDomain = SSEPackedSingle in
  6578. defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
  6579. int_x86_avx_maskload_ps,
  6580. int_x86_avx_maskload_ps_256,
  6581. int_x86_avx_maskstore_ps,
  6582. int_x86_avx_maskstore_ps_256,
  6583. WriteFMaskMove32, WriteFMaskMove32Y>;
  6584. let ExeDomain = SSEPackedDouble in
  6585. defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
  6586. int_x86_avx_maskload_pd,
  6587. int_x86_avx_maskload_pd_256,
  6588. int_x86_avx_maskstore_pd,
  6589. int_x86_avx_maskstore_pd_256,
  6590. WriteFMaskMove64, WriteFMaskMove64Y>;
  6591. //===----------------------------------------------------------------------===//
  6592. // AVX_VNNI
  6593. //===----------------------------------------------------------------------===//
  6594. let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
  6595. ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
  6596. multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  6597. bit IsCommutable> {
  6598. let isCommutable = IsCommutable in
  6599. def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
  6600. (ins VR128:$src1, VR128:$src2, VR128:$src3),
  6601. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6602. [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
  6603. VR128:$src2, VR128:$src3)))]>,
  6604. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  6605. def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
  6606. (ins VR128:$src1, VR128:$src2, i128mem:$src3),
  6607. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6608. [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
  6609. (loadv4i32 addr:$src3))))]>,
  6610. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  6611. let isCommutable = IsCommutable in
  6612. def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
  6613. (ins VR256:$src1, VR256:$src2, VR256:$src3),
  6614. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6615. [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
  6616. VR256:$src2, VR256:$src3)))]>,
  6617. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
  6618. def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
  6619. (ins VR256:$src1, VR256:$src2, i256mem:$src3),
  6620. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6621. [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
  6622. (loadv8i32 addr:$src3))))]>,
  6623. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
  6624. }
  6625. defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
  6626. defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
  6627. defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>;
  6628. defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
  6629. def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
  6630. (X86vpmaddwd node:$lhs, node:$rhs), [{
  6631. return N->hasOneUse();
  6632. }]>;
  6633. let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
  6634. def : Pat<(v8i32 (add VR256:$src1,
  6635. (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
  6636. (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
  6637. def : Pat<(v8i32 (add VR256:$src1,
  6638. (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
  6639. (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
  6640. def : Pat<(v4i32 (add VR128:$src1,
  6641. (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
  6642. (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
  6643. def : Pat<(v4i32 (add VR128:$src1,
  6644. (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
  6645. (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
  6646. }
  6647. //===----------------------------------------------------------------------===//
  6648. // VPERMIL - Permute Single and Double Floating-Point Values
  6649. //
  6650. multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
  6651. RegisterClass RC, X86MemOperand x86memop_f,
  6652. X86MemOperand x86memop_i,
  6653. ValueType f_vt, ValueType i_vt,
  6654. X86FoldableSchedWrite sched,
  6655. X86FoldableSchedWrite varsched> {
  6656. let Predicates = [HasAVX, NoVLX] in {
  6657. def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
  6658. (ins RC:$src1, RC:$src2),
  6659. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6660. [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
  6661. Sched<[varsched]>;
  6662. def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
  6663. (ins RC:$src1, x86memop_i:$src2),
  6664. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6665. [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
  6666. (i_vt (load addr:$src2)))))]>, VEX_4V,
  6667. Sched<[varsched.Folded, sched.ReadAfterFold]>;
  6668. def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
  6669. (ins RC:$src1, u8imm:$src2),
  6670. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6671. [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
  6672. Sched<[sched]>;
  6673. def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
  6674. (ins x86memop_f:$src1, u8imm:$src2),
  6675. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6676. [(set RC:$dst,
  6677. (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
  6678. Sched<[sched.Folded]>;
  6679. }// Predicates = [HasAVX, NoVLX]
  6680. }
  6681. let ExeDomain = SSEPackedSingle in {
  6682. defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
  6683. v4f32, v4i32, SchedWriteFShuffle.XMM,
  6684. SchedWriteFVarShuffle.XMM>;
  6685. defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
  6686. v8f32, v8i32, SchedWriteFShuffle.YMM,
  6687. SchedWriteFVarShuffle.YMM>, VEX_L;
  6688. }
  6689. let ExeDomain = SSEPackedDouble in {
  6690. defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
  6691. v2f64, v2i64, SchedWriteFShuffle.XMM,
  6692. SchedWriteFVarShuffle.XMM>;
  6693. defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
  6694. v4f64, v4i64, SchedWriteFShuffle.YMM,
  6695. SchedWriteFVarShuffle.YMM>, VEX_L;
  6696. }
  6697. //===----------------------------------------------------------------------===//
  6698. // VZERO - Zero YMM registers
  6699. // Note: These instruction do not affect the YMM16-YMM31.
  6700. //
  6701. let SchedRW = [WriteSystem] in {
  6702. let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
  6703. YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
  6704. // Zero All YMM registers
  6705. def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
  6706. [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
  6707. Requires<[HasAVX]>, VEX_WIG;
  6708. // Zero Upper bits of YMM registers
  6709. def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
  6710. [(int_x86_avx_vzeroupper)]>, PS, VEX,
  6711. Requires<[HasAVX]>, VEX_WIG;
  6712. } // Defs
  6713. } // SchedRW
  6714. //===----------------------------------------------------------------------===//
  6715. // Half precision conversion instructions
  6716. //
  6717. multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
  6718. X86FoldableSchedWrite sched> {
  6719. def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
  6720. "vcvtph2ps\t{$src, $dst|$dst, $src}",
  6721. [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
  6722. T8PD, VEX, Sched<[sched]>;
  6723. let hasSideEffects = 0, mayLoad = 1 in
  6724. def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  6725. "vcvtph2ps\t{$src, $dst|$dst, $src}",
  6726. []>, T8PD, VEX, Sched<[sched.Folded]>;
  6727. }
  6728. multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
  6729. SchedWrite RR, SchedWrite MR> {
  6730. def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
  6731. (ins RC:$src1, i32u8imm:$src2),
  6732. "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6733. [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
  6734. TAPD, VEX, Sched<[RR]>;
  6735. let hasSideEffects = 0, mayStore = 1 in
  6736. def mr : Ii8<0x1D, MRMDestMem, (outs),
  6737. (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
  6738. "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  6739. TAPD, VEX, Sched<[MR]>;
  6740. }
  6741. let Predicates = [HasF16C, NoVLX] in {
  6742. defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
  6743. defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
  6744. defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
  6745. WriteCvtPS2PHSt>, SIMD_EXC;
  6746. defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
  6747. WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
  6748. // Pattern match vcvtph2ps of a scalar i64 load.
  6749. def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  6750. (VCVTPH2PSrm addr:$src)>;
  6751. def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
  6752. (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  6753. (VCVTPH2PSrm addr:$src)>;
  6754. def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
  6755. (VCVTPH2PSYrm addr:$src)>;
  6756. def : Pat<(store (f64 (extractelt
  6757. (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
  6758. (iPTR 0))), addr:$dst),
  6759. (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
  6760. def : Pat<(store (i64 (extractelt
  6761. (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
  6762. (iPTR 0))), addr:$dst),
  6763. (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
  6764. def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
  6765. (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
  6766. }
  6767. //===----------------------------------------------------------------------===//
  6768. // AVX2 Instructions
  6769. //===----------------------------------------------------------------------===//
  6770. /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
  6771. multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  6772. ValueType OpVT, X86FoldableSchedWrite sched,
  6773. RegisterClass RC,
  6774. X86MemOperand x86memop, SDNodeXForm commuteXForm> {
  6775. let isCommutable = 1 in
  6776. def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
  6777. (ins RC:$src1, RC:$src2, u8imm:$src3),
  6778. !strconcat(OpcodeStr,
  6779. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  6780. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  6781. Sched<[sched]>, VEX_4V;
  6782. def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
  6783. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  6784. !strconcat(OpcodeStr,
  6785. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  6786. [(set RC:$dst,
  6787. (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
  6788. Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
  6789. // Pattern to commute if load is in first source.
  6790. def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
  6791. (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
  6792. (commuteXForm timm:$src3))>;
  6793. }
  6794. let Predicates = [HasAVX2] in {
  6795. defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
  6796. SchedWriteBlend.XMM, VR128, i128mem,
  6797. BlendCommuteImm4>;
  6798. defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
  6799. SchedWriteBlend.YMM, VR256, i256mem,
  6800. BlendCommuteImm8>, VEX_L;
  6801. def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
  6802. (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
  6803. def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
  6804. (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  6805. def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
  6806. (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  6807. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  6808. (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
  6809. def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
  6810. (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
  6811. def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
  6812. (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
  6813. }
  6814. // For insertion into the zero index (low half) of a 256-bit vector, it is
  6815. // more efficient to generate a blend with immediate instead of an insert*128.
  6816. // NOTE: We're using FP instructions here, but execution domain fixing should
  6817. // take care of using integer instructions when profitable.
  6818. let Predicates = [HasAVX] in {
  6819. def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
  6820. (VBLENDPSYrri VR256:$src1,
  6821. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6822. VR128:$src2, sub_xmm), 0xf)>;
  6823. def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
  6824. (VBLENDPSYrri VR256:$src1,
  6825. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6826. VR128:$src2, sub_xmm), 0xf)>;
  6827. def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
  6828. (VBLENDPSYrri VR256:$src1,
  6829. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6830. VR128:$src2, sub_xmm), 0xf)>;
  6831. def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
  6832. (VBLENDPSYrri VR256:$src1,
  6833. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6834. VR128:$src2, sub_xmm), 0xf)>;
  6835. def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
  6836. (VBLENDPSYrri VR256:$src1,
  6837. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6838. VR128:$src2, sub_xmm), 0xf)>;
  6839. def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
  6840. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6841. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6842. def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
  6843. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6844. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6845. def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
  6846. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6847. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6848. def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
  6849. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6850. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6851. def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
  6852. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6853. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6854. }
  6855. //===----------------------------------------------------------------------===//
  6856. // VPBROADCAST - Load from memory and broadcast to all elements of the
  6857. // destination operand
  6858. //
  6859. multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
  6860. X86MemOperand x86memop, PatFrag bcast_frag,
  6861. ValueType OpVT128, ValueType OpVT256, Predicate prd> {
  6862. let Predicates = [HasAVX2, prd] in {
  6863. def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  6864. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6865. [(set VR128:$dst,
  6866. (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
  6867. Sched<[SchedWriteShuffle.XMM]>, VEX;
  6868. def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  6869. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6870. [(set VR128:$dst,
  6871. (OpVT128 (bcast_frag addr:$src)))]>,
  6872. Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
  6873. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  6874. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6875. [(set VR256:$dst,
  6876. (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
  6877. Sched<[WriteShuffle256]>, VEX, VEX_L;
  6878. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
  6879. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6880. [(set VR256:$dst,
  6881. (OpVT256 (bcast_frag addr:$src)))]>,
  6882. Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
  6883. // Provide aliases for broadcast from the same register class that
  6884. // automatically does the extract.
  6885. def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
  6886. (!cast<Instruction>(NAME#"Yrr")
  6887. (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
  6888. }
  6889. }
  6890. defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
  6891. v16i8, v32i8, NoVLX_Or_NoBWI>;
  6892. defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
  6893. v8i16, v16i16, NoVLX_Or_NoBWI>;
  6894. defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
  6895. v4i32, v8i32, NoVLX>;
  6896. defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
  6897. v2i64, v4i64, NoVLX>;
  6898. let Predicates = [HasAVX2, NoVLX] in {
  6899. // Provide fallback in case the load node that is used in the patterns above
  6900. // is used by additional users, which prevents the pattern selection.
  6901. def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
  6902. (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6903. def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
  6904. (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6905. def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
  6906. (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6907. }
  6908. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  6909. def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
  6910. (VPBROADCASTBrr (VMOVDI2PDIrr
  6911. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6912. GR8:$src, sub_8bit))))>;
  6913. def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
  6914. (VPBROADCASTBYrr (VMOVDI2PDIrr
  6915. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6916. GR8:$src, sub_8bit))))>;
  6917. def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
  6918. (VPBROADCASTWrr (VMOVDI2PDIrr
  6919. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6920. GR16:$src, sub_16bit))))>;
  6921. def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
  6922. (VPBROADCASTWYrr (VMOVDI2PDIrr
  6923. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6924. GR16:$src, sub_16bit))))>;
  6925. def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
  6926. (VPBROADCASTWrm addr:$src)>;
  6927. def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
  6928. (VPBROADCASTWYrm addr:$src)>;
  6929. def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
  6930. (VPBROADCASTWrr VR128:$src)>;
  6931. def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
  6932. (VPBROADCASTWYrr VR128:$src)>;
  6933. def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
  6934. (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
  6935. def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
  6936. (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
  6937. }
  6938. let Predicates = [HasAVX2, NoVLX] in {
  6939. def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
  6940. (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
  6941. def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
  6942. (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
  6943. def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
  6944. (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
  6945. def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
  6946. (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
  6947. }
  6948. // AVX1 broadcast patterns
  6949. let Predicates = [HasAVX1Only] in {
  6950. def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
  6951. (VBROADCASTSSYrm addr:$src)>;
  6952. def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
  6953. (VBROADCASTSDYrm addr:$src)>;
  6954. def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
  6955. (VBROADCASTSSrm addr:$src)>;
  6956. }
  6957. // Provide fallback in case the load node that is used in the patterns above
  6958. // is used by additional users, which prevents the pattern selection.
  6959. let Predicates = [HasAVX, NoVLX] in {
  6960. // 128bit broadcasts:
  6961. def : Pat<(v2f64 (X86VBroadcast f64:$src)),
  6962. (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6963. def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
  6964. (VMOVDDUPrm addr:$src)>;
  6965. def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
  6966. (VMOVDDUPrr VR128:$src)>;
  6967. }
  6968. let Predicates = [HasAVX1Only] in {
  6969. def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
  6970. (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
  6971. def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
  6972. (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  6973. (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
  6974. (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
  6975. def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
  6976. (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  6977. (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
  6978. (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
  6979. def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
  6980. (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  6981. (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
  6982. (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
  6983. def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
  6984. (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  6985. (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
  6986. (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
  6987. def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
  6988. (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
  6989. def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
  6990. (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6991. (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
  6992. (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
  6993. def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
  6994. (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
  6995. (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
  6996. (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
  6997. def : Pat<(v2i64 (X86VBroadcast i64:$src)),
  6998. (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
  6999. def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
  7000. (VMOVDDUPrm addr:$src)>;
  7001. }
  7002. //===----------------------------------------------------------------------===//
  7003. // VPERM - Permute instructions
  7004. //
  7005. multiclass avx2_perm<bits<8> opc, string OpcodeStr,
  7006. ValueType OpVT, X86FoldableSchedWrite Sched,
  7007. X86MemOperand memOp> {
  7008. let Predicates = [HasAVX2, NoVLX] in {
  7009. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
  7010. (ins VR256:$src1, VR256:$src2),
  7011. !strconcat(OpcodeStr,
  7012. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7013. [(set VR256:$dst,
  7014. (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
  7015. Sched<[Sched]>, VEX_4V, VEX_L;
  7016. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
  7017. (ins VR256:$src1, memOp:$src2),
  7018. !strconcat(OpcodeStr,
  7019. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7020. [(set VR256:$dst,
  7021. (OpVT (X86VPermv VR256:$src1,
  7022. (load addr:$src2))))]>,
  7023. Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
  7024. }
  7025. }
  7026. defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
  7027. let ExeDomain = SSEPackedSingle in
  7028. defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
  7029. multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
  7030. ValueType OpVT, X86FoldableSchedWrite Sched,
  7031. X86MemOperand memOp> {
  7032. let Predicates = [HasAVX2, NoVLX] in {
  7033. def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
  7034. (ins VR256:$src1, u8imm:$src2),
  7035. !strconcat(OpcodeStr,
  7036. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7037. [(set VR256:$dst,
  7038. (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
  7039. Sched<[Sched]>, VEX, VEX_L;
  7040. def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
  7041. (ins memOp:$src1, u8imm:$src2),
  7042. !strconcat(OpcodeStr,
  7043. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7044. [(set VR256:$dst,
  7045. (OpVT (X86VPermi (mem_frag addr:$src1),
  7046. (i8 timm:$src2))))]>,
  7047. Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
  7048. }
  7049. }
  7050. defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
  7051. WriteShuffle256, i256mem>, VEX_W;
  7052. let ExeDomain = SSEPackedDouble in
  7053. defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
  7054. WriteFShuffle256, f256mem>, VEX_W;
  7055. //===----------------------------------------------------------------------===//
  7056. // VPERM2I128 - Permute Integer vector Values in 128-bit chunks
  7057. //
  7058. let isCommutable = 1 in
  7059. def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
  7060. (ins VR256:$src1, VR256:$src2, u8imm:$src3),
  7061. "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  7062. Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
  7063. def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
  7064. (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
  7065. "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  7066. Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  7067. let Predicates = [HasAVX2] in {
  7068. defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
  7069. defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
  7070. defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
  7071. defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
  7072. defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
  7073. defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
  7074. }
  7075. //===----------------------------------------------------------------------===//
  7076. // VINSERTI128 - Insert packed integer values
  7077. //
  7078. let hasSideEffects = 0 in {
  7079. def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
  7080. (ins VR256:$src1, VR128:$src2, u8imm:$src3),
  7081. "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  7082. []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
  7083. let mayLoad = 1 in
  7084. def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
  7085. (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
  7086. "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  7087. []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  7088. }
  7089. let Predicates = [HasAVX2, NoVLX] in {
  7090. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>;
  7091. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>;
  7092. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>;
  7093. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>;
  7094. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
  7095. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
  7096. }
  7097. //===----------------------------------------------------------------------===//
  7098. // VEXTRACTI128 - Extract packed integer values
  7099. //
  7100. def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
  7101. (ins VR256:$src1, u8imm:$src2),
  7102. "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  7103. Sched<[WriteShuffle256]>, VEX, VEX_L;
  7104. let hasSideEffects = 0, mayStore = 1 in
  7105. def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
  7106. (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
  7107. "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  7108. Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
  7109. let Predicates = [HasAVX2, NoVLX] in {
  7110. defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
  7111. defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
  7112. defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
  7113. defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
  7114. defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
  7115. defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
  7116. }
  7117. //===----------------------------------------------------------------------===//
  7118. // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
  7119. //
  7120. multiclass avx2_pmovmask<string OpcodeStr,
  7121. Intrinsic IntLd128, Intrinsic IntLd256,
  7122. Intrinsic IntSt128, Intrinsic IntSt256,
  7123. X86SchedWriteMaskMove schedX,
  7124. X86SchedWriteMaskMove schedY> {
  7125. def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
  7126. (ins VR128:$src1, i128mem:$src2),
  7127. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7128. [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
  7129. VEX_4V, Sched<[schedX.RM]>;
  7130. def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
  7131. (ins VR256:$src1, i256mem:$src2),
  7132. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7133. [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
  7134. VEX_4V, VEX_L, Sched<[schedY.RM]>;
  7135. def mr : AVX28I<0x8e, MRMDestMem, (outs),
  7136. (ins i128mem:$dst, VR128:$src1, VR128:$src2),
  7137. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7138. [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
  7139. VEX_4V, Sched<[schedX.MR]>;
  7140. def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
  7141. (ins i256mem:$dst, VR256:$src1, VR256:$src2),
  7142. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7143. [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
  7144. VEX_4V, VEX_L, Sched<[schedY.MR]>;
  7145. }
  7146. defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
  7147. int_x86_avx2_maskload_d,
  7148. int_x86_avx2_maskload_d_256,
  7149. int_x86_avx2_maskstore_d,
  7150. int_x86_avx2_maskstore_d_256,
  7151. WriteVecMaskMove32, WriteVecMaskMove32Y>;
  7152. defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
  7153. int_x86_avx2_maskload_q,
  7154. int_x86_avx2_maskload_q_256,
  7155. int_x86_avx2_maskstore_q,
  7156. int_x86_avx2_maskstore_q_256,
  7157. WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
  7158. multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
  7159. ValueType MaskVT> {
  7160. // masked store
  7161. def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
  7162. (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
  7163. // masked load
  7164. def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
  7165. (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
  7166. def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
  7167. (VT immAllZerosV))),
  7168. (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
  7169. }
  7170. let Predicates = [HasAVX] in {
  7171. defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
  7172. defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
  7173. defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
  7174. defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
  7175. }
  7176. let Predicates = [HasAVX1Only] in {
  7177. // load/store i32/i64 not supported use ps/pd version
  7178. defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
  7179. defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
  7180. defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
  7181. defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
  7182. }
  7183. let Predicates = [HasAVX2] in {
  7184. defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
  7185. defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
  7186. defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
  7187. defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
  7188. }
  7189. //===----------------------------------------------------------------------===//
  7190. // Variable Bit Shifts
  7191. //
  7192. multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
  7193. ValueType vt128, ValueType vt256> {
  7194. def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
  7195. (ins VR128:$src1, VR128:$src2),
  7196. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7197. [(set VR128:$dst,
  7198. (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
  7199. VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
  7200. def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
  7201. (ins VR128:$src1, i128mem:$src2),
  7202. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7203. [(set VR128:$dst,
  7204. (vt128 (OpNode VR128:$src1,
  7205. (vt128 (load addr:$src2)))))]>,
  7206. VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
  7207. SchedWriteVarVecShift.XMM.ReadAfterFold]>;
  7208. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
  7209. (ins VR256:$src1, VR256:$src2),
  7210. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7211. [(set VR256:$dst,
  7212. (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
  7213. VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
  7214. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
  7215. (ins VR256:$src1, i256mem:$src2),
  7216. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7217. [(set VR256:$dst,
  7218. (vt256 (OpNode VR256:$src1,
  7219. (vt256 (load addr:$src2)))))]>,
  7220. VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
  7221. SchedWriteVarVecShift.YMM.ReadAfterFold]>;
  7222. }
  7223. let Predicates = [HasAVX2, NoVLX] in {
  7224. defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
  7225. defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
  7226. defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
  7227. defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
  7228. defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
  7229. }
  7230. //===----------------------------------------------------------------------===//
  7231. // VGATHER - GATHER Operations
  7232. // FIXME: Improve scheduling of gather instructions.
  7233. multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
  7234. X86MemOperand memop128, X86MemOperand memop256> {
  7235. let mayLoad = 1, hasSideEffects = 0 in {
  7236. def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
  7237. (ins VR128:$src1, memop128:$src2, VR128:$mask),
  7238. !strconcat(OpcodeStr,
  7239. "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
  7240. []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
  7241. def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
  7242. (ins RC256:$src1, memop256:$src2, RC256:$mask),
  7243. !strconcat(OpcodeStr,
  7244. "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
  7245. []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
  7246. }
  7247. }
  7248. let Predicates = [HasAVX2] in {
  7249. let mayLoad = 1, hasSideEffects = 0, Constraints
  7250. = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
  7251. in {
  7252. defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
  7253. VR256, vx128mem, vx256mem>, VEX_W;
  7254. defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
  7255. VR256, vx128mem, vy256mem>, VEX_W;
  7256. defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
  7257. VR256, vx128mem, vy256mem>;
  7258. defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
  7259. VR128, vx64mem, vy128mem>;
  7260. let ExeDomain = SSEPackedDouble in {
  7261. defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
  7262. VR256, vx128mem, vx256mem>, VEX_W;
  7263. defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
  7264. VR256, vx128mem, vy256mem>, VEX_W;
  7265. }
  7266. let ExeDomain = SSEPackedSingle in {
  7267. defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
  7268. VR256, vx128mem, vy256mem>;
  7269. defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
  7270. VR128, vx64mem, vy128mem>;
  7271. }
  7272. }
  7273. }
  7274. //===----------------------------------------------------------------------===//
  7275. // GFNI instructions
  7276. //===----------------------------------------------------------------------===//
  7277. multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
  7278. RegisterClass RC, PatFrag MemOpFrag,
  7279. X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
  7280. bit Is2Addr = 0> {
  7281. let ExeDomain = SSEPackedInt,
  7282. AsmString = !if(Is2Addr,
  7283. OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
  7284. OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
  7285. let isCommutable = 1 in
  7286. def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
  7287. [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
  7288. Sched<[sched]>, T8PD;
  7289. def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
  7290. [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
  7291. (MemOpFrag addr:$src2))))]>,
  7292. Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
  7293. }
  7294. }
  7295. multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
  7296. SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
  7297. X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
  7298. bit Is2Addr = 0> {
  7299. let AsmString = !if(Is2Addr,
  7300. OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  7301. OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
  7302. def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
  7303. (ins RC:$src1, RC:$src2, u8imm:$src3), "",
  7304. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
  7305. SSEPackedInt>, Sched<[sched]>;
  7306. def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
  7307. (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
  7308. [(set RC:$dst, (OpVT (OpNode RC:$src1,
  7309. (MemOpFrag addr:$src2),
  7310. timm:$src3)))], SSEPackedInt>,
  7311. Sched<[sched.Folded, sched.ReadAfterFold]>;
  7312. }
  7313. }
  7314. multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
  7315. let Constraints = "$src1 = $dst",
  7316. Predicates = [HasGFNI, UseSSE2] in
  7317. defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
  7318. VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>;
  7319. let Predicates = [HasGFNI, HasAVX, NoVLX] in {
  7320. defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
  7321. load, i128mem, SchedWriteVecIMul.XMM>,
  7322. VEX_4V, VEX_W;
  7323. defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
  7324. load, i256mem, SchedWriteVecIMul.YMM>,
  7325. VEX_4V, VEX_L, VEX_W;
  7326. }
  7327. }
  7328. // GF2P8MULB
  7329. let Constraints = "$src1 = $dst",
  7330. Predicates = [HasGFNI, UseSSE2] in
  7331. defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
  7332. i128mem, SchedWriteVecALU.XMM, 1>;
  7333. let Predicates = [HasGFNI, HasAVX, NoVLX] in {
  7334. defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
  7335. i128mem, SchedWriteVecALU.XMM>, VEX_4V;
  7336. defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
  7337. i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
  7338. }
  7339. // GF2P8AFFINEINVQB, GF2P8AFFINEQB
  7340. let isCommutable = 0 in {
  7341. defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
  7342. X86GF2P8affineinvqb>, TAPD;
  7343. defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
  7344. X86GF2P8affineqb>, TAPD;
  7345. }
  7346. // AVX-IFMA
  7347. let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
  7348. checkVEXPredicate = 1 in
  7349. multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
  7350. // NOTE: The SDNode have the multiply operands first with the add last.
  7351. // This enables commuted load patterns to be autogenerated by tablegen.
  7352. let isCommutable = 1 in {
  7353. def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
  7354. (ins VR128:$src1, VR128:$src2, VR128:$src3),
  7355. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7356. [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
  7357. VR128:$src3, VR128:$src1)))]>,
  7358. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  7359. }
  7360. def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
  7361. (ins VR128:$src1, VR128:$src2, i128mem:$src3),
  7362. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7363. [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
  7364. (loadv2i64 addr:$src3), VR128:$src1)))]>,
  7365. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  7366. let isCommutable = 1 in {
  7367. def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
  7368. (ins VR256:$src1, VR256:$src2, VR256:$src3),
  7369. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7370. [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
  7371. VR256:$src3, VR256:$src1)))]>,
  7372. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
  7373. }
  7374. def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
  7375. (ins VR256:$src1, VR256:$src2, i256mem:$src3),
  7376. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7377. [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
  7378. (loadv4i64 addr:$src3), VR256:$src1)))]>,
  7379. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
  7380. }
  7381. defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
  7382. defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
  7383. // AVX-VNNI-INT8
  7384. let Constraints = "$src1 = $dst" in
  7385. multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
  7386. RegisterClass RC, PatFrag MemOpFrag,
  7387. X86MemOperand X86memop, SDNode OpNode,
  7388. X86FoldableSchedWrite Sched,
  7389. bit IsCommutable> {
  7390. let isCommutable = IsCommutable in
  7391. def rr : I<Opc, MRMSrcReg, (outs RC:$dst),
  7392. (ins RC:$src1, RC:$src2, RC:$src3),
  7393. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7394. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
  7395. VEX_4V, Sched<[Sched]>;
  7396. def rm : I<Opc, MRMSrcMem, (outs RC:$dst),
  7397. (ins RC:$src1, RC:$src2, X86memop:$src3),
  7398. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  7399. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
  7400. (MemOpFrag addr:$src3))))]>,
  7401. VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
  7402. }
  7403. let Predicates = [HasAVXVNNIINT8] in {
  7404. defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32,
  7405. i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
  7406. 1>, T8XD;
  7407. defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32,
  7408. i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
  7409. 1>, VEX_L, T8XD;
  7410. defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32,
  7411. i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
  7412. 1>, T8PS;
  7413. defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32,
  7414. i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
  7415. 1>, VEX_L, T8PS;
  7416. defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
  7417. i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
  7418. 1>, T8XD;
  7419. defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
  7420. i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
  7421. 1>, VEX_L, T8XD;
  7422. defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
  7423. i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
  7424. 1>, T8PS;
  7425. defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
  7426. i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
  7427. 1>, VEX_L, T8PS;
  7428. defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32,
  7429. i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM,
  7430. 0>, T8XS;
  7431. defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32,
  7432. i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM,
  7433. 0>, VEX_L, T8XS;
  7434. defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
  7435. i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
  7436. 0>, T8XS;
  7437. defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
  7438. i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
  7439. 0>, VEX_L, T8XS;
  7440. }
  7441. // AVX-NE-CONVERT
  7442. multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr,
  7443. X86MemOperand MemOp128, X86MemOperand MemOp256> {
  7444. def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src),
  7445. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  7446. [(set VR128:$dst,
  7447. (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>,
  7448. Sched<[WriteCvtPH2PS]>, VEX;
  7449. def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src),
  7450. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  7451. [(set VR256:$dst,
  7452. (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>,
  7453. Sched<[WriteCvtPH2PSY]>, VEX, VEX_L;
  7454. }
  7455. multiclass VCVTNEPS2BF16_BASE {
  7456. def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  7457. "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
  7458. [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>,
  7459. Sched<[WriteCvtPH2PS]>;
  7460. def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  7461. "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}",
  7462. [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>,
  7463. Sched<[WriteCvtPH2PS]>;
  7464. def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  7465. "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
  7466. [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>,
  7467. Sched<[WriteCvtPH2PSY]>, VEX_L;
  7468. def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  7469. "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}",
  7470. [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>,
  7471. Sched<[WriteCvtPH2PSY]>, VEX_L;
  7472. }
  7473. let Predicates = [HasAVXNECONVERT] in {
  7474. defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
  7475. f16mem>, T8XS;
  7476. defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
  7477. T8PD;
  7478. defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
  7479. f256mem>, T8XS;
  7480. defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
  7481. f256mem>, T8PD;
  7482. defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
  7483. f256mem>, T8XD;
  7484. defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
  7485. f256mem>, T8PS;
  7486. let checkVEXPredicate = 1 in
  7487. defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
  7488. }
  7489. def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
  7490. (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
  7491. def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
  7492. (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;