X86InstrSSE.td 388 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023
  1. //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file describes the X86 SSE instruction set, defining the instructions,
  10. // and properties of the instructions which are needed for code generation,
  11. // machine code emission, and analysis.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. //===----------------------------------------------------------------------===//
  15. // SSE 1 & 2 Instructions Classes
  16. //===----------------------------------------------------------------------===//
  17. /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
  18. multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  19. RegisterClass RC, X86MemOperand x86memop,
  20. Domain d, X86FoldableSchedWrite sched,
  21. bit Is2Addr = 1> {
  22. let isCodeGenOnly = 1 in {
  23. let isCommutable = 1 in {
  24. def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  25. !if(Is2Addr,
  26. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  27. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  28. [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
  29. Sched<[sched]>;
  30. }
  31. def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  32. !if(Is2Addr,
  33. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  34. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  35. [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
  36. Sched<[sched.Folded, sched.ReadAfterFold]>;
  37. }
  38. }
  39. /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
  40. multiclass sse12_fp_scalar_int<bits<8> opc,
  41. SDPatternOperator OpNode, RegisterClass RC,
  42. ValueType VT, string asm, Operand memopr,
  43. PatFrags mem_frags, Domain d,
  44. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  45. let hasSideEffects = 0 in {
  46. def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  47. !if(Is2Addr,
  48. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  49. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  50. [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
  51. Sched<[sched]>;
  52. let mayLoad = 1 in
  53. def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
  54. !if(Is2Addr,
  55. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  56. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  57. [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
  58. Sched<[sched.Folded, sched.ReadAfterFold]>;
  59. }
  60. }
  61. /// sse12_fp_packed - SSE 1 & 2 packed instructions class
  62. multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  63. RegisterClass RC, ValueType vt,
  64. X86MemOperand x86memop, PatFrag mem_frag,
  65. Domain d, X86FoldableSchedWrite sched,
  66. bit Is2Addr = 1> {
  67. let isCommutable = 1 in
  68. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  69. !if(Is2Addr,
  70. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  71. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  72. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
  73. Sched<[sched]>;
  74. let mayLoad = 1 in
  75. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  76. !if(Is2Addr,
  77. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  78. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  79. [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
  80. d>,
  81. Sched<[sched.Folded, sched.ReadAfterFold]>;
  82. }
  83. /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
  84. multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
  85. string OpcodeStr, X86MemOperand x86memop,
  86. X86FoldableSchedWrite sched,
  87. list<dag> pat_rr, list<dag> pat_rm,
  88. bit Is2Addr = 1> {
  89. let isCommutable = 1, hasSideEffects = 0 in
  90. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  91. !if(Is2Addr,
  92. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  93. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  94. pat_rr, d>,
  95. Sched<[sched]>;
  96. let hasSideEffects = 0, mayLoad = 1 in
  97. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  98. !if(Is2Addr,
  99. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  100. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  101. pat_rm, d>,
  102. Sched<[sched.Folded, sched.ReadAfterFold]>;
  103. }
  104. // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
  105. // This is expanded by ExpandPostRAPseudos.
  106. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  107. isPseudo = 1, SchedRW = [WriteZero] in {
  108. def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
  109. [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
  110. def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
  111. [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
  112. def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  113. [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
  114. }
  115. //===----------------------------------------------------------------------===//
  116. // AVX & SSE - Zero/One Vectors
  117. //===----------------------------------------------------------------------===//
  118. // Alias instruction that maps zero vector to pxor / xorp* for sse.
  119. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
  120. // swizzled by ExecutionDomainFix to pxor.
  121. // We set canFoldAsLoad because this can be converted to a constant-pool
  122. // load of an all-zeros value if folding it would be beneficial.
  123. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  124. isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
  125. def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  126. [(set VR128:$dst, (v4f32 immAllZerosV))]>;
  127. }
  128. let Predicates = [NoAVX512] in {
  129. def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
  130. def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
  131. def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
  132. def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
  133. def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
  134. }
  135. // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
  136. // and doesn't need it because on sandy bridge the register is set to zero
  137. // at the rename stage without using any execution unit, so SET0PSY
  138. // and SET0PDY can be used for vector int instructions without penalty
  139. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  140. isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
  141. def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
  142. [(set VR256:$dst, (v8i32 immAllZerosV))]>;
  143. }
  144. let Predicates = [NoAVX512] in {
  145. def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
  146. def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
  147. def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
  148. def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
  149. def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
  150. }
  151. // We set canFoldAsLoad because this can be converted to a constant-pool
  152. // load of an all-ones value if folding it would be beneficial.
  153. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
  154. isPseudo = 1, SchedRW = [WriteZero] in {
  155. def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
  156. [(set VR128:$dst, (v4i32 immAllOnesV))]>;
  157. let Predicates = [HasAVX1Only, OptForMinSize] in {
  158. def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
  159. [(set VR256:$dst, (v8i32 immAllOnesV))]>;
  160. }
  161. let Predicates = [HasAVX2] in
  162. def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
  163. [(set VR256:$dst, (v8i32 immAllOnesV))]>;
  164. }
  165. //===----------------------------------------------------------------------===//
  166. // SSE 1 & 2 - Move FP Scalar Instructions
  167. //
  168. // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
  169. // register copies because it's a partial register update; Register-to-register
  170. // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
  171. // that the insert be implementable in terms of a copy, and just mentioned, we
  172. // don't use movss/movsd for copies.
  173. //===----------------------------------------------------------------------===//
  174. multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
  175. string asm_opr, Domain d, string Name> {
  176. let isCommutable = 1 in
  177. def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
  178. (ins VR128:$src1, VR128:$src2),
  179. !strconcat(base_opc, asm_opr),
  180. [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
  181. Sched<[SchedWriteFShuffle.XMM]>;
  182. // For the disassembler
  183. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  184. def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
  185. (ins VR128:$src1, VR128:$src2),
  186. !strconcat(base_opc, asm_opr), []>,
  187. Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
  188. }
  189. multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
  190. X86MemOperand x86memop, string OpcodeStr,
  191. Domain d, string Name, Predicate pred> {
  192. // AVX
  193. let Predicates = [UseAVX, OptForSize] in
  194. defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
  195. "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
  196. "V"#Name>,
  197. VEX_4V, VEX_LIG, VEX_WIG;
  198. def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
  199. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  200. [(store RC:$src, addr:$dst)], d>,
  201. VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
  202. // SSE1 & 2
  203. let Constraints = "$src1 = $dst" in {
  204. let Predicates = [pred, NoSSE41_Or_OptForSize] in
  205. defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
  206. "\t{$src2, $dst|$dst, $src2}", d, Name>;
  207. }
  208. def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
  209. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  210. [(store RC:$src, addr:$dst)], d>,
  211. Sched<[WriteFStore]>;
  212. def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  213. (!cast<Instruction>("V"#NAME#"rr_REV")
  214. VR128:$dst, VR128:$src1, VR128:$src2), 0>;
  215. def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
  216. (!cast<Instruction>(NAME#"rr_REV")
  217. VR128:$dst, VR128:$src2), 0>;
  218. }
  219. // Loading from memory automatically zeroing upper bits.
  220. multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
  221. PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
  222. Domain d> {
  223. def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  224. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  225. [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
  226. VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
  227. def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  228. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  229. [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
  230. Sched<[WriteFLoad]>;
  231. // _alt version uses FR32/FR64 register class.
  232. let isCodeGenOnly = 1 in {
  233. def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  234. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  235. [(set RC:$dst, (mem_pat addr:$src))], d>,
  236. VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
  237. def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  238. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  239. [(set RC:$dst, (mem_pat addr:$src))], d>,
  240. Sched<[WriteFLoad]>;
  241. }
  242. }
  243. defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
  244. SSEPackedSingle, "MOVSS", UseSSE1>, XS;
  245. defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
  246. SSEPackedDouble, "MOVSD", UseSSE2>, XD;
  247. let canFoldAsLoad = 1, isReMaterializable = 1 in {
  248. defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
  249. SSEPackedSingle>, XS;
  250. defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
  251. SSEPackedDouble>, XD;
  252. }
  253. // Patterns
  254. let Predicates = [UseAVX] in {
  255. def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
  256. (VMOVSSrm addr:$src)>;
  257. def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
  258. (VMOVSDrm addr:$src)>;
  259. // Represent the same patterns above but in the form they appear for
  260. // 256-bit types
  261. def : Pat<(v8f32 (X86vzload32 addr:$src)),
  262. (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
  263. def : Pat<(v4f64 (X86vzload64 addr:$src)),
  264. (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
  265. }
  266. let Predicates = [UseAVX, OptForSize] in {
  267. // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  268. // MOVSS to the lower bits.
  269. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  270. (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
  271. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  272. (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
  273. // Move low f32 and clear high bits.
  274. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
  275. (SUBREG_TO_REG (i32 0),
  276. (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
  277. (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
  278. def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
  279. (SUBREG_TO_REG (i32 0),
  280. (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
  281. (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
  282. }
  283. let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
  284. // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  285. // MOVSS to the lower bits.
  286. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  287. (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
  288. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  289. (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
  290. }
  291. let Predicates = [UseSSE2] in
  292. def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
  293. (MOVSDrm addr:$src)>;
  294. let Predicates = [UseSSE1] in
  295. def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
  296. (MOVSSrm addr:$src)>;
  297. //===----------------------------------------------------------------------===//
  298. // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
  299. //===----------------------------------------------------------------------===//
  300. multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
  301. X86MemOperand x86memop, PatFrag ld_frag,
  302. string asm, Domain d,
  303. X86SchedWriteMoveLS sched> {
  304. let hasSideEffects = 0, isMoveReg = 1 in
  305. def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
  306. !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
  307. Sched<[sched.RR]>;
  308. let canFoldAsLoad = 1, isReMaterializable = 1 in
  309. def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  310. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  311. [(set RC:$dst, (ld_frag addr:$src))], d>,
  312. Sched<[sched.RM]>;
  313. }
  314. let Predicates = [HasAVX, NoVLX] in {
  315. defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
  316. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  317. PS, VEX, VEX_WIG;
  318. defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
  319. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  320. PD, VEX, VEX_WIG;
  321. defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
  322. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  323. PS, VEX, VEX_WIG;
  324. defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
  325. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  326. PD, VEX, VEX_WIG;
  327. defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
  328. SSEPackedSingle, SchedWriteFMoveLS.YMM>,
  329. PS, VEX, VEX_L, VEX_WIG;
  330. defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
  331. SSEPackedDouble, SchedWriteFMoveLS.YMM>,
  332. PD, VEX, VEX_L, VEX_WIG;
  333. defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
  334. SSEPackedSingle, SchedWriteFMoveLS.YMM>,
  335. PS, VEX, VEX_L, VEX_WIG;
  336. defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
  337. SSEPackedDouble, SchedWriteFMoveLS.YMM>,
  338. PD, VEX, VEX_L, VEX_WIG;
  339. }
  340. let Predicates = [UseSSE1] in {
  341. defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
  342. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  343. PS;
  344. defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
  345. SSEPackedSingle, SchedWriteFMoveLS.XMM>,
  346. PS;
  347. }
  348. let Predicates = [UseSSE2] in {
  349. defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
  350. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  351. PD;
  352. defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
  353. SSEPackedDouble, SchedWriteFMoveLS.XMM>,
  354. PD;
  355. }
  356. let Predicates = [HasAVX, NoVLX] in {
  357. let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
  358. def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  359. "movaps\t{$src, $dst|$dst, $src}",
  360. [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
  361. VEX, VEX_WIG;
  362. def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  363. "movapd\t{$src, $dst|$dst, $src}",
  364. [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
  365. VEX, VEX_WIG;
  366. def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  367. "movups\t{$src, $dst|$dst, $src}",
  368. [(store (v4f32 VR128:$src), addr:$dst)]>,
  369. VEX, VEX_WIG;
  370. def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  371. "movupd\t{$src, $dst|$dst, $src}",
  372. [(store (v2f64 VR128:$src), addr:$dst)]>,
  373. VEX, VEX_WIG;
  374. } // SchedRW
  375. let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
  376. def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  377. "movaps\t{$src, $dst|$dst, $src}",
  378. [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
  379. VEX, VEX_L, VEX_WIG;
  380. def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  381. "movapd\t{$src, $dst|$dst, $src}",
  382. [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
  383. VEX, VEX_L, VEX_WIG;
  384. def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  385. "movups\t{$src, $dst|$dst, $src}",
  386. [(store (v8f32 VR256:$src), addr:$dst)]>,
  387. VEX, VEX_L, VEX_WIG;
  388. def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
  389. "movupd\t{$src, $dst|$dst, $src}",
  390. [(store (v4f64 VR256:$src), addr:$dst)]>,
  391. VEX, VEX_L, VEX_WIG;
  392. } // SchedRW
  393. } // Predicate
  394. // For disassembler
  395. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  396. isMoveReg = 1 in {
  397. let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
  398. def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
  399. (ins VR128:$src),
  400. "movaps\t{$src, $dst|$dst, $src}", []>,
  401. VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
  402. def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
  403. (ins VR128:$src),
  404. "movapd\t{$src, $dst|$dst, $src}", []>,
  405. VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
  406. def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
  407. (ins VR128:$src),
  408. "movups\t{$src, $dst|$dst, $src}", []>,
  409. VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
  410. def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
  411. (ins VR128:$src),
  412. "movupd\t{$src, $dst|$dst, $src}", []>,
  413. VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
  414. } // SchedRW
  415. let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
  416. def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
  417. (ins VR256:$src),
  418. "movaps\t{$src, $dst|$dst, $src}", []>,
  419. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
  420. def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
  421. (ins VR256:$src),
  422. "movapd\t{$src, $dst|$dst, $src}", []>,
  423. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
  424. def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
  425. (ins VR256:$src),
  426. "movups\t{$src, $dst|$dst, $src}", []>,
  427. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
  428. def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
  429. (ins VR256:$src),
  430. "movupd\t{$src, $dst|$dst, $src}", []>,
  431. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
  432. } // SchedRW
  433. } // Predicate
  434. // Reversed version with ".s" suffix for GAS compatibility.
  435. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
  436. (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
  437. def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
  438. (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
  439. def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
  440. (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
  441. def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
  442. (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
  443. def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
  444. (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
  445. def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
  446. (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
  447. def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
  448. (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
  449. def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
  450. (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
  451. let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
  452. def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  453. "movaps\t{$src, $dst|$dst, $src}",
  454. [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
  455. def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  456. "movapd\t{$src, $dst|$dst, $src}",
  457. [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
  458. def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  459. "movups\t{$src, $dst|$dst, $src}",
  460. [(store (v4f32 VR128:$src), addr:$dst)]>;
  461. def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  462. "movupd\t{$src, $dst|$dst, $src}",
  463. [(store (v2f64 VR128:$src), addr:$dst)]>;
  464. } // SchedRW
  465. // For disassembler
  466. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  467. isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
  468. def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  469. "movaps\t{$src, $dst|$dst, $src}", []>,
  470. FoldGenData<"MOVAPSrr">;
  471. def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  472. "movapd\t{$src, $dst|$dst, $src}", []>,
  473. FoldGenData<"MOVAPDrr">;
  474. def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  475. "movups\t{$src, $dst|$dst, $src}", []>,
  476. FoldGenData<"MOVUPSrr">;
  477. def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  478. "movupd\t{$src, $dst|$dst, $src}", []>,
  479. FoldGenData<"MOVUPDrr">;
  480. }
  481. // Reversed version with ".s" suffix for GAS compatibility.
  482. def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
  483. (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
  484. def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
  485. (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
  486. def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
  487. (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
  488. def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
  489. (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
  490. let Predicates = [HasAVX, NoVLX] in {
  491. // 256-bit load/store need to use floating point load/store in case we don't
  492. // have AVX2. Execution domain fixing will convert to integer if AVX2 is
  493. // available and changing the domain is beneficial.
  494. def : Pat<(alignedloadv4i64 addr:$src),
  495. (VMOVAPSYrm addr:$src)>;
  496. def : Pat<(alignedloadv8i32 addr:$src),
  497. (VMOVAPSYrm addr:$src)>;
  498. def : Pat<(alignedloadv16i16 addr:$src),
  499. (VMOVAPSYrm addr:$src)>;
  500. def : Pat<(alignedloadv32i8 addr:$src),
  501. (VMOVAPSYrm addr:$src)>;
  502. def : Pat<(loadv4i64 addr:$src),
  503. (VMOVUPSYrm addr:$src)>;
  504. def : Pat<(loadv8i32 addr:$src),
  505. (VMOVUPSYrm addr:$src)>;
  506. def : Pat<(loadv16i16 addr:$src),
  507. (VMOVUPSYrm addr:$src)>;
  508. def : Pat<(loadv32i8 addr:$src),
  509. (VMOVUPSYrm addr:$src)>;
  510. def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
  511. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  512. def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
  513. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  514. def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
  515. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  516. def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
  517. (VMOVAPSYmr addr:$dst, VR256:$src)>;
  518. def : Pat<(store (v4i64 VR256:$src), addr:$dst),
  519. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  520. def : Pat<(store (v8i32 VR256:$src), addr:$dst),
  521. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  522. def : Pat<(store (v16i16 VR256:$src), addr:$dst),
  523. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  524. def : Pat<(store (v32i8 VR256:$src), addr:$dst),
  525. (VMOVUPSYmr addr:$dst, VR256:$src)>;
  526. }
  527. // Use movaps / movups for SSE integer load / store (one byte shorter).
  528. // The instructions selected below are then converted to MOVDQA/MOVDQU
  529. // during the SSE domain pass.
  530. let Predicates = [UseSSE1] in {
  531. def : Pat<(alignedloadv2i64 addr:$src),
  532. (MOVAPSrm addr:$src)>;
  533. def : Pat<(alignedloadv4i32 addr:$src),
  534. (MOVAPSrm addr:$src)>;
  535. def : Pat<(alignedloadv8i16 addr:$src),
  536. (MOVAPSrm addr:$src)>;
  537. def : Pat<(alignedloadv16i8 addr:$src),
  538. (MOVAPSrm addr:$src)>;
  539. def : Pat<(loadv2i64 addr:$src),
  540. (MOVUPSrm addr:$src)>;
  541. def : Pat<(loadv4i32 addr:$src),
  542. (MOVUPSrm addr:$src)>;
  543. def : Pat<(loadv8i16 addr:$src),
  544. (MOVUPSrm addr:$src)>;
  545. def : Pat<(loadv16i8 addr:$src),
  546. (MOVUPSrm addr:$src)>;
  547. def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
  548. (MOVAPSmr addr:$dst, VR128:$src)>;
  549. def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
  550. (MOVAPSmr addr:$dst, VR128:$src)>;
  551. def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
  552. (MOVAPSmr addr:$dst, VR128:$src)>;
  553. def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
  554. (MOVAPSmr addr:$dst, VR128:$src)>;
  555. def : Pat<(store (v2i64 VR128:$src), addr:$dst),
  556. (MOVUPSmr addr:$dst, VR128:$src)>;
  557. def : Pat<(store (v4i32 VR128:$src), addr:$dst),
  558. (MOVUPSmr addr:$dst, VR128:$src)>;
  559. def : Pat<(store (v8i16 VR128:$src), addr:$dst),
  560. (MOVUPSmr addr:$dst, VR128:$src)>;
  561. def : Pat<(store (v16i8 VR128:$src), addr:$dst),
  562. (MOVUPSmr addr:$dst, VR128:$src)>;
  563. }
  564. //===----------------------------------------------------------------------===//
  565. // SSE 1 & 2 - Move Low packed FP Instructions
  566. //===----------------------------------------------------------------------===//
  567. multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
  568. string base_opc, string asm_opr> {
  569. // No pattern as they need be special cased between high and low.
  570. let hasSideEffects = 0, mayLoad = 1 in
  571. def PSrm : PI<opc, MRMSrcMem,
  572. (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
  573. !strconcat(base_opc, "s", asm_opr),
  574. [], SSEPackedSingle>, PS,
  575. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  576. def PDrm : PI<opc, MRMSrcMem,
  577. (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
  578. !strconcat(base_opc, "d", asm_opr),
  579. [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
  580. (scalar_to_vector (loadf64 addr:$src2)))))],
  581. SSEPackedDouble>, PD,
  582. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  583. }
  584. multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
  585. string base_opc> {
  586. let Predicates = [UseAVX] in
  587. defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
  588. "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
  589. VEX_4V, VEX_WIG;
  590. let Constraints = "$src1 = $dst" in
  591. defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
  592. "\t{$src2, $dst|$dst, $src2}">;
  593. }
  594. defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
  595. let SchedRW = [WriteFStore] in {
  596. let Predicates = [UseAVX] in {
  597. let mayStore = 1, hasSideEffects = 0 in
  598. def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  599. "movlps\t{$src, $dst|$dst, $src}",
  600. []>,
  601. VEX, VEX_WIG;
  602. def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  603. "movlpd\t{$src, $dst|$dst, $src}",
  604. [(store (f64 (extractelt (v2f64 VR128:$src),
  605. (iPTR 0))), addr:$dst)]>,
  606. VEX, VEX_WIG;
  607. }// UseAVX
  608. let mayStore = 1, hasSideEffects = 0 in
  609. def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  610. "movlps\t{$src, $dst|$dst, $src}",
  611. []>;
  612. def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  613. "movlpd\t{$src, $dst|$dst, $src}",
  614. [(store (f64 (extractelt (v2f64 VR128:$src),
  615. (iPTR 0))), addr:$dst)]>;
  616. } // SchedRW
  617. let Predicates = [UseSSE1] in {
  618. // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
  619. // end up with a movsd or blend instead of shufp.
  620. // No need for aligned load, we're only loading 64-bits.
  621. def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
  622. (i8 -28)),
  623. (MOVLPSrm VR128:$src1, addr:$src2)>;
  624. def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
  625. (MOVLPSrm VR128:$src1, addr:$src2)>;
  626. def : Pat<(v4f32 (X86vzload64 addr:$src)),
  627. (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
  628. def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
  629. (MOVLPSmr addr:$dst, VR128:$src)>;
  630. }
  631. //===----------------------------------------------------------------------===//
  632. // SSE 1 & 2 - Move Hi packed FP Instructions
  633. //===----------------------------------------------------------------------===//
  634. defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
  635. let SchedRW = [WriteFStore] in {
  636. // v2f64 extract element 1 is always custom lowered to unpack high to low
  637. // and extract element 0 so the non-store version isn't too horrible.
  638. let Predicates = [UseAVX] in {
  639. let mayStore = 1, hasSideEffects = 0 in
  640. def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  641. "movhps\t{$src, $dst|$dst, $src}",
  642. []>, VEX, VEX_WIG;
  643. def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  644. "movhpd\t{$src, $dst|$dst, $src}",
  645. [(store (f64 (extractelt
  646. (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
  647. (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
  648. } // UseAVX
  649. let mayStore = 1, hasSideEffects = 0 in
  650. def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  651. "movhps\t{$src, $dst|$dst, $src}",
  652. []>;
  653. def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  654. "movhpd\t{$src, $dst|$dst, $src}",
  655. [(store (f64 (extractelt
  656. (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
  657. (iPTR 0))), addr:$dst)]>;
  658. } // SchedRW
  659. let Predicates = [UseAVX] in {
  660. // MOVHPD patterns
  661. def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
  662. (VMOVHPDrm VR128:$src1, addr:$src2)>;
  663. def : Pat<(store (f64 (extractelt
  664. (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
  665. (iPTR 0))), addr:$dst),
  666. (VMOVHPDmr addr:$dst, VR128:$src)>;
  667. // MOVLPD patterns
  668. def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
  669. (VMOVLPDrm VR128:$src1, addr:$src2)>;
  670. }
  671. let Predicates = [UseSSE1] in {
  672. // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
  673. // end up with a movsd or blend instead of shufp.
  674. // No need for aligned load, we're only loading 64-bits.
  675. def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
  676. (MOVHPSrm VR128:$src1, addr:$src2)>;
  677. def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
  678. (MOVHPSrm VR128:$src1, addr:$src2)>;
  679. def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
  680. addr:$dst),
  681. (MOVHPSmr addr:$dst, VR128:$src)>;
  682. }
  683. let Predicates = [UseSSE2] in {
  684. // MOVHPD patterns
  685. def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
  686. (MOVHPDrm VR128:$src1, addr:$src2)>;
  687. def : Pat<(store (f64 (extractelt
  688. (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
  689. (iPTR 0))), addr:$dst),
  690. (MOVHPDmr addr:$dst, VR128:$src)>;
  691. // MOVLPD patterns
  692. def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
  693. (MOVLPDrm VR128:$src1, addr:$src2)>;
  694. }
  695. let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
  696. // Use MOVLPD to load into the low bits from a full vector unless we can use
  697. // BLENDPD.
  698. def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
  699. (MOVLPDrm VR128:$src1, addr:$src2)>;
  700. }
  701. //===----------------------------------------------------------------------===//
  702. // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
  703. //===----------------------------------------------------------------------===//
  704. let Predicates = [UseAVX] in {
  705. def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
  706. (ins VR128:$src1, VR128:$src2),
  707. "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  708. [(set VR128:$dst,
  709. (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
  710. VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
  711. let isCommutable = 1 in
  712. def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
  713. (ins VR128:$src1, VR128:$src2),
  714. "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  715. [(set VR128:$dst,
  716. (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
  717. VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
  718. NotMemoryFoldable;
  719. }
  720. let Constraints = "$src1 = $dst" in {
  721. def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
  722. (ins VR128:$src1, VR128:$src2),
  723. "movlhps\t{$src2, $dst|$dst, $src2}",
  724. [(set VR128:$dst,
  725. (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
  726. Sched<[SchedWriteFShuffle.XMM]>;
  727. let isCommutable = 1 in
  728. def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
  729. (ins VR128:$src1, VR128:$src2),
  730. "movhlps\t{$src2, $dst|$dst, $src2}",
  731. [(set VR128:$dst,
  732. (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
  733. Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
  734. }
  735. //===----------------------------------------------------------------------===//
  736. // SSE 1 & 2 - Conversion Instructions
  737. //===----------------------------------------------------------------------===//
  738. multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  739. SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
  740. string asm, string mem, X86FoldableSchedWrite sched,
  741. Domain d,
  742. SchedRead Int2Fpu = ReadDefault> {
  743. let ExeDomain = d in {
  744. def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
  745. !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
  746. [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
  747. Sched<[sched, Int2Fpu]>;
  748. def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
  749. mem#"\t{$src, $dst|$dst, $src}",
  750. [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
  751. Sched<[sched.Folded]>;
  752. }
  753. }
  754. multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
  755. ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
  756. string asm, Domain d, X86FoldableSchedWrite sched> {
  757. let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
  758. def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
  759. [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
  760. Sched<[sched]>;
  761. let mayLoad = 1 in
  762. def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
  763. [(set RC:$dst, (DstTy (any_sint_to_fp
  764. (SrcTy (ld_frag addr:$src)))))], d>,
  765. Sched<[sched.Folded]>;
  766. }
  767. }
  768. multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  769. X86MemOperand x86memop, string asm, string mem,
  770. X86FoldableSchedWrite sched, Domain d> {
  771. let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
  772. def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
  773. !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
  774. Sched<[sched, ReadDefault, ReadInt2Fpu]>;
  775. let mayLoad = 1 in
  776. def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
  777. (ins DstRC:$src1, x86memop:$src),
  778. asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
  779. Sched<[sched.Folded, sched.ReadAfterFold]>;
  780. } // hasSideEffects = 0
  781. }
  782. let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  783. defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
  784. "cvttss2si", "cvttss2si",
  785. WriteCvtSS2I, SSEPackedSingle>,
  786. XS, VEX, VEX_LIG;
  787. defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
  788. "cvttss2si", "cvttss2si",
  789. WriteCvtSS2I, SSEPackedSingle>,
  790. XS, VEX, VEX_W, VEX_LIG;
  791. defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
  792. "cvttsd2si", "cvttsd2si",
  793. WriteCvtSD2I, SSEPackedDouble>,
  794. XD, VEX, VEX_LIG;
  795. defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
  796. "cvttsd2si", "cvttsd2si",
  797. WriteCvtSD2I, SSEPackedDouble>,
  798. XD, VEX, VEX_W, VEX_LIG;
  799. defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
  800. "cvtss2si", "cvtss2si",
  801. WriteCvtSS2I, SSEPackedSingle>,
  802. XS, VEX, VEX_LIG;
  803. defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
  804. "cvtss2si", "cvtss2si",
  805. WriteCvtSS2I, SSEPackedSingle>,
  806. XS, VEX, VEX_W, VEX_LIG;
  807. defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
  808. "cvtsd2si", "cvtsd2si",
  809. WriteCvtSD2I, SSEPackedDouble>,
  810. XD, VEX, VEX_LIG;
  811. defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
  812. "cvtsd2si", "cvtsd2si",
  813. WriteCvtSD2I, SSEPackedDouble>,
  814. XD, VEX, VEX_W, VEX_LIG;
  815. }
  816. // The assembler can recognize rr 64-bit instructions by seeing a rxx
  817. // register, but the same isn't true when only using memory operands,
  818. // provide other assembly "l" and "q" forms to address this explicitly
  819. // where appropriate to do so.
  820. let isCodeGenOnly = 1 in {
  821. defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
  822. WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
  823. VEX_LIG, SIMD_EXC;
  824. defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
  825. WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
  826. VEX_W, VEX_LIG, SIMD_EXC;
  827. defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
  828. WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
  829. VEX_LIG;
  830. defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
  831. WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
  832. VEX_W, VEX_LIG, SIMD_EXC;
  833. } // isCodeGenOnly = 1
  834. let Predicates = [UseAVX] in {
  835. def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
  836. (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
  837. def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
  838. (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
  839. def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
  840. (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
  841. def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
  842. (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
  843. def : Pat<(f32 (any_sint_to_fp GR32:$src)),
  844. (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
  845. def : Pat<(f32 (any_sint_to_fp GR64:$src)),
  846. (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
  847. def : Pat<(f64 (any_sint_to_fp GR32:$src)),
  848. (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
  849. def : Pat<(f64 (any_sint_to_fp GR64:$src)),
  850. (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
  851. def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
  852. def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
  853. def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
  854. def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
  855. }
  856. let isCodeGenOnly = 1 in {
  857. defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
  858. "cvttss2si", "cvttss2si",
  859. WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
  860. defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
  861. "cvttss2si", "cvttss2si",
  862. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
  863. defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
  864. "cvttsd2si", "cvttsd2si",
  865. WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
  866. defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
  867. "cvttsd2si", "cvttsd2si",
  868. WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
  869. defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
  870. "cvtss2si", "cvtss2si",
  871. WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
  872. defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
  873. "cvtss2si", "cvtss2si",
  874. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
  875. defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
  876. "cvtsd2si", "cvtsd2si",
  877. WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
  878. defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
  879. "cvtsd2si", "cvtsd2si",
  880. WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
  881. defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
  882. "cvtsi2ss", "cvtsi2ss{l}",
  883. WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
  884. defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
  885. "cvtsi2ss", "cvtsi2ss{q}",
  886. WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
  887. defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
  888. "cvtsi2sd", "cvtsi2sd{l}",
  889. WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
  890. defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
  891. "cvtsi2sd", "cvtsi2sd{q}",
  892. WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
  893. } // isCodeGenOnly = 1
  894. let Predicates = [UseSSE1] in {
  895. def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
  896. def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
  897. }
  898. let Predicates = [UseSSE2] in {
  899. def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
  900. def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
  901. }
  902. // Conversion Instructions Intrinsics - Match intrinsics which expect MM
  903. // and/or XMM operand(s).
  904. multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
  905. ValueType DstVT, ValueType SrcVT, SDNode OpNode,
  906. Operand memop, PatFrags mem_frags, string asm,
  907. X86FoldableSchedWrite sched, Domain d> {
  908. let ExeDomain = d in {
  909. def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
  910. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  911. [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
  912. Sched<[sched]>;
  913. def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
  914. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  915. [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
  916. Sched<[sched.Folded]>;
  917. }
  918. }
  919. multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
  920. RegisterClass DstRC, X86MemOperand x86memop,
  921. string asm, string mem, X86FoldableSchedWrite sched,
  922. Domain d, bit Is2Addr = 1> {
  923. let hasSideEffects = 0, ExeDomain = d in {
  924. def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
  925. !if(Is2Addr,
  926. !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
  927. !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  928. []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
  929. let mayLoad = 1 in
  930. def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
  931. (ins DstRC:$src1, x86memop:$src2),
  932. !if(Is2Addr,
  933. asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
  934. asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  935. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  936. }
  937. }
  938. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  939. let Predicates = [UseAVX] in {
  940. defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
  941. X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
  942. WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
  943. defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
  944. X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
  945. WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
  946. }
  947. defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
  948. sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
  949. SSEPackedDouble>, XD;
  950. defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
  951. sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
  952. SSEPackedDouble>, XD, REX_W;
  953. }
  954. let Predicates = [UseAVX] in {
  955. defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  956. i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
  957. XS, VEX_4V, VEX_LIG, SIMD_EXC;
  958. defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  959. i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
  960. XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
  961. defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  962. i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
  963. XD, VEX_4V, VEX_LIG;
  964. defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  965. i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
  966. XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
  967. }
  968. let Constraints = "$src1 = $dst" in {
  969. defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  970. i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
  971. XS, SIMD_EXC;
  972. defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  973. i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
  974. XS, REX_W, SIMD_EXC;
  975. defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
  976. i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
  977. XD;
  978. defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
  979. i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
  980. XD, REX_W, SIMD_EXC;
  981. }
  982. def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  983. (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
  984. def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  985. (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
  986. def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  987. (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
  988. def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  989. (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
  990. def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
  991. (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
  992. def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
  993. (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
  994. def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
  995. (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
  996. def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
  997. (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
  998. def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
  999. (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
  1000. def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
  1001. (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
  1002. def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
  1003. (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
  1004. def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
  1005. (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
  1006. /// SSE 1 Only
  1007. // Aliases for intrinsics
  1008. let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1009. defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
  1010. ssmem, sse_load_f32, "cvttss2si",
  1011. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
  1012. defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
  1013. X86cvtts2Int, ssmem, sse_load_f32,
  1014. "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
  1015. XS, VEX, VEX_LIG, VEX_W;
  1016. defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
  1017. sdmem, sse_load_f64, "cvttsd2si",
  1018. WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
  1019. defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
  1020. X86cvtts2Int, sdmem, sse_load_f64,
  1021. "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
  1022. XD, VEX, VEX_LIG, VEX_W;
  1023. }
  1024. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1025. defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
  1026. ssmem, sse_load_f32, "cvttss2si",
  1027. WriteCvtSS2I, SSEPackedSingle>, XS;
  1028. defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
  1029. X86cvtts2Int, ssmem, sse_load_f32,
  1030. "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
  1031. XS, REX_W;
  1032. defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
  1033. sdmem, sse_load_f64, "cvttsd2si",
  1034. WriteCvtSD2I, SSEPackedDouble>, XD;
  1035. defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
  1036. X86cvtts2Int, sdmem, sse_load_f64,
  1037. "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
  1038. XD, REX_W;
  1039. }
  1040. def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
  1041. (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1042. def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
  1043. (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
  1044. def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1045. (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1046. def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1047. (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
  1048. def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
  1049. (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1050. def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
  1051. (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
  1052. def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1053. (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1054. def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1055. (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
  1056. def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
  1057. (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1058. def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
  1059. (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
  1060. def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1061. (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1062. def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
  1063. (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
  1064. def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
  1065. (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1066. def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
  1067. (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
  1068. def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1069. (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1070. def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
  1071. (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
  1072. let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1073. defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
  1074. ssmem, sse_load_f32, "cvtss2si",
  1075. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
  1076. defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
  1077. ssmem, sse_load_f32, "cvtss2si",
  1078. WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
  1079. }
  1080. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1081. defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
  1082. ssmem, sse_load_f32, "cvtss2si",
  1083. WriteCvtSS2I, SSEPackedSingle>, XS;
  1084. defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
  1085. ssmem, sse_load_f32, "cvtss2si",
  1086. WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
  1087. defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
  1088. "vcvtdq2ps\t{$src, $dst|$dst, $src}",
  1089. SSEPackedSingle, WriteCvtI2PS>,
  1090. PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
  1091. defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
  1092. "vcvtdq2ps\t{$src, $dst|$dst, $src}",
  1093. SSEPackedSingle, WriteCvtI2PSY>,
  1094. PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
  1095. defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
  1096. "cvtdq2ps\t{$src, $dst|$dst, $src}",
  1097. SSEPackedSingle, WriteCvtI2PS>,
  1098. PS, Requires<[UseSSE2]>;
  1099. }
  1100. // AVX aliases
  1101. def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
  1102. (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1103. def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
  1104. (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
  1105. def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1106. (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1107. def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1108. (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
  1109. def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
  1110. (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1111. def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
  1112. (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
  1113. def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1114. (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1115. def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1116. (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
  1117. // SSE aliases
  1118. def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
  1119. (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1120. def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
  1121. (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
  1122. def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1123. (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
  1124. def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
  1125. (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
  1126. def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
  1127. (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1128. def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
  1129. (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
  1130. def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1131. (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
  1132. def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
  1133. (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
  1134. /// SSE 2 Only
  1135. // Convert scalar double to scalar single
  1136. let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
  1137. ExeDomain = SSEPackedSingle in {
  1138. def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
  1139. (ins FR32:$src1, FR64:$src2),
  1140. "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1141. VEX_4V, VEX_LIG, VEX_WIG,
  1142. Sched<[WriteCvtSD2SS]>, SIMD_EXC;
  1143. let mayLoad = 1 in
  1144. def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
  1145. (ins FR32:$src1, f64mem:$src2),
  1146. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1147. XD, VEX_4V, VEX_LIG, VEX_WIG,
  1148. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
  1149. }
  1150. def : Pat<(f32 (any_fpround FR64:$src)),
  1151. (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
  1152. Requires<[UseAVX]>;
  1153. let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
  1154. def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
  1155. "cvtsd2ss\t{$src, $dst|$dst, $src}",
  1156. [(set FR32:$dst, (any_fpround FR64:$src))]>,
  1157. Sched<[WriteCvtSD2SS]>, SIMD_EXC;
  1158. def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
  1159. "cvtsd2ss\t{$src, $dst|$dst, $src}",
  1160. [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
  1161. XD, Requires<[UseSSE2, OptForSize]>,
  1162. Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
  1163. }
  1164. let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
  1165. def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
  1166. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1167. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1168. [(set VR128:$dst,
  1169. (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
  1170. XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
  1171. Sched<[WriteCvtSD2SS]>;
  1172. def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
  1173. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
  1174. "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1175. [(set VR128:$dst,
  1176. (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
  1177. XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
  1178. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
  1179. let Constraints = "$src1 = $dst" in {
  1180. def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
  1181. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1182. "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
  1183. [(set VR128:$dst,
  1184. (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
  1185. XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
  1186. def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
  1187. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
  1188. "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
  1189. [(set VR128:$dst,
  1190. (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
  1191. XD, Requires<[UseSSE2]>,
  1192. Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
  1193. }
  1194. }
  1195. // Convert scalar single to scalar double
  1196. // SSE2 instructions with XS prefix
  1197. let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  1198. def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
  1199. (ins FR64:$src1, FR32:$src2),
  1200. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1201. XS, VEX_4V, VEX_LIG, VEX_WIG,
  1202. Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
  1203. let mayLoad = 1 in
  1204. def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
  1205. (ins FR64:$src1, f32mem:$src2),
  1206. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  1207. XS, VEX_4V, VEX_LIG, VEX_WIG,
  1208. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
  1209. Requires<[UseAVX, OptForSize]>, SIMD_EXC;
  1210. } // isCodeGenOnly = 1, hasSideEffects = 0
  1211. def : Pat<(f64 (any_fpextend FR32:$src)),
  1212. (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
  1213. def : Pat<(any_fpextend (loadf32 addr:$src)),
  1214. (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
  1215. let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
  1216. def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
  1217. "cvtss2sd\t{$src, $dst|$dst, $src}",
  1218. [(set FR64:$dst, (any_fpextend FR32:$src))]>,
  1219. XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
  1220. def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
  1221. "cvtss2sd\t{$src, $dst|$dst, $src}",
  1222. [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
  1223. XS, Requires<[UseSSE2, OptForSize]>,
  1224. Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
  1225. } // isCodeGenOnly = 1
  1226. let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
  1227. ExeDomain = SSEPackedSingle in {
  1228. def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
  1229. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1230. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1231. []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
  1232. Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
  1233. let mayLoad = 1 in
  1234. def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
  1235. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
  1236. "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1237. []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
  1238. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
  1239. let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
  1240. def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
  1241. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  1242. "cvtss2sd\t{$src2, $dst|$dst, $src2}",
  1243. []>, XS, Requires<[UseSSE2]>,
  1244. Sched<[WriteCvtSS2SD]>;
  1245. let mayLoad = 1 in
  1246. def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
  1247. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
  1248. "cvtss2sd\t{$src2, $dst|$dst, $src2}",
  1249. []>, XS, Requires<[UseSSE2]>,
  1250. Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
  1251. }
  1252. } // hasSideEffects = 0
  1253. // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
  1254. // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
  1255. // vmovs{s,d} instructions
  1256. let Predicates = [UseAVX] in {
  1257. def : Pat<(v4f32 (X86Movss
  1258. (v4f32 VR128:$dst),
  1259. (v4f32 (scalar_to_vector
  1260. (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
  1261. (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
  1262. def : Pat<(v2f64 (X86Movsd
  1263. (v2f64 VR128:$dst),
  1264. (v2f64 (scalar_to_vector
  1265. (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
  1266. (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
  1267. def : Pat<(v4f32 (X86Movss
  1268. (v4f32 VR128:$dst),
  1269. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
  1270. (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
  1271. def : Pat<(v4f32 (X86Movss
  1272. (v4f32 VR128:$dst),
  1273. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
  1274. (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
  1275. def : Pat<(v4f32 (X86Movss
  1276. (v4f32 VR128:$dst),
  1277. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
  1278. (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
  1279. def : Pat<(v4f32 (X86Movss
  1280. (v4f32 VR128:$dst),
  1281. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
  1282. (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
  1283. def : Pat<(v2f64 (X86Movsd
  1284. (v2f64 VR128:$dst),
  1285. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
  1286. (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
  1287. def : Pat<(v2f64 (X86Movsd
  1288. (v2f64 VR128:$dst),
  1289. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
  1290. (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
  1291. def : Pat<(v2f64 (X86Movsd
  1292. (v2f64 VR128:$dst),
  1293. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
  1294. (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
  1295. def : Pat<(v2f64 (X86Movsd
  1296. (v2f64 VR128:$dst),
  1297. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
  1298. (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
  1299. } // Predicates = [UseAVX]
  1300. let Predicates = [UseSSE2] in {
  1301. def : Pat<(v4f32 (X86Movss
  1302. (v4f32 VR128:$dst),
  1303. (v4f32 (scalar_to_vector
  1304. (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
  1305. (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
  1306. def : Pat<(v2f64 (X86Movsd
  1307. (v2f64 VR128:$dst),
  1308. (v2f64 (scalar_to_vector
  1309. (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
  1310. (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
  1311. def : Pat<(v2f64 (X86Movsd
  1312. (v2f64 VR128:$dst),
  1313. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
  1314. (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
  1315. def : Pat<(v2f64 (X86Movsd
  1316. (v2f64 VR128:$dst),
  1317. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
  1318. (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
  1319. def : Pat<(v2f64 (X86Movsd
  1320. (v2f64 VR128:$dst),
  1321. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
  1322. (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
  1323. def : Pat<(v2f64 (X86Movsd
  1324. (v2f64 VR128:$dst),
  1325. (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
  1326. (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
  1327. } // Predicates = [UseSSE2]
  1328. let Predicates = [UseSSE1] in {
  1329. def : Pat<(v4f32 (X86Movss
  1330. (v4f32 VR128:$dst),
  1331. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
  1332. (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
  1333. def : Pat<(v4f32 (X86Movss
  1334. (v4f32 VR128:$dst),
  1335. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
  1336. (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
  1337. def : Pat<(v4f32 (X86Movss
  1338. (v4f32 VR128:$dst),
  1339. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
  1340. (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
  1341. def : Pat<(v4f32 (X86Movss
  1342. (v4f32 VR128:$dst),
  1343. (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
  1344. (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
  1345. } // Predicates = [UseSSE1]
  1346. let Predicates = [HasAVX, NoVLX] in {
  1347. // Convert packed single/double fp to doubleword
  1348. def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1349. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1350. [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
  1351. VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
  1352. def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1353. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1354. [(set VR128:$dst,
  1355. (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
  1356. VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
  1357. def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  1358. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1359. [(set VR256:$dst,
  1360. (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
  1361. VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
  1362. def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  1363. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1364. [(set VR256:$dst,
  1365. (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
  1366. VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
  1367. }
  1368. def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1369. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1370. [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
  1371. Sched<[WriteCvtPS2I]>, SIMD_EXC;
  1372. def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1373. "cvtps2dq\t{$src, $dst|$dst, $src}",
  1374. [(set VR128:$dst,
  1375. (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
  1376. Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
  1377. // Convert Packed Double FP to Packed DW Integers
  1378. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1379. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1380. // register, but the same isn't true when using memory operands instead.
  1381. // Provide other assembly rr and rm forms to address this explicitly.
  1382. def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1383. "vcvtpd2dq\t{$src, $dst|$dst, $src}",
  1384. [(set VR128:$dst,
  1385. (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
  1386. VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
  1387. // XMM only
  1388. def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1389. "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
  1390. [(set VR128:$dst,
  1391. (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
  1392. Sched<[WriteCvtPD2ILd]>, VEX_WIG;
  1393. // YMM only
  1394. def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1395. "vcvtpd2dq\t{$src, $dst|$dst, $src}",
  1396. [(set VR128:$dst,
  1397. (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
  1398. VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
  1399. def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1400. "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
  1401. [(set VR128:$dst,
  1402. (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
  1403. VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
  1404. }
  1405. def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
  1406. (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
  1407. def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
  1408. (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
  1409. def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1410. "cvtpd2dq\t{$src, $dst|$dst, $src}",
  1411. [(set VR128:$dst,
  1412. (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
  1413. Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
  1414. def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1415. "cvtpd2dq\t{$src, $dst|$dst, $src}",
  1416. [(set VR128:$dst,
  1417. (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
  1418. Sched<[WriteCvtPD2I]>, SIMD_EXC;
  1419. // Convert with truncation packed single/double fp to doubleword
  1420. // SSE2 packed instructions with XS prefix
  1421. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  1422. let Predicates = [HasAVX, NoVLX] in {
  1423. def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1424. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1425. [(set VR128:$dst,
  1426. (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
  1427. VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
  1428. def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1429. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1430. [(set VR128:$dst,
  1431. (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
  1432. VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
  1433. def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  1434. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1435. [(set VR256:$dst,
  1436. (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
  1437. VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
  1438. def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  1439. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1440. [(set VR256:$dst,
  1441. (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
  1442. VEX, VEX_L,
  1443. Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
  1444. }
  1445. def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1446. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1447. [(set VR128:$dst,
  1448. (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
  1449. Sched<[WriteCvtPS2I]>;
  1450. def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1451. "cvttps2dq\t{$src, $dst|$dst, $src}",
  1452. [(set VR128:$dst,
  1453. (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
  1454. Sched<[WriteCvtPS2ILd]>;
  1455. }
  1456. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1457. // register, but the same isn't true when using memory operands instead.
  1458. // Provide other assembly rr and rm forms to address this explicitly.
  1459. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1460. // XMM only
  1461. def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1462. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1463. [(set VR128:$dst,
  1464. (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
  1465. VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
  1466. def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1467. "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
  1468. [(set VR128:$dst,
  1469. (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
  1470. VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
  1471. // YMM only
  1472. def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1473. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1474. [(set VR128:$dst,
  1475. (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
  1476. VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
  1477. def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1478. "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
  1479. [(set VR128:$dst,
  1480. (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
  1481. VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
  1482. } // Predicates = [HasAVX, NoVLX]
  1483. def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
  1484. (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
  1485. def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
  1486. (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
  1487. let Predicates = [HasAVX, NoVLX] in {
  1488. def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
  1489. (VCVTTPD2DQYrr VR256:$src)>;
  1490. def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
  1491. (VCVTTPD2DQYrm addr:$src)>;
  1492. }
  1493. def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1494. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1495. [(set VR128:$dst,
  1496. (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
  1497. Sched<[WriteCvtPD2I]>, SIMD_EXC;
  1498. def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
  1499. "cvttpd2dq\t{$src, $dst|$dst, $src}",
  1500. [(set VR128:$dst,
  1501. (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
  1502. Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
  1503. // Convert packed single to packed double
  1504. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1505. // SSE2 instructions without OpSize prefix
  1506. def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1507. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1508. [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
  1509. PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
  1510. def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  1511. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1512. [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
  1513. PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
  1514. def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  1515. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1516. [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
  1517. PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
  1518. def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
  1519. "vcvtps2pd\t{$src, $dst|$dst, $src}",
  1520. [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
  1521. PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
  1522. }
  1523. let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1524. def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1525. "cvtps2pd\t{$src, $dst|$dst, $src}",
  1526. [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
  1527. PS, Sched<[WriteCvtPS2PD]>;
  1528. def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  1529. "cvtps2pd\t{$src, $dst|$dst, $src}",
  1530. [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
  1531. PS, Sched<[WriteCvtPS2PD.Folded]>;
  1532. }
  1533. // Convert Packed DW Integers to Packed Double FP
  1534. let Predicates = [HasAVX, NoVLX] in {
  1535. let hasSideEffects = 0, mayLoad = 1 in
  1536. def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  1537. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1538. [(set VR128:$dst,
  1539. (v2f64 (X86any_VSintToFP
  1540. (bc_v4i32
  1541. (v2i64 (scalar_to_vector
  1542. (loadi64 addr:$src)))))))]>,
  1543. VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
  1544. def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1545. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1546. [(set VR128:$dst,
  1547. (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
  1548. VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
  1549. def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
  1550. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1551. [(set VR256:$dst,
  1552. (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
  1553. VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
  1554. VEX_WIG;
  1555. def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  1556. "vcvtdq2pd\t{$src, $dst|$dst, $src}",
  1557. [(set VR256:$dst,
  1558. (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
  1559. VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
  1560. }
  1561. let hasSideEffects = 0, mayLoad = 1 in
  1562. def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  1563. "cvtdq2pd\t{$src, $dst|$dst, $src}",
  1564. [(set VR128:$dst,
  1565. (v2f64 (X86any_VSintToFP
  1566. (bc_v4i32
  1567. (v2i64 (scalar_to_vector
  1568. (loadi64 addr:$src)))))))]>,
  1569. Sched<[WriteCvtI2PDLd]>;
  1570. def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1571. "cvtdq2pd\t{$src, $dst|$dst, $src}",
  1572. [(set VR128:$dst,
  1573. (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
  1574. Sched<[WriteCvtI2PD]>;
  1575. // AVX register conversion intrinsics
  1576. let Predicates = [HasAVX, NoVLX] in {
  1577. def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  1578. (VCVTDQ2PDrm addr:$src)>;
  1579. } // Predicates = [HasAVX, NoVLX]
  1580. // SSE2 register conversion intrinsics
  1581. let Predicates = [UseSSE2] in {
  1582. def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  1583. (CVTDQ2PDrm addr:$src)>;
  1584. } // Predicates = [UseSSE2]
  1585. // Convert packed double to packed single
  1586. // The assembler can recognize rr 256-bit instructions by seeing a ymm
  1587. // register, but the same isn't true when using memory operands instead.
  1588. // Provide other assembly rr and rm forms to address this explicitly.
  1589. let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
  1590. // XMM only
  1591. def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1592. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1593. [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
  1594. VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
  1595. def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1596. "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
  1597. [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
  1598. VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
  1599. def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
  1600. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1601. [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
  1602. VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
  1603. def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
  1604. "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
  1605. [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
  1606. VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
  1607. } // Predicates = [HasAVX, NoVLX]
  1608. def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
  1609. (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
  1610. def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
  1611. (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
  1612. def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  1613. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1614. [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
  1615. Sched<[WriteCvtPD2PS]>, SIMD_EXC;
  1616. def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  1617. "cvtpd2ps\t{$src, $dst|$dst, $src}",
  1618. [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
  1619. Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
  1620. //===----------------------------------------------------------------------===//
  1621. // SSE 1 & 2 - Compare Instructions
  1622. //===----------------------------------------------------------------------===//
  1623. // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
  1624. multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
  1625. Operand memop, SDNode OpNode, ValueType VT,
  1626. PatFrag ld_frag, string asm,
  1627. X86FoldableSchedWrite sched,
  1628. PatFrags mem_frags> {
  1629. def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
  1630. (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
  1631. [(set VR128:$dst, (OpNode (VT VR128:$src1),
  1632. VR128:$src2, timm:$cc))]>,
  1633. Sched<[sched]>, SIMD_EXC;
  1634. let mayLoad = 1 in
  1635. def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
  1636. (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
  1637. [(set VR128:$dst, (OpNode (VT VR128:$src1),
  1638. (mem_frags addr:$src2), timm:$cc))]>,
  1639. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1640. let isCodeGenOnly = 1 in {
  1641. let isCommutable = 1 in
  1642. def rr : SIi8<0xC2, MRMSrcReg,
  1643. (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
  1644. [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
  1645. Sched<[sched]>, SIMD_EXC;
  1646. def rm : SIi8<0xC2, MRMSrcMem,
  1647. (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
  1648. [(set RC:$dst, (OpNode RC:$src1,
  1649. (ld_frag addr:$src2), timm:$cc))]>,
  1650. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1651. }
  1652. }
  1653. let ExeDomain = SSEPackedSingle in
  1654. defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
  1655. "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1656. SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
  1657. XS, VEX_4V, VEX_LIG, VEX_WIG;
  1658. let ExeDomain = SSEPackedDouble in
  1659. defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
  1660. "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1661. SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
  1662. XD, VEX_4V, VEX_LIG, VEX_WIG;
  1663. let Constraints = "$src1 = $dst" in {
  1664. let ExeDomain = SSEPackedSingle in
  1665. defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
  1666. "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1667. SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
  1668. let ExeDomain = SSEPackedDouble in
  1669. defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
  1670. "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1671. SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
  1672. }
  1673. // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
  1674. multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
  1675. ValueType vt, X86MemOperand x86memop,
  1676. PatFrag ld_frag, string OpcodeStr, Domain d,
  1677. X86FoldableSchedWrite sched = WriteFComX> {
  1678. let ExeDomain = d in {
  1679. def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  1680. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1681. [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
  1682. Sched<[sched]>, SIMD_EXC;
  1683. let mayLoad = 1 in
  1684. def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
  1685. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1686. [(set EFLAGS, (OpNode (vt RC:$src1),
  1687. (ld_frag addr:$src2)))]>,
  1688. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1689. }
  1690. }
  1691. // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
  1692. multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
  1693. ValueType vt, Operand memop,
  1694. PatFrags mem_frags, string OpcodeStr,
  1695. Domain d,
  1696. X86FoldableSchedWrite sched = WriteFComX> {
  1697. let ExeDomain = d in {
  1698. def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  1699. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1700. [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
  1701. Sched<[sched]>, SIMD_EXC;
  1702. let mayLoad = 1 in
  1703. def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
  1704. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  1705. [(set EFLAGS, (OpNode (vt RC:$src1),
  1706. (mem_frags addr:$src2)))]>,
  1707. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1708. }
  1709. }
  1710. let Defs = [EFLAGS] in {
  1711. defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
  1712. "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1713. defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
  1714. "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1715. defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
  1716. "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1717. defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
  1718. "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1719. let isCodeGenOnly = 1 in {
  1720. defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
  1721. sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1722. defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
  1723. sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1724. defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
  1725. sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
  1726. defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
  1727. sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
  1728. }
  1729. defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
  1730. "ucomiss", SSEPackedSingle>, PS;
  1731. defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
  1732. "ucomisd", SSEPackedDouble>, PD;
  1733. defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
  1734. "comiss", SSEPackedSingle>, PS;
  1735. defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
  1736. "comisd", SSEPackedDouble>, PD;
  1737. let isCodeGenOnly = 1 in {
  1738. defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
  1739. sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
  1740. defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
  1741. sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
  1742. defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
  1743. sse_load_f32, "comiss", SSEPackedSingle>, PS;
  1744. defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
  1745. sse_load_f64, "comisd", SSEPackedDouble>, PD;
  1746. }
  1747. } // Defs = [EFLAGS]
  1748. // sse12_cmp_packed - sse 1 & 2 compare packed instructions
  1749. multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
  1750. ValueType VT, string asm,
  1751. X86FoldableSchedWrite sched,
  1752. Domain d, PatFrag ld_frag> {
  1753. let isCommutable = 1 in
  1754. def rri : PIi8<0xC2, MRMSrcReg,
  1755. (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
  1756. [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
  1757. Sched<[sched]>, SIMD_EXC;
  1758. def rmi : PIi8<0xC2, MRMSrcMem,
  1759. (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
  1760. [(set RC:$dst,
  1761. (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
  1762. Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
  1763. }
  1764. defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
  1765. "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1766. SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
  1767. defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
  1768. "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1769. SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
  1770. defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
  1771. "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1772. SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
  1773. defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
  1774. "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
  1775. SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
  1776. let Constraints = "$src1 = $dst" in {
  1777. defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
  1778. "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1779. SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
  1780. defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
  1781. "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
  1782. SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
  1783. }
  1784. def CommutableCMPCC : PatLeaf<(timm), [{
  1785. uint64_t Imm = N->getZExtValue() & 0x7;
  1786. return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
  1787. }]>;
  1788. // Patterns to select compares with loads in first operand.
  1789. let Predicates = [HasAVX] in {
  1790. def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
  1791. CommutableCMPCC:$cc)),
  1792. (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
  1793. def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
  1794. CommutableCMPCC:$cc)),
  1795. (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
  1796. def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
  1797. CommutableCMPCC:$cc)),
  1798. (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1799. def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
  1800. CommutableCMPCC:$cc)),
  1801. (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1802. def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
  1803. CommutableCMPCC:$cc)),
  1804. (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
  1805. def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
  1806. CommutableCMPCC:$cc)),
  1807. (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
  1808. }
  1809. let Predicates = [UseSSE2] in {
  1810. def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
  1811. CommutableCMPCC:$cc)),
  1812. (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1813. def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
  1814. CommutableCMPCC:$cc)),
  1815. (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
  1816. }
  1817. let Predicates = [UseSSE1] in {
  1818. def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
  1819. CommutableCMPCC:$cc)),
  1820. (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
  1821. def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
  1822. CommutableCMPCC:$cc)),
  1823. (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
  1824. }
  1825. //===----------------------------------------------------------------------===//
  1826. // SSE 1 & 2 - Shuffle Instructions
  1827. //===----------------------------------------------------------------------===//
  1828. /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
  1829. multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
  1830. ValueType vt, string asm, PatFrag mem_frag,
  1831. X86FoldableSchedWrite sched, Domain d,
  1832. bit IsCommutable = 0> {
  1833. def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
  1834. (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
  1835. [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
  1836. (i8 timm:$src3))))], d>,
  1837. Sched<[sched.Folded, sched.ReadAfterFold]>;
  1838. let isCommutable = IsCommutable in
  1839. def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
  1840. (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
  1841. [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
  1842. (i8 timm:$src3))))], d>,
  1843. Sched<[sched]>;
  1844. }
  1845. let Predicates = [HasAVX, NoVLX] in {
  1846. defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
  1847. "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1848. loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
  1849. PS, VEX_4V, VEX_WIG;
  1850. defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
  1851. "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1852. loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
  1853. PS, VEX_4V, VEX_L, VEX_WIG;
  1854. defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
  1855. "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1856. loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
  1857. PD, VEX_4V, VEX_WIG;
  1858. defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
  1859. "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  1860. loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
  1861. PD, VEX_4V, VEX_L, VEX_WIG;
  1862. }
  1863. let Constraints = "$src1 = $dst" in {
  1864. defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
  1865. "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  1866. memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1867. defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
  1868. "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  1869. memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
  1870. }
  1871. //===----------------------------------------------------------------------===//
  1872. // SSE 1 & 2 - Unpack FP Instructions
  1873. //===----------------------------------------------------------------------===//
  1874. /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
  1875. multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
  1876. PatFrag mem_frag, RegisterClass RC,
  1877. X86MemOperand x86memop, string asm,
  1878. X86FoldableSchedWrite sched, Domain d,
  1879. bit IsCommutable = 0> {
  1880. let isCommutable = IsCommutable in
  1881. def rr : PI<opc, MRMSrcReg,
  1882. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  1883. asm, [(set RC:$dst,
  1884. (vt (OpNode RC:$src1, RC:$src2)))], d>,
  1885. Sched<[sched]>;
  1886. def rm : PI<opc, MRMSrcMem,
  1887. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  1888. asm, [(set RC:$dst,
  1889. (vt (OpNode RC:$src1,
  1890. (mem_frag addr:$src2))))], d>,
  1891. Sched<[sched.Folded, sched.ReadAfterFold]>;
  1892. }
  1893. let Predicates = [HasAVX, NoVLX] in {
  1894. defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
  1895. VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1896. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
  1897. defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
  1898. VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1899. SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
  1900. defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
  1901. VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1902. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
  1903. defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
  1904. VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1905. SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
  1906. defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
  1907. VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1908. SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
  1909. defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
  1910. VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1911. SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
  1912. defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
  1913. VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1914. SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
  1915. defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
  1916. VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  1917. SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
  1918. }// Predicates = [HasAVX, NoVLX]
  1919. let Constraints = "$src1 = $dst" in {
  1920. defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
  1921. VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
  1922. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1923. defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
  1924. VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
  1925. SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
  1926. defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
  1927. VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
  1928. SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
  1929. defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
  1930. VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
  1931. SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
  1932. } // Constraints = "$src1 = $dst"
  1933. let Predicates = [HasAVX1Only] in {
  1934. def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
  1935. (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
  1936. def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
  1937. (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
  1938. def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
  1939. (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
  1940. def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
  1941. (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
  1942. def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
  1943. (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
  1944. def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
  1945. (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
  1946. def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
  1947. (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
  1948. def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
  1949. (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
  1950. }
  1951. let Predicates = [UseSSE2] in {
  1952. // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
  1953. def : Pat<(v2f64 (X86Unpckl VR128:$src1,
  1954. (v2f64 (simple_load addr:$src2)))),
  1955. (MOVHPDrm VR128:$src1, addr:$src2)>;
  1956. }
  1957. //===----------------------------------------------------------------------===//
  1958. // SSE 1 & 2 - Extract Floating-Point Sign mask
  1959. //===----------------------------------------------------------------------===//
  1960. /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
  1961. multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
  1962. string asm, Domain d> {
  1963. def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
  1964. !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
  1965. [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
  1966. Sched<[WriteFMOVMSK]>;
  1967. }
  1968. let Predicates = [HasAVX] in {
  1969. defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
  1970. SSEPackedSingle>, PS, VEX, VEX_WIG;
  1971. defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
  1972. SSEPackedDouble>, PD, VEX, VEX_WIG;
  1973. defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
  1974. SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
  1975. defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
  1976. SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
  1977. // Also support integer VTs to avoid a int->fp bitcast in the DAG.
  1978. def : Pat<(X86movmsk (v4i32 VR128:$src)),
  1979. (VMOVMSKPSrr VR128:$src)>;
  1980. def : Pat<(X86movmsk (v2i64 VR128:$src)),
  1981. (VMOVMSKPDrr VR128:$src)>;
  1982. def : Pat<(X86movmsk (v8i32 VR256:$src)),
  1983. (VMOVMSKPSYrr VR256:$src)>;
  1984. def : Pat<(X86movmsk (v4i64 VR256:$src)),
  1985. (VMOVMSKPDYrr VR256:$src)>;
  1986. }
  1987. defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
  1988. SSEPackedSingle>, PS;
  1989. defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
  1990. SSEPackedDouble>, PD;
  1991. let Predicates = [UseSSE2] in {
  1992. // Also support integer VTs to avoid a int->fp bitcast in the DAG.
  1993. def : Pat<(X86movmsk (v4i32 VR128:$src)),
  1994. (MOVMSKPSrr VR128:$src)>;
  1995. def : Pat<(X86movmsk (v2i64 VR128:$src)),
  1996. (MOVMSKPDrr VR128:$src)>;
  1997. }
  1998. //===---------------------------------------------------------------------===//
  1999. // SSE2 - Packed Integer Logical Instructions
  2000. //===---------------------------------------------------------------------===//
  2001. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  2002. /// PDI_binop_rm - Simple SSE2 binary operator.
  2003. multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  2004. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  2005. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  2006. bit IsCommutable, bit Is2Addr> {
  2007. let isCommutable = IsCommutable in
  2008. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  2009. (ins RC:$src1, RC:$src2),
  2010. !if(Is2Addr,
  2011. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  2012. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  2013. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  2014. Sched<[sched]>;
  2015. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  2016. (ins RC:$src1, x86memop:$src2),
  2017. !if(Is2Addr,
  2018. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  2019. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  2020. [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  2021. Sched<[sched.Folded, sched.ReadAfterFold]>;
  2022. }
  2023. } // ExeDomain = SSEPackedInt
  2024. multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
  2025. ValueType OpVT128, ValueType OpVT256,
  2026. X86SchedWriteWidths sched, bit IsCommutable,
  2027. Predicate prd> {
  2028. let Predicates = [HasAVX, prd] in
  2029. defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
  2030. VR128, load, i128mem, sched.XMM,
  2031. IsCommutable, 0>, VEX_4V, VEX_WIG;
  2032. let Constraints = "$src1 = $dst" in
  2033. defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
  2034. memop, i128mem, sched.XMM, IsCommutable, 1>;
  2035. let Predicates = [HasAVX2, prd] in
  2036. defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
  2037. OpVT256, VR256, load, i256mem, sched.YMM,
  2038. IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
  2039. }
  2040. // These are ordered here for pattern ordering requirements with the fp versions
  2041. defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
  2042. SchedWriteVecLogic, 1, NoVLX>;
  2043. defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
  2044. SchedWriteVecLogic, 1, NoVLX>;
  2045. defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
  2046. SchedWriteVecLogic, 1, NoVLX>;
  2047. defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
  2048. SchedWriteVecLogic, 0, NoVLX>;
  2049. //===----------------------------------------------------------------------===//
  2050. // SSE 1 & 2 - Logical Instructions
  2051. //===----------------------------------------------------------------------===//
  2052. /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
  2053. ///
  2054. /// There are no patterns here because isel prefers integer versions for SSE2
  2055. /// and later. There are SSE1 v4f32 patterns later.
  2056. multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
  2057. X86SchedWriteWidths sched> {
  2058. let Predicates = [HasAVX, NoVLX] in {
  2059. defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
  2060. !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
  2061. [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
  2062. defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
  2063. !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
  2064. [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
  2065. defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
  2066. !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
  2067. [], [], 0>, PS, VEX_4V, VEX_WIG;
  2068. defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
  2069. !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
  2070. [], [], 0>, PD, VEX_4V, VEX_WIG;
  2071. }
  2072. let Constraints = "$src1 = $dst" in {
  2073. defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
  2074. !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
  2075. [], []>, PS;
  2076. defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
  2077. !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
  2078. [], []>, PD;
  2079. }
  2080. }
  2081. defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
  2082. defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
  2083. defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
  2084. let isCommutable = 0 in
  2085. defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
  2086. let Predicates = [HasAVX2, NoVLX] in {
  2087. def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
  2088. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2089. def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
  2090. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2091. def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
  2092. (VPANDYrr VR256:$src1, VR256:$src2)>;
  2093. def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
  2094. (VPORYrr VR256:$src1, VR256:$src2)>;
  2095. def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
  2096. (VPORYrr VR256:$src1, VR256:$src2)>;
  2097. def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
  2098. (VPORYrr VR256:$src1, VR256:$src2)>;
  2099. def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
  2100. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2101. def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
  2102. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2103. def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
  2104. (VPXORYrr VR256:$src1, VR256:$src2)>;
  2105. def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
  2106. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2107. def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
  2108. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2109. def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
  2110. (VPANDNYrr VR256:$src1, VR256:$src2)>;
  2111. def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
  2112. (VPANDYrm VR256:$src1, addr:$src2)>;
  2113. def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
  2114. (VPANDYrm VR256:$src1, addr:$src2)>;
  2115. def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
  2116. (VPANDYrm VR256:$src1, addr:$src2)>;
  2117. def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
  2118. (VPORYrm VR256:$src1, addr:$src2)>;
  2119. def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
  2120. (VPORYrm VR256:$src1, addr:$src2)>;
  2121. def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
  2122. (VPORYrm VR256:$src1, addr:$src2)>;
  2123. def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
  2124. (VPXORYrm VR256:$src1, addr:$src2)>;
  2125. def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
  2126. (VPXORYrm VR256:$src1, addr:$src2)>;
  2127. def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
  2128. (VPXORYrm VR256:$src1, addr:$src2)>;
  2129. def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
  2130. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2131. def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
  2132. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2133. def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
  2134. (VPANDNYrm VR256:$src1, addr:$src2)>;
  2135. }
  2136. // If only AVX1 is supported, we need to handle integer operations with
  2137. // floating point instructions since the integer versions aren't available.
  2138. let Predicates = [HasAVX1Only] in {
  2139. def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
  2140. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2141. def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
  2142. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2143. def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
  2144. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2145. def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
  2146. (VANDPSYrr VR256:$src1, VR256:$src2)>;
  2147. def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
  2148. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2149. def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
  2150. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2151. def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
  2152. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2153. def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
  2154. (VORPSYrr VR256:$src1, VR256:$src2)>;
  2155. def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
  2156. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2157. def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
  2158. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2159. def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
  2160. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2161. def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
  2162. (VXORPSYrr VR256:$src1, VR256:$src2)>;
  2163. def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
  2164. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2165. def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
  2166. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2167. def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
  2168. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2169. def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
  2170. (VANDNPSYrr VR256:$src1, VR256:$src2)>;
  2171. def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
  2172. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2173. def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
  2174. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2175. def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
  2176. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2177. def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
  2178. (VANDPSYrm VR256:$src1, addr:$src2)>;
  2179. def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
  2180. (VORPSYrm VR256:$src1, addr:$src2)>;
  2181. def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
  2182. (VORPSYrm VR256:$src1, addr:$src2)>;
  2183. def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
  2184. (VORPSYrm VR256:$src1, addr:$src2)>;
  2185. def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
  2186. (VORPSYrm VR256:$src1, addr:$src2)>;
  2187. def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
  2188. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2189. def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
  2190. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2191. def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
  2192. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2193. def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
  2194. (VXORPSYrm VR256:$src1, addr:$src2)>;
  2195. def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
  2196. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2197. def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
  2198. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2199. def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
  2200. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2201. def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
  2202. (VANDNPSYrm VR256:$src1, addr:$src2)>;
  2203. }
  2204. let Predicates = [HasAVX, NoVLX] in {
  2205. def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
  2206. (VPANDrr VR128:$src1, VR128:$src2)>;
  2207. def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
  2208. (VPANDrr VR128:$src1, VR128:$src2)>;
  2209. def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
  2210. (VPANDrr VR128:$src1, VR128:$src2)>;
  2211. def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
  2212. (VPORrr VR128:$src1, VR128:$src2)>;
  2213. def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
  2214. (VPORrr VR128:$src1, VR128:$src2)>;
  2215. def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
  2216. (VPORrr VR128:$src1, VR128:$src2)>;
  2217. def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
  2218. (VPXORrr VR128:$src1, VR128:$src2)>;
  2219. def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
  2220. (VPXORrr VR128:$src1, VR128:$src2)>;
  2221. def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
  2222. (VPXORrr VR128:$src1, VR128:$src2)>;
  2223. def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
  2224. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2225. def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
  2226. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2227. def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
  2228. (VPANDNrr VR128:$src1, VR128:$src2)>;
  2229. def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
  2230. (VPANDrm VR128:$src1, addr:$src2)>;
  2231. def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
  2232. (VPANDrm VR128:$src1, addr:$src2)>;
  2233. def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
  2234. (VPANDrm VR128:$src1, addr:$src2)>;
  2235. def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
  2236. (VPORrm VR128:$src1, addr:$src2)>;
  2237. def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
  2238. (VPORrm VR128:$src1, addr:$src2)>;
  2239. def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
  2240. (VPORrm VR128:$src1, addr:$src2)>;
  2241. def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
  2242. (VPXORrm VR128:$src1, addr:$src2)>;
  2243. def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
  2244. (VPXORrm VR128:$src1, addr:$src2)>;
  2245. def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
  2246. (VPXORrm VR128:$src1, addr:$src2)>;
  2247. def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
  2248. (VPANDNrm VR128:$src1, addr:$src2)>;
  2249. def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
  2250. (VPANDNrm VR128:$src1, addr:$src2)>;
  2251. def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
  2252. (VPANDNrm VR128:$src1, addr:$src2)>;
  2253. }
  2254. let Predicates = [UseSSE2] in {
  2255. def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
  2256. (PANDrr VR128:$src1, VR128:$src2)>;
  2257. def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
  2258. (PANDrr VR128:$src1, VR128:$src2)>;
  2259. def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
  2260. (PANDrr VR128:$src1, VR128:$src2)>;
  2261. def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
  2262. (PORrr VR128:$src1, VR128:$src2)>;
  2263. def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
  2264. (PORrr VR128:$src1, VR128:$src2)>;
  2265. def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
  2266. (PORrr VR128:$src1, VR128:$src2)>;
  2267. def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
  2268. (PXORrr VR128:$src1, VR128:$src2)>;
  2269. def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
  2270. (PXORrr VR128:$src1, VR128:$src2)>;
  2271. def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
  2272. (PXORrr VR128:$src1, VR128:$src2)>;
  2273. def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
  2274. (PANDNrr VR128:$src1, VR128:$src2)>;
  2275. def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
  2276. (PANDNrr VR128:$src1, VR128:$src2)>;
  2277. def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
  2278. (PANDNrr VR128:$src1, VR128:$src2)>;
  2279. def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
  2280. (PANDrm VR128:$src1, addr:$src2)>;
  2281. def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
  2282. (PANDrm VR128:$src1, addr:$src2)>;
  2283. def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
  2284. (PANDrm VR128:$src1, addr:$src2)>;
  2285. def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
  2286. (PORrm VR128:$src1, addr:$src2)>;
  2287. def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
  2288. (PORrm VR128:$src1, addr:$src2)>;
  2289. def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
  2290. (PORrm VR128:$src1, addr:$src2)>;
  2291. def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
  2292. (PXORrm VR128:$src1, addr:$src2)>;
  2293. def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
  2294. (PXORrm VR128:$src1, addr:$src2)>;
  2295. def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
  2296. (PXORrm VR128:$src1, addr:$src2)>;
  2297. def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
  2298. (PANDNrm VR128:$src1, addr:$src2)>;
  2299. def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
  2300. (PANDNrm VR128:$src1, addr:$src2)>;
  2301. def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
  2302. (PANDNrm VR128:$src1, addr:$src2)>;
  2303. }
  2304. // Patterns for packed operations when we don't have integer type available.
  2305. def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
  2306. (ANDPSrr VR128:$src1, VR128:$src2)>;
  2307. def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
  2308. (ORPSrr VR128:$src1, VR128:$src2)>;
  2309. def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
  2310. (XORPSrr VR128:$src1, VR128:$src2)>;
  2311. def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
  2312. (ANDNPSrr VR128:$src1, VR128:$src2)>;
  2313. def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
  2314. (ANDPSrm VR128:$src1, addr:$src2)>;
  2315. def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
  2316. (ORPSrm VR128:$src1, addr:$src2)>;
  2317. def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
  2318. (XORPSrm VR128:$src1, addr:$src2)>;
  2319. def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
  2320. (ANDNPSrm VR128:$src1, addr:$src2)>;
  2321. //===----------------------------------------------------------------------===//
  2322. // SSE 1 & 2 - Arithmetic Instructions
  2323. //===----------------------------------------------------------------------===//
  2324. /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
  2325. /// vector forms.
  2326. ///
  2327. /// In addition, we also have a special variant of the scalar form here to
  2328. /// represent the associated intrinsic operation. This form is unlike the
  2329. /// plain scalar form, in that it takes an entire vector (instead of a scalar)
  2330. /// and leaves the top elements unmodified (therefore these cannot be commuted).
  2331. ///
  2332. /// These three forms can each be reg+reg or reg+mem.
  2333. ///
  2334. /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
  2335. /// classes below
  2336. multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
  2337. SDPatternOperator OpNode, X86SchedWriteSizes sched> {
  2338. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2339. let Predicates = [HasAVX, NoVLX] in {
  2340. defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
  2341. VR128, v4f32, f128mem, loadv4f32,
  2342. SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
  2343. defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
  2344. VR128, v2f64, f128mem, loadv2f64,
  2345. SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
  2346. defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
  2347. OpNode, VR256, v8f32, f256mem, loadv8f32,
  2348. SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
  2349. defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
  2350. OpNode, VR256, v4f64, f256mem, loadv4f64,
  2351. SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
  2352. }
  2353. let Constraints = "$src1 = $dst" in {
  2354. defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
  2355. v4f32, f128mem, memopv4f32, SSEPackedSingle,
  2356. sched.PS.XMM>, PS;
  2357. defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
  2358. v2f64, f128mem, memopv2f64, SSEPackedDouble,
  2359. sched.PD.XMM>, PD;
  2360. }
  2361. }
  2362. }
  2363. multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2364. X86SchedWriteSizes sched> {
  2365. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2366. defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
  2367. OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
  2368. XS, VEX_4V, VEX_LIG, VEX_WIG;
  2369. defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
  2370. OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
  2371. XD, VEX_4V, VEX_LIG, VEX_WIG;
  2372. let Constraints = "$src1 = $dst" in {
  2373. defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
  2374. OpNode, FR32, f32mem, SSEPackedSingle,
  2375. sched.PS.Scl>, XS;
  2376. defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
  2377. OpNode, FR64, f64mem, SSEPackedDouble,
  2378. sched.PD.Scl>, XD;
  2379. }
  2380. }
  2381. }
  2382. multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
  2383. SDPatternOperator OpNode,
  2384. X86SchedWriteSizes sched> {
  2385. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  2386. defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
  2387. !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
  2388. SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
  2389. defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
  2390. !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
  2391. SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
  2392. let Constraints = "$src1 = $dst" in {
  2393. defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
  2394. !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
  2395. SSEPackedSingle, sched.PS.Scl>, XS;
  2396. defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
  2397. !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
  2398. SSEPackedDouble, sched.PD.Scl>, XD;
  2399. }
  2400. }
  2401. }
  2402. // Binary Arithmetic instructions
  2403. defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
  2404. basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
  2405. basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
  2406. defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
  2407. basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
  2408. basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
  2409. let isCommutable = 0 in {
  2410. defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
  2411. basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
  2412. basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
  2413. defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
  2414. basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
  2415. basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
  2416. defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
  2417. basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
  2418. basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
  2419. defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
  2420. basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
  2421. basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
  2422. }
  2423. let isCodeGenOnly = 1 in {
  2424. defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
  2425. basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
  2426. defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
  2427. basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
  2428. }
  2429. // Patterns used to select SSE scalar fp arithmetic instructions from
  2430. // either:
  2431. //
  2432. // (1) a scalar fp operation followed by a blend
  2433. //
  2434. // The effect is that the backend no longer emits unnecessary vector
  2435. // insert instructions immediately after SSE scalar fp instructions
  2436. // like addss or mulss.
  2437. //
  2438. // For example, given the following code:
  2439. // __m128 foo(__m128 A, __m128 B) {
  2440. // A[0] += B[0];
  2441. // return A;
  2442. // }
  2443. //
  2444. // Previously we generated:
  2445. // addss %xmm0, %xmm1
  2446. // movss %xmm1, %xmm0
  2447. //
  2448. // We now generate:
  2449. // addss %xmm1, %xmm0
  2450. //
  2451. // (2) a vector packed single/double fp operation followed by a vector insert
  2452. //
  2453. // The effect is that the backend converts the packed fp instruction
  2454. // followed by a vector insert into a single SSE scalar fp instruction.
  2455. //
  2456. // For example, given the following code:
  2457. // __m128 foo(__m128 A, __m128 B) {
  2458. // __m128 C = A + B;
  2459. // return (__m128) {c[0], a[1], a[2], a[3]};
  2460. // }
  2461. //
  2462. // Previously we generated:
  2463. // addps %xmm0, %xmm1
  2464. // movss %xmm1, %xmm0
  2465. //
  2466. // We now generate:
  2467. // addss %xmm1, %xmm0
  2468. // TODO: Some canonicalization in lowering would simplify the number of
  2469. // patterns we have to try to match.
  2470. multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
  2471. ValueType VT, ValueType EltTy,
  2472. RegisterClass RC, PatFrag ld_frag,
  2473. Predicate BasePredicate> {
  2474. let Predicates = [BasePredicate] in {
  2475. // extracted scalar math op with insert via movss/movsd
  2476. def : Pat<(VT (Move (VT VR128:$dst),
  2477. (VT (scalar_to_vector
  2478. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2479. RC:$src))))),
  2480. (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
  2481. (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
  2482. def : Pat<(VT (Move (VT VR128:$dst),
  2483. (VT (scalar_to_vector
  2484. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2485. (ld_frag addr:$src)))))),
  2486. (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
  2487. }
  2488. // Repeat for AVX versions of the instructions.
  2489. let Predicates = [UseAVX] in {
  2490. // extracted scalar math op with insert via movss/movsd
  2491. def : Pat<(VT (Move (VT VR128:$dst),
  2492. (VT (scalar_to_vector
  2493. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2494. RC:$src))))),
  2495. (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
  2496. (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
  2497. def : Pat<(VT (Move (VT VR128:$dst),
  2498. (VT (scalar_to_vector
  2499. (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
  2500. (ld_frag addr:$src)))))),
  2501. (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
  2502. }
  2503. }
  2504. defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2505. defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2506. defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2507. defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
  2508. defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2509. defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2510. defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2511. defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
  2512. /// Unop Arithmetic
  2513. /// In addition, we also have a special variant of the scalar form here to
  2514. /// represent the associated intrinsic operation. This form is unlike the
  2515. /// plain scalar form, in that it takes an entire vector (instead of a
  2516. /// scalar) and leaves the top elements undefined.
  2517. ///
  2518. /// And, we have a special variant form for a full-vector intrinsic form.
  2519. /// sse_fp_unop_s - SSE1 unops in scalar form
  2520. /// For the non-AVX defs, we need $src1 to be tied to $dst because
  2521. /// the HW instructions are 2 operand / destructive.
  2522. multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
  2523. X86MemOperand x86memop, Operand intmemop,
  2524. SDPatternOperator OpNode, Domain d,
  2525. X86FoldableSchedWrite sched, Predicate target> {
  2526. let isCodeGenOnly = 1, hasSideEffects = 0 in {
  2527. def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
  2528. !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
  2529. [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
  2530. Requires<[target]>;
  2531. let mayLoad = 1 in
  2532. def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
  2533. !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
  2534. [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
  2535. Sched<[sched.Folded]>,
  2536. Requires<[target, OptForSize]>;
  2537. }
  2538. let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
  2539. def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
  2540. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
  2541. Sched<[sched]>;
  2542. let mayLoad = 1 in
  2543. def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
  2544. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
  2545. Sched<[sched.Folded, sched.ReadAfterFold]>;
  2546. }
  2547. }
  2548. multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
  2549. Intrinsic Intr, Predicate target> {
  2550. let Predicates = [target] in {
  2551. // These are unary operations, but they are modeled as having 2 source operands
  2552. // because the high elements of the destination are unchanged in SSE.
  2553. def : Pat<(Intr VR128:$src),
  2554. (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
  2555. }
  2556. // We don't want to fold scalar loads into these instructions unless
  2557. // optimizing for size. This is because the folded instruction will have a
  2558. // partial register update, while the unfolded sequence will not, e.g.
  2559. // movss mem, %xmm0
  2560. // rcpss %xmm0, %xmm0
  2561. // which has a clobber before the rcp, vs.
  2562. // rcpss mem, %xmm0
  2563. let Predicates = [target, OptForSize] in {
  2564. def : Pat<(Intr (mem_frags addr:$src2)),
  2565. (!cast<Instruction>(NAME#m_Int)
  2566. (vt (IMPLICIT_DEF)), addr:$src2)>;
  2567. }
  2568. }
  2569. multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
  2570. Intrinsic Intr, Predicate target> {
  2571. let Predicates = [target] in {
  2572. def : Pat<(Intr VR128:$src),
  2573. (!cast<Instruction>(NAME#r_Int) VR128:$src,
  2574. VR128:$src)>;
  2575. }
  2576. let Predicates = [target, OptForSize] in {
  2577. def : Pat<(Intr (mem_frags addr:$src2)),
  2578. (!cast<Instruction>(NAME#m_Int)
  2579. (vt (IMPLICIT_DEF)), addr:$src2)>;
  2580. }
  2581. }
  2582. multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
  2583. ValueType ScalarVT, X86MemOperand x86memop,
  2584. Operand intmemop, SDPatternOperator OpNode, Domain d,
  2585. X86FoldableSchedWrite sched, Predicate target> {
  2586. let isCodeGenOnly = 1, hasSideEffects = 0 in {
  2587. def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  2588. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2589. [], d>, Sched<[sched]>;
  2590. let mayLoad = 1 in
  2591. def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  2592. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2593. [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  2594. }
  2595. let hasSideEffects = 0, ExeDomain = d in {
  2596. def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
  2597. (ins VR128:$src1, VR128:$src2),
  2598. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2599. []>, Sched<[sched]>;
  2600. let mayLoad = 1 in
  2601. def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
  2602. (ins VR128:$src1, intmemop:$src2),
  2603. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  2604. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  2605. }
  2606. // We don't want to fold scalar loads into these instructions unless
  2607. // optimizing for size. This is because the folded instruction will have a
  2608. // partial register update, while the unfolded sequence will not, e.g.
  2609. // vmovss mem, %xmm0
  2610. // vrcpss %xmm0, %xmm0, %xmm0
  2611. // which has a clobber before the rcp, vs.
  2612. // vrcpss mem, %xmm0, %xmm0
  2613. // TODO: In theory, we could fold the load, and avoid the stall caused by
  2614. // the partial register store, either in BreakFalseDeps or with smarter RA.
  2615. let Predicates = [target] in {
  2616. def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
  2617. (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
  2618. }
  2619. let Predicates = [target, OptForSize] in {
  2620. def : Pat<(ScalarVT (OpNode (load addr:$src))),
  2621. (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
  2622. addr:$src)>;
  2623. }
  2624. }
  2625. /// sse1_fp_unop_p - SSE1 unops in packed form.
  2626. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2627. X86SchedWriteWidths sched, list<Predicate> prds> {
  2628. let Predicates = prds in {
  2629. def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2630. !strconcat("v", OpcodeStr,
  2631. "ps\t{$src, $dst|$dst, $src}"),
  2632. [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
  2633. VEX, Sched<[sched.XMM]>, VEX_WIG;
  2634. def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2635. !strconcat("v", OpcodeStr,
  2636. "ps\t{$src, $dst|$dst, $src}"),
  2637. [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
  2638. VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
  2639. def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2640. !strconcat("v", OpcodeStr,
  2641. "ps\t{$src, $dst|$dst, $src}"),
  2642. [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
  2643. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  2644. def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  2645. !strconcat("v", OpcodeStr,
  2646. "ps\t{$src, $dst|$dst, $src}"),
  2647. [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
  2648. VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
  2649. }
  2650. def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2651. !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
  2652. [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
  2653. Sched<[sched.XMM]>;
  2654. def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2655. !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
  2656. [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
  2657. Sched<[sched.XMM.Folded]>;
  2658. }
  2659. /// sse2_fp_unop_p - SSE2 unops in vector forms.
  2660. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
  2661. SDPatternOperator OpNode, X86SchedWriteWidths sched> {
  2662. let Predicates = [HasAVX, NoVLX] in {
  2663. def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2664. !strconcat("v", OpcodeStr,
  2665. "pd\t{$src, $dst|$dst, $src}"),
  2666. [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
  2667. VEX, Sched<[sched.XMM]>, VEX_WIG;
  2668. def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2669. !strconcat("v", OpcodeStr,
  2670. "pd\t{$src, $dst|$dst, $src}"),
  2671. [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
  2672. VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
  2673. def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2674. !strconcat("v", OpcodeStr,
  2675. "pd\t{$src, $dst|$dst, $src}"),
  2676. [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
  2677. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  2678. def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  2679. !strconcat("v", OpcodeStr,
  2680. "pd\t{$src, $dst|$dst, $src}"),
  2681. [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
  2682. VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
  2683. }
  2684. def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2685. !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
  2686. [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
  2687. Sched<[sched.XMM]>;
  2688. def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
  2689. !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
  2690. [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
  2691. Sched<[sched.XMM.Folded]>;
  2692. }
  2693. multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
  2694. defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32,
  2695. !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
  2696. UseSSE1>, XS;
  2697. defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32,
  2698. !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
  2699. AVXTarget>,
  2700. XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
  2701. }
  2702. multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2703. X86SchedWriteWidths sched, Predicate AVXTarget> {
  2704. defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
  2705. ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
  2706. defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
  2707. f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
  2708. XS, VEX_4V, VEX_LIG, VEX_WIG;
  2709. }
  2710. multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
  2711. X86SchedWriteWidths sched, Predicate AVXTarget> {
  2712. defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
  2713. sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
  2714. defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
  2715. f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
  2716. XD, VEX_4V, VEX_LIG, VEX_WIG;
  2717. }
  2718. // Square root.
  2719. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
  2720. sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
  2721. sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
  2722. sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
  2723. // Reciprocal approximations. Note that these typically require refinement
  2724. // in order to obtain suitable precision.
  2725. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
  2726. sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
  2727. sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
  2728. defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
  2729. sse1_fp_unop_s_intr<"rcp", HasAVX>,
  2730. sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
  2731. // There is no f64 version of the reciprocal approximation instructions.
  2732. multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
  2733. ValueType VT, Predicate BasePredicate> {
  2734. let Predicates = [BasePredicate] in {
  2735. def : Pat<(VT (Move VT:$dst, (scalar_to_vector
  2736. (OpNode (extractelt VT:$src, 0))))),
  2737. (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2738. }
  2739. // Repeat for AVX versions of the instructions.
  2740. let Predicates = [UseAVX] in {
  2741. def : Pat<(VT (Move VT:$dst, (scalar_to_vector
  2742. (OpNode (extractelt VT:$src, 0))))),
  2743. (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2744. }
  2745. }
  2746. defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
  2747. defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
  2748. multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
  2749. SDNode Move, ValueType VT,
  2750. Predicate BasePredicate> {
  2751. let Predicates = [BasePredicate] in {
  2752. def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
  2753. (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2754. }
  2755. // Repeat for AVX versions of the instructions.
  2756. let Predicates = [HasAVX] in {
  2757. def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
  2758. (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
  2759. }
  2760. }
  2761. defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
  2762. v4f32, UseSSE1>;
  2763. defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
  2764. v4f32, UseSSE1>;
  2765. //===----------------------------------------------------------------------===//
  2766. // SSE 1 & 2 - Non-temporal stores
  2767. //===----------------------------------------------------------------------===//
  2768. let AddedComplexity = 400 in { // Prefer non-temporal versions
  2769. let Predicates = [HasAVX, NoVLX] in {
  2770. let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
  2771. def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
  2772. (ins f128mem:$dst, VR128:$src),
  2773. "movntps\t{$src, $dst|$dst, $src}",
  2774. [(alignednontemporalstore (v4f32 VR128:$src),
  2775. addr:$dst)]>, VEX, VEX_WIG;
  2776. def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
  2777. (ins f128mem:$dst, VR128:$src),
  2778. "movntpd\t{$src, $dst|$dst, $src}",
  2779. [(alignednontemporalstore (v2f64 VR128:$src),
  2780. addr:$dst)]>, VEX, VEX_WIG;
  2781. } // SchedRW
  2782. let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
  2783. def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
  2784. (ins f256mem:$dst, VR256:$src),
  2785. "movntps\t{$src, $dst|$dst, $src}",
  2786. [(alignednontemporalstore (v8f32 VR256:$src),
  2787. addr:$dst)]>, VEX, VEX_L, VEX_WIG;
  2788. def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
  2789. (ins f256mem:$dst, VR256:$src),
  2790. "movntpd\t{$src, $dst|$dst, $src}",
  2791. [(alignednontemporalstore (v4f64 VR256:$src),
  2792. addr:$dst)]>, VEX, VEX_L, VEX_WIG;
  2793. } // SchedRW
  2794. let ExeDomain = SSEPackedInt in {
  2795. def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
  2796. (ins i128mem:$dst, VR128:$src),
  2797. "movntdq\t{$src, $dst|$dst, $src}",
  2798. [(alignednontemporalstore (v2i64 VR128:$src),
  2799. addr:$dst)]>, VEX, VEX_WIG,
  2800. Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
  2801. def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
  2802. (ins i256mem:$dst, VR256:$src),
  2803. "movntdq\t{$src, $dst|$dst, $src}",
  2804. [(alignednontemporalstore (v4i64 VR256:$src),
  2805. addr:$dst)]>, VEX, VEX_L, VEX_WIG,
  2806. Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
  2807. } // ExeDomain
  2808. } // Predicates
  2809. let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
  2810. def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2811. "movntps\t{$src, $dst|$dst, $src}",
  2812. [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
  2813. def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2814. "movntpd\t{$src, $dst|$dst, $src}",
  2815. [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
  2816. } // SchedRW
  2817. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
  2818. def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
  2819. "movntdq\t{$src, $dst|$dst, $src}",
  2820. [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
  2821. let SchedRW = [WriteStoreNT] in {
  2822. // There is no AVX form for instructions below this point
  2823. def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
  2824. "movnti{l}\t{$src, $dst|$dst, $src}",
  2825. [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
  2826. PS, Requires<[HasSSE2]>;
  2827. def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
  2828. "movnti{q}\t{$src, $dst|$dst, $src}",
  2829. [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
  2830. PS, Requires<[HasSSE2]>;
  2831. } // SchedRW = [WriteStoreNT]
  2832. let Predicates = [HasAVX, NoVLX] in {
  2833. def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
  2834. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2835. def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
  2836. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2837. def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
  2838. (VMOVNTDQYmr addr:$dst, VR256:$src)>;
  2839. def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
  2840. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2841. def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
  2842. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2843. def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
  2844. (VMOVNTDQmr addr:$dst, VR128:$src)>;
  2845. }
  2846. let Predicates = [UseSSE2] in {
  2847. def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
  2848. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2849. def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
  2850. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2851. def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
  2852. (MOVNTDQmr addr:$dst, VR128:$src)>;
  2853. }
  2854. } // AddedComplexity
  2855. //===----------------------------------------------------------------------===//
  2856. // SSE 1 & 2 - Prefetch and memory fence
  2857. //===----------------------------------------------------------------------===//
  2858. // Prefetch intrinsic.
  2859. let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
  2860. def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
  2861. "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
  2862. def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
  2863. "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
  2864. def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
  2865. "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
  2866. def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
  2867. "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
  2868. }
  2869. // FIXME: How should flush instruction be modeled?
  2870. let SchedRW = [WriteLoad] in {
  2871. // Flush cache
  2872. def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
  2873. "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
  2874. PS, Requires<[HasSSE2]>;
  2875. }
  2876. let SchedRW = [WriteNop] in {
  2877. // Pause. This "instruction" is encoded as "rep; nop", so even though it
  2878. // was introduced with SSE2, it's backward compatible.
  2879. def PAUSE : I<0x90, RawFrm, (outs), (ins),
  2880. "pause", [(int_x86_sse2_pause)]>, OBXS;
  2881. }
  2882. let SchedRW = [WriteFence] in {
  2883. // Load, store, and memory fence
  2884. // TODO: As with mfence, we may want to ease the availability of sfence/lfence
  2885. // to include any 64-bit target.
  2886. def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
  2887. PS, Requires<[HasSSE1]>;
  2888. def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
  2889. PS, Requires<[HasSSE2]>;
  2890. def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
  2891. PS, Requires<[HasMFence]>;
  2892. } // SchedRW
  2893. def : Pat<(X86MFence), (MFENCE)>;
  2894. //===----------------------------------------------------------------------===//
  2895. // SSE 1 & 2 - Load/Store XCSR register
  2896. //===----------------------------------------------------------------------===//
  2897. let mayLoad=1, hasSideEffects=1 in
  2898. def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
  2899. "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
  2900. VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
  2901. let mayStore=1, hasSideEffects=1 in
  2902. def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
  2903. "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
  2904. VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
  2905. let mayLoad=1, hasSideEffects=1 in
  2906. def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
  2907. "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
  2908. PS, Sched<[WriteLDMXCSR]>;
  2909. let mayStore=1, hasSideEffects=1 in
  2910. def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
  2911. "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
  2912. PS, Sched<[WriteSTMXCSR]>;
  2913. //===---------------------------------------------------------------------===//
  2914. // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
  2915. //===---------------------------------------------------------------------===//
  2916. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  2917. let hasSideEffects = 0 in {
  2918. def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2919. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2920. Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
  2921. def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2922. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2923. Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
  2924. def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2925. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2926. Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
  2927. def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  2928. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2929. Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
  2930. }
  2931. // For Disassembler
  2932. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
  2933. def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  2934. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2935. Sched<[SchedWriteVecMoveLS.XMM.RR]>,
  2936. VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
  2937. def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
  2938. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2939. Sched<[SchedWriteVecMoveLS.YMM.RR]>,
  2940. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
  2941. def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  2942. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2943. Sched<[SchedWriteVecMoveLS.XMM.RR]>,
  2944. VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
  2945. def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
  2946. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2947. Sched<[SchedWriteVecMoveLS.YMM.RR]>,
  2948. VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
  2949. }
  2950. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
  2951. hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
  2952. def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  2953. "movdqa\t{$src, $dst|$dst, $src}",
  2954. [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
  2955. Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
  2956. def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  2957. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2958. Sched<[SchedWriteVecMoveLS.YMM.RM]>,
  2959. VEX, VEX_L, VEX_WIG;
  2960. def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  2961. "vmovdqu\t{$src, $dst|$dst, $src}",
  2962. [(set VR128:$dst, (loadv2i64 addr:$src))]>,
  2963. Sched<[SchedWriteVecMoveLS.XMM.RM]>,
  2964. XS, VEX, VEX_WIG;
  2965. def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  2966. "vmovdqu\t{$src, $dst|$dst, $src}", []>,
  2967. Sched<[SchedWriteVecMoveLS.YMM.RM]>,
  2968. XS, VEX, VEX_L, VEX_WIG;
  2969. }
  2970. let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
  2971. def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
  2972. (ins i128mem:$dst, VR128:$src),
  2973. "movdqa\t{$src, $dst|$dst, $src}",
  2974. [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
  2975. Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
  2976. def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
  2977. (ins i256mem:$dst, VR256:$src),
  2978. "movdqa\t{$src, $dst|$dst, $src}", []>,
  2979. Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
  2980. def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  2981. "vmovdqu\t{$src, $dst|$dst, $src}",
  2982. [(store (v2i64 VR128:$src), addr:$dst)]>,
  2983. Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
  2984. def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
  2985. "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
  2986. Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
  2987. }
  2988. let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
  2989. let hasSideEffects = 0 in {
  2990. def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2991. "movdqa\t{$src, $dst|$dst, $src}", []>;
  2992. def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  2993. "movdqu\t{$src, $dst|$dst, $src}", []>,
  2994. XS, Requires<[UseSSE2]>;
  2995. }
  2996. // For Disassembler
  2997. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
  2998. def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  2999. "movdqa\t{$src, $dst|$dst, $src}", []>,
  3000. FoldGenData<"MOVDQArr">;
  3001. def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3002. "movdqu\t{$src, $dst|$dst, $src}", []>,
  3003. XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
  3004. }
  3005. } // SchedRW
  3006. let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
  3007. hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
  3008. def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  3009. "movdqa\t{$src, $dst|$dst, $src}",
  3010. [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
  3011. def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  3012. "movdqu\t{$src, $dst|$dst, $src}",
  3013. [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
  3014. XS, Requires<[UseSSE2]>;
  3015. }
  3016. let mayStore = 1, hasSideEffects = 0,
  3017. SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
  3018. def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  3019. "movdqa\t{$src, $dst|$dst, $src}",
  3020. [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
  3021. def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
  3022. "movdqu\t{$src, $dst|$dst, $src}",
  3023. [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
  3024. XS, Requires<[UseSSE2]>;
  3025. }
  3026. } // ExeDomain = SSEPackedInt
  3027. // Reversed version with ".s" suffix for GAS compatibility.
  3028. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
  3029. (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
  3030. def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
  3031. (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
  3032. def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
  3033. (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
  3034. def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
  3035. (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
  3036. // Reversed version with ".s" suffix for GAS compatibility.
  3037. def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
  3038. (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
  3039. def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
  3040. (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
  3041. let Predicates = [HasAVX, NoVLX] in {
  3042. // Additional patterns for other integer sizes.
  3043. def : Pat<(alignedloadv4i32 addr:$src),
  3044. (VMOVDQArm addr:$src)>;
  3045. def : Pat<(alignedloadv8i16 addr:$src),
  3046. (VMOVDQArm addr:$src)>;
  3047. def : Pat<(alignedloadv16i8 addr:$src),
  3048. (VMOVDQArm addr:$src)>;
  3049. def : Pat<(loadv4i32 addr:$src),
  3050. (VMOVDQUrm addr:$src)>;
  3051. def : Pat<(loadv8i16 addr:$src),
  3052. (VMOVDQUrm addr:$src)>;
  3053. def : Pat<(loadv16i8 addr:$src),
  3054. (VMOVDQUrm addr:$src)>;
  3055. def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
  3056. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3057. def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
  3058. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3059. def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
  3060. (VMOVDQAmr addr:$dst, VR128:$src)>;
  3061. def : Pat<(store (v4i32 VR128:$src), addr:$dst),
  3062. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3063. def : Pat<(store (v8i16 VR128:$src), addr:$dst),
  3064. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3065. def : Pat<(store (v16i8 VR128:$src), addr:$dst),
  3066. (VMOVDQUmr addr:$dst, VR128:$src)>;
  3067. }
  3068. //===---------------------------------------------------------------------===//
  3069. // SSE2 - Packed Integer Arithmetic Instructions
  3070. //===---------------------------------------------------------------------===//
  3071. let ExeDomain = SSEPackedInt in { // SSE integer instructions
  3072. /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
  3073. multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
  3074. ValueType DstVT, ValueType SrcVT, RegisterClass RC,
  3075. PatFrag memop_frag, X86MemOperand x86memop,
  3076. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  3077. let isCommutable = 1 in
  3078. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  3079. (ins RC:$src1, RC:$src2),
  3080. !if(Is2Addr,
  3081. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3082. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3083. [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
  3084. Sched<[sched]>;
  3085. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  3086. (ins RC:$src1, x86memop:$src2),
  3087. !if(Is2Addr,
  3088. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3089. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3090. [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
  3091. (memop_frag addr:$src2))))]>,
  3092. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3093. }
  3094. } // ExeDomain = SSEPackedInt
  3095. defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
  3096. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3097. defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
  3098. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3099. defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
  3100. SchedWriteVecALU, 1, NoVLX>;
  3101. defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
  3102. SchedWriteVecALU, 1, NoVLX>;
  3103. defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
  3104. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3105. defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
  3106. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3107. defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
  3108. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3109. defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
  3110. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3111. defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
  3112. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3113. defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
  3114. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3115. defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
  3116. SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
  3117. defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
  3118. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3119. defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
  3120. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3121. defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
  3122. SchedWriteVecALU, 0, NoVLX>;
  3123. defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
  3124. SchedWriteVecALU, 0, NoVLX>;
  3125. defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
  3126. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3127. defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
  3128. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3129. defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
  3130. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3131. defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
  3132. SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
  3133. defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
  3134. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3135. defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
  3136. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3137. defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
  3138. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3139. defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
  3140. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3141. defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
  3142. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3143. defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
  3144. SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
  3145. defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
  3146. SchedWriteVecIMul, 1, NoVLX>;
  3147. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3148. defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
  3149. load, i128mem, SchedWriteVecIMul.XMM, 0>,
  3150. VEX_4V, VEX_WIG;
  3151. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3152. defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
  3153. VR256, load, i256mem, SchedWriteVecIMul.YMM,
  3154. 0>, VEX_4V, VEX_L, VEX_WIG;
  3155. let Constraints = "$src1 = $dst" in
  3156. defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
  3157. memop, i128mem, SchedWriteVecIMul.XMM>;
  3158. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3159. defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
  3160. load, i128mem, SchedWritePSADBW.XMM, 0>,
  3161. VEX_4V, VEX_WIG;
  3162. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3163. defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
  3164. load, i256mem, SchedWritePSADBW.YMM, 0>,
  3165. VEX_4V, VEX_L, VEX_WIG;
  3166. let Constraints = "$src1 = $dst" in
  3167. defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
  3168. memop, i128mem, SchedWritePSADBW.XMM>;
  3169. //===---------------------------------------------------------------------===//
  3170. // SSE2 - Packed Integer Logical Instructions
  3171. //===---------------------------------------------------------------------===//
  3172. multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
  3173. string OpcodeStr, SDNode OpNode,
  3174. SDNode OpNode2, RegisterClass RC,
  3175. X86FoldableSchedWrite sched,
  3176. X86FoldableSchedWrite schedImm,
  3177. ValueType DstVT, ValueType SrcVT,
  3178. PatFrag ld_frag, bit Is2Addr = 1> {
  3179. // src2 is always 128-bit
  3180. def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
  3181. (ins RC:$src1, VR128:$src2),
  3182. !if(Is2Addr,
  3183. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3184. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3185. [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
  3186. Sched<[sched]>;
  3187. def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
  3188. (ins RC:$src1, i128mem:$src2),
  3189. !if(Is2Addr,
  3190. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3191. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3192. [(set RC:$dst, (DstVT (OpNode RC:$src1,
  3193. (SrcVT (ld_frag addr:$src2)))))]>,
  3194. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3195. def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
  3196. (ins RC:$src1, u8imm:$src2),
  3197. !if(Is2Addr,
  3198. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3199. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3200. [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
  3201. Sched<[schedImm]>;
  3202. }
  3203. multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
  3204. string OpcodeStr, SDNode OpNode,
  3205. SDNode OpNode2, ValueType DstVT128,
  3206. ValueType DstVT256, ValueType SrcVT,
  3207. X86SchedWriteWidths sched,
  3208. X86SchedWriteWidths schedImm, Predicate prd> {
  3209. let Predicates = [HasAVX, prd] in
  3210. defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
  3211. OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
  3212. DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
  3213. let Predicates = [HasAVX2, prd] in
  3214. defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
  3215. OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
  3216. DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
  3217. VEX_WIG;
  3218. let Constraints = "$src1 = $dst" in
  3219. defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
  3220. VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
  3221. memop>;
  3222. }
  3223. multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
  3224. SDNode OpNode, RegisterClass RC, ValueType VT,
  3225. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  3226. def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
  3227. !if(Is2Addr,
  3228. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3229. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3230. [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
  3231. Sched<[sched]>;
  3232. }
  3233. multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
  3234. SDNode OpNode, X86SchedWriteWidths sched> {
  3235. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  3236. defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
  3237. VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
  3238. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  3239. defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
  3240. VR256, v32i8, sched.YMM, 0>,
  3241. VEX_4V, VEX_L, VEX_WIG;
  3242. let Constraints = "$src1 = $dst" in
  3243. defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
  3244. sched.XMM>;
  3245. }
  3246. let ExeDomain = SSEPackedInt in {
  3247. defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
  3248. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3249. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3250. defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
  3251. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3252. SchedWriteVecShiftImm, NoVLX>;
  3253. defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
  3254. v2i64, v4i64, v2i64, SchedWriteVecShift,
  3255. SchedWriteVecShiftImm, NoVLX>;
  3256. defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
  3257. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3258. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3259. defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
  3260. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3261. SchedWriteVecShiftImm, NoVLX>;
  3262. defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
  3263. v2i64, v4i64, v2i64, SchedWriteVecShift,
  3264. SchedWriteVecShiftImm, NoVLX>;
  3265. defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
  3266. v8i16, v16i16, v8i16, SchedWriteVecShift,
  3267. SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
  3268. defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
  3269. v4i32, v8i32, v4i32, SchedWriteVecShift,
  3270. SchedWriteVecShiftImm, NoVLX>;
  3271. defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
  3272. SchedWriteShuffle>;
  3273. defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
  3274. SchedWriteShuffle>;
  3275. } // ExeDomain = SSEPackedInt
  3276. //===---------------------------------------------------------------------===//
  3277. // SSE2 - Packed Integer Comparison Instructions
  3278. //===---------------------------------------------------------------------===//
  3279. defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
  3280. SchedWriteVecALU, 1, TruePredicate>;
  3281. defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
  3282. SchedWriteVecALU, 1, TruePredicate>;
  3283. defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
  3284. SchedWriteVecALU, 1, TruePredicate>;
  3285. defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
  3286. SchedWriteVecALU, 0, TruePredicate>;
  3287. defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
  3288. SchedWriteVecALU, 0, TruePredicate>;
  3289. defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
  3290. SchedWriteVecALU, 0, TruePredicate>;
  3291. //===---------------------------------------------------------------------===//
  3292. // SSE2 - Packed Integer Shuffle Instructions
  3293. //===---------------------------------------------------------------------===//
  3294. let ExeDomain = SSEPackedInt in {
  3295. multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
  3296. SDNode OpNode, X86SchedWriteWidths sched,
  3297. Predicate prd> {
  3298. let Predicates = [HasAVX, prd] in {
  3299. def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
  3300. (ins VR128:$src1, u8imm:$src2),
  3301. !strconcat("v", OpcodeStr,
  3302. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3303. [(set VR128:$dst,
  3304. (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
  3305. VEX, Sched<[sched.XMM]>, VEX_WIG;
  3306. def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
  3307. (ins i128mem:$src1, u8imm:$src2),
  3308. !strconcat("v", OpcodeStr,
  3309. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3310. [(set VR128:$dst,
  3311. (vt128 (OpNode (load addr:$src1),
  3312. (i8 timm:$src2))))]>, VEX,
  3313. Sched<[sched.XMM.Folded]>, VEX_WIG;
  3314. }
  3315. let Predicates = [HasAVX2, prd] in {
  3316. def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
  3317. (ins VR256:$src1, u8imm:$src2),
  3318. !strconcat("v", OpcodeStr,
  3319. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3320. [(set VR256:$dst,
  3321. (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
  3322. VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
  3323. def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
  3324. (ins i256mem:$src1, u8imm:$src2),
  3325. !strconcat("v", OpcodeStr,
  3326. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3327. [(set VR256:$dst,
  3328. (vt256 (OpNode (load addr:$src1),
  3329. (i8 timm:$src2))))]>, VEX, VEX_L,
  3330. Sched<[sched.YMM.Folded]>, VEX_WIG;
  3331. }
  3332. let Predicates = [UseSSE2] in {
  3333. def ri : Ii8<0x70, MRMSrcReg,
  3334. (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
  3335. !strconcat(OpcodeStr,
  3336. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3337. [(set VR128:$dst,
  3338. (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
  3339. Sched<[sched.XMM]>;
  3340. def mi : Ii8<0x70, MRMSrcMem,
  3341. (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
  3342. !strconcat(OpcodeStr,
  3343. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  3344. [(set VR128:$dst,
  3345. (vt128 (OpNode (memop addr:$src1),
  3346. (i8 timm:$src2))))]>,
  3347. Sched<[sched.XMM.Folded]>;
  3348. }
  3349. }
  3350. } // ExeDomain = SSEPackedInt
  3351. defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
  3352. SchedWriteShuffle, NoVLX>, PD;
  3353. defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
  3354. SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
  3355. defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
  3356. SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
  3357. //===---------------------------------------------------------------------===//
  3358. // Packed Integer Pack Instructions (SSE & AVX)
  3359. //===---------------------------------------------------------------------===//
  3360. let ExeDomain = SSEPackedInt in {
  3361. multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
  3362. ValueType ArgVT, SDNode OpNode, RegisterClass RC,
  3363. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  3364. PatFrag ld_frag, bit Is2Addr = 1> {
  3365. def rr : PDI<opc, MRMSrcReg,
  3366. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3367. !if(Is2Addr,
  3368. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3369. !strconcat(OpcodeStr,
  3370. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3371. [(set RC:$dst,
  3372. (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
  3373. Sched<[sched]>;
  3374. def rm : PDI<opc, MRMSrcMem,
  3375. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3376. !if(Is2Addr,
  3377. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3378. !strconcat(OpcodeStr,
  3379. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3380. [(set RC:$dst,
  3381. (OutVT (OpNode (ArgVT RC:$src1),
  3382. (ld_frag addr:$src2))))]>,
  3383. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3384. }
  3385. multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
  3386. ValueType ArgVT, SDNode OpNode, RegisterClass RC,
  3387. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  3388. PatFrag ld_frag, bit Is2Addr = 1> {
  3389. def rr : SS48I<opc, MRMSrcReg,
  3390. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3391. !if(Is2Addr,
  3392. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3393. !strconcat(OpcodeStr,
  3394. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3395. [(set RC:$dst,
  3396. (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
  3397. Sched<[sched]>;
  3398. def rm : SS48I<opc, MRMSrcMem,
  3399. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3400. !if(Is2Addr,
  3401. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  3402. !strconcat(OpcodeStr,
  3403. "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3404. [(set RC:$dst,
  3405. (OutVT (OpNode (ArgVT RC:$src1),
  3406. (ld_frag addr:$src2))))]>,
  3407. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3408. }
  3409. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  3410. defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
  3411. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3412. VEX_4V, VEX_WIG;
  3413. defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
  3414. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3415. VEX_4V, VEX_WIG;
  3416. defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
  3417. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3418. VEX_4V, VEX_WIG;
  3419. defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
  3420. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3421. VEX_4V, VEX_WIG;
  3422. }
  3423. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  3424. defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
  3425. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3426. VEX_4V, VEX_L, VEX_WIG;
  3427. defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
  3428. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3429. VEX_4V, VEX_L, VEX_WIG;
  3430. defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
  3431. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3432. VEX_4V, VEX_L, VEX_WIG;
  3433. defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
  3434. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3435. VEX_4V, VEX_L, VEX_WIG;
  3436. }
  3437. let Constraints = "$src1 = $dst" in {
  3438. defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
  3439. i128mem, SchedWriteShuffle.XMM, memop>;
  3440. defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
  3441. i128mem, SchedWriteShuffle.XMM, memop>;
  3442. defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
  3443. i128mem, SchedWriteShuffle.XMM, memop>;
  3444. defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
  3445. i128mem, SchedWriteShuffle.XMM, memop>;
  3446. }
  3447. } // ExeDomain = SSEPackedInt
  3448. //===---------------------------------------------------------------------===//
  3449. // SSE2 - Packed Integer Unpack Instructions
  3450. //===---------------------------------------------------------------------===//
  3451. let ExeDomain = SSEPackedInt in {
  3452. multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
  3453. SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
  3454. X86FoldableSchedWrite sched, PatFrag ld_frag,
  3455. bit Is2Addr = 1> {
  3456. def rr : PDI<opc, MRMSrcReg,
  3457. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  3458. !if(Is2Addr,
  3459. !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
  3460. !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3461. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  3462. Sched<[sched]>;
  3463. def rm : PDI<opc, MRMSrcMem,
  3464. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  3465. !if(Is2Addr,
  3466. !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
  3467. !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  3468. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  3469. Sched<[sched.Folded, sched.ReadAfterFold]>;
  3470. }
  3471. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  3472. defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
  3473. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3474. VEX_4V, VEX_WIG;
  3475. defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
  3476. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3477. VEX_4V, VEX_WIG;
  3478. defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
  3479. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3480. VEX_4V, VEX_WIG;
  3481. defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
  3482. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3483. VEX_4V, VEX_WIG;
  3484. }
  3485. let Predicates = [HasAVX, NoVLX] in {
  3486. defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
  3487. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3488. VEX_4V, VEX_WIG;
  3489. defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
  3490. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3491. VEX_4V, VEX_WIG;
  3492. defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
  3493. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3494. VEX_4V, VEX_WIG;
  3495. defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
  3496. i128mem, SchedWriteShuffle.XMM, load, 0>,
  3497. VEX_4V, VEX_WIG;
  3498. }
  3499. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  3500. defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
  3501. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3502. VEX_4V, VEX_L, VEX_WIG;
  3503. defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
  3504. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3505. VEX_4V, VEX_L, VEX_WIG;
  3506. defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
  3507. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3508. VEX_4V, VEX_L, VEX_WIG;
  3509. defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
  3510. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3511. VEX_4V, VEX_L, VEX_WIG;
  3512. }
  3513. let Predicates = [HasAVX2, NoVLX] in {
  3514. defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
  3515. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3516. VEX_4V, VEX_L, VEX_WIG;
  3517. defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
  3518. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3519. VEX_4V, VEX_L, VEX_WIG;
  3520. defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
  3521. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3522. VEX_4V, VEX_L, VEX_WIG;
  3523. defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
  3524. i256mem, SchedWriteShuffle.YMM, load, 0>,
  3525. VEX_4V, VEX_L, VEX_WIG;
  3526. }
  3527. let Constraints = "$src1 = $dst" in {
  3528. defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
  3529. i128mem, SchedWriteShuffle.XMM, memop>;
  3530. defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
  3531. i128mem, SchedWriteShuffle.XMM, memop>;
  3532. defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
  3533. i128mem, SchedWriteShuffle.XMM, memop>;
  3534. defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
  3535. i128mem, SchedWriteShuffle.XMM, memop>;
  3536. defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
  3537. i128mem, SchedWriteShuffle.XMM, memop>;
  3538. defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
  3539. i128mem, SchedWriteShuffle.XMM, memop>;
  3540. defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
  3541. i128mem, SchedWriteShuffle.XMM, memop>;
  3542. defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
  3543. i128mem, SchedWriteShuffle.XMM, memop>;
  3544. }
  3545. } // ExeDomain = SSEPackedInt
  3546. //===---------------------------------------------------------------------===//
  3547. // SSE2 - Packed Integer Extract and Insert
  3548. //===---------------------------------------------------------------------===//
  3549. let ExeDomain = SSEPackedInt in {
  3550. multiclass sse2_pinsrw<bit Is2Addr = 1> {
  3551. def rr : Ii8<0xC4, MRMSrcReg,
  3552. (outs VR128:$dst), (ins VR128:$src1,
  3553. GR32orGR64:$src2, u8imm:$src3),
  3554. !if(Is2Addr,
  3555. "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  3556. "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  3557. [(set VR128:$dst,
  3558. (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
  3559. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  3560. def rm : Ii8<0xC4, MRMSrcMem,
  3561. (outs VR128:$dst), (ins VR128:$src1,
  3562. i16mem:$src2, u8imm:$src3),
  3563. !if(Is2Addr,
  3564. "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  3565. "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  3566. [(set VR128:$dst,
  3567. (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
  3568. timm:$src3))]>,
  3569. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  3570. }
  3571. // Extract
  3572. let Predicates = [HasAVX, NoBWI] in
  3573. def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
  3574. (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
  3575. "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  3576. [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
  3577. timm:$src2))]>,
  3578. PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
  3579. def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
  3580. (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
  3581. "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  3582. [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
  3583. timm:$src2))]>,
  3584. Sched<[WriteVecExtract]>;
  3585. // Insert
  3586. let Predicates = [HasAVX, NoBWI] in
  3587. defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
  3588. let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
  3589. defm PINSRW : sse2_pinsrw, PD;
  3590. } // ExeDomain = SSEPackedInt
  3591. //===---------------------------------------------------------------------===//
  3592. // SSE2 - Packed Mask Creation
  3593. //===---------------------------------------------------------------------===//
  3594. let ExeDomain = SSEPackedInt in {
  3595. def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
  3596. (ins VR128:$src),
  3597. "pmovmskb\t{$src, $dst|$dst, $src}",
  3598. [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
  3599. Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
  3600. let Predicates = [HasAVX2] in {
  3601. def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
  3602. (ins VR256:$src),
  3603. "pmovmskb\t{$src, $dst|$dst, $src}",
  3604. [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
  3605. Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
  3606. }
  3607. def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
  3608. "pmovmskb\t{$src, $dst|$dst, $src}",
  3609. [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
  3610. Sched<[WriteVecMOVMSK]>;
  3611. } // ExeDomain = SSEPackedInt
  3612. //===---------------------------------------------------------------------===//
  3613. // SSE2 - Conditional Store
  3614. //===---------------------------------------------------------------------===//
  3615. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
  3616. let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
  3617. def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
  3618. (ins VR128:$src, VR128:$mask),
  3619. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3620. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
  3621. VEX, VEX_WIG;
  3622. let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
  3623. def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
  3624. (ins VR128:$src, VR128:$mask),
  3625. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3626. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
  3627. VEX, VEX_WIG, AdSize64;
  3628. let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
  3629. def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
  3630. (ins VR128:$src, VR128:$mask), "",
  3631. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
  3632. VEX, VEX_WIG, AdSize32 {
  3633. let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
  3634. let AsmVariantName = "NonParsable";
  3635. }
  3636. let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
  3637. def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
  3638. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3639. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
  3640. let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
  3641. def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
  3642. "maskmovdqu\t{$mask, $src|$src, $mask}",
  3643. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
  3644. AdSize64;
  3645. let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
  3646. def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
  3647. "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
  3648. [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
  3649. AdSize32 {
  3650. let AsmVariantName = "NonParsable";
  3651. }
  3652. } // ExeDomain = SSEPackedInt
  3653. //===---------------------------------------------------------------------===//
  3654. // SSE2 - Move Doubleword/Quadword
  3655. //===---------------------------------------------------------------------===//
  3656. //===---------------------------------------------------------------------===//
  3657. // Move Int Doubleword to Packed Double Int
  3658. //
  3659. let ExeDomain = SSEPackedInt in {
  3660. def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
  3661. "movd\t{$src, $dst|$dst, $src}",
  3662. [(set VR128:$dst,
  3663. (v4i32 (scalar_to_vector GR32:$src)))]>,
  3664. VEX, Sched<[WriteVecMoveFromGpr]>;
  3665. def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
  3666. "movd\t{$src, $dst|$dst, $src}",
  3667. [(set VR128:$dst,
  3668. (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
  3669. VEX, Sched<[WriteVecLoad]>;
  3670. def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
  3671. "movq\t{$src, $dst|$dst, $src}",
  3672. [(set VR128:$dst,
  3673. (v2i64 (scalar_to_vector GR64:$src)))]>,
  3674. VEX, Sched<[WriteVecMoveFromGpr]>;
  3675. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
  3676. def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3677. "movq\t{$src, $dst|$dst, $src}", []>,
  3678. VEX, Sched<[WriteVecLoad]>;
  3679. let isCodeGenOnly = 1 in
  3680. def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
  3681. "movq\t{$src, $dst|$dst, $src}",
  3682. [(set FR64:$dst, (bitconvert GR64:$src))]>,
  3683. VEX, Sched<[WriteVecMoveFromGpr]>;
  3684. def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
  3685. "movd\t{$src, $dst|$dst, $src}",
  3686. [(set VR128:$dst,
  3687. (v4i32 (scalar_to_vector GR32:$src)))]>,
  3688. Sched<[WriteVecMoveFromGpr]>;
  3689. def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
  3690. "movd\t{$src, $dst|$dst, $src}",
  3691. [(set VR128:$dst,
  3692. (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
  3693. Sched<[WriteVecLoad]>;
  3694. def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
  3695. "movq\t{$src, $dst|$dst, $src}",
  3696. [(set VR128:$dst,
  3697. (v2i64 (scalar_to_vector GR64:$src)))]>,
  3698. Sched<[WriteVecMoveFromGpr]>;
  3699. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
  3700. def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3701. "movq\t{$src, $dst|$dst, $src}", []>,
  3702. Sched<[WriteVecLoad]>;
  3703. let isCodeGenOnly = 1 in
  3704. def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
  3705. "movq\t{$src, $dst|$dst, $src}",
  3706. [(set FR64:$dst, (bitconvert GR64:$src))]>,
  3707. Sched<[WriteVecMoveFromGpr]>;
  3708. } // ExeDomain = SSEPackedInt
  3709. //===---------------------------------------------------------------------===//
  3710. // Move Int Doubleword to Single Scalar
  3711. //
  3712. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3713. def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
  3714. "movd\t{$src, $dst|$dst, $src}",
  3715. [(set FR32:$dst, (bitconvert GR32:$src))]>,
  3716. VEX, Sched<[WriteVecMoveFromGpr]>;
  3717. def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
  3718. "movd\t{$src, $dst|$dst, $src}",
  3719. [(set FR32:$dst, (bitconvert GR32:$src))]>,
  3720. Sched<[WriteVecMoveFromGpr]>;
  3721. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3722. //===---------------------------------------------------------------------===//
  3723. // Move Packed Doubleword Int to Packed Double Int
  3724. //
  3725. let ExeDomain = SSEPackedInt in {
  3726. def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
  3727. "movd\t{$src, $dst|$dst, $src}",
  3728. [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
  3729. (iPTR 0)))]>, VEX,
  3730. Sched<[WriteVecMoveToGpr]>;
  3731. def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
  3732. (ins i32mem:$dst, VR128:$src),
  3733. "movd\t{$src, $dst|$dst, $src}",
  3734. [(store (i32 (extractelt (v4i32 VR128:$src),
  3735. (iPTR 0))), addr:$dst)]>,
  3736. VEX, Sched<[WriteVecStore]>;
  3737. def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
  3738. "movd\t{$src, $dst|$dst, $src}",
  3739. [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
  3740. (iPTR 0)))]>,
  3741. Sched<[WriteVecMoveToGpr]>;
  3742. def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
  3743. "movd\t{$src, $dst|$dst, $src}",
  3744. [(store (i32 (extractelt (v4i32 VR128:$src),
  3745. (iPTR 0))), addr:$dst)]>,
  3746. Sched<[WriteVecStore]>;
  3747. } // ExeDomain = SSEPackedInt
  3748. //===---------------------------------------------------------------------===//
  3749. // Move Packed Doubleword Int first element to Doubleword Int
  3750. //
  3751. let ExeDomain = SSEPackedInt in {
  3752. let SchedRW = [WriteVecMoveToGpr] in {
  3753. def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
  3754. "movq\t{$src, $dst|$dst, $src}",
  3755. [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
  3756. (iPTR 0)))]>,
  3757. VEX;
  3758. def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
  3759. "movq\t{$src, $dst|$dst, $src}",
  3760. [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
  3761. (iPTR 0)))]>;
  3762. } //SchedRW
  3763. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
  3764. def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
  3765. (ins i64mem:$dst, VR128:$src),
  3766. "movq\t{$src, $dst|$dst, $src}", []>,
  3767. VEX, Sched<[WriteVecStore]>;
  3768. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
  3769. def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3770. "movq\t{$src, $dst|$dst, $src}", []>,
  3771. Sched<[WriteVecStore]>;
  3772. } // ExeDomain = SSEPackedInt
  3773. //===---------------------------------------------------------------------===//
  3774. // Bitcast FR64 <-> GR64
  3775. //
  3776. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3777. def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
  3778. "movq\t{$src, $dst|$dst, $src}",
  3779. [(set GR64:$dst, (bitconvert FR64:$src))]>,
  3780. VEX, Sched<[WriteVecMoveToGpr]>;
  3781. def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
  3782. "movq\t{$src, $dst|$dst, $src}",
  3783. [(set GR64:$dst, (bitconvert FR64:$src))]>,
  3784. Sched<[WriteVecMoveToGpr]>;
  3785. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3786. //===---------------------------------------------------------------------===//
  3787. // Move Scalar Single to Double Int
  3788. //
  3789. let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
  3790. def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
  3791. "movd\t{$src, $dst|$dst, $src}",
  3792. [(set GR32:$dst, (bitconvert FR32:$src))]>,
  3793. VEX, Sched<[WriteVecMoveToGpr]>;
  3794. def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
  3795. "movd\t{$src, $dst|$dst, $src}",
  3796. [(set GR32:$dst, (bitconvert FR32:$src))]>,
  3797. Sched<[WriteVecMoveToGpr]>;
  3798. } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
  3799. let Predicates = [UseAVX] in {
  3800. def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
  3801. (VMOVDI2PDIrr GR32:$src)>;
  3802. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
  3803. (VMOV64toPQIrr GR64:$src)>;
  3804. // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
  3805. // These instructions also write zeros in the high part of a 256-bit register.
  3806. def : Pat<(v4i32 (X86vzload32 addr:$src)),
  3807. (VMOVDI2PDIrm addr:$src)>;
  3808. def : Pat<(v8i32 (X86vzload32 addr:$src)),
  3809. (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
  3810. }
  3811. let Predicates = [UseSSE2] in {
  3812. def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
  3813. (MOVDI2PDIrr GR32:$src)>;
  3814. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
  3815. (MOV64toPQIrr GR64:$src)>;
  3816. def : Pat<(v4i32 (X86vzload32 addr:$src)),
  3817. (MOVDI2PDIrm addr:$src)>;
  3818. }
  3819. // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
  3820. // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
  3821. // these aliases.
  3822. def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
  3823. (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
  3824. def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
  3825. (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
  3826. // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
  3827. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
  3828. (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
  3829. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
  3830. (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
  3831. //===---------------------------------------------------------------------===//
  3832. // SSE2 - Move Quadword
  3833. //===---------------------------------------------------------------------===//
  3834. //===---------------------------------------------------------------------===//
  3835. // Move Quadword Int to Packed Quadword Int
  3836. //
  3837. let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
  3838. def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3839. "vmovq\t{$src, $dst|$dst, $src}",
  3840. [(set VR128:$dst,
  3841. (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
  3842. VEX, Requires<[UseAVX]>, VEX_WIG;
  3843. def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
  3844. "movq\t{$src, $dst|$dst, $src}",
  3845. [(set VR128:$dst,
  3846. (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
  3847. XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
  3848. } // ExeDomain, SchedRW
  3849. //===---------------------------------------------------------------------===//
  3850. // Move Packed Quadword Int to Quadword Int
  3851. //
  3852. let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
  3853. def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3854. "movq\t{$src, $dst|$dst, $src}",
  3855. [(store (i64 (extractelt (v2i64 VR128:$src),
  3856. (iPTR 0))), addr:$dst)]>,
  3857. VEX, VEX_WIG;
  3858. def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
  3859. "movq\t{$src, $dst|$dst, $src}",
  3860. [(store (i64 (extractelt (v2i64 VR128:$src),
  3861. (iPTR 0))), addr:$dst)]>;
  3862. } // ExeDomain, SchedRW
  3863. // For disassembler only
  3864. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
  3865. SchedRW = [SchedWriteVecLogic.XMM] in {
  3866. def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3867. "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
  3868. def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
  3869. "movq\t{$src, $dst|$dst, $src}", []>;
  3870. }
  3871. def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
  3872. (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
  3873. def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
  3874. (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
  3875. let Predicates = [UseAVX] in {
  3876. def : Pat<(v2i64 (X86vzload64 addr:$src)),
  3877. (VMOVQI2PQIrm addr:$src)>;
  3878. def : Pat<(v4i64 (X86vzload64 addr:$src)),
  3879. (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
  3880. def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
  3881. (VMOVPQI2QImr addr:$dst, VR128:$src)>;
  3882. }
  3883. let Predicates = [UseSSE2] in {
  3884. def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
  3885. def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
  3886. (MOVPQI2QImr addr:$dst, VR128:$src)>;
  3887. }
  3888. //===---------------------------------------------------------------------===//
  3889. // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
  3890. // IA32 document. movq xmm1, xmm2 does clear the high bits.
  3891. //
  3892. let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
  3893. def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3894. "vmovq\t{$src, $dst|$dst, $src}",
  3895. [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
  3896. XS, VEX, Requires<[UseAVX]>, VEX_WIG;
  3897. def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3898. "movq\t{$src, $dst|$dst, $src}",
  3899. [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
  3900. XS, Requires<[UseSSE2]>;
  3901. } // ExeDomain, SchedRW
  3902. let Predicates = [UseAVX] in {
  3903. def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
  3904. (VMOVZPQILo2PQIrr VR128:$src)>;
  3905. }
  3906. let Predicates = [UseSSE2] in {
  3907. def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
  3908. (MOVZPQILo2PQIrr VR128:$src)>;
  3909. }
  3910. let Predicates = [UseAVX] in {
  3911. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
  3912. (SUBREG_TO_REG (i32 0),
  3913. (v2f64 (VMOVZPQILo2PQIrr
  3914. (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
  3915. sub_xmm)>;
  3916. def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
  3917. (SUBREG_TO_REG (i32 0),
  3918. (v2i64 (VMOVZPQILo2PQIrr
  3919. (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
  3920. sub_xmm)>;
  3921. }
  3922. //===---------------------------------------------------------------------===//
  3923. // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
  3924. //===---------------------------------------------------------------------===//
  3925. multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
  3926. ValueType vt, RegisterClass RC, PatFrag mem_frag,
  3927. X86MemOperand x86memop, X86FoldableSchedWrite sched> {
  3928. def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
  3929. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3930. [(set RC:$dst, (vt (OpNode RC:$src)))]>,
  3931. Sched<[sched]>;
  3932. def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  3933. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3934. [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
  3935. Sched<[sched.Folded]>;
  3936. }
  3937. let Predicates = [HasAVX, NoVLX] in {
  3938. defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
  3939. v4f32, VR128, loadv4f32, f128mem,
  3940. SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
  3941. defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
  3942. v4f32, VR128, loadv4f32, f128mem,
  3943. SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
  3944. defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
  3945. v8f32, VR256, loadv8f32, f256mem,
  3946. SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
  3947. defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
  3948. v8f32, VR256, loadv8f32, f256mem,
  3949. SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
  3950. }
  3951. defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
  3952. memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
  3953. defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
  3954. memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
  3955. let Predicates = [HasAVX, NoVLX] in {
  3956. def : Pat<(v4i32 (X86Movshdup VR128:$src)),
  3957. (VMOVSHDUPrr VR128:$src)>;
  3958. def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
  3959. (VMOVSHDUPrm addr:$src)>;
  3960. def : Pat<(v4i32 (X86Movsldup VR128:$src)),
  3961. (VMOVSLDUPrr VR128:$src)>;
  3962. def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
  3963. (VMOVSLDUPrm addr:$src)>;
  3964. def : Pat<(v8i32 (X86Movshdup VR256:$src)),
  3965. (VMOVSHDUPYrr VR256:$src)>;
  3966. def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
  3967. (VMOVSHDUPYrm addr:$src)>;
  3968. def : Pat<(v8i32 (X86Movsldup VR256:$src)),
  3969. (VMOVSLDUPYrr VR256:$src)>;
  3970. def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
  3971. (VMOVSLDUPYrm addr:$src)>;
  3972. }
  3973. let Predicates = [UseSSE3] in {
  3974. def : Pat<(v4i32 (X86Movshdup VR128:$src)),
  3975. (MOVSHDUPrr VR128:$src)>;
  3976. def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
  3977. (MOVSHDUPrm addr:$src)>;
  3978. def : Pat<(v4i32 (X86Movsldup VR128:$src)),
  3979. (MOVSLDUPrr VR128:$src)>;
  3980. def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
  3981. (MOVSLDUPrm addr:$src)>;
  3982. }
  3983. //===---------------------------------------------------------------------===//
  3984. // SSE3 - Replicate Double FP - MOVDDUP
  3985. //===---------------------------------------------------------------------===//
  3986. multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
  3987. def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  3988. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3989. [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
  3990. Sched<[sched.XMM]>;
  3991. def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
  3992. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  3993. [(set VR128:$dst,
  3994. (v2f64 (X86Movddup
  3995. (scalar_to_vector (loadf64 addr:$src)))))]>,
  3996. Sched<[sched.XMM.Folded]>;
  3997. }
  3998. // FIXME: Merge with above classes when there are patterns for the ymm version
  3999. multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
  4000. def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
  4001. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4002. [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
  4003. Sched<[sched.YMM]>;
  4004. def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
  4005. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4006. [(set VR256:$dst,
  4007. (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
  4008. Sched<[sched.YMM.Folded]>;
  4009. }
  4010. let Predicates = [HasAVX, NoVLX] in {
  4011. defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
  4012. VEX, VEX_WIG;
  4013. defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
  4014. VEX, VEX_L, VEX_WIG;
  4015. }
  4016. defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
  4017. let Predicates = [HasAVX, NoVLX] in {
  4018. def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
  4019. (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
  4020. }
  4021. let Predicates = [UseSSE3] in {
  4022. def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
  4023. (MOVDDUPrm addr:$src)>;
  4024. }
  4025. //===---------------------------------------------------------------------===//
  4026. // SSE3 - Move Unaligned Integer
  4027. //===---------------------------------------------------------------------===//
  4028. let Predicates = [HasAVX] in {
  4029. def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  4030. "vlddqu\t{$src, $dst|$dst, $src}",
  4031. [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
  4032. Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
  4033. def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  4034. "vlddqu\t{$src, $dst|$dst, $src}",
  4035. [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
  4036. Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
  4037. } // Predicates
  4038. def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  4039. "lddqu\t{$src, $dst|$dst, $src}",
  4040. [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
  4041. Sched<[SchedWriteVecMoveLS.XMM.RM]>;
  4042. //===---------------------------------------------------------------------===//
  4043. // SSE3 - Arithmetic
  4044. //===---------------------------------------------------------------------===//
  4045. multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
  4046. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  4047. PatFrag ld_frag, bit Is2Addr = 1> {
  4048. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4049. def rr : I<0xD0, MRMSrcReg,
  4050. (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4051. !if(Is2Addr,
  4052. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4053. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4054. [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
  4055. Sched<[sched]>;
  4056. def rm : I<0xD0, MRMSrcMem,
  4057. (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4058. !if(Is2Addr,
  4059. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4060. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4061. [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
  4062. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4063. }
  4064. }
  4065. let Predicates = [HasAVX] in {
  4066. let ExeDomain = SSEPackedSingle in {
  4067. defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
  4068. SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
  4069. XD, VEX_4V, VEX_WIG;
  4070. defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
  4071. SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
  4072. XD, VEX_4V, VEX_L, VEX_WIG;
  4073. }
  4074. let ExeDomain = SSEPackedDouble in {
  4075. defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
  4076. SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
  4077. PD, VEX_4V, VEX_WIG;
  4078. defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
  4079. SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
  4080. PD, VEX_4V, VEX_L, VEX_WIG;
  4081. }
  4082. }
  4083. let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
  4084. let ExeDomain = SSEPackedSingle in
  4085. defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
  4086. SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
  4087. let ExeDomain = SSEPackedDouble in
  4088. defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
  4089. SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
  4090. }
  4091. //===---------------------------------------------------------------------===//
  4092. // SSE3 Instructions
  4093. //===---------------------------------------------------------------------===//
  4094. // Horizontal ops
  4095. multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
  4096. X86MemOperand x86memop, SDNode OpNode,
  4097. X86FoldableSchedWrite sched, PatFrag ld_frag,
  4098. bit Is2Addr = 1> {
  4099. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4100. def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4101. !if(Is2Addr,
  4102. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4103. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4104. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  4105. Sched<[sched]>;
  4106. def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4107. !if(Is2Addr,
  4108. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4109. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4110. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  4111. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4112. }
  4113. }
  4114. multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
  4115. X86MemOperand x86memop, SDNode OpNode,
  4116. X86FoldableSchedWrite sched, PatFrag ld_frag,
  4117. bit Is2Addr = 1> {
  4118. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4119. def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
  4120. !if(Is2Addr,
  4121. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4122. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4123. [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
  4124. Sched<[sched]>;
  4125. def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
  4126. !if(Is2Addr,
  4127. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4128. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4129. [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
  4130. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4131. }
  4132. }
  4133. let Predicates = [HasAVX] in {
  4134. let ExeDomain = SSEPackedSingle in {
  4135. defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
  4136. X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
  4137. defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
  4138. X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
  4139. defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
  4140. X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
  4141. defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
  4142. X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
  4143. }
  4144. let ExeDomain = SSEPackedDouble in {
  4145. defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
  4146. X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
  4147. defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
  4148. X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
  4149. defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
  4150. X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
  4151. defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
  4152. X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
  4153. }
  4154. }
  4155. let Constraints = "$src1 = $dst" in {
  4156. let ExeDomain = SSEPackedSingle in {
  4157. defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
  4158. WriteFHAdd, memopv4f32>;
  4159. defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
  4160. WriteFHAdd, memopv4f32>;
  4161. }
  4162. let ExeDomain = SSEPackedDouble in {
  4163. defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
  4164. WriteFHAdd, memopv2f64>;
  4165. defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
  4166. WriteFHAdd, memopv2f64>;
  4167. }
  4168. }
  4169. //===---------------------------------------------------------------------===//
  4170. // SSSE3 - Packed Absolute Instructions
  4171. //===---------------------------------------------------------------------===//
  4172. /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
  4173. multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
  4174. SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
  4175. def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
  4176. (ins VR128:$src),
  4177. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4178. [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
  4179. Sched<[sched.XMM]>;
  4180. def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
  4181. (ins i128mem:$src),
  4182. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4183. [(set VR128:$dst,
  4184. (vt (OpNode (ld_frag addr:$src))))]>,
  4185. Sched<[sched.XMM.Folded]>;
  4186. }
  4187. /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
  4188. multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
  4189. SDNode OpNode, X86SchedWriteWidths sched> {
  4190. def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
  4191. (ins VR256:$src),
  4192. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4193. [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
  4194. Sched<[sched.YMM]>;
  4195. def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
  4196. (ins i256mem:$src),
  4197. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  4198. [(set VR256:$dst,
  4199. (vt (OpNode (load addr:$src))))]>,
  4200. Sched<[sched.YMM.Folded]>;
  4201. }
  4202. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4203. defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
  4204. load>, VEX, VEX_WIG;
  4205. defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
  4206. load>, VEX, VEX_WIG;
  4207. }
  4208. let Predicates = [HasAVX, NoVLX] in {
  4209. defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
  4210. load>, VEX, VEX_WIG;
  4211. }
  4212. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4213. defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
  4214. VEX, VEX_L, VEX_WIG;
  4215. defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
  4216. VEX, VEX_L, VEX_WIG;
  4217. }
  4218. let Predicates = [HasAVX2, NoVLX] in {
  4219. defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
  4220. VEX, VEX_L, VEX_WIG;
  4221. }
  4222. defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
  4223. memop>;
  4224. defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
  4225. memop>;
  4226. defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
  4227. memop>;
  4228. //===---------------------------------------------------------------------===//
  4229. // SSSE3 - Packed Binary Operator Instructions
  4230. //===---------------------------------------------------------------------===//
  4231. /// SS3I_binop_rm - Simple SSSE3 bin op
  4232. multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  4233. ValueType DstVT, ValueType OpVT, RegisterClass RC,
  4234. PatFrag memop_frag, X86MemOperand x86memop,
  4235. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  4236. let isCommutable = 1 in
  4237. def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
  4238. (ins RC:$src1, RC:$src2),
  4239. !if(Is2Addr,
  4240. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4241. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4242. [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
  4243. Sched<[sched]>;
  4244. def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
  4245. (ins RC:$src1, x86memop:$src2),
  4246. !if(Is2Addr,
  4247. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4248. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4249. [(set RC:$dst,
  4250. (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
  4251. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4252. }
  4253. /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
  4254. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
  4255. Intrinsic IntId128, X86FoldableSchedWrite sched,
  4256. PatFrag ld_frag, bit Is2Addr = 1> {
  4257. let isCommutable = 1 in
  4258. def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
  4259. (ins VR128:$src1, VR128:$src2),
  4260. !if(Is2Addr,
  4261. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4262. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4263. [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
  4264. Sched<[sched]>;
  4265. def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
  4266. (ins VR128:$src1, i128mem:$src2),
  4267. !if(Is2Addr,
  4268. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  4269. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  4270. [(set VR128:$dst,
  4271. (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
  4272. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4273. }
  4274. multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
  4275. Intrinsic IntId256,
  4276. X86FoldableSchedWrite sched> {
  4277. let isCommutable = 1 in
  4278. def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
  4279. (ins VR256:$src1, VR256:$src2),
  4280. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4281. [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
  4282. Sched<[sched]>;
  4283. def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
  4284. (ins VR256:$src1, i256mem:$src2),
  4285. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4286. [(set VR256:$dst,
  4287. (IntId256 VR256:$src1, (load addr:$src2)))]>,
  4288. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4289. }
  4290. let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4291. let isCommutable = 0 in {
  4292. defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
  4293. VR128, load, i128mem,
  4294. SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
  4295. defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
  4296. v16i8, VR128, load, i128mem,
  4297. SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
  4298. }
  4299. defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
  4300. VR128, load, i128mem,
  4301. SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
  4302. }
  4303. let ImmT = NoImm, Predicates = [HasAVX] in {
  4304. let isCommutable = 0 in {
  4305. defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
  4306. load, i128mem,
  4307. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4308. defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
  4309. load, i128mem,
  4310. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4311. defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
  4312. load, i128mem,
  4313. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4314. defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
  4315. load, i128mem,
  4316. SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
  4317. defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
  4318. int_x86_ssse3_psign_b_128,
  4319. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4320. defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
  4321. int_x86_ssse3_psign_w_128,
  4322. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4323. defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
  4324. int_x86_ssse3_psign_d_128,
  4325. SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
  4326. defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
  4327. int_x86_ssse3_phadd_sw_128,
  4328. SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
  4329. defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
  4330. int_x86_ssse3_phsub_sw_128,
  4331. SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
  4332. }
  4333. }
  4334. let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4335. let isCommutable = 0 in {
  4336. defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
  4337. VR256, load, i256mem,
  4338. SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4339. defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
  4340. v32i8, VR256, load, i256mem,
  4341. SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4342. }
  4343. defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
  4344. VR256, load, i256mem,
  4345. SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4346. }
  4347. let ImmT = NoImm, Predicates = [HasAVX2] in {
  4348. let isCommutable = 0 in {
  4349. defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
  4350. VR256, load, i256mem,
  4351. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4352. defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
  4353. load, i256mem,
  4354. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4355. defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
  4356. VR256, load, i256mem,
  4357. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4358. defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
  4359. load, i256mem,
  4360. SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4361. defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
  4362. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4363. defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
  4364. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4365. defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
  4366. SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4367. defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
  4368. int_x86_avx2_phadd_sw,
  4369. SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4370. defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
  4371. int_x86_avx2_phsub_sw,
  4372. SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
  4373. }
  4374. }
  4375. // None of these have i8 immediate fields.
  4376. let ImmT = NoImm, Constraints = "$src1 = $dst" in {
  4377. let isCommutable = 0 in {
  4378. defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
  4379. memop, i128mem, SchedWritePHAdd.XMM>;
  4380. defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
  4381. memop, i128mem, SchedWritePHAdd.XMM>;
  4382. defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
  4383. memop, i128mem, SchedWritePHAdd.XMM>;
  4384. defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
  4385. memop, i128mem, SchedWritePHAdd.XMM>;
  4386. defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
  4387. SchedWriteVecALU.XMM, memop>;
  4388. defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
  4389. SchedWriteVecALU.XMM, memop>;
  4390. defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
  4391. SchedWriteVecALU.XMM, memop>;
  4392. defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
  4393. memop, i128mem, SchedWriteVarShuffle.XMM>;
  4394. defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
  4395. int_x86_ssse3_phadd_sw_128,
  4396. SchedWritePHAdd.XMM, memop>;
  4397. defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
  4398. int_x86_ssse3_phsub_sw_128,
  4399. SchedWritePHAdd.XMM, memop>;
  4400. defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
  4401. v16i8, VR128, memop, i128mem,
  4402. SchedWriteVecIMul.XMM>;
  4403. }
  4404. defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
  4405. VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
  4406. }
  4407. //===---------------------------------------------------------------------===//
  4408. // SSSE3 - Packed Align Instruction Patterns
  4409. //===---------------------------------------------------------------------===//
  4410. multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
  4411. PatFrag memop_frag, X86MemOperand x86memop,
  4412. X86FoldableSchedWrite sched, bit Is2Addr = 1> {
  4413. let hasSideEffects = 0 in {
  4414. def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
  4415. (ins RC:$src1, RC:$src2, u8imm:$src3),
  4416. !if(Is2Addr,
  4417. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4418. !strconcat(asm,
  4419. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4420. [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
  4421. Sched<[sched]>;
  4422. let mayLoad = 1 in
  4423. def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
  4424. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  4425. !if(Is2Addr,
  4426. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4427. !strconcat(asm,
  4428. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4429. [(set RC:$dst, (VT (X86PAlignr RC:$src1,
  4430. (memop_frag addr:$src2),
  4431. (i8 timm:$src3))))]>,
  4432. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4433. }
  4434. }
  4435. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
  4436. defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
  4437. SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
  4438. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
  4439. defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
  4440. SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
  4441. let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
  4442. defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
  4443. SchedWriteShuffle.XMM>;
  4444. //===---------------------------------------------------------------------===//
  4445. // SSSE3 - Thread synchronization
  4446. //===---------------------------------------------------------------------===//
  4447. let SchedRW = [WriteSystem] in {
  4448. let Uses = [EAX, ECX, EDX] in
  4449. def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
  4450. TB, Requires<[HasSSE3, Not64BitMode]>;
  4451. let Uses = [RAX, ECX, EDX] in
  4452. def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
  4453. TB, Requires<[HasSSE3, In64BitMode]>;
  4454. let Uses = [ECX, EAX] in
  4455. def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
  4456. [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
  4457. } // SchedRW
  4458. def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
  4459. def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
  4460. def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
  4461. Requires<[Not64BitMode]>;
  4462. def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
  4463. Requires<[In64BitMode]>;
  4464. //===----------------------------------------------------------------------===//
  4465. // SSE4.1 - Packed Move with Sign/Zero Extend
  4466. // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
  4467. //===----------------------------------------------------------------------===//
  4468. multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
  4469. RegisterClass OutRC, RegisterClass InRC,
  4470. X86FoldableSchedWrite sched> {
  4471. def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
  4472. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
  4473. Sched<[sched]>;
  4474. def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
  4475. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
  4476. Sched<[sched.Folded]>;
  4477. }
  4478. multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
  4479. X86MemOperand MemOp, X86MemOperand MemYOp,
  4480. Predicate prd> {
  4481. defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
  4482. SchedWriteShuffle.XMM>;
  4483. let Predicates = [HasAVX, prd] in
  4484. defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
  4485. VR128, VR128, SchedWriteShuffle.XMM>,
  4486. VEX, VEX_WIG;
  4487. let Predicates = [HasAVX2, prd] in
  4488. defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
  4489. VR256, VR128, WriteVPMOV256>,
  4490. VEX, VEX_L, VEX_WIG;
  4491. }
  4492. multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
  4493. X86MemOperand MemYOp, Predicate prd> {
  4494. defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
  4495. MemOp, MemYOp, prd>;
  4496. defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
  4497. !strconcat("pmovzx", OpcodeStr),
  4498. MemOp, MemYOp, prd>;
  4499. }
  4500. defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
  4501. defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
  4502. defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
  4503. defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
  4504. defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
  4505. defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
  4506. // AVX2 Patterns
  4507. multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
  4508. SDNode ExtOp, SDNode InVecOp> {
  4509. // Register-Register patterns
  4510. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4511. def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
  4512. (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
  4513. }
  4514. let Predicates = [HasAVX2, NoVLX] in {
  4515. def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
  4516. (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
  4517. def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
  4518. (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
  4519. def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
  4520. (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
  4521. def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
  4522. (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
  4523. def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
  4524. (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
  4525. }
  4526. // Simple Register-Memory patterns
  4527. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  4528. def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4529. (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
  4530. def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
  4531. (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
  4532. }
  4533. let Predicates = [HasAVX2, NoVLX] in {
  4534. def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4535. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4536. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4537. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4538. def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4539. (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
  4540. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4541. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4542. def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
  4543. (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
  4544. }
  4545. // AVX2 Register-Memory patterns
  4546. let Predicates = [HasAVX2, NoVLX] in {
  4547. def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
  4548. (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
  4549. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4550. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4551. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4552. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4553. def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
  4554. (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
  4555. def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
  4556. (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
  4557. def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4558. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4559. def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
  4560. (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
  4561. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4562. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4563. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4564. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4565. def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  4566. (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
  4567. }
  4568. }
  4569. defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
  4570. defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
  4571. // SSE4.1/AVX patterns.
  4572. multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
  4573. SDNode ExtOp> {
  4574. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4575. def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
  4576. (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
  4577. }
  4578. let Predicates = [HasAVX, NoVLX] in {
  4579. def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
  4580. (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
  4581. def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
  4582. (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
  4583. def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
  4584. (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
  4585. def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
  4586. (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
  4587. def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
  4588. (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
  4589. }
  4590. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4591. def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4592. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4593. }
  4594. let Predicates = [HasAVX, NoVLX] in {
  4595. def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4596. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4597. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
  4598. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4599. def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4600. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4601. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
  4602. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4603. def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
  4604. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4605. }
  4606. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  4607. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4608. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4609. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4610. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4611. def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
  4612. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4613. def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
  4614. (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
  4615. }
  4616. let Predicates = [HasAVX, NoVLX] in {
  4617. def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4618. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4619. def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
  4620. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4621. def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
  4622. (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
  4623. def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
  4624. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4625. def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
  4626. (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
  4627. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4628. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4629. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4630. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4631. def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  4632. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4633. def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
  4634. (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
  4635. def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
  4636. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4637. def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
  4638. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4639. def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
  4640. (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
  4641. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  4642. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4643. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
  4644. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4645. def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
  4646. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4647. def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
  4648. (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
  4649. }
  4650. }
  4651. defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
  4652. defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
  4653. let Predicates = [UseSSE41] in {
  4654. defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
  4655. defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
  4656. }
  4657. //===----------------------------------------------------------------------===//
  4658. // SSE4.1 - Extract Instructions
  4659. //===----------------------------------------------------------------------===//
  4660. /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
  4661. multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
  4662. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4663. (ins VR128:$src1, u8imm:$src2),
  4664. !strconcat(OpcodeStr,
  4665. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4666. [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
  4667. timm:$src2))]>,
  4668. Sched<[WriteVecExtract]>;
  4669. let hasSideEffects = 0, mayStore = 1 in
  4670. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4671. (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
  4672. !strconcat(OpcodeStr,
  4673. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4674. [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
  4675. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4676. }
  4677. let Predicates = [HasAVX, NoBWI] in
  4678. defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
  4679. defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
  4680. /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
  4681. multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
  4682. let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  4683. def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4684. (ins VR128:$src1, u8imm:$src2),
  4685. !strconcat(OpcodeStr,
  4686. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
  4687. Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
  4688. let hasSideEffects = 0, mayStore = 1 in
  4689. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4690. (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
  4691. !strconcat(OpcodeStr,
  4692. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4693. [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
  4694. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4695. }
  4696. let Predicates = [HasAVX, NoBWI] in
  4697. defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
  4698. defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
  4699. /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
  4700. multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
  4701. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
  4702. (ins VR128:$src1, u8imm:$src2),
  4703. !strconcat(OpcodeStr,
  4704. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4705. [(set GR32:$dst,
  4706. (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
  4707. Sched<[WriteVecExtract]>;
  4708. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4709. (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
  4710. !strconcat(OpcodeStr,
  4711. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4712. [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
  4713. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4714. }
  4715. let Predicates = [HasAVX, NoDQI] in
  4716. defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
  4717. defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
  4718. /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
  4719. multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
  4720. def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
  4721. (ins VR128:$src1, u8imm:$src2),
  4722. !strconcat(OpcodeStr,
  4723. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4724. [(set GR64:$dst,
  4725. (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
  4726. Sched<[WriteVecExtract]>;
  4727. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4728. (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
  4729. !strconcat(OpcodeStr,
  4730. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4731. [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
  4732. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4733. }
  4734. let Predicates = [HasAVX, NoDQI] in
  4735. defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
  4736. defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
  4737. /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
  4738. /// destination
  4739. multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
  4740. def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
  4741. (ins VR128:$src1, u8imm:$src2),
  4742. !strconcat(OpcodeStr,
  4743. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4744. [(set GR32orGR64:$dst,
  4745. (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
  4746. Sched<[WriteVecExtract]>;
  4747. def mr : SS4AIi8<opc, MRMDestMem, (outs),
  4748. (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
  4749. !strconcat(OpcodeStr,
  4750. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4751. [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
  4752. addr:$dst)]>, Sched<[WriteVecExtractSt]>;
  4753. }
  4754. let ExeDomain = SSEPackedSingle in {
  4755. let Predicates = [UseAVX] in
  4756. defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
  4757. defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
  4758. }
  4759. //===----------------------------------------------------------------------===//
  4760. // SSE4.1 - Insert Instructions
  4761. //===----------------------------------------------------------------------===//
  4762. multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
  4763. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4764. (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
  4765. !if(Is2Addr,
  4766. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4767. !strconcat(asm,
  4768. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4769. [(set VR128:$dst,
  4770. (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
  4771. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4772. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4773. (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
  4774. !if(Is2Addr,
  4775. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4776. !strconcat(asm,
  4777. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4778. [(set VR128:$dst,
  4779. (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
  4780. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4781. }
  4782. let Predicates = [HasAVX, NoBWI] in
  4783. defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
  4784. let Constraints = "$src1 = $dst" in
  4785. defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
  4786. multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
  4787. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4788. (ins VR128:$src1, GR32:$src2, u8imm:$src3),
  4789. !if(Is2Addr,
  4790. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4791. !strconcat(asm,
  4792. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4793. [(set VR128:$dst,
  4794. (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
  4795. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4796. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4797. (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
  4798. !if(Is2Addr,
  4799. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4800. !strconcat(asm,
  4801. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4802. [(set VR128:$dst,
  4803. (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
  4804. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4805. }
  4806. let Predicates = [HasAVX, NoDQI] in
  4807. defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
  4808. let Constraints = "$src1 = $dst" in
  4809. defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
  4810. multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
  4811. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4812. (ins VR128:$src1, GR64:$src2, u8imm:$src3),
  4813. !if(Is2Addr,
  4814. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4815. !strconcat(asm,
  4816. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4817. [(set VR128:$dst,
  4818. (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
  4819. Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
  4820. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4821. (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
  4822. !if(Is2Addr,
  4823. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4824. !strconcat(asm,
  4825. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4826. [(set VR128:$dst,
  4827. (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
  4828. Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
  4829. }
  4830. let Predicates = [HasAVX, NoDQI] in
  4831. defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
  4832. let Constraints = "$src1 = $dst" in
  4833. defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
  4834. // insertps has a few different modes, there's the first two here below which
  4835. // are optimized inserts that won't zero arbitrary elements in the destination
  4836. // vector. The next one matches the intrinsic and could zero arbitrary elements
  4837. // in the target vector.
  4838. multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
  4839. let isCommutable = 1 in
  4840. def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
  4841. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  4842. !if(Is2Addr,
  4843. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4844. !strconcat(asm,
  4845. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4846. [(set VR128:$dst,
  4847. (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
  4848. Sched<[SchedWriteFShuffle.XMM]>;
  4849. def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
  4850. (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
  4851. !if(Is2Addr,
  4852. !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4853. !strconcat(asm,
  4854. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4855. [(set VR128:$dst,
  4856. (X86insertps VR128:$src1,
  4857. (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
  4858. timm:$src3))]>,
  4859. Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
  4860. }
  4861. let ExeDomain = SSEPackedSingle in {
  4862. let Predicates = [UseAVX] in
  4863. defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
  4864. VEX_4V, VEX_WIG;
  4865. let Constraints = "$src1 = $dst" in
  4866. defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
  4867. }
  4868. //===----------------------------------------------------------------------===//
  4869. // SSE4.1 - Round Instructions
  4870. //===----------------------------------------------------------------------===//
  4871. multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
  4872. X86MemOperand x86memop, RegisterClass RC,
  4873. ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
  4874. X86FoldableSchedWrite sched> {
  4875. // Intrinsic operation, reg.
  4876. // Vector intrinsic operation, reg
  4877. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4878. def r : SS4AIi8<opc, MRMSrcReg,
  4879. (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
  4880. !strconcat(OpcodeStr,
  4881. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4882. [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
  4883. Sched<[sched]>;
  4884. // Vector intrinsic operation, mem
  4885. def m : SS4AIi8<opc, MRMSrcMem,
  4886. (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
  4887. !strconcat(OpcodeStr,
  4888. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4889. [(set RC:$dst,
  4890. (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
  4891. Sched<[sched.Folded]>;
  4892. }
  4893. }
  4894. multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
  4895. string OpcodeStr, X86FoldableSchedWrite sched> {
  4896. let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4897. def SSr : SS4AIi8<opcss, MRMSrcReg,
  4898. (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
  4899. !strconcat(OpcodeStr,
  4900. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4901. []>, Sched<[sched]>;
  4902. let mayLoad = 1 in
  4903. def SSm : SS4AIi8<opcss, MRMSrcMem,
  4904. (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
  4905. !strconcat(OpcodeStr,
  4906. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4907. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4908. } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
  4909. let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4910. def SDr : SS4AIi8<opcsd, MRMSrcReg,
  4911. (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
  4912. !strconcat(OpcodeStr,
  4913. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4914. []>, Sched<[sched]>;
  4915. let mayLoad = 1 in
  4916. def SDm : SS4AIi8<opcsd, MRMSrcMem,
  4917. (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
  4918. !strconcat(OpcodeStr,
  4919. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  4920. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4921. } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
  4922. }
  4923. multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
  4924. string OpcodeStr, X86FoldableSchedWrite sched> {
  4925. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4926. let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4927. def SSr : SS4AIi8<opcss, MRMSrcReg,
  4928. (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
  4929. !strconcat(OpcodeStr,
  4930. "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4931. []>, Sched<[sched]>;
  4932. let mayLoad = 1 in
  4933. def SSm : SS4AIi8<opcss, MRMSrcMem,
  4934. (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
  4935. !strconcat(OpcodeStr,
  4936. "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4937. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4938. } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
  4939. let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
  4940. def SDr : SS4AIi8<opcsd, MRMSrcReg,
  4941. (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
  4942. !strconcat(OpcodeStr,
  4943. "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4944. []>, Sched<[sched]>;
  4945. let mayLoad = 1 in
  4946. def SDm : SS4AIi8<opcsd, MRMSrcMem,
  4947. (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
  4948. !strconcat(OpcodeStr,
  4949. "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  4950. []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
  4951. } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
  4952. }
  4953. }
  4954. multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
  4955. string OpcodeStr, X86FoldableSchedWrite sched,
  4956. ValueType VT32, ValueType VT64,
  4957. SDNode OpNode, bit Is2Addr = 1> {
  4958. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  4959. let ExeDomain = SSEPackedSingle in {
  4960. def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
  4961. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
  4962. !if(Is2Addr,
  4963. !strconcat(OpcodeStr,
  4964. "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4965. !strconcat(OpcodeStr,
  4966. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4967. [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
  4968. Sched<[sched]>;
  4969. def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
  4970. (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
  4971. !if(Is2Addr,
  4972. !strconcat(OpcodeStr,
  4973. "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4974. !strconcat(OpcodeStr,
  4975. "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4976. [(set VR128:$dst,
  4977. (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
  4978. Sched<[sched.Folded, sched.ReadAfterFold]>;
  4979. } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
  4980. let ExeDomain = SSEPackedDouble in {
  4981. def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
  4982. (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
  4983. !if(Is2Addr,
  4984. !strconcat(OpcodeStr,
  4985. "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4986. !strconcat(OpcodeStr,
  4987. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4988. [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
  4989. Sched<[sched]>;
  4990. def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
  4991. (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
  4992. !if(Is2Addr,
  4993. !strconcat(OpcodeStr,
  4994. "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  4995. !strconcat(OpcodeStr,
  4996. "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  4997. [(set VR128:$dst,
  4998. (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
  4999. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5000. } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
  5001. }
  5002. }
  5003. // FP round - roundss, roundps, roundsd, roundpd
  5004. let Predicates = [HasAVX, NoVLX] in {
  5005. let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
  5006. // Intrinsic form
  5007. defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
  5008. loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
  5009. VEX, VEX_WIG;
  5010. defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
  5011. loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
  5012. VEX, VEX_L, VEX_WIG;
  5013. }
  5014. let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
  5015. defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
  5016. loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
  5017. VEX, VEX_WIG;
  5018. defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
  5019. loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
  5020. VEX, VEX_L, VEX_WIG;
  5021. }
  5022. }
  5023. let Predicates = [UseAVX] in {
  5024. defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
  5025. v4f32, v2f64, X86RndScales, 0>,
  5026. VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
  5027. defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
  5028. VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
  5029. }
  5030. let Predicates = [UseAVX] in {
  5031. def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
  5032. (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
  5033. def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
  5034. (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
  5035. }
  5036. let Predicates = [UseAVX, OptForSize] in {
  5037. def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
  5038. (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
  5039. def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
  5040. (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
  5041. }
  5042. let ExeDomain = SSEPackedSingle in
  5043. defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
  5044. memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
  5045. let ExeDomain = SSEPackedDouble in
  5046. defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
  5047. memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
  5048. defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
  5049. let Constraints = "$src1 = $dst" in
  5050. defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
  5051. v4f32, v2f64, X86RndScales>;
  5052. let Predicates = [UseSSE41] in {
  5053. def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
  5054. (ROUNDSSr FR32:$src1, timm:$src2)>;
  5055. def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
  5056. (ROUNDSDr FR64:$src1, timm:$src2)>;
  5057. }
  5058. let Predicates = [UseSSE41, OptForSize] in {
  5059. def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
  5060. (ROUNDSSm addr:$src1, timm:$src2)>;
  5061. def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
  5062. (ROUNDSDm addr:$src1, timm:$src2)>;
  5063. }
  5064. //===----------------------------------------------------------------------===//
  5065. // SSE4.1 - Packed Bit Test
  5066. //===----------------------------------------------------------------------===//
  5067. // ptest instruction we'll lower to this in X86ISelLowering primarily from
  5068. // the intel intrinsic that corresponds to this.
  5069. let Defs = [EFLAGS], Predicates = [HasAVX] in {
  5070. def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
  5071. "vptest\t{$src2, $src1|$src1, $src2}",
  5072. [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
  5073. Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
  5074. def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
  5075. "vptest\t{$src2, $src1|$src1, $src2}",
  5076. [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
  5077. Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
  5078. VEX, VEX_WIG;
  5079. def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
  5080. "vptest\t{$src2, $src1|$src1, $src2}",
  5081. [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
  5082. Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
  5083. def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
  5084. "vptest\t{$src2, $src1|$src1, $src2}",
  5085. [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
  5086. Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
  5087. VEX, VEX_L, VEX_WIG;
  5088. }
  5089. let Defs = [EFLAGS] in {
  5090. def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
  5091. "ptest\t{$src2, $src1|$src1, $src2}",
  5092. [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
  5093. Sched<[SchedWriteVecTest.XMM]>;
  5094. def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
  5095. "ptest\t{$src2, $src1|$src1, $src2}",
  5096. [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
  5097. Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
  5098. }
  5099. // The bit test instructions below are AVX only
  5100. multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
  5101. X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
  5102. X86FoldableSchedWrite sched> {
  5103. def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
  5104. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  5105. [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
  5106. Sched<[sched]>, VEX;
  5107. def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
  5108. !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
  5109. [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
  5110. Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
  5111. }
  5112. let Defs = [EFLAGS], Predicates = [HasAVX] in {
  5113. let ExeDomain = SSEPackedSingle in {
  5114. defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
  5115. SchedWriteFTest.XMM>;
  5116. defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
  5117. SchedWriteFTest.YMM>, VEX_L;
  5118. }
  5119. let ExeDomain = SSEPackedDouble in {
  5120. defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
  5121. SchedWriteFTest.XMM>;
  5122. defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
  5123. SchedWriteFTest.YMM>, VEX_L;
  5124. }
  5125. }
  5126. //===----------------------------------------------------------------------===//
  5127. // SSE4.1 - Misc Instructions
  5128. //===----------------------------------------------------------------------===//
  5129. let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
  5130. def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
  5131. "popcnt{w}\t{$src, $dst|$dst, $src}",
  5132. [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
  5133. Sched<[WritePOPCNT]>, OpSize16, XS;
  5134. def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
  5135. "popcnt{w}\t{$src, $dst|$dst, $src}",
  5136. [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
  5137. (implicit EFLAGS)]>,
  5138. Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
  5139. def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
  5140. "popcnt{l}\t{$src, $dst|$dst, $src}",
  5141. [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
  5142. Sched<[WritePOPCNT]>, OpSize32, XS;
  5143. def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
  5144. "popcnt{l}\t{$src, $dst|$dst, $src}",
  5145. [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
  5146. (implicit EFLAGS)]>,
  5147. Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
  5148. def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
  5149. "popcnt{q}\t{$src, $dst|$dst, $src}",
  5150. [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
  5151. Sched<[WritePOPCNT]>, XS;
  5152. def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
  5153. "popcnt{q}\t{$src, $dst|$dst, $src}",
  5154. [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
  5155. (implicit EFLAGS)]>,
  5156. Sched<[WritePOPCNT.Folded]>, XS;
  5157. }
  5158. // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
  5159. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
  5160. SDNode OpNode, PatFrag ld_frag,
  5161. X86FoldableSchedWrite Sched> {
  5162. def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
  5163. (ins VR128:$src),
  5164. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  5165. [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
  5166. Sched<[Sched]>;
  5167. def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
  5168. (ins i128mem:$src),
  5169. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  5170. [(set VR128:$dst,
  5171. (v8i16 (OpNode (ld_frag addr:$src))))]>,
  5172. Sched<[Sched.Folded]>;
  5173. }
  5174. // PHMIN has the same profile as PSAD, thus we use the same scheduling
  5175. // model, although the naming is misleading.
  5176. let Predicates = [HasAVX] in
  5177. defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
  5178. X86phminpos, load,
  5179. WritePHMINPOS>, VEX, VEX_WIG;
  5180. defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
  5181. X86phminpos, memop,
  5182. WritePHMINPOS>;
  5183. /// SS48I_binop_rm - Simple SSE41 binary operator.
  5184. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5185. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5186. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  5187. bit Is2Addr = 1> {
  5188. let isCommutable = 1 in
  5189. def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
  5190. (ins RC:$src1, RC:$src2),
  5191. !if(Is2Addr,
  5192. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5193. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5194. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  5195. Sched<[sched]>;
  5196. def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
  5197. (ins RC:$src1, x86memop:$src2),
  5198. !if(Is2Addr,
  5199. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5200. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5201. [(set RC:$dst,
  5202. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  5203. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5204. }
  5205. let Predicates = [HasAVX, NoVLX] in {
  5206. defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
  5207. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5208. VEX_4V, VEX_WIG;
  5209. defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
  5210. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5211. VEX_4V, VEX_WIG;
  5212. defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
  5213. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5214. VEX_4V, VEX_WIG;
  5215. defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
  5216. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5217. VEX_4V, VEX_WIG;
  5218. defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
  5219. load, i128mem, SchedWriteVecIMul.XMM, 0>,
  5220. VEX_4V, VEX_WIG;
  5221. }
  5222. let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
  5223. defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
  5224. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5225. VEX_4V, VEX_WIG;
  5226. defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
  5227. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5228. VEX_4V, VEX_WIG;
  5229. defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
  5230. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5231. VEX_4V, VEX_WIG;
  5232. defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
  5233. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5234. VEX_4V, VEX_WIG;
  5235. }
  5236. let Predicates = [HasAVX2, NoVLX] in {
  5237. defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
  5238. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5239. VEX_4V, VEX_L, VEX_WIG;
  5240. defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
  5241. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5242. VEX_4V, VEX_L, VEX_WIG;
  5243. defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
  5244. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5245. VEX_4V, VEX_L, VEX_WIG;
  5246. defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
  5247. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5248. VEX_4V, VEX_L, VEX_WIG;
  5249. defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
  5250. load, i256mem, SchedWriteVecIMul.YMM, 0>,
  5251. VEX_4V, VEX_L, VEX_WIG;
  5252. }
  5253. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  5254. defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
  5255. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5256. VEX_4V, VEX_L, VEX_WIG;
  5257. defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
  5258. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5259. VEX_4V, VEX_L, VEX_WIG;
  5260. defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
  5261. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5262. VEX_4V, VEX_L, VEX_WIG;
  5263. defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
  5264. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5265. VEX_4V, VEX_L, VEX_WIG;
  5266. }
  5267. let Constraints = "$src1 = $dst" in {
  5268. defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
  5269. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5270. defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
  5271. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5272. defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
  5273. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5274. defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
  5275. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5276. defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
  5277. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5278. defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
  5279. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5280. defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
  5281. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5282. defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
  5283. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5284. defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
  5285. memop, i128mem, SchedWriteVecIMul.XMM, 1>;
  5286. }
  5287. let Predicates = [HasAVX, NoVLX] in
  5288. defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
  5289. load, i128mem, SchedWritePMULLD.XMM, 0>,
  5290. VEX_4V, VEX_WIG;
  5291. let Predicates = [HasAVX] in
  5292. defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
  5293. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5294. VEX_4V, VEX_WIG;
  5295. let Predicates = [HasAVX2, NoVLX] in
  5296. defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
  5297. load, i256mem, SchedWritePMULLD.YMM, 0>,
  5298. VEX_4V, VEX_L, VEX_WIG;
  5299. let Predicates = [HasAVX2] in
  5300. defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
  5301. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5302. VEX_4V, VEX_L, VEX_WIG;
  5303. let Constraints = "$src1 = $dst" in {
  5304. defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
  5305. memop, i128mem, SchedWritePMULLD.XMM, 1>;
  5306. defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
  5307. memop, i128mem, SchedWriteVecALU.XMM, 1>;
  5308. }
  5309. /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
  5310. multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
  5311. Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
  5312. X86MemOperand x86memop, bit Is2Addr,
  5313. X86FoldableSchedWrite sched> {
  5314. let isCommutable = 1 in
  5315. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5316. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5317. !if(Is2Addr,
  5318. !strconcat(OpcodeStr,
  5319. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5320. !strconcat(OpcodeStr,
  5321. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5322. [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
  5323. Sched<[sched]>;
  5324. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5325. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5326. !if(Is2Addr,
  5327. !strconcat(OpcodeStr,
  5328. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5329. !strconcat(OpcodeStr,
  5330. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5331. [(set RC:$dst,
  5332. (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
  5333. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5334. }
  5335. /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
  5336. multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5337. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5338. X86MemOperand x86memop, bit Is2Addr,
  5339. X86FoldableSchedWrite sched> {
  5340. let isCommutable = 1 in
  5341. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5342. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5343. !if(Is2Addr,
  5344. !strconcat(OpcodeStr,
  5345. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5346. !strconcat(OpcodeStr,
  5347. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5348. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  5349. Sched<[sched]>;
  5350. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5351. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5352. !if(Is2Addr,
  5353. !strconcat(OpcodeStr,
  5354. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5355. !strconcat(OpcodeStr,
  5356. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5357. [(set RC:$dst,
  5358. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
  5359. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5360. }
  5361. def BlendCommuteImm2 : SDNodeXForm<timm, [{
  5362. uint8_t Imm = N->getZExtValue() & 0x03;
  5363. return getI8Imm(Imm ^ 0x03, SDLoc(N));
  5364. }]>;
  5365. def BlendCommuteImm4 : SDNodeXForm<timm, [{
  5366. uint8_t Imm = N->getZExtValue() & 0x0f;
  5367. return getI8Imm(Imm ^ 0x0f, SDLoc(N));
  5368. }]>;
  5369. def BlendCommuteImm8 : SDNodeXForm<timm, [{
  5370. uint8_t Imm = N->getZExtValue() & 0xff;
  5371. return getI8Imm(Imm ^ 0xff, SDLoc(N));
  5372. }]>;
  5373. // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
  5374. def BlendScaleImm4 : SDNodeXForm<timm, [{
  5375. uint8_t Imm = N->getZExtValue();
  5376. uint8_t NewImm = 0;
  5377. for (unsigned i = 0; i != 4; ++i) {
  5378. if (Imm & (1 << i))
  5379. NewImm |= 0x3 << (i * 2);
  5380. }
  5381. return getI8Imm(NewImm, SDLoc(N));
  5382. }]>;
  5383. // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
  5384. def BlendScaleImm2 : SDNodeXForm<timm, [{
  5385. uint8_t Imm = N->getZExtValue();
  5386. uint8_t NewImm = 0;
  5387. for (unsigned i = 0; i != 2; ++i) {
  5388. if (Imm & (1 << i))
  5389. NewImm |= 0xf << (i * 4);
  5390. }
  5391. return getI8Imm(NewImm, SDLoc(N));
  5392. }]>;
  5393. // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
  5394. def BlendScaleImm2to4 : SDNodeXForm<timm, [{
  5395. uint8_t Imm = N->getZExtValue();
  5396. uint8_t NewImm = 0;
  5397. for (unsigned i = 0; i != 2; ++i) {
  5398. if (Imm & (1 << i))
  5399. NewImm |= 0x3 << (i * 2);
  5400. }
  5401. return getI8Imm(NewImm, SDLoc(N));
  5402. }]>;
  5403. // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
  5404. def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
  5405. uint8_t Imm = N->getZExtValue();
  5406. uint8_t NewImm = 0;
  5407. for (unsigned i = 0; i != 4; ++i) {
  5408. if (Imm & (1 << i))
  5409. NewImm |= 0x3 << (i * 2);
  5410. }
  5411. return getI8Imm(NewImm ^ 0xff, SDLoc(N));
  5412. }]>;
  5413. // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
  5414. def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
  5415. uint8_t Imm = N->getZExtValue();
  5416. uint8_t NewImm = 0;
  5417. for (unsigned i = 0; i != 2; ++i) {
  5418. if (Imm & (1 << i))
  5419. NewImm |= 0xf << (i * 4);
  5420. }
  5421. return getI8Imm(NewImm ^ 0xff, SDLoc(N));
  5422. }]>;
  5423. // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
  5424. def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
  5425. uint8_t Imm = N->getZExtValue();
  5426. uint8_t NewImm = 0;
  5427. for (unsigned i = 0; i != 2; ++i) {
  5428. if (Imm & (1 << i))
  5429. NewImm |= 0x3 << (i * 2);
  5430. }
  5431. return getI8Imm(NewImm ^ 0xf, SDLoc(N));
  5432. }]>;
  5433. let Predicates = [HasAVX] in {
  5434. let isCommutable = 0 in {
  5435. defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
  5436. VR128, load, i128mem, 0,
  5437. SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
  5438. }
  5439. let Uses = [MXCSR], mayRaiseFPException = 1 in {
  5440. let ExeDomain = SSEPackedSingle in
  5441. defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
  5442. VR128, load, f128mem, 0,
  5443. SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
  5444. let ExeDomain = SSEPackedDouble in
  5445. defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
  5446. VR128, load, f128mem, 0,
  5447. SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
  5448. let ExeDomain = SSEPackedSingle in
  5449. defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
  5450. VR256, load, i256mem, 0,
  5451. SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
  5452. }
  5453. }
  5454. let Predicates = [HasAVX2] in {
  5455. let isCommutable = 0 in {
  5456. defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
  5457. VR256, load, i256mem, 0,
  5458. SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
  5459. }
  5460. }
  5461. let Constraints = "$src1 = $dst" in {
  5462. let isCommutable = 0 in {
  5463. defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
  5464. VR128, memop, i128mem, 1,
  5465. SchedWriteMPSAD.XMM>;
  5466. }
  5467. let ExeDomain = SSEPackedSingle in
  5468. defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
  5469. VR128, memop, f128mem, 1,
  5470. SchedWriteDPPS.XMM>, SIMD_EXC;
  5471. let ExeDomain = SSEPackedDouble in
  5472. defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
  5473. VR128, memop, f128mem, 1,
  5474. SchedWriteDPPD.XMM>, SIMD_EXC;
  5475. }
  5476. /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
  5477. multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5478. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5479. X86MemOperand x86memop, bit Is2Addr, Domain d,
  5480. X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
  5481. let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
  5482. let isCommutable = 1 in
  5483. def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
  5484. (ins RC:$src1, RC:$src2, u8imm:$src3),
  5485. !if(Is2Addr,
  5486. !strconcat(OpcodeStr,
  5487. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5488. !strconcat(OpcodeStr,
  5489. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5490. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  5491. Sched<[sched]>;
  5492. def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
  5493. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  5494. !if(Is2Addr,
  5495. !strconcat(OpcodeStr,
  5496. "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  5497. !strconcat(OpcodeStr,
  5498. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
  5499. [(set RC:$dst,
  5500. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
  5501. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5502. }
  5503. // Pattern to commute if load is in first source.
  5504. def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
  5505. (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
  5506. (commuteXForm timm:$src3))>;
  5507. }
  5508. let Predicates = [HasAVX] in {
  5509. defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
  5510. VR128, load, f128mem, 0, SSEPackedSingle,
  5511. SchedWriteFBlend.XMM, BlendCommuteImm4>,
  5512. VEX_4V, VEX_WIG;
  5513. defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
  5514. VR256, load, f256mem, 0, SSEPackedSingle,
  5515. SchedWriteFBlend.YMM, BlendCommuteImm8>,
  5516. VEX_4V, VEX_L, VEX_WIG;
  5517. defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
  5518. VR128, load, f128mem, 0, SSEPackedDouble,
  5519. SchedWriteFBlend.XMM, BlendCommuteImm2>,
  5520. VEX_4V, VEX_WIG;
  5521. defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
  5522. VR256, load, f256mem, 0, SSEPackedDouble,
  5523. SchedWriteFBlend.YMM, BlendCommuteImm4>,
  5524. VEX_4V, VEX_L, VEX_WIG;
  5525. defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
  5526. VR128, load, i128mem, 0, SSEPackedInt,
  5527. SchedWriteBlend.XMM, BlendCommuteImm8>,
  5528. VEX_4V, VEX_WIG;
  5529. }
  5530. let Predicates = [HasAVX2] in {
  5531. defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
  5532. VR256, load, i256mem, 0, SSEPackedInt,
  5533. SchedWriteBlend.YMM, BlendCommuteImm8>,
  5534. VEX_4V, VEX_L, VEX_WIG;
  5535. }
  5536. // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
  5537. // ExecutionDomainFixPass will cleanup domains later on.
  5538. let Predicates = [HasAVX1Only] in {
  5539. def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
  5540. (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
  5541. def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
  5542. (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
  5543. def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
  5544. (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
  5545. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5546. // it from becoming movsd via commuting under optsize.
  5547. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  5548. (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
  5549. def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
  5550. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
  5551. def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
  5552. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
  5553. def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
  5554. (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
  5555. def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
  5556. (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
  5557. def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
  5558. (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
  5559. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5560. // it from becoming movss via commuting under optsize.
  5561. def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
  5562. (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
  5563. def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
  5564. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  5565. def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
  5566. (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  5567. }
  5568. defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
  5569. VR128, memop, f128mem, 1, SSEPackedSingle,
  5570. SchedWriteFBlend.XMM, BlendCommuteImm4>;
  5571. defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
  5572. VR128, memop, f128mem, 1, SSEPackedDouble,
  5573. SchedWriteFBlend.XMM, BlendCommuteImm2>;
  5574. defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
  5575. VR128, memop, i128mem, 1, SSEPackedInt,
  5576. SchedWriteBlend.XMM, BlendCommuteImm8>;
  5577. let Predicates = [UseSSE41] in {
  5578. // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
  5579. // it from becoming movss via commuting under optsize.
  5580. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  5581. (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
  5582. def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
  5583. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
  5584. def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
  5585. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
  5586. def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
  5587. (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
  5588. def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
  5589. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  5590. def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
  5591. (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  5592. }
  5593. // For insertion into the zero index (low half) of a 256-bit vector, it is
  5594. // more efficient to generate a blend with immediate instead of an insert*128.
  5595. let Predicates = [HasAVX] in {
  5596. def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
  5597. (VBLENDPDYrri VR256:$src1,
  5598. (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  5599. VR128:$src2, sub_xmm), 0x3)>;
  5600. def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
  5601. (VBLENDPSYrri VR256:$src1,
  5602. (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  5603. VR128:$src2, sub_xmm), 0xf)>;
  5604. def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
  5605. (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  5606. VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
  5607. def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
  5608. (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  5609. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  5610. }
  5611. /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
  5612. multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
  5613. X86MemOperand x86memop, ValueType VT,
  5614. PatFrag mem_frag, SDNode OpNode,
  5615. X86FoldableSchedWrite sched> {
  5616. def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
  5617. (ins RC:$src1, RC:$src2, RC:$src3),
  5618. !strconcat(OpcodeStr,
  5619. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  5620. [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
  5621. SSEPackedInt>, TAPD, VEX_4V,
  5622. Sched<[sched]>;
  5623. def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
  5624. (ins RC:$src1, x86memop:$src2, RC:$src3),
  5625. !strconcat(OpcodeStr,
  5626. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  5627. [(set RC:$dst,
  5628. (OpNode RC:$src3, (mem_frag addr:$src2),
  5629. RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
  5630. Sched<[sched.Folded, sched.ReadAfterFold,
  5631. // x86memop:$src2
  5632. ReadDefault, ReadDefault, ReadDefault, ReadDefault,
  5633. ReadDefault,
  5634. // RC::$src3
  5635. sched.ReadAfterFold]>;
  5636. }
  5637. let Predicates = [HasAVX] in {
  5638. let ExeDomain = SSEPackedDouble in {
  5639. defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
  5640. v2f64, loadv2f64, X86Blendv,
  5641. SchedWriteFVarBlend.XMM>;
  5642. defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
  5643. v4f64, loadv4f64, X86Blendv,
  5644. SchedWriteFVarBlend.YMM>, VEX_L;
  5645. } // ExeDomain = SSEPackedDouble
  5646. let ExeDomain = SSEPackedSingle in {
  5647. defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
  5648. v4f32, loadv4f32, X86Blendv,
  5649. SchedWriteFVarBlend.XMM>;
  5650. defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
  5651. v8f32, loadv8f32, X86Blendv,
  5652. SchedWriteFVarBlend.YMM>, VEX_L;
  5653. } // ExeDomain = SSEPackedSingle
  5654. defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
  5655. v16i8, loadv16i8, X86Blendv,
  5656. SchedWriteVarBlend.XMM>;
  5657. }
  5658. let Predicates = [HasAVX2] in {
  5659. defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
  5660. v32i8, loadv32i8, X86Blendv,
  5661. SchedWriteVarBlend.YMM>, VEX_L;
  5662. }
  5663. let Predicates = [HasAVX] in {
  5664. def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
  5665. (v4i32 VR128:$src2))),
  5666. (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
  5667. def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
  5668. (v2i64 VR128:$src2))),
  5669. (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
  5670. def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
  5671. (v8i32 VR256:$src2))),
  5672. (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
  5673. def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
  5674. (v4i64 VR256:$src2))),
  5675. (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
  5676. }
  5677. // Prefer a movss or movsd over a blendps when optimizing for size. these were
  5678. // changed to use blends because blends have better throughput on sandybridge
  5679. // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
  5680. let Predicates = [HasAVX, OptForSpeed] in {
  5681. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  5682. (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
  5683. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  5684. (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
  5685. def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
  5686. (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
  5687. def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
  5688. (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
  5689. def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
  5690. (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
  5691. def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
  5692. (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
  5693. def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
  5694. (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
  5695. def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
  5696. (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
  5697. // Move low f32 and clear high bits.
  5698. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
  5699. (SUBREG_TO_REG (i32 0),
  5700. (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
  5701. (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
  5702. (i8 1))), sub_xmm)>;
  5703. def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
  5704. (SUBREG_TO_REG (i32 0),
  5705. (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
  5706. (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
  5707. (i8 3))), sub_xmm)>;
  5708. }
  5709. // Prefer a movss or movsd over a blendps when optimizing for size. these were
  5710. // changed to use blends because blends have better throughput on sandybridge
  5711. // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
  5712. let Predicates = [UseSSE41, OptForSpeed] in {
  5713. // With SSE41 we can use blends for these patterns.
  5714. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
  5715. (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
  5716. def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
  5717. (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
  5718. def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
  5719. (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
  5720. def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
  5721. (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
  5722. def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
  5723. (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
  5724. def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
  5725. (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
  5726. def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
  5727. (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
  5728. def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
  5729. (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
  5730. }
  5731. /// SS41I_ternary - SSE 4.1 ternary operator
  5732. let Uses = [XMM0], Constraints = "$src1 = $dst" in {
  5733. multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
  5734. PatFrag mem_frag, X86MemOperand x86memop,
  5735. SDNode OpNode, X86FoldableSchedWrite sched> {
  5736. def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
  5737. (ins VR128:$src1, VR128:$src2),
  5738. !strconcat(OpcodeStr,
  5739. "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  5740. [(set VR128:$dst,
  5741. (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
  5742. Sched<[sched]>;
  5743. def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
  5744. (ins VR128:$src1, x86memop:$src2),
  5745. !strconcat(OpcodeStr,
  5746. "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  5747. [(set VR128:$dst,
  5748. (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
  5749. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5750. }
  5751. }
  5752. let ExeDomain = SSEPackedDouble in
  5753. defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
  5754. X86Blendv, SchedWriteFVarBlend.XMM>;
  5755. let ExeDomain = SSEPackedSingle in
  5756. defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
  5757. X86Blendv, SchedWriteFVarBlend.XMM>;
  5758. defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
  5759. X86Blendv, SchedWriteVarBlend.XMM>;
  5760. // Aliases with the implicit xmm0 argument
  5761. def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
  5762. (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
  5763. def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
  5764. (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
  5765. def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
  5766. (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
  5767. def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
  5768. (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
  5769. def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
  5770. (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
  5771. def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
  5772. (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
  5773. let Predicates = [UseSSE41] in {
  5774. def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
  5775. (v4i32 VR128:$src2))),
  5776. (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
  5777. def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
  5778. (v2i64 VR128:$src2))),
  5779. (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
  5780. }
  5781. let AddedComplexity = 400 in { // Prefer non-temporal versions
  5782. let Predicates = [HasAVX, NoVLX] in
  5783. def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  5784. "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
  5785. Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
  5786. let Predicates = [HasAVX2, NoVLX] in
  5787. def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
  5788. "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
  5789. Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
  5790. def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
  5791. "movntdqa\t{$src, $dst|$dst, $src}", []>,
  5792. Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
  5793. let Predicates = [HasAVX2, NoVLX] in {
  5794. def : Pat<(v8f32 (alignednontemporalload addr:$src)),
  5795. (VMOVNTDQAYrm addr:$src)>;
  5796. def : Pat<(v4f64 (alignednontemporalload addr:$src)),
  5797. (VMOVNTDQAYrm addr:$src)>;
  5798. def : Pat<(v4i64 (alignednontemporalload addr:$src)),
  5799. (VMOVNTDQAYrm addr:$src)>;
  5800. def : Pat<(v8i32 (alignednontemporalload addr:$src)),
  5801. (VMOVNTDQAYrm addr:$src)>;
  5802. def : Pat<(v16i16 (alignednontemporalload addr:$src)),
  5803. (VMOVNTDQAYrm addr:$src)>;
  5804. def : Pat<(v32i8 (alignednontemporalload addr:$src)),
  5805. (VMOVNTDQAYrm addr:$src)>;
  5806. }
  5807. let Predicates = [HasAVX, NoVLX] in {
  5808. def : Pat<(v4f32 (alignednontemporalload addr:$src)),
  5809. (VMOVNTDQArm addr:$src)>;
  5810. def : Pat<(v2f64 (alignednontemporalload addr:$src)),
  5811. (VMOVNTDQArm addr:$src)>;
  5812. def : Pat<(v2i64 (alignednontemporalload addr:$src)),
  5813. (VMOVNTDQArm addr:$src)>;
  5814. def : Pat<(v4i32 (alignednontemporalload addr:$src)),
  5815. (VMOVNTDQArm addr:$src)>;
  5816. def : Pat<(v8i16 (alignednontemporalload addr:$src)),
  5817. (VMOVNTDQArm addr:$src)>;
  5818. def : Pat<(v16i8 (alignednontemporalload addr:$src)),
  5819. (VMOVNTDQArm addr:$src)>;
  5820. }
  5821. let Predicates = [UseSSE41] in {
  5822. def : Pat<(v4f32 (alignednontemporalload addr:$src)),
  5823. (MOVNTDQArm addr:$src)>;
  5824. def : Pat<(v2f64 (alignednontemporalload addr:$src)),
  5825. (MOVNTDQArm addr:$src)>;
  5826. def : Pat<(v2i64 (alignednontemporalload addr:$src)),
  5827. (MOVNTDQArm addr:$src)>;
  5828. def : Pat<(v4i32 (alignednontemporalload addr:$src)),
  5829. (MOVNTDQArm addr:$src)>;
  5830. def : Pat<(v8i16 (alignednontemporalload addr:$src)),
  5831. (MOVNTDQArm addr:$src)>;
  5832. def : Pat<(v16i8 (alignednontemporalload addr:$src)),
  5833. (MOVNTDQArm addr:$src)>;
  5834. }
  5835. } // AddedComplexity
  5836. //===----------------------------------------------------------------------===//
  5837. // SSE4.2 - Compare Instructions
  5838. //===----------------------------------------------------------------------===//
  5839. /// SS42I_binop_rm - Simple SSE 4.2 binary operator
  5840. multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  5841. ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
  5842. X86MemOperand x86memop, X86FoldableSchedWrite sched,
  5843. bit Is2Addr = 1> {
  5844. def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
  5845. (ins RC:$src1, RC:$src2),
  5846. !if(Is2Addr,
  5847. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5848. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5849. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
  5850. Sched<[sched]>;
  5851. def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
  5852. (ins RC:$src1, x86memop:$src2),
  5853. !if(Is2Addr,
  5854. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
  5855. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
  5856. [(set RC:$dst,
  5857. (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
  5858. Sched<[sched.Folded, sched.ReadAfterFold]>;
  5859. }
  5860. let Predicates = [HasAVX] in
  5861. defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
  5862. load, i128mem, SchedWriteVecALU.XMM, 0>,
  5863. VEX_4V, VEX_WIG;
  5864. let Predicates = [HasAVX2] in
  5865. defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
  5866. load, i256mem, SchedWriteVecALU.YMM, 0>,
  5867. VEX_4V, VEX_L, VEX_WIG;
  5868. let Constraints = "$src1 = $dst" in
  5869. defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
  5870. memop, i128mem, SchedWriteVecALU.XMM>;
  5871. //===----------------------------------------------------------------------===//
  5872. // SSE4.2 - String/text Processing Instructions
  5873. //===----------------------------------------------------------------------===//
  5874. multiclass pcmpistrm_SS42AI<string asm> {
  5875. def rr : SS42AI<0x62, MRMSrcReg, (outs),
  5876. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  5877. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5878. []>, Sched<[WritePCmpIStrM]>;
  5879. let mayLoad = 1 in
  5880. def rm :SS42AI<0x62, MRMSrcMem, (outs),
  5881. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  5882. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5883. []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
  5884. }
  5885. let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
  5886. let Predicates = [HasAVX] in
  5887. defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
  5888. defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
  5889. }
  5890. multiclass SS42AI_pcmpestrm<string asm> {
  5891. def rr : SS42AI<0x60, MRMSrcReg, (outs),
  5892. (ins VR128:$src1, VR128:$src3, u8imm:$src5),
  5893. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5894. []>, Sched<[WritePCmpEStrM]>;
  5895. let mayLoad = 1 in
  5896. def rm : SS42AI<0x60, MRMSrcMem, (outs),
  5897. (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
  5898. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5899. []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
  5900. }
  5901. let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
  5902. let Predicates = [HasAVX] in
  5903. defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
  5904. defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
  5905. }
  5906. multiclass SS42AI_pcmpistri<string asm> {
  5907. def rr : SS42AI<0x63, MRMSrcReg, (outs),
  5908. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  5909. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5910. []>, Sched<[WritePCmpIStrI]>;
  5911. let mayLoad = 1 in
  5912. def rm : SS42AI<0x63, MRMSrcMem, (outs),
  5913. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  5914. !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
  5915. []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
  5916. }
  5917. let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
  5918. let Predicates = [HasAVX] in
  5919. defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
  5920. defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
  5921. }
  5922. multiclass SS42AI_pcmpestri<string asm> {
  5923. def rr : SS42AI<0x61, MRMSrcReg, (outs),
  5924. (ins VR128:$src1, VR128:$src3, u8imm:$src5),
  5925. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5926. []>, Sched<[WritePCmpEStrI]>;
  5927. let mayLoad = 1 in
  5928. def rm : SS42AI<0x61, MRMSrcMem, (outs),
  5929. (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
  5930. !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
  5931. []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
  5932. }
  5933. let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
  5934. let Predicates = [HasAVX] in
  5935. defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
  5936. defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
  5937. }
  5938. //===----------------------------------------------------------------------===//
  5939. // SSE4.2 - CRC Instructions
  5940. //===----------------------------------------------------------------------===//
  5941. // No CRC instructions have AVX equivalents
  5942. // crc intrinsic instruction
  5943. // This set of instructions are only rm, the only difference is the size
  5944. // of r and m.
  5945. class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
  5946. RegisterClass RCIn, SDPatternOperator Int> :
  5947. CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
  5948. !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
  5949. [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
  5950. Sched<[WriteCRC32]>;
  5951. class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
  5952. X86MemOperand x86memop, SDPatternOperator Int> :
  5953. CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
  5954. !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
  5955. [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
  5956. Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
  5957. let Constraints = "$src1 = $dst" in {
  5958. def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
  5959. int_x86_sse42_crc32_32_8>;
  5960. def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
  5961. int_x86_sse42_crc32_32_8>;
  5962. def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
  5963. int_x86_sse42_crc32_32_16>, OpSize16;
  5964. def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
  5965. int_x86_sse42_crc32_32_16>, OpSize16;
  5966. def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
  5967. int_x86_sse42_crc32_32_32>, OpSize32;
  5968. def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
  5969. int_x86_sse42_crc32_32_32>, OpSize32;
  5970. def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
  5971. int_x86_sse42_crc32_64_64>, REX_W;
  5972. def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
  5973. int_x86_sse42_crc32_64_64>, REX_W;
  5974. let hasSideEffects = 0 in {
  5975. let mayLoad = 1 in
  5976. def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
  5977. null_frag>, REX_W;
  5978. def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
  5979. null_frag>, REX_W;
  5980. }
  5981. }
  5982. //===----------------------------------------------------------------------===//
  5983. // SHA-NI Instructions
  5984. //===----------------------------------------------------------------------===//
  5985. // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
  5986. multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
  5987. X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
  5988. def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
  5989. (ins VR128:$src1, VR128:$src2),
  5990. !if(UsesXMM0,
  5991. !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  5992. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
  5993. [!if(UsesXMM0,
  5994. (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
  5995. (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
  5996. T8PS, Sched<[sched]>;
  5997. def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
  5998. (ins VR128:$src1, i128mem:$src2),
  5999. !if(UsesXMM0,
  6000. !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
  6001. !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
  6002. [!if(UsesXMM0,
  6003. (set VR128:$dst, (IntId VR128:$src1,
  6004. (memop addr:$src2), XMM0)),
  6005. (set VR128:$dst, (IntId VR128:$src1,
  6006. (memop addr:$src2))))]>, T8PS,
  6007. Sched<[sched.Folded, sched.ReadAfterFold]>;
  6008. }
  6009. let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
  6010. def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
  6011. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  6012. "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6013. [(set VR128:$dst,
  6014. (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
  6015. (i8 timm:$src3)))]>, TAPS,
  6016. Sched<[SchedWriteVecIMul.XMM]>;
  6017. def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
  6018. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  6019. "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6020. [(set VR128:$dst,
  6021. (int_x86_sha1rnds4 VR128:$src1,
  6022. (memop addr:$src2),
  6023. (i8 timm:$src3)))]>, TAPS,
  6024. Sched<[SchedWriteVecIMul.XMM.Folded,
  6025. SchedWriteVecIMul.XMM.ReadAfterFold]>;
  6026. defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
  6027. SchedWriteVecIMul.XMM>;
  6028. defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
  6029. SchedWriteVecIMul.XMM>;
  6030. defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
  6031. SchedWriteVecIMul.XMM>;
  6032. let Uses=[XMM0] in
  6033. defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
  6034. SchedWriteVecIMul.XMM, 1>;
  6035. defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
  6036. SchedWriteVecIMul.XMM>;
  6037. defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
  6038. SchedWriteVecIMul.XMM>;
  6039. }
  6040. // Aliases with explicit %xmm0
  6041. def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
  6042. (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
  6043. def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
  6044. (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
  6045. //===----------------------------------------------------------------------===//
  6046. // AES-NI Instructions
  6047. //===----------------------------------------------------------------------===//
  6048. multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
  6049. Intrinsic IntId, PatFrag ld_frag,
  6050. bit Is2Addr = 0, RegisterClass RC = VR128,
  6051. X86MemOperand MemOp = i128mem> {
  6052. let AsmString = OpcodeStr#
  6053. !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
  6054. "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
  6055. def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
  6056. (ins RC:$src1, RC:$src2), "",
  6057. [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
  6058. Sched<[WriteAESDecEnc]>;
  6059. def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
  6060. (ins RC:$src1, MemOp:$src2), "",
  6061. [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
  6062. Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
  6063. }
  6064. }
  6065. // Perform One Round of an AES Encryption/Decryption Flow
  6066. let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
  6067. defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
  6068. int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
  6069. defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
  6070. int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
  6071. defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
  6072. int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
  6073. defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
  6074. int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
  6075. }
  6076. let Predicates = [NoVLX, HasVAES] in {
  6077. defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
  6078. int_x86_aesni_aesenc_256, load, 0, VR256,
  6079. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6080. defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
  6081. int_x86_aesni_aesenclast_256, load, 0, VR256,
  6082. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6083. defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
  6084. int_x86_aesni_aesdec_256, load, 0, VR256,
  6085. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6086. defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
  6087. int_x86_aesni_aesdeclast_256, load, 0, VR256,
  6088. i256mem>, VEX_4V, VEX_L, VEX_WIG;
  6089. }
  6090. let Constraints = "$src1 = $dst" in {
  6091. defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
  6092. int_x86_aesni_aesenc, memop, 1>;
  6093. defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
  6094. int_x86_aesni_aesenclast, memop, 1>;
  6095. defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
  6096. int_x86_aesni_aesdec, memop, 1>;
  6097. defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
  6098. int_x86_aesni_aesdeclast, memop, 1>;
  6099. }
  6100. // Perform the AES InvMixColumn Transformation
  6101. let Predicates = [HasAVX, HasAES] in {
  6102. def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
  6103. (ins VR128:$src1),
  6104. "vaesimc\t{$src1, $dst|$dst, $src1}",
  6105. [(set VR128:$dst,
  6106. (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
  6107. VEX, VEX_WIG;
  6108. def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
  6109. (ins i128mem:$src1),
  6110. "vaesimc\t{$src1, $dst|$dst, $src1}",
  6111. [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
  6112. Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
  6113. }
  6114. def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
  6115. (ins VR128:$src1),
  6116. "aesimc\t{$src1, $dst|$dst, $src1}",
  6117. [(set VR128:$dst,
  6118. (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
  6119. def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
  6120. (ins i128mem:$src1),
  6121. "aesimc\t{$src1, $dst|$dst, $src1}",
  6122. [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
  6123. Sched<[WriteAESIMC.Folded]>;
  6124. // AES Round Key Generation Assist
  6125. let Predicates = [HasAVX, HasAES] in {
  6126. def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
  6127. (ins VR128:$src1, u8imm:$src2),
  6128. "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6129. [(set VR128:$dst,
  6130. (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
  6131. Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
  6132. def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
  6133. (ins i128mem:$src1, u8imm:$src2),
  6134. "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6135. [(set VR128:$dst,
  6136. (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
  6137. Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
  6138. }
  6139. def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
  6140. (ins VR128:$src1, u8imm:$src2),
  6141. "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6142. [(set VR128:$dst,
  6143. (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
  6144. Sched<[WriteAESKeyGen]>;
  6145. def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
  6146. (ins i128mem:$src1, u8imm:$src2),
  6147. "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6148. [(set VR128:$dst,
  6149. (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
  6150. Sched<[WriteAESKeyGen.Folded]>;
  6151. //===----------------------------------------------------------------------===//
  6152. // PCLMUL Instructions
  6153. //===----------------------------------------------------------------------===//
  6154. // Immediate transform to help with commuting.
  6155. def PCLMULCommuteImm : SDNodeXForm<timm, [{
  6156. uint8_t Imm = N->getZExtValue();
  6157. return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
  6158. }]>;
  6159. // SSE carry-less Multiplication instructions
  6160. let Predicates = [NoAVX, HasPCLMUL] in {
  6161. let Constraints = "$src1 = $dst" in {
  6162. let isCommutable = 1 in
  6163. def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
  6164. (ins VR128:$src1, VR128:$src2, u8imm:$src3),
  6165. "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6166. [(set VR128:$dst,
  6167. (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
  6168. Sched<[WriteCLMul]>;
  6169. def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
  6170. (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
  6171. "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  6172. [(set VR128:$dst,
  6173. (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
  6174. timm:$src3))]>,
  6175. Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
  6176. } // Constraints = "$src1 = $dst"
  6177. def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
  6178. (i8 timm:$src3)),
  6179. (PCLMULQDQrm VR128:$src1, addr:$src2,
  6180. (PCLMULCommuteImm timm:$src3))>;
  6181. } // Predicates = [NoAVX, HasPCLMUL]
  6182. // SSE aliases
  6183. foreach HI = ["hq","lq"] in
  6184. foreach LO = ["hq","lq"] in {
  6185. def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
  6186. (PCLMULQDQrr VR128:$dst, VR128:$src,
  6187. !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
  6188. def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
  6189. (PCLMULQDQrm VR128:$dst, i128mem:$src,
  6190. !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
  6191. }
  6192. // AVX carry-less Multiplication instructions
  6193. multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
  6194. PatFrag LdFrag, Intrinsic IntId> {
  6195. let isCommutable = 1 in
  6196. def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
  6197. (ins RC:$src1, RC:$src2, u8imm:$src3),
  6198. "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6199. [(set RC:$dst,
  6200. (IntId RC:$src1, RC:$src2, timm:$src3))]>,
  6201. Sched<[WriteCLMul]>;
  6202. def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
  6203. (ins RC:$src1, MemOp:$src2, u8imm:$src3),
  6204. "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6205. [(set RC:$dst,
  6206. (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
  6207. Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
  6208. // We can commute a load in the first operand by swapping the sources and
  6209. // rotating the immediate.
  6210. def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
  6211. (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
  6212. (PCLMULCommuteImm timm:$src3))>;
  6213. }
  6214. let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
  6215. defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
  6216. int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
  6217. let Predicates = [NoVLX, HasVPCLMULQDQ] in
  6218. defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
  6219. int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
  6220. multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
  6221. X86MemOperand MemOp, string Hi, string Lo> {
  6222. def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6223. (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
  6224. !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
  6225. def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6226. (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
  6227. !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
  6228. }
  6229. multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
  6230. X86MemOperand MemOp> {
  6231. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
  6232. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
  6233. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
  6234. defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
  6235. }
  6236. // AVX aliases
  6237. defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
  6238. defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
  6239. //===----------------------------------------------------------------------===//
  6240. // SSE4A Instructions
  6241. //===----------------------------------------------------------------------===//
  6242. let Predicates = [HasSSE4A] in {
  6243. let ExeDomain = SSEPackedInt in {
  6244. let Constraints = "$src = $dst" in {
  6245. def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
  6246. (ins VR128:$src, u8imm:$len, u8imm:$idx),
  6247. "extrq\t{$idx, $len, $src|$src, $len, $idx}",
  6248. [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
  6249. timm:$idx))]>,
  6250. PD, Sched<[SchedWriteVecALU.XMM]>;
  6251. def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
  6252. (ins VR128:$src, VR128:$mask),
  6253. "extrq\t{$mask, $src|$src, $mask}",
  6254. [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
  6255. VR128:$mask))]>,
  6256. PD, Sched<[SchedWriteVecALU.XMM]>;
  6257. def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
  6258. (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
  6259. "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
  6260. [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
  6261. timm:$len, timm:$idx))]>,
  6262. XD, Sched<[SchedWriteVecALU.XMM]>;
  6263. def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
  6264. (ins VR128:$src, VR128:$mask),
  6265. "insertq\t{$mask, $src|$src, $mask}",
  6266. [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
  6267. VR128:$mask))]>,
  6268. XD, Sched<[SchedWriteVecALU.XMM]>;
  6269. }
  6270. } // ExeDomain = SSEPackedInt
  6271. // Non-temporal (unaligned) scalar stores.
  6272. let AddedComplexity = 400 in { // Prefer non-temporal versions
  6273. let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
  6274. def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
  6275. "movntss\t{$src, $dst|$dst, $src}", []>, XS;
  6276. def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
  6277. "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
  6278. } // SchedRW
  6279. def : Pat<(nontemporalstore FR32:$src, addr:$dst),
  6280. (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6281. def : Pat<(nontemporalstore FR64:$src, addr:$dst),
  6282. (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6283. } // AddedComplexity
  6284. } // HasSSE4A
  6285. //===----------------------------------------------------------------------===//
  6286. // AVX Instructions
  6287. //===----------------------------------------------------------------------===//
  6288. //===----------------------------------------------------------------------===//
  6289. // VBROADCAST - Load from memory and broadcast to all elements of the
  6290. // destination operand
  6291. //
  6292. class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
  6293. X86MemOperand x86memop, ValueType VT,
  6294. PatFrag bcast_frag, SchedWrite Sched> :
  6295. AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  6296. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6297. [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
  6298. Sched<[Sched]>, VEX;
  6299. // AVX2 adds register forms
  6300. class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
  6301. ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
  6302. AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
  6303. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6304. [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
  6305. Sched<[Sched]>, VEX;
  6306. let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
  6307. def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
  6308. f32mem, v4f32, X86VBroadcastld32,
  6309. SchedWriteFShuffle.XMM.Folded>;
  6310. def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
  6311. f32mem, v8f32, X86VBroadcastld32,
  6312. SchedWriteFShuffle.XMM.Folded>, VEX_L;
  6313. }
  6314. let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
  6315. def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
  6316. v4f64, X86VBroadcastld64,
  6317. SchedWriteFShuffle.XMM.Folded>, VEX_L;
  6318. let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
  6319. def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
  6320. v4f32, v4f32, SchedWriteFShuffle.XMM>;
  6321. def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
  6322. v8f32, v4f32, WriteFShuffle256>, VEX_L;
  6323. }
  6324. let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
  6325. def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
  6326. v4f64, v2f64, WriteFShuffle256>, VEX_L;
  6327. //===----------------------------------------------------------------------===//
  6328. // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
  6329. // halves of a 256-bit vector.
  6330. //
  6331. let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
  6332. def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
  6333. (ins i128mem:$src),
  6334. "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
  6335. Sched<[WriteShuffleLd]>, VEX, VEX_L;
  6336. let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
  6337. ExeDomain = SSEPackedSingle in
  6338. def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
  6339. (ins f128mem:$src),
  6340. "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
  6341. Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
  6342. let Predicates = [HasAVX, NoVLX] in {
  6343. def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
  6344. (VBROADCASTF128 addr:$src)>;
  6345. def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
  6346. (VBROADCASTF128 addr:$src)>;
  6347. // NOTE: We're using FP instructions here, but execution domain fixing can
  6348. // convert to integer when profitable.
  6349. def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
  6350. (VBROADCASTF128 addr:$src)>;
  6351. def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
  6352. (VBROADCASTF128 addr:$src)>;
  6353. def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
  6354. (VBROADCASTF128 addr:$src)>;
  6355. def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
  6356. (VBROADCASTF128 addr:$src)>;
  6357. }
  6358. //===----------------------------------------------------------------------===//
  6359. // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
  6360. //
  6361. let ExeDomain = SSEPackedSingle in {
  6362. let isCommutable = 1 in
  6363. def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
  6364. (ins VR256:$src1, VR256:$src2, u8imm:$src3),
  6365. "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6366. VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
  6367. def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
  6368. (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
  6369. "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6370. VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
  6371. }
  6372. // Immediate transform to help with commuting.
  6373. def Perm2XCommuteImm : SDNodeXForm<timm, [{
  6374. return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
  6375. }]>;
  6376. multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
  6377. def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
  6378. (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
  6379. def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
  6380. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
  6381. // Pattern with load in other operand.
  6382. def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
  6383. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
  6384. (Perm2XCommuteImm timm:$imm))>;
  6385. }
  6386. let Predicates = [HasAVX] in {
  6387. defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
  6388. defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
  6389. }
  6390. let Predicates = [HasAVX1Only] in {
  6391. defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
  6392. defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
  6393. defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
  6394. defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
  6395. }
  6396. //===----------------------------------------------------------------------===//
  6397. // VINSERTF128 - Insert packed floating-point values
  6398. //
  6399. let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  6400. def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
  6401. (ins VR256:$src1, VR128:$src2, u8imm:$src3),
  6402. "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6403. []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
  6404. let mayLoad = 1 in
  6405. def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
  6406. (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
  6407. "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6408. []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  6409. }
  6410. // To create a 256-bit all ones value, we should produce VCMPTRUEPS
  6411. // with YMM register containing zero.
  6412. // FIXME: Avoid producing vxorps to clear the fake inputs.
  6413. let Predicates = [HasAVX1Only] in {
  6414. def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
  6415. }
  6416. multiclass vinsert_lowering<string InstrStr, string PermStr,
  6417. ValueType From, ValueType To,
  6418. PatFrag frommemop_frag, PatFrag tomemop_frag> {
  6419. def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
  6420. (iPTR imm)),
  6421. (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
  6422. (INSERT_get_vinsert128_imm VR256:$ins))>;
  6423. def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
  6424. (From (frommemop_frag addr:$src2)),
  6425. (iPTR imm)),
  6426. (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
  6427. (INSERT_get_vinsert128_imm VR256:$ins))>;
  6428. // Folding "To" vector - convert to perm2x128 and commute inputs.
  6429. def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
  6430. (From VR128:$src2),
  6431. (iPTR imm)),
  6432. (!cast<Instruction>(PermStr#rm)
  6433. (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
  6434. addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
  6435. }
  6436. let Predicates = [HasAVX, NoVLX] in {
  6437. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
  6438. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
  6439. }
  6440. let Predicates = [HasAVX1Only] in {
  6441. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>;
  6442. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>;
  6443. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
  6444. defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>;
  6445. }
  6446. //===----------------------------------------------------------------------===//
  6447. // VEXTRACTF128 - Extract packed floating-point values
  6448. //
  6449. let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
  6450. def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
  6451. (ins VR256:$src1, u8imm:$src2),
  6452. "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6453. []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
  6454. let mayStore = 1 in
  6455. def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
  6456. (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
  6457. "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6458. []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
  6459. }
  6460. multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
  6461. def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
  6462. (To (!cast<Instruction>(InstrStr#rr)
  6463. (From VR256:$src1),
  6464. (EXTRACT_get_vextract128_imm VR128:$ext)))>;
  6465. def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
  6466. (iPTR imm))), addr:$dst),
  6467. (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
  6468. (EXTRACT_get_vextract128_imm VR128:$ext))>;
  6469. }
  6470. // AVX1 patterns
  6471. let Predicates = [HasAVX, NoVLX] in {
  6472. defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
  6473. defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
  6474. }
  6475. let Predicates = [HasAVX1Only] in {
  6476. defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
  6477. defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
  6478. defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
  6479. defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
  6480. }
  6481. //===----------------------------------------------------------------------===//
  6482. // VMASKMOV - Conditional SIMD Packed Loads and Stores
  6483. //
  6484. multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
  6485. Intrinsic IntLd, Intrinsic IntLd256,
  6486. Intrinsic IntSt, Intrinsic IntSt256,
  6487. X86SchedWriteMaskMove schedX,
  6488. X86SchedWriteMaskMove schedY> {
  6489. def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
  6490. (ins VR128:$src1, f128mem:$src2),
  6491. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6492. [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
  6493. VEX_4V, Sched<[schedX.RM]>;
  6494. def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
  6495. (ins VR256:$src1, f256mem:$src2),
  6496. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6497. [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
  6498. VEX_4V, VEX_L, Sched<[schedY.RM]>;
  6499. def mr : AVX8I<opc_mr, MRMDestMem, (outs),
  6500. (ins f128mem:$dst, VR128:$src1, VR128:$src2),
  6501. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6502. [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
  6503. VEX_4V, Sched<[schedX.MR]>;
  6504. def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
  6505. (ins f256mem:$dst, VR256:$src1, VR256:$src2),
  6506. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6507. [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
  6508. VEX_4V, VEX_L, Sched<[schedY.MR]>;
  6509. }
  6510. let ExeDomain = SSEPackedSingle in
  6511. defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
  6512. int_x86_avx_maskload_ps,
  6513. int_x86_avx_maskload_ps_256,
  6514. int_x86_avx_maskstore_ps,
  6515. int_x86_avx_maskstore_ps_256,
  6516. WriteFMaskMove32, WriteFMaskMove32Y>;
  6517. let ExeDomain = SSEPackedDouble in
  6518. defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
  6519. int_x86_avx_maskload_pd,
  6520. int_x86_avx_maskload_pd_256,
  6521. int_x86_avx_maskstore_pd,
  6522. int_x86_avx_maskstore_pd_256,
  6523. WriteFMaskMove64, WriteFMaskMove64Y>;
  6524. //===----------------------------------------------------------------------===//
  6525. // AVX_VNNI
  6526. //===----------------------------------------------------------------------===//
  6527. let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
  6528. ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
  6529. multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
  6530. bit IsCommutable> {
  6531. let isCommutable = IsCommutable in
  6532. def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
  6533. (ins VR128:$src1, VR128:$src2, VR128:$src3),
  6534. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6535. [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
  6536. VR128:$src2, VR128:$src3)))]>,
  6537. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  6538. def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
  6539. (ins VR128:$src1, VR128:$src2, i128mem:$src3),
  6540. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6541. [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
  6542. (loadv4i32 addr:$src3))))]>,
  6543. VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
  6544. let isCommutable = IsCommutable in
  6545. def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
  6546. (ins VR256:$src1, VR256:$src2, VR256:$src3),
  6547. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6548. [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
  6549. VR256:$src2, VR256:$src3)))]>,
  6550. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
  6551. def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
  6552. (ins VR256:$src1, VR256:$src2, i256mem:$src3),
  6553. !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
  6554. [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
  6555. (loadv8i32 addr:$src3))))]>,
  6556. VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
  6557. }
  6558. defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
  6559. defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
  6560. defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>;
  6561. defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
  6562. def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
  6563. (X86vpmaddwd node:$lhs, node:$rhs), [{
  6564. return N->hasOneUse();
  6565. }]>;
  6566. let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
  6567. def : Pat<(v8i32 (add VR256:$src1,
  6568. (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
  6569. (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
  6570. def : Pat<(v8i32 (add VR256:$src1,
  6571. (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
  6572. (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
  6573. def : Pat<(v4i32 (add VR128:$src1,
  6574. (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
  6575. (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
  6576. def : Pat<(v4i32 (add VR128:$src1,
  6577. (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
  6578. (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
  6579. }
  6580. //===----------------------------------------------------------------------===//
  6581. // VPERMIL - Permute Single and Double Floating-Point Values
  6582. //
  6583. multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
  6584. RegisterClass RC, X86MemOperand x86memop_f,
  6585. X86MemOperand x86memop_i,
  6586. ValueType f_vt, ValueType i_vt,
  6587. X86FoldableSchedWrite sched,
  6588. X86FoldableSchedWrite varsched> {
  6589. let Predicates = [HasAVX, NoVLX] in {
  6590. def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
  6591. (ins RC:$src1, RC:$src2),
  6592. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6593. [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
  6594. Sched<[varsched]>;
  6595. def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
  6596. (ins RC:$src1, x86memop_i:$src2),
  6597. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6598. [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
  6599. (i_vt (load addr:$src2)))))]>, VEX_4V,
  6600. Sched<[varsched.Folded, sched.ReadAfterFold]>;
  6601. def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
  6602. (ins RC:$src1, u8imm:$src2),
  6603. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6604. [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
  6605. Sched<[sched]>;
  6606. def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
  6607. (ins x86memop_f:$src1, u8imm:$src2),
  6608. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6609. [(set RC:$dst,
  6610. (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
  6611. Sched<[sched.Folded]>;
  6612. }// Predicates = [HasAVX, NoVLX]
  6613. }
  6614. let ExeDomain = SSEPackedSingle in {
  6615. defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
  6616. v4f32, v4i32, SchedWriteFShuffle.XMM,
  6617. SchedWriteFVarShuffle.XMM>;
  6618. defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
  6619. v8f32, v8i32, SchedWriteFShuffle.YMM,
  6620. SchedWriteFVarShuffle.YMM>, VEX_L;
  6621. }
  6622. let ExeDomain = SSEPackedDouble in {
  6623. defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
  6624. v2f64, v2i64, SchedWriteFShuffle.XMM,
  6625. SchedWriteFVarShuffle.XMM>;
  6626. defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
  6627. v4f64, v4i64, SchedWriteFShuffle.YMM,
  6628. SchedWriteFVarShuffle.YMM>, VEX_L;
  6629. }
  6630. //===----------------------------------------------------------------------===//
  6631. // VZERO - Zero YMM registers
  6632. // Note: These instruction do not affect the YMM16-YMM31.
  6633. //
  6634. let SchedRW = [WriteSystem] in {
  6635. let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
  6636. YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
  6637. // Zero All YMM registers
  6638. def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
  6639. [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
  6640. Requires<[HasAVX]>, VEX_WIG;
  6641. // Zero Upper bits of YMM registers
  6642. def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
  6643. [(int_x86_avx_vzeroupper)]>, PS, VEX,
  6644. Requires<[HasAVX]>, VEX_WIG;
  6645. } // Defs
  6646. } // SchedRW
  6647. //===----------------------------------------------------------------------===//
  6648. // Half precision conversion instructions
  6649. //
  6650. multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
  6651. X86FoldableSchedWrite sched> {
  6652. def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
  6653. "vcvtph2ps\t{$src, $dst|$dst, $src}",
  6654. [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
  6655. T8PD, VEX, Sched<[sched]>;
  6656. let hasSideEffects = 0, mayLoad = 1 in
  6657. def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
  6658. "vcvtph2ps\t{$src, $dst|$dst, $src}",
  6659. []>, T8PD, VEX, Sched<[sched.Folded]>;
  6660. }
  6661. multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
  6662. SchedWrite RR, SchedWrite MR> {
  6663. def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
  6664. (ins RC:$src1, i32u8imm:$src2),
  6665. "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  6666. [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
  6667. TAPD, VEX, Sched<[RR]>;
  6668. let hasSideEffects = 0, mayStore = 1 in
  6669. def mr : Ii8<0x1D, MRMDestMem, (outs),
  6670. (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
  6671. "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  6672. TAPD, VEX, Sched<[MR]>;
  6673. }
  6674. let Predicates = [HasF16C, NoVLX] in {
  6675. defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
  6676. defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
  6677. defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
  6678. WriteCvtPS2PHSt>, SIMD_EXC;
  6679. defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
  6680. WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
  6681. // Pattern match vcvtph2ps of a scalar i64 load.
  6682. def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
  6683. (VCVTPH2PSrm addr:$src)>;
  6684. def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
  6685. (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
  6686. (VCVTPH2PSrm addr:$src)>;
  6687. def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
  6688. (VCVTPH2PSYrm addr:$src)>;
  6689. def : Pat<(store (f64 (extractelt
  6690. (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
  6691. (iPTR 0))), addr:$dst),
  6692. (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
  6693. def : Pat<(store (i64 (extractelt
  6694. (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
  6695. (iPTR 0))), addr:$dst),
  6696. (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
  6697. def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
  6698. (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
  6699. }
  6700. //===----------------------------------------------------------------------===//
  6701. // AVX2 Instructions
  6702. //===----------------------------------------------------------------------===//
  6703. /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
  6704. multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
  6705. ValueType OpVT, X86FoldableSchedWrite sched,
  6706. RegisterClass RC,
  6707. X86MemOperand x86memop, SDNodeXForm commuteXForm> {
  6708. let isCommutable = 1 in
  6709. def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
  6710. (ins RC:$src1, RC:$src2, u8imm:$src3),
  6711. !strconcat(OpcodeStr,
  6712. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  6713. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
  6714. Sched<[sched]>, VEX_4V;
  6715. def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
  6716. (ins RC:$src1, x86memop:$src2, u8imm:$src3),
  6717. !strconcat(OpcodeStr,
  6718. "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
  6719. [(set RC:$dst,
  6720. (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
  6721. Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
  6722. // Pattern to commute if load is in first source.
  6723. def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
  6724. (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
  6725. (commuteXForm timm:$src3))>;
  6726. }
  6727. let Predicates = [HasAVX2] in {
  6728. defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
  6729. SchedWriteBlend.XMM, VR128, i128mem,
  6730. BlendCommuteImm4>;
  6731. defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
  6732. SchedWriteBlend.YMM, VR256, i256mem,
  6733. BlendCommuteImm8>, VEX_L;
  6734. def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
  6735. (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
  6736. def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
  6737. (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
  6738. def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
  6739. (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
  6740. def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
  6741. (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
  6742. def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
  6743. (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
  6744. def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
  6745. (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
  6746. }
  6747. // For insertion into the zero index (low half) of a 256-bit vector, it is
  6748. // more efficient to generate a blend with immediate instead of an insert*128.
  6749. // NOTE: We're using FP instructions here, but execution domain fixing should
  6750. // take care of using integer instructions when profitable.
  6751. let Predicates = [HasAVX] in {
  6752. def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
  6753. (VBLENDPSYrri VR256:$src1,
  6754. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6755. VR128:$src2, sub_xmm), 0xf)>;
  6756. def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
  6757. (VBLENDPSYrri VR256:$src1,
  6758. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6759. VR128:$src2, sub_xmm), 0xf)>;
  6760. def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
  6761. (VBLENDPSYrri VR256:$src1,
  6762. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6763. VR128:$src2, sub_xmm), 0xf)>;
  6764. def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
  6765. (VBLENDPSYrri VR256:$src1,
  6766. (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6767. VR128:$src2, sub_xmm), 0xf)>;
  6768. def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
  6769. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6770. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6771. def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
  6772. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6773. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6774. def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
  6775. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6776. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6777. def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
  6778. (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6779. VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
  6780. }
  6781. //===----------------------------------------------------------------------===//
  6782. // VPBROADCAST - Load from memory and broadcast to all elements of the
  6783. // destination operand
  6784. //
  6785. multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
  6786. X86MemOperand x86memop, PatFrag bcast_frag,
  6787. ValueType OpVT128, ValueType OpVT256, Predicate prd> {
  6788. let Predicates = [HasAVX2, prd] in {
  6789. def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
  6790. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6791. [(set VR128:$dst,
  6792. (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
  6793. Sched<[SchedWriteShuffle.XMM]>, VEX;
  6794. def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
  6795. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6796. [(set VR128:$dst,
  6797. (OpVT128 (bcast_frag addr:$src)))]>,
  6798. Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
  6799. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
  6800. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6801. [(set VR256:$dst,
  6802. (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
  6803. Sched<[WriteShuffle256]>, VEX, VEX_L;
  6804. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
  6805. !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
  6806. [(set VR256:$dst,
  6807. (OpVT256 (bcast_frag addr:$src)))]>,
  6808. Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
  6809. // Provide aliases for broadcast from the same register class that
  6810. // automatically does the extract.
  6811. def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
  6812. (!cast<Instruction>(NAME#"Yrr")
  6813. (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
  6814. }
  6815. }
  6816. defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
  6817. v16i8, v32i8, NoVLX_Or_NoBWI>;
  6818. defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
  6819. v8i16, v16i16, NoVLX_Or_NoBWI>;
  6820. defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
  6821. v4i32, v8i32, NoVLX>;
  6822. defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
  6823. v2i64, v4i64, NoVLX>;
  6824. let Predicates = [HasAVX2, NoVLX] in {
  6825. // Provide fallback in case the load node that is used in the patterns above
  6826. // is used by additional users, which prevents the pattern selection.
  6827. def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
  6828. (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6829. def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
  6830. (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
  6831. def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
  6832. (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6833. }
  6834. let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  6835. def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
  6836. (VPBROADCASTBrr (VMOVDI2PDIrr
  6837. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6838. GR8:$src, sub_8bit))))>;
  6839. def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
  6840. (VPBROADCASTBYrr (VMOVDI2PDIrr
  6841. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6842. GR8:$src, sub_8bit))))>;
  6843. def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
  6844. (VPBROADCASTWrr (VMOVDI2PDIrr
  6845. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6846. GR16:$src, sub_16bit))))>;
  6847. def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
  6848. (VPBROADCASTWYrr (VMOVDI2PDIrr
  6849. (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
  6850. GR16:$src, sub_16bit))))>;
  6851. }
  6852. let Predicates = [HasAVX2, NoVLX] in {
  6853. def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
  6854. (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
  6855. def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
  6856. (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
  6857. def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
  6858. (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
  6859. def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
  6860. (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
  6861. }
  6862. // AVX1 broadcast patterns
  6863. let Predicates = [HasAVX1Only] in {
  6864. def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
  6865. (VBROADCASTSSYrm addr:$src)>;
  6866. def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
  6867. (VBROADCASTSDYrm addr:$src)>;
  6868. def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
  6869. (VBROADCASTSSrm addr:$src)>;
  6870. }
  6871. // Provide fallback in case the load node that is used in the patterns above
  6872. // is used by additional users, which prevents the pattern selection.
  6873. let Predicates = [HasAVX, NoVLX] in {
  6874. // 128bit broadcasts:
  6875. def : Pat<(v2f64 (X86VBroadcast f64:$src)),
  6876. (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
  6877. def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
  6878. (VMOVDDUPrm addr:$src)>;
  6879. def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
  6880. (VMOVDDUPrr VR128:$src)>;
  6881. }
  6882. let Predicates = [HasAVX1Only] in {
  6883. def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
  6884. (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
  6885. def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
  6886. (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  6887. (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
  6888. (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
  6889. def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
  6890. (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
  6891. (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
  6892. (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
  6893. def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
  6894. (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  6895. (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
  6896. (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
  6897. def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
  6898. (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
  6899. (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
  6900. (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
  6901. def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
  6902. (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
  6903. def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
  6904. (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
  6905. (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
  6906. (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
  6907. def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
  6908. (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
  6909. (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
  6910. (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
  6911. def : Pat<(v2i64 (X86VBroadcast i64:$src)),
  6912. (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
  6913. def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
  6914. (VMOVDDUPrm addr:$src)>;
  6915. }
  6916. //===----------------------------------------------------------------------===//
  6917. // VPERM - Permute instructions
  6918. //
  6919. multiclass avx2_perm<bits<8> opc, string OpcodeStr,
  6920. ValueType OpVT, X86FoldableSchedWrite Sched,
  6921. X86MemOperand memOp> {
  6922. let Predicates = [HasAVX2, NoVLX] in {
  6923. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
  6924. (ins VR256:$src1, VR256:$src2),
  6925. !strconcat(OpcodeStr,
  6926. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6927. [(set VR256:$dst,
  6928. (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
  6929. Sched<[Sched]>, VEX_4V, VEX_L;
  6930. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
  6931. (ins VR256:$src1, memOp:$src2),
  6932. !strconcat(OpcodeStr,
  6933. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6934. [(set VR256:$dst,
  6935. (OpVT (X86VPermv VR256:$src1,
  6936. (load addr:$src2))))]>,
  6937. Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
  6938. }
  6939. }
  6940. defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
  6941. let ExeDomain = SSEPackedSingle in
  6942. defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
  6943. multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
  6944. ValueType OpVT, X86FoldableSchedWrite Sched,
  6945. X86MemOperand memOp> {
  6946. let Predicates = [HasAVX2, NoVLX] in {
  6947. def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
  6948. (ins VR256:$src1, u8imm:$src2),
  6949. !strconcat(OpcodeStr,
  6950. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6951. [(set VR256:$dst,
  6952. (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
  6953. Sched<[Sched]>, VEX, VEX_L;
  6954. def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
  6955. (ins memOp:$src1, u8imm:$src2),
  6956. !strconcat(OpcodeStr,
  6957. "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  6958. [(set VR256:$dst,
  6959. (OpVT (X86VPermi (mem_frag addr:$src1),
  6960. (i8 timm:$src2))))]>,
  6961. Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
  6962. }
  6963. }
  6964. defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
  6965. WriteShuffle256, i256mem>, VEX_W;
  6966. let ExeDomain = SSEPackedDouble in
  6967. defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
  6968. WriteFShuffle256, f256mem>, VEX_W;
  6969. //===----------------------------------------------------------------------===//
  6970. // VPERM2I128 - Permute Integer vector Values in 128-bit chunks
  6971. //
  6972. let isCommutable = 1 in
  6973. def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
  6974. (ins VR256:$src1, VR256:$src2, u8imm:$src3),
  6975. "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6976. Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
  6977. def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
  6978. (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
  6979. "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
  6980. Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  6981. let Predicates = [HasAVX2] in {
  6982. defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
  6983. defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
  6984. defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
  6985. defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
  6986. }
  6987. //===----------------------------------------------------------------------===//
  6988. // VINSERTI128 - Insert packed integer values
  6989. //
  6990. let hasSideEffects = 0 in {
  6991. def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
  6992. (ins VR256:$src1, VR128:$src2, u8imm:$src3),
  6993. "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6994. []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
  6995. let mayLoad = 1 in
  6996. def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
  6997. (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
  6998. "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
  6999. []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
  7000. }
  7001. let Predicates = [HasAVX2, NoVLX] in {
  7002. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>;
  7003. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>;
  7004. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>;
  7005. defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>;
  7006. }
  7007. //===----------------------------------------------------------------------===//
  7008. // VEXTRACTI128 - Extract packed integer values
  7009. //
  7010. def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
  7011. (ins VR256:$src1, u8imm:$src2),
  7012. "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  7013. Sched<[WriteShuffle256]>, VEX, VEX_L;
  7014. let hasSideEffects = 0, mayStore = 1 in
  7015. def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
  7016. (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
  7017. "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
  7018. Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
  7019. let Predicates = [HasAVX2, NoVLX] in {
  7020. defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
  7021. defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
  7022. defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
  7023. defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
  7024. }
  7025. //===----------------------------------------------------------------------===//
  7026. // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
  7027. //
  7028. multiclass avx2_pmovmask<string OpcodeStr,
  7029. Intrinsic IntLd128, Intrinsic IntLd256,
  7030. Intrinsic IntSt128, Intrinsic IntSt256,
  7031. X86SchedWriteMaskMove schedX,
  7032. X86SchedWriteMaskMove schedY> {
  7033. def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
  7034. (ins VR128:$src1, i128mem:$src2),
  7035. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7036. [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
  7037. VEX_4V, Sched<[schedX.RM]>;
  7038. def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
  7039. (ins VR256:$src1, i256mem:$src2),
  7040. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7041. [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
  7042. VEX_4V, VEX_L, Sched<[schedY.RM]>;
  7043. def mr : AVX28I<0x8e, MRMDestMem, (outs),
  7044. (ins i128mem:$dst, VR128:$src1, VR128:$src2),
  7045. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7046. [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
  7047. VEX_4V, Sched<[schedX.MR]>;
  7048. def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
  7049. (ins i256mem:$dst, VR256:$src1, VR256:$src2),
  7050. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7051. [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
  7052. VEX_4V, VEX_L, Sched<[schedY.MR]>;
  7053. }
  7054. defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
  7055. int_x86_avx2_maskload_d,
  7056. int_x86_avx2_maskload_d_256,
  7057. int_x86_avx2_maskstore_d,
  7058. int_x86_avx2_maskstore_d_256,
  7059. WriteVecMaskMove32, WriteVecMaskMove32Y>;
  7060. defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
  7061. int_x86_avx2_maskload_q,
  7062. int_x86_avx2_maskload_q_256,
  7063. int_x86_avx2_maskstore_q,
  7064. int_x86_avx2_maskstore_q_256,
  7065. WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
  7066. multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
  7067. ValueType MaskVT> {
  7068. // masked store
  7069. def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
  7070. (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
  7071. // masked load
  7072. def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
  7073. (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
  7074. def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
  7075. (VT immAllZerosV))),
  7076. (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
  7077. }
  7078. let Predicates = [HasAVX] in {
  7079. defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
  7080. defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
  7081. defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
  7082. defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
  7083. }
  7084. let Predicates = [HasAVX1Only] in {
  7085. // load/store i32/i64 not supported use ps/pd version
  7086. defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
  7087. defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
  7088. defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
  7089. defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
  7090. }
  7091. let Predicates = [HasAVX2] in {
  7092. defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
  7093. defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
  7094. defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
  7095. defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
  7096. }
  7097. //===----------------------------------------------------------------------===//
  7098. // Variable Bit Shifts
  7099. //
  7100. multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
  7101. ValueType vt128, ValueType vt256> {
  7102. def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
  7103. (ins VR128:$src1, VR128:$src2),
  7104. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7105. [(set VR128:$dst,
  7106. (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
  7107. VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
  7108. def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
  7109. (ins VR128:$src1, i128mem:$src2),
  7110. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7111. [(set VR128:$dst,
  7112. (vt128 (OpNode VR128:$src1,
  7113. (vt128 (load addr:$src2)))))]>,
  7114. VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
  7115. SchedWriteVarVecShift.XMM.ReadAfterFold]>;
  7116. def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
  7117. (ins VR256:$src1, VR256:$src2),
  7118. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7119. [(set VR256:$dst,
  7120. (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
  7121. VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
  7122. def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
  7123. (ins VR256:$src1, i256mem:$src2),
  7124. !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
  7125. [(set VR256:$dst,
  7126. (vt256 (OpNode VR256:$src1,
  7127. (vt256 (load addr:$src2)))))]>,
  7128. VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
  7129. SchedWriteVarVecShift.YMM.ReadAfterFold]>;
  7130. }
  7131. let Predicates = [HasAVX2, NoVLX] in {
  7132. defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
  7133. defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
  7134. defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
  7135. defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
  7136. defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
  7137. }
  7138. //===----------------------------------------------------------------------===//
  7139. // VGATHER - GATHER Operations
  7140. // FIXME: Improve scheduling of gather instructions.
  7141. multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
  7142. X86MemOperand memop128, X86MemOperand memop256> {
  7143. let mayLoad = 1, hasSideEffects = 0 in {
  7144. def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
  7145. (ins VR128:$src1, memop128:$src2, VR128:$mask),
  7146. !strconcat(OpcodeStr,
  7147. "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
  7148. []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
  7149. def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
  7150. (ins RC256:$src1, memop256:$src2, RC256:$mask),
  7151. !strconcat(OpcodeStr,
  7152. "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
  7153. []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
  7154. }
  7155. }
  7156. let Predicates = [HasAVX2] in {
  7157. let mayLoad = 1, hasSideEffects = 0, Constraints
  7158. = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
  7159. in {
  7160. defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
  7161. VR256, vx128mem, vx256mem>, VEX_W;
  7162. defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
  7163. VR256, vx128mem, vy256mem>, VEX_W;
  7164. defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
  7165. VR256, vx128mem, vy256mem>;
  7166. defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
  7167. VR128, vx64mem, vy128mem>;
  7168. let ExeDomain = SSEPackedDouble in {
  7169. defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
  7170. VR256, vx128mem, vx256mem>, VEX_W;
  7171. defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
  7172. VR256, vx128mem, vy256mem>, VEX_W;
  7173. }
  7174. let ExeDomain = SSEPackedSingle in {
  7175. defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
  7176. VR256, vx128mem, vy256mem>;
  7177. defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
  7178. VR128, vx64mem, vy128mem>;
  7179. }
  7180. }
  7181. }
  7182. //===----------------------------------------------------------------------===//
  7183. // GFNI instructions
  7184. //===----------------------------------------------------------------------===//
  7185. multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
  7186. RegisterClass RC, PatFrag MemOpFrag,
  7187. X86MemOperand X86MemOp, bit Is2Addr = 0> {
  7188. let ExeDomain = SSEPackedInt,
  7189. AsmString = !if(Is2Addr,
  7190. OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
  7191. OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
  7192. let isCommutable = 1 in
  7193. def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
  7194. [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
  7195. Sched<[SchedWriteVecALU.XMM]>, T8PD;
  7196. def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
  7197. [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
  7198. (MemOpFrag addr:$src2))))]>,
  7199. Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
  7200. }
  7201. }
  7202. multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
  7203. SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
  7204. X86MemOperand X86MemOp, bit Is2Addr = 0> {
  7205. let AsmString = !if(Is2Addr,
  7206. OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
  7207. OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
  7208. def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
  7209. (ins RC:$src1, RC:$src2, u8imm:$src3), "",
  7210. [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
  7211. SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
  7212. def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
  7213. (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
  7214. [(set RC:$dst, (OpVT (OpNode RC:$src1,
  7215. (MemOpFrag addr:$src2),
  7216. timm:$src3)))], SSEPackedInt>,
  7217. Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
  7218. }
  7219. }
  7220. multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
  7221. let Constraints = "$src1 = $dst",
  7222. Predicates = [HasGFNI, UseSSE2] in
  7223. defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
  7224. VR128, load, i128mem, 1>;
  7225. let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
  7226. defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
  7227. load, i128mem>, VEX_4V, VEX_W;
  7228. defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
  7229. load, i256mem>, VEX_4V, VEX_L, VEX_W;
  7230. }
  7231. }
  7232. // GF2P8MULB
  7233. let Constraints = "$src1 = $dst",
  7234. Predicates = [HasGFNI, UseSSE2] in
  7235. defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
  7236. i128mem, 1>;
  7237. let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
  7238. defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
  7239. i128mem>, VEX_4V;
  7240. defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
  7241. i256mem>, VEX_4V, VEX_L;
  7242. }
  7243. // GF2P8AFFINEINVQB, GF2P8AFFINEQB
  7244. let isCommutable = 0 in {
  7245. defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
  7246. X86GF2P8affineinvqb>, TAPD;
  7247. defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
  7248. X86GF2P8affineqb>, TAPD;
  7249. }