h264_intrapred.asm 69 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717
  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Fiona Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. tm_shuf: times 8 db 0x03, 0x80
  27. pw_ff00: times 8 dw 0xff00
  28. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  29. db 1, 2, 3, 4, 5, 6, 7, 8
  30. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  31. db 1, 2, 3, 4, 0, 0, 0, 0
  32. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  33. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  34. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  35. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  36. SECTION .text
  37. cextern pb_1
  38. cextern pb_3
  39. cextern pw_4
  40. cextern pw_5
  41. cextern pw_8
  42. cextern pw_16
  43. cextern pw_17
  44. cextern pw_32
  45. ;-----------------------------------------------------------------------------
  46. ; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
  47. ;-----------------------------------------------------------------------------
  48. INIT_MMX mmx
  49. cglobal pred16x16_vertical_8, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. INIT_XMM sse
  64. cglobal pred16x16_vertical_8, 2,3
  65. sub r0, r1
  66. mov r2, 4
  67. movaps xmm0, [r0]
  68. .loop:
  69. movaps [r0+r1*1], xmm0
  70. movaps [r0+r1*2], xmm0
  71. lea r0, [r0+r1*2]
  72. movaps [r0+r1*1], xmm0
  73. movaps [r0+r1*2], xmm0
  74. lea r0, [r0+r1*2]
  75. dec r2
  76. jg .loop
  77. REP_RET
  78. ;-----------------------------------------------------------------------------
  79. ; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
  80. ;-----------------------------------------------------------------------------
  81. %macro PRED16x16_H 0
  82. cglobal pred16x16_horizontal_8, 2,3
  83. mov r2, 8
  84. %if cpuflag(ssse3)
  85. mova m2, [pb_3]
  86. %endif
  87. .loop:
  88. movd m0, [r0+r1*0-4]
  89. movd m1, [r0+r1*1-4]
  90. %if cpuflag(ssse3)
  91. pshufb m0, m2
  92. pshufb m1, m2
  93. %else
  94. punpcklbw m0, m0
  95. punpcklbw m1, m1
  96. SPLATW m0, m0, 3
  97. SPLATW m1, m1, 3
  98. mova [r0+r1*0+8], m0
  99. mova [r0+r1*1+8], m1
  100. %endif
  101. mova [r0+r1*0], m0
  102. mova [r0+r1*1], m1
  103. lea r0, [r0+r1*2]
  104. dec r2
  105. jg .loop
  106. REP_RET
  107. %endmacro
  108. INIT_MMX mmx
  109. PRED16x16_H
  110. INIT_MMX mmxext
  111. PRED16x16_H
  112. INIT_XMM ssse3
  113. PRED16x16_H
  114. ;-----------------------------------------------------------------------------
  115. ; void ff_pred16x16_dc_8(uint8_t *src, int stride)
  116. ;-----------------------------------------------------------------------------
  117. %macro PRED16x16_DC 0
  118. cglobal pred16x16_dc_8, 2,7
  119. mov r4, r0
  120. sub r0, r1
  121. pxor mm0, mm0
  122. pxor mm1, mm1
  123. psadbw mm0, [r0+0]
  124. psadbw mm1, [r0+8]
  125. dec r0
  126. movzx r5d, byte [r0+r1*1]
  127. paddw mm0, mm1
  128. movd r6d, mm0
  129. lea r0, [r0+r1*2]
  130. %rep 7
  131. movzx r2d, byte [r0+r1*0]
  132. movzx r3d, byte [r0+r1*1]
  133. add r5d, r2d
  134. add r6d, r3d
  135. lea r0, [r0+r1*2]
  136. %endrep
  137. movzx r2d, byte [r0+r1*0]
  138. add r5d, r6d
  139. lea r2d, [r2+r5+16]
  140. shr r2d, 5
  141. %if cpuflag(ssse3)
  142. pxor m1, m1
  143. %endif
  144. SPLATB_REG m0, r2, m1
  145. %if mmsize==8
  146. mov r3d, 8
  147. .loop:
  148. mova [r4+r1*0+0], m0
  149. mova [r4+r1*0+8], m0
  150. mova [r4+r1*1+0], m0
  151. mova [r4+r1*1+8], m0
  152. %else
  153. mov r3d, 4
  154. .loop:
  155. mova [r4+r1*0], m0
  156. mova [r4+r1*1], m0
  157. lea r4, [r4+r1*2]
  158. mova [r4+r1*0], m0
  159. mova [r4+r1*1], m0
  160. %endif
  161. lea r4, [r4+r1*2]
  162. dec r3d
  163. jg .loop
  164. REP_RET
  165. %endmacro
  166. INIT_MMX mmxext
  167. PRED16x16_DC
  168. INIT_XMM sse2
  169. PRED16x16_DC
  170. INIT_XMM ssse3
  171. PRED16x16_DC
  172. ;-----------------------------------------------------------------------------
  173. ; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
  174. ;-----------------------------------------------------------------------------
  175. %macro PRED16x16_TM 0
  176. cglobal pred16x16_tm_vp8_8, 2,5
  177. sub r0, r1
  178. pxor mm7, mm7
  179. movq mm0, [r0+0]
  180. movq mm2, [r0+8]
  181. movq mm1, mm0
  182. movq mm3, mm2
  183. punpcklbw mm0, mm7
  184. punpckhbw mm1, mm7
  185. punpcklbw mm2, mm7
  186. punpckhbw mm3, mm7
  187. movzx r3d, byte [r0-1]
  188. mov r4d, 16
  189. .loop:
  190. movzx r2d, byte [r0+r1-1]
  191. sub r2d, r3d
  192. movd mm4, r2d
  193. SPLATW mm4, mm4, 0
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmx
  211. PRED16x16_TM
  212. INIT_MMX mmxext
  213. PRED16x16_TM
  214. INIT_XMM sse2
  215. cglobal pred16x16_tm_vp8_8, 2,6,6
  216. sub r0, r1
  217. pxor xmm2, xmm2
  218. movdqa xmm0, [r0]
  219. movdqa xmm1, xmm0
  220. punpcklbw xmm0, xmm2
  221. punpckhbw xmm1, xmm2
  222. movzx r4d, byte [r0-1]
  223. mov r5d, 8
  224. .loop:
  225. movzx r2d, byte [r0+r1*1-1]
  226. movzx r3d, byte [r0+r1*2-1]
  227. sub r2d, r4d
  228. sub r3d, r4d
  229. movd xmm2, r2d
  230. movd xmm4, r3d
  231. pshuflw xmm2, xmm2, 0
  232. pshuflw xmm4, xmm4, 0
  233. punpcklqdq xmm2, xmm2
  234. punpcklqdq xmm4, xmm4
  235. movdqa xmm3, xmm2
  236. movdqa xmm5, xmm4
  237. paddw xmm2, xmm0
  238. paddw xmm3, xmm1
  239. paddw xmm4, xmm0
  240. paddw xmm5, xmm1
  241. packuswb xmm2, xmm3
  242. packuswb xmm4, xmm5
  243. movdqa [r0+r1*1], xmm2
  244. movdqa [r0+r1*2], xmm4
  245. lea r0, [r0+r1*2]
  246. dec r5d
  247. jg .loop
  248. REP_RET
  249. ;-----------------------------------------------------------------------------
  250. ; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
  251. ;-----------------------------------------------------------------------------
  252. %macro H264_PRED16x16_PLANE 1
  253. cglobal pred16x16_plane_%1_8, 2,9,7
  254. mov r2, r1 ; +stride
  255. neg r1 ; -stride
  256. movh m0, [r0+r1 -1]
  257. %if mmsize == 8
  258. pxor m4, m4
  259. movh m1, [r0+r1 +3 ]
  260. movh m2, [r0+r1 +8 ]
  261. movh m3, [r0+r1 +12]
  262. punpcklbw m0, m4
  263. punpcklbw m1, m4
  264. punpcklbw m2, m4
  265. punpcklbw m3, m4
  266. pmullw m0, [pw_m8tom1 ]
  267. pmullw m1, [pw_m8tom1+8]
  268. pmullw m2, [pw_1to8 ]
  269. pmullw m3, [pw_1to8 +8]
  270. paddw m0, m2
  271. paddw m1, m3
  272. %else ; mmsize == 16
  273. %if cpuflag(ssse3)
  274. movhps m0, [r0+r1 +8]
  275. pmaddubsw m0, [plane_shuf] ; H coefficients
  276. %else ; sse2
  277. pxor m2, m2
  278. movh m1, [r0+r1 +8]
  279. punpcklbw m0, m2
  280. punpcklbw m1, m2
  281. pmullw m0, [pw_m8tom1]
  282. pmullw m1, [pw_1to8]
  283. paddw m0, m1
  284. %endif
  285. movhlps m1, m0
  286. %endif
  287. paddw m0, m1
  288. %if cpuflag(mmxext)
  289. PSHUFLW m1, m0, 0xE
  290. %elif cpuflag(mmx)
  291. mova m1, m0
  292. psrlq m1, 32
  293. %endif
  294. paddw m0, m1
  295. %if cpuflag(mmxext)
  296. PSHUFLW m1, m0, 0x1
  297. %elif cpuflag(mmx)
  298. mova m1, m0
  299. psrlq m1, 16
  300. %endif
  301. paddw m0, m1 ; sum of H coefficients
  302. lea r4, [r0+r2*8-1]
  303. lea r3, [r0+r2*4-1]
  304. add r4, r2
  305. %if ARCH_X86_64
  306. %define e_reg r8
  307. %else
  308. %define e_reg r0
  309. %endif
  310. movzx e_reg, byte [r3+r2*2 ]
  311. movzx r5, byte [r4+r1 ]
  312. sub r5, e_reg
  313. movzx e_reg, byte [r3+r2 ]
  314. movzx r6, byte [r4 ]
  315. sub r6, e_reg
  316. lea r5, [r5+r6*2]
  317. movzx e_reg, byte [r3+r1 ]
  318. movzx r6, byte [r4+r2*2 ]
  319. sub r6, e_reg
  320. lea r5, [r5+r6*4]
  321. movzx e_reg, byte [r3 ]
  322. %if ARCH_X86_64
  323. movzx r7, byte [r4+r2 ]
  324. sub r7, e_reg
  325. %else
  326. movzx r6, byte [r4+r2 ]
  327. sub r6, e_reg
  328. lea r5, [r5+r6*4]
  329. sub r5, r6
  330. %endif
  331. lea e_reg, [r3+r1*4]
  332. lea r3, [r4+r2*4]
  333. movzx r4, byte [e_reg+r2 ]
  334. movzx r6, byte [r3 ]
  335. sub r6, r4
  336. %if ARCH_X86_64
  337. lea r6, [r7+r6*2]
  338. lea r5, [r5+r6*2]
  339. add r5, r6
  340. %else
  341. lea r5, [r5+r6*4]
  342. lea r5, [r5+r6*2]
  343. %endif
  344. movzx r4, byte [e_reg ]
  345. %if ARCH_X86_64
  346. movzx r7, byte [r3 +r2 ]
  347. sub r7, r4
  348. sub r5, r7
  349. %else
  350. movzx r6, byte [r3 +r2 ]
  351. sub r6, r4
  352. lea r5, [r5+r6*8]
  353. sub r5, r6
  354. %endif
  355. movzx r4, byte [e_reg+r1 ]
  356. movzx r6, byte [r3 +r2*2]
  357. sub r6, r4
  358. %if ARCH_X86_64
  359. add r6, r7
  360. %endif
  361. lea r5, [r5+r6*8]
  362. movzx r4, byte [e_reg+r2*2]
  363. movzx r6, byte [r3 +r1 ]
  364. sub r6, r4
  365. lea r5, [r5+r6*4]
  366. add r5, r6 ; sum of V coefficients
  367. %if ARCH_X86_64 == 0
  368. mov r0, r0m
  369. %endif
  370. %ifidn %1, h264
  371. lea r5, [r5*5+32]
  372. sar r5, 6
  373. %elifidn %1, rv40
  374. lea r5, [r5*5]
  375. sar r5, 6
  376. %elifidn %1, svq3
  377. test r5, r5
  378. lea r6, [r5+3]
  379. cmovs r5, r6
  380. sar r5, 2 ; V/4
  381. lea r5, [r5*5] ; 5*(V/4)
  382. test r5, r5
  383. lea r6, [r5+15]
  384. cmovs r5, r6
  385. sar r5, 4 ; (5*(V/4))/16
  386. %endif
  387. movzx r4, byte [r0+r1 +15]
  388. movzx r3, byte [r3+r2*2 ]
  389. lea r3, [r3+r4+1]
  390. shl r3, 4
  391. movd r1d, m0
  392. movsx r1d, r1w
  393. %ifnidn %1, svq3
  394. %ifidn %1, h264
  395. lea r1d, [r1d*5+32]
  396. %else ; rv40
  397. lea r1d, [r1d*5]
  398. %endif
  399. sar r1d, 6
  400. %else ; svq3
  401. test r1d, r1d
  402. lea r4d, [r1d+3]
  403. cmovs r1d, r4d
  404. sar r1d, 2 ; H/4
  405. lea r1d, [r1d*5] ; 5*(H/4)
  406. test r1d, r1d
  407. lea r4d, [r1d+15]
  408. cmovs r1d, r4d
  409. sar r1d, 4 ; (5*(H/4))/16
  410. %endif
  411. movd m0, r1d
  412. add r1d, r5d
  413. add r3d, r1d
  414. shl r1d, 3
  415. sub r3d, r1d ; a
  416. movd m1, r5d
  417. movd m3, r3d
  418. SPLATW m0, m0, 0 ; H
  419. SPLATW m1, m1, 0 ; V
  420. SPLATW m3, m3, 0 ; a
  421. %ifidn %1, svq3
  422. SWAP 0, 1
  423. %endif
  424. mova m2, m0
  425. %if mmsize == 8
  426. mova m5, m0
  427. %endif
  428. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  429. %if mmsize == 16
  430. psllw m2, 3
  431. %else
  432. psllw m5, 3
  433. psllw m2, 2
  434. mova m6, m5
  435. paddw m6, m2
  436. %endif
  437. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  438. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  439. %if mmsize == 8
  440. paddw m5, m0 ; a + {8,9,10,11}*H
  441. paddw m6, m0 ; a + {12,13,14,15}*H
  442. %endif
  443. mov r4, 8
  444. .loop:
  445. mova m3, m0 ; b[0..7]
  446. mova m4, m2 ; b[8..15]
  447. psraw m3, 5
  448. psraw m4, 5
  449. packuswb m3, m4
  450. mova [r0], m3
  451. %if mmsize == 8
  452. mova m3, m5 ; b[8..11]
  453. mova m4, m6 ; b[12..15]
  454. psraw m3, 5
  455. psraw m4, 5
  456. packuswb m3, m4
  457. mova [r0+8], m3
  458. %endif
  459. paddw m0, m1
  460. paddw m2, m1
  461. %if mmsize == 8
  462. paddw m5, m1
  463. paddw m6, m1
  464. %endif
  465. mova m3, m0 ; b[0..7]
  466. mova m4, m2 ; b[8..15]
  467. psraw m3, 5
  468. psraw m4, 5
  469. packuswb m3, m4
  470. mova [r0+r2], m3
  471. %if mmsize == 8
  472. mova m3, m5 ; b[8..11]
  473. mova m4, m6 ; b[12..15]
  474. psraw m3, 5
  475. psraw m4, 5
  476. packuswb m3, m4
  477. mova [r0+r2+8], m3
  478. %endif
  479. paddw m0, m1
  480. paddw m2, m1
  481. %if mmsize == 8
  482. paddw m5, m1
  483. paddw m6, m1
  484. %endif
  485. lea r0, [r0+r2*2]
  486. dec r4
  487. jg .loop
  488. REP_RET
  489. %endmacro
  490. INIT_MMX mmx
  491. H264_PRED16x16_PLANE h264
  492. H264_PRED16x16_PLANE rv40
  493. H264_PRED16x16_PLANE svq3
  494. INIT_MMX mmxext
  495. H264_PRED16x16_PLANE h264
  496. H264_PRED16x16_PLANE rv40
  497. H264_PRED16x16_PLANE svq3
  498. INIT_XMM sse2
  499. H264_PRED16x16_PLANE h264
  500. H264_PRED16x16_PLANE rv40
  501. H264_PRED16x16_PLANE svq3
  502. INIT_XMM ssse3
  503. H264_PRED16x16_PLANE h264
  504. H264_PRED16x16_PLANE rv40
  505. H264_PRED16x16_PLANE svq3
  506. ;-----------------------------------------------------------------------------
  507. ; void ff_pred8x8_plane_8(uint8_t *src, int stride)
  508. ;-----------------------------------------------------------------------------
  509. %macro H264_PRED8x8_PLANE 0
  510. cglobal pred8x8_plane_8, 2,9,7
  511. mov r2, r1 ; +stride
  512. neg r1 ; -stride
  513. movd m0, [r0+r1 -1]
  514. %if mmsize == 8
  515. pxor m2, m2
  516. movh m1, [r0+r1 +4 ]
  517. punpcklbw m0, m2
  518. punpcklbw m1, m2
  519. pmullw m0, [pw_m4to4]
  520. pmullw m1, [pw_m4to4+8]
  521. %else ; mmsize == 16
  522. %if cpuflag(ssse3)
  523. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  524. pmaddubsw m0, [plane8_shuf] ; H coefficients
  525. %else ; sse2
  526. pxor m2, m2
  527. movd m1, [r0+r1 +4]
  528. punpckldq m0, m1
  529. punpcklbw m0, m2
  530. pmullw m0, [pw_m4to4]
  531. %endif
  532. movhlps m1, m0
  533. %endif
  534. paddw m0, m1
  535. %if notcpuflag(ssse3)
  536. %if cpuflag(mmxext)
  537. PSHUFLW m1, m0, 0xE
  538. %elif cpuflag(mmx)
  539. mova m1, m0
  540. psrlq m1, 32
  541. %endif
  542. paddw m0, m1
  543. %endif ; !ssse3
  544. %if cpuflag(mmxext)
  545. PSHUFLW m1, m0, 0x1
  546. %elif cpuflag(mmx)
  547. mova m1, m0
  548. psrlq m1, 16
  549. %endif
  550. paddw m0, m1 ; sum of H coefficients
  551. lea r4, [r0+r2*4-1]
  552. lea r3, [r0 -1]
  553. add r4, r2
  554. %if ARCH_X86_64
  555. %define e_reg r8
  556. %else
  557. %define e_reg r0
  558. %endif
  559. movzx e_reg, byte [r3+r2*2 ]
  560. movzx r5, byte [r4+r1 ]
  561. sub r5, e_reg
  562. movzx e_reg, byte [r3 ]
  563. %if ARCH_X86_64
  564. movzx r7, byte [r4+r2 ]
  565. sub r7, e_reg
  566. sub r5, r7
  567. %else
  568. movzx r6, byte [r4+r2 ]
  569. sub r6, e_reg
  570. lea r5, [r5+r6*4]
  571. sub r5, r6
  572. %endif
  573. movzx e_reg, byte [r3+r1 ]
  574. movzx r6, byte [r4+r2*2 ]
  575. sub r6, e_reg
  576. %if ARCH_X86_64
  577. add r6, r7
  578. %endif
  579. lea r5, [r5+r6*4]
  580. movzx e_reg, byte [r3+r2 ]
  581. movzx r6, byte [r4 ]
  582. sub r6, e_reg
  583. lea r6, [r5+r6*2]
  584. lea r5, [r6*9+16]
  585. lea r5, [r5+r6*8]
  586. sar r5, 5
  587. %if ARCH_X86_64 == 0
  588. mov r0, r0m
  589. %endif
  590. movzx r3, byte [r4+r2*2 ]
  591. movzx r4, byte [r0+r1 +7]
  592. lea r3, [r3+r4+1]
  593. shl r3, 4
  594. movd r1d, m0
  595. movsx r1d, r1w
  596. imul r1d, 17
  597. add r1d, 16
  598. sar r1d, 5
  599. movd m0, r1d
  600. add r1d, r5d
  601. sub r3d, r1d
  602. add r1d, r1d
  603. sub r3d, r1d ; a
  604. movd m1, r5d
  605. movd m3, r3d
  606. SPLATW m0, m0, 0 ; H
  607. SPLATW m1, m1, 0 ; V
  608. SPLATW m3, m3, 0 ; a
  609. %if mmsize == 8
  610. mova m2, m0
  611. %endif
  612. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  613. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  614. %if mmsize == 8
  615. psllw m2, 2
  616. paddw m2, m0 ; a + {4,5,6,7}*H
  617. %endif
  618. mov r4, 4
  619. ALIGN 16
  620. .loop:
  621. %if mmsize == 16
  622. mova m3, m0 ; b[0..7]
  623. paddw m0, m1
  624. psraw m3, 5
  625. mova m4, m0 ; V+b[0..7]
  626. paddw m0, m1
  627. psraw m4, 5
  628. packuswb m3, m4
  629. movh [r0], m3
  630. movhps [r0+r2], m3
  631. %else ; mmsize == 8
  632. mova m3, m0 ; b[0..3]
  633. mova m4, m2 ; b[4..7]
  634. paddw m0, m1
  635. paddw m2, m1
  636. psraw m3, 5
  637. psraw m4, 5
  638. mova m5, m0 ; V+b[0..3]
  639. mova m6, m2 ; V+b[4..7]
  640. paddw m0, m1
  641. paddw m2, m1
  642. psraw m5, 5
  643. psraw m6, 5
  644. packuswb m3, m4
  645. packuswb m5, m6
  646. mova [r0], m3
  647. mova [r0+r2], m5
  648. %endif
  649. lea r0, [r0+r2*2]
  650. dec r4
  651. jg .loop
  652. REP_RET
  653. %endmacro
  654. INIT_MMX mmx
  655. H264_PRED8x8_PLANE
  656. INIT_MMX mmxext
  657. H264_PRED8x8_PLANE
  658. INIT_XMM sse2
  659. H264_PRED8x8_PLANE
  660. INIT_XMM ssse3
  661. H264_PRED8x8_PLANE
  662. ;-----------------------------------------------------------------------------
  663. ; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
  664. ;-----------------------------------------------------------------------------
  665. INIT_MMX mmx
  666. cglobal pred8x8_vertical_8, 2,2
  667. sub r0, r1
  668. movq mm0, [r0]
  669. %rep 3
  670. movq [r0+r1*1], mm0
  671. movq [r0+r1*2], mm0
  672. lea r0, [r0+r1*2]
  673. %endrep
  674. movq [r0+r1*1], mm0
  675. movq [r0+r1*2], mm0
  676. RET
  677. ;-----------------------------------------------------------------------------
  678. ; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
  679. ;-----------------------------------------------------------------------------
  680. %macro PRED8x8_H 0
  681. cglobal pred8x8_horizontal_8, 2,3
  682. mov r2, 4
  683. %if cpuflag(ssse3)
  684. mova m2, [pb_3]
  685. %endif
  686. .loop:
  687. SPLATB_LOAD m0, r0+r1*0-1, m2
  688. SPLATB_LOAD m1, r0+r1*1-1, m2
  689. mova [r0+r1*0], m0
  690. mova [r0+r1*1], m1
  691. lea r0, [r0+r1*2]
  692. dec r2
  693. jg .loop
  694. REP_RET
  695. %endmacro
  696. INIT_MMX mmx
  697. PRED8x8_H
  698. INIT_MMX mmxext
  699. PRED8x8_H
  700. INIT_MMX ssse3
  701. PRED8x8_H
  702. ;-----------------------------------------------------------------------------
  703. ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
  704. ;-----------------------------------------------------------------------------
  705. INIT_MMX mmxext
  706. cglobal pred8x8_top_dc_8, 2,5
  707. sub r0, r1
  708. movq mm0, [r0]
  709. pxor mm1, mm1
  710. pxor mm2, mm2
  711. lea r2, [r0+r1*2]
  712. punpckhbw mm1, mm0
  713. punpcklbw mm0, mm2
  714. psadbw mm1, mm2 ; s1
  715. lea r3, [r2+r1*2]
  716. psadbw mm0, mm2 ; s0
  717. psrlw mm1, 1
  718. psrlw mm0, 1
  719. pavgw mm1, mm2
  720. lea r4, [r3+r1*2]
  721. pavgw mm0, mm2
  722. pshufw mm1, mm1, 0
  723. pshufw mm0, mm0, 0 ; dc0 (w)
  724. packuswb mm0, mm1 ; dc0,dc1 (b)
  725. movq [r0+r1*1], mm0
  726. movq [r0+r1*2], mm0
  727. lea r0, [r3+r1*2]
  728. movq [r2+r1*1], mm0
  729. movq [r2+r1*2], mm0
  730. movq [r3+r1*1], mm0
  731. movq [r3+r1*2], mm0
  732. movq [r0+r1*1], mm0
  733. movq [r0+r1*2], mm0
  734. RET
  735. ;-----------------------------------------------------------------------------
  736. ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
  737. ;-----------------------------------------------------------------------------
  738. INIT_MMX mmxext
  739. cglobal pred8x8_dc_8, 2,5
  740. sub r0, r1
  741. pxor m7, m7
  742. movd m0, [r0+0]
  743. movd m1, [r0+4]
  744. psadbw m0, m7 ; s0
  745. mov r4, r0
  746. psadbw m1, m7 ; s1
  747. movzx r2d, byte [r0+r1*1-1]
  748. movzx r3d, byte [r0+r1*2-1]
  749. lea r0, [r0+r1*2]
  750. add r2d, r3d
  751. movzx r3d, byte [r0+r1*1-1]
  752. add r2d, r3d
  753. movzx r3d, byte [r0+r1*2-1]
  754. add r2d, r3d
  755. lea r0, [r0+r1*2]
  756. movd m2, r2d ; s2
  757. movzx r2d, byte [r0+r1*1-1]
  758. movzx r3d, byte [r0+r1*2-1]
  759. lea r0, [r0+r1*2]
  760. add r2d, r3d
  761. movzx r3d, byte [r0+r1*1-1]
  762. add r2d, r3d
  763. movzx r3d, byte [r0+r1*2-1]
  764. add r2d, r3d
  765. movd m3, r2d ; s3
  766. punpcklwd m0, m1
  767. mov r0, r4
  768. punpcklwd m2, m3
  769. punpckldq m0, m2 ; s0, s1, s2, s3
  770. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  771. lea r2, [r0+r1*2]
  772. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  773. paddw m0, m3
  774. lea r3, [r2+r1*2]
  775. psrlw m0, 2
  776. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  777. lea r4, [r3+r1*2]
  778. packuswb m0, m0
  779. punpcklbw m0, m0
  780. movq m1, m0
  781. punpcklbw m0, m0
  782. punpckhbw m1, m1
  783. movq [r0+r1*1], m0
  784. movq [r0+r1*2], m0
  785. movq [r2+r1*1], m0
  786. movq [r2+r1*2], m0
  787. movq [r3+r1*1], m1
  788. movq [r3+r1*2], m1
  789. movq [r4+r1*1], m1
  790. movq [r4+r1*2], m1
  791. RET
  792. ;-----------------------------------------------------------------------------
  793. ; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
  794. ;-----------------------------------------------------------------------------
  795. INIT_MMX mmxext
  796. cglobal pred8x8_dc_rv40_8, 2,7
  797. mov r4, r0
  798. sub r0, r1
  799. pxor mm0, mm0
  800. psadbw mm0, [r0]
  801. dec r0
  802. movzx r5d, byte [r0+r1*1]
  803. movd r6d, mm0
  804. lea r0, [r0+r1*2]
  805. %rep 3
  806. movzx r2d, byte [r0+r1*0]
  807. movzx r3d, byte [r0+r1*1]
  808. add r5d, r2d
  809. add r6d, r3d
  810. lea r0, [r0+r1*2]
  811. %endrep
  812. movzx r2d, byte [r0+r1*0]
  813. add r5d, r6d
  814. lea r2d, [r2+r5+8]
  815. shr r2d, 4
  816. movd mm0, r2d
  817. punpcklbw mm0, mm0
  818. pshufw mm0, mm0, 0
  819. mov r3d, 4
  820. .loop:
  821. movq [r4+r1*0], mm0
  822. movq [r4+r1*1], mm0
  823. lea r4, [r4+r1*2]
  824. dec r3d
  825. jg .loop
  826. REP_RET
  827. ;-----------------------------------------------------------------------------
  828. ; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
  829. ;-----------------------------------------------------------------------------
  830. %macro PRED8x8_TM 0
  831. cglobal pred8x8_tm_vp8_8, 2,6
  832. sub r0, r1
  833. pxor mm7, mm7
  834. movq mm0, [r0]
  835. movq mm1, mm0
  836. punpcklbw mm0, mm7
  837. punpckhbw mm1, mm7
  838. movzx r4d, byte [r0-1]
  839. mov r5d, 4
  840. .loop:
  841. movzx r2d, byte [r0+r1*1-1]
  842. movzx r3d, byte [r0+r1*2-1]
  843. sub r2d, r4d
  844. sub r3d, r4d
  845. movd mm2, r2d
  846. movd mm4, r3d
  847. SPLATW mm2, mm2, 0
  848. SPLATW mm4, mm4, 0
  849. movq mm3, mm2
  850. movq mm5, mm4
  851. paddw mm2, mm0
  852. paddw mm3, mm1
  853. paddw mm4, mm0
  854. paddw mm5, mm1
  855. packuswb mm2, mm3
  856. packuswb mm4, mm5
  857. movq [r0+r1*1], mm2
  858. movq [r0+r1*2], mm4
  859. lea r0, [r0+r1*2]
  860. dec r5d
  861. jg .loop
  862. REP_RET
  863. %endmacro
  864. INIT_MMX mmx
  865. PRED8x8_TM
  866. INIT_MMX mmxext
  867. PRED8x8_TM
  868. INIT_XMM sse2
  869. cglobal pred8x8_tm_vp8_8, 2,6,4
  870. sub r0, r1
  871. pxor xmm1, xmm1
  872. movq xmm0, [r0]
  873. punpcklbw xmm0, xmm1
  874. movzx r4d, byte [r0-1]
  875. mov r5d, 4
  876. .loop:
  877. movzx r2d, byte [r0+r1*1-1]
  878. movzx r3d, byte [r0+r1*2-1]
  879. sub r2d, r4d
  880. sub r3d, r4d
  881. movd xmm2, r2d
  882. movd xmm3, r3d
  883. pshuflw xmm2, xmm2, 0
  884. pshuflw xmm3, xmm3, 0
  885. punpcklqdq xmm2, xmm2
  886. punpcklqdq xmm3, xmm3
  887. paddw xmm2, xmm0
  888. paddw xmm3, xmm0
  889. packuswb xmm2, xmm3
  890. movq [r0+r1*1], xmm2
  891. movhps [r0+r1*2], xmm2
  892. lea r0, [r0+r1*2]
  893. dec r5d
  894. jg .loop
  895. REP_RET
  896. INIT_XMM ssse3
  897. cglobal pred8x8_tm_vp8_8, 2,3,6
  898. sub r0, r1
  899. movdqa xmm4, [tm_shuf]
  900. pxor xmm1, xmm1
  901. movq xmm0, [r0]
  902. punpcklbw xmm0, xmm1
  903. movd xmm5, [r0-4]
  904. pshufb xmm5, xmm4
  905. mov r2d, 4
  906. .loop:
  907. movd xmm2, [r0+r1*1-4]
  908. movd xmm3, [r0+r1*2-4]
  909. pshufb xmm2, xmm4
  910. pshufb xmm3, xmm4
  911. psubw xmm2, xmm5
  912. psubw xmm3, xmm5
  913. paddw xmm2, xmm0
  914. paddw xmm3, xmm0
  915. packuswb xmm2, xmm3
  916. movq [r0+r1*1], xmm2
  917. movhps [r0+r1*2], xmm2
  918. lea r0, [r0+r1*2]
  919. dec r2d
  920. jg .loop
  921. REP_RET
  922. ; dest, left, right, src, tmp
  923. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  924. %macro PRED4x4_LOWPASS 5
  925. mova %5, %2
  926. pavgb %2, %3
  927. pxor %3, %5
  928. mova %1, %4
  929. pand %3, [pb_1]
  930. psubusb %2, %3
  931. pavgb %1, %2
  932. %endmacro
  933. ;-----------------------------------------------------------------------------
  934. ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
  935. ; int stride)
  936. ;-----------------------------------------------------------------------------
  937. %macro PRED8x8L_TOP_DC 0
  938. cglobal pred8x8l_top_dc_8, 4,4
  939. sub r0, r3
  940. pxor mm7, mm7
  941. movq mm0, [r0-8]
  942. movq mm3, [r0]
  943. movq mm1, [r0+8]
  944. movq mm2, mm3
  945. movq mm4, mm3
  946. PALIGNR mm2, mm0, 7, mm0
  947. PALIGNR mm1, mm4, 1, mm4
  948. test r1, r1 ; top_left
  949. jz .fix_lt_2
  950. test r2, r2 ; top_right
  951. jz .fix_tr_1
  952. jmp .body
  953. .fix_lt_2:
  954. movq mm5, mm3
  955. pxor mm5, mm2
  956. psllq mm5, 56
  957. psrlq mm5, 56
  958. pxor mm2, mm5
  959. test r2, r2 ; top_right
  960. jnz .body
  961. .fix_tr_1:
  962. movq mm5, mm3
  963. pxor mm5, mm1
  964. psrlq mm5, 56
  965. psllq mm5, 56
  966. pxor mm1, mm5
  967. .body:
  968. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  969. psadbw mm7, mm0
  970. paddw mm7, [pw_4]
  971. psrlw mm7, 3
  972. pshufw mm7, mm7, 0
  973. packuswb mm7, mm7
  974. %rep 3
  975. movq [r0+r3*1], mm7
  976. movq [r0+r3*2], mm7
  977. lea r0, [r0+r3*2]
  978. %endrep
  979. movq [r0+r3*1], mm7
  980. movq [r0+r3*2], mm7
  981. RET
  982. %endmacro
  983. INIT_MMX mmxext
  984. PRED8x8L_TOP_DC
  985. INIT_MMX ssse3
  986. PRED8x8L_TOP_DC
  987. ;-----------------------------------------------------------------------------
  988. ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
  989. ; int stride)
  990. ;-----------------------------------------------------------------------------
  991. %macro PRED8x8L_DC 0
  992. cglobal pred8x8l_dc_8, 4,5
  993. sub r0, r3
  994. lea r4, [r0+r3*2]
  995. movq mm0, [r0+r3*1-8]
  996. punpckhbw mm0, [r0+r3*0-8]
  997. movq mm1, [r4+r3*1-8]
  998. punpckhbw mm1, [r0+r3*2-8]
  999. mov r4, r0
  1000. punpckhwd mm1, mm0
  1001. lea r0, [r0+r3*4]
  1002. movq mm2, [r0+r3*1-8]
  1003. punpckhbw mm2, [r0+r3*0-8]
  1004. lea r0, [r0+r3*2]
  1005. movq mm3, [r0+r3*1-8]
  1006. punpckhbw mm3, [r0+r3*0-8]
  1007. punpckhwd mm3, mm2
  1008. punpckhdq mm3, mm1
  1009. lea r0, [r0+r3*2]
  1010. movq mm0, [r0+r3*0-8]
  1011. movq mm1, [r4]
  1012. mov r0, r4
  1013. movq mm4, mm3
  1014. movq mm2, mm3
  1015. PALIGNR mm4, mm0, 7, mm0
  1016. PALIGNR mm1, mm2, 1, mm2
  1017. test r1, r1
  1018. jnz .do_left
  1019. .fix_lt_1:
  1020. movq mm5, mm3
  1021. pxor mm5, mm4
  1022. psrlq mm5, 56
  1023. psllq mm5, 48
  1024. pxor mm1, mm5
  1025. jmp .do_left
  1026. .fix_lt_2:
  1027. movq mm5, mm3
  1028. pxor mm5, mm2
  1029. psllq mm5, 56
  1030. psrlq mm5, 56
  1031. pxor mm2, mm5
  1032. test r2, r2
  1033. jnz .body
  1034. .fix_tr_1:
  1035. movq mm5, mm3
  1036. pxor mm5, mm1
  1037. psrlq mm5, 56
  1038. psllq mm5, 56
  1039. pxor mm1, mm5
  1040. jmp .body
  1041. .do_left:
  1042. movq mm0, mm4
  1043. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1044. movq mm4, mm0
  1045. movq mm7, mm2
  1046. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1047. psllq mm1, 56
  1048. PALIGNR mm7, mm1, 7, mm3
  1049. movq mm0, [r0-8]
  1050. movq mm3, [r0]
  1051. movq mm1, [r0+8]
  1052. movq mm2, mm3
  1053. movq mm4, mm3
  1054. PALIGNR mm2, mm0, 7, mm0
  1055. PALIGNR mm1, mm4, 1, mm4
  1056. test r1, r1
  1057. jz .fix_lt_2
  1058. test r2, r2
  1059. jz .fix_tr_1
  1060. .body:
  1061. lea r1, [r0+r3*2]
  1062. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1063. pxor mm0, mm0
  1064. pxor mm1, mm1
  1065. lea r2, [r1+r3*2]
  1066. psadbw mm0, mm7
  1067. psadbw mm1, mm6
  1068. paddw mm0, [pw_8]
  1069. paddw mm0, mm1
  1070. lea r4, [r2+r3*2]
  1071. psrlw mm0, 4
  1072. pshufw mm0, mm0, 0
  1073. packuswb mm0, mm0
  1074. movq [r0+r3*1], mm0
  1075. movq [r0+r3*2], mm0
  1076. movq [r1+r3*1], mm0
  1077. movq [r1+r3*2], mm0
  1078. movq [r2+r3*1], mm0
  1079. movq [r2+r3*2], mm0
  1080. movq [r4+r3*1], mm0
  1081. movq [r4+r3*2], mm0
  1082. RET
  1083. %endmacro
  1084. INIT_MMX mmxext
  1085. PRED8x8L_DC
  1086. INIT_MMX ssse3
  1087. PRED8x8L_DC
  1088. ;-----------------------------------------------------------------------------
  1089. ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
  1090. ; int has_topright, int stride)
  1091. ;-----------------------------------------------------------------------------
  1092. %macro PRED8x8L_HORIZONTAL 0
  1093. cglobal pred8x8l_horizontal_8, 4,4
  1094. sub r0, r3
  1095. lea r2, [r0+r3*2]
  1096. movq mm0, [r0+r3*1-8]
  1097. test r1, r1
  1098. lea r1, [r0+r3]
  1099. cmovnz r1, r0
  1100. punpckhbw mm0, [r1+r3*0-8]
  1101. movq mm1, [r2+r3*1-8]
  1102. punpckhbw mm1, [r0+r3*2-8]
  1103. mov r2, r0
  1104. punpckhwd mm1, mm0
  1105. lea r0, [r0+r3*4]
  1106. movq mm2, [r0+r3*1-8]
  1107. punpckhbw mm2, [r0+r3*0-8]
  1108. lea r0, [r0+r3*2]
  1109. movq mm3, [r0+r3*1-8]
  1110. punpckhbw mm3, [r0+r3*0-8]
  1111. punpckhwd mm3, mm2
  1112. punpckhdq mm3, mm1
  1113. lea r0, [r0+r3*2]
  1114. movq mm0, [r0+r3*0-8]
  1115. movq mm1, [r1+r3*0-8]
  1116. mov r0, r2
  1117. movq mm4, mm3
  1118. movq mm2, mm3
  1119. PALIGNR mm4, mm0, 7, mm0
  1120. PALIGNR mm1, mm2, 1, mm2
  1121. movq mm0, mm4
  1122. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1123. movq mm4, mm0
  1124. movq mm7, mm2
  1125. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1126. psllq mm1, 56
  1127. PALIGNR mm7, mm1, 7, mm3
  1128. movq mm3, mm7
  1129. lea r1, [r0+r3*2]
  1130. movq mm7, mm3
  1131. punpckhbw mm3, mm3
  1132. punpcklbw mm7, mm7
  1133. pshufw mm0, mm3, 0xff
  1134. pshufw mm1, mm3, 0xaa
  1135. lea r2, [r1+r3*2]
  1136. pshufw mm2, mm3, 0x55
  1137. pshufw mm3, mm3, 0x00
  1138. pshufw mm4, mm7, 0xff
  1139. pshufw mm5, mm7, 0xaa
  1140. pshufw mm6, mm7, 0x55
  1141. pshufw mm7, mm7, 0x00
  1142. movq [r0+r3*1], mm0
  1143. movq [r0+r3*2], mm1
  1144. movq [r1+r3*1], mm2
  1145. movq [r1+r3*2], mm3
  1146. movq [r2+r3*1], mm4
  1147. movq [r2+r3*2], mm5
  1148. lea r0, [r2+r3*2]
  1149. movq [r0+r3*1], mm6
  1150. movq [r0+r3*2], mm7
  1151. RET
  1152. %endmacro
  1153. INIT_MMX mmxext
  1154. PRED8x8L_HORIZONTAL
  1155. INIT_MMX ssse3
  1156. PRED8x8L_HORIZONTAL
  1157. ;-----------------------------------------------------------------------------
  1158. ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
  1159. ; int stride)
  1160. ;-----------------------------------------------------------------------------
  1161. %macro PRED8x8L_VERTICAL 0
  1162. cglobal pred8x8l_vertical_8, 4,4
  1163. sub r0, r3
  1164. movq mm0, [r0-8]
  1165. movq mm3, [r0]
  1166. movq mm1, [r0+8]
  1167. movq mm2, mm3
  1168. movq mm4, mm3
  1169. PALIGNR mm2, mm0, 7, mm0
  1170. PALIGNR mm1, mm4, 1, mm4
  1171. test r1, r1 ; top_left
  1172. jz .fix_lt_2
  1173. test r2, r2 ; top_right
  1174. jz .fix_tr_1
  1175. jmp .body
  1176. .fix_lt_2:
  1177. movq mm5, mm3
  1178. pxor mm5, mm2
  1179. psllq mm5, 56
  1180. psrlq mm5, 56
  1181. pxor mm2, mm5
  1182. test r2, r2 ; top_right
  1183. jnz .body
  1184. .fix_tr_1:
  1185. movq mm5, mm3
  1186. pxor mm5, mm1
  1187. psrlq mm5, 56
  1188. psllq mm5, 56
  1189. pxor mm1, mm5
  1190. .body:
  1191. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1192. %rep 3
  1193. movq [r0+r3*1], mm0
  1194. movq [r0+r3*2], mm0
  1195. lea r0, [r0+r3*2]
  1196. %endrep
  1197. movq [r0+r3*1], mm0
  1198. movq [r0+r3*2], mm0
  1199. RET
  1200. %endmacro
  1201. INIT_MMX mmxext
  1202. PRED8x8L_VERTICAL
  1203. INIT_MMX ssse3
  1204. PRED8x8L_VERTICAL
  1205. ;-----------------------------------------------------------------------------
  1206. ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
  1207. ; int has_topright, int stride)
  1208. ;-----------------------------------------------------------------------------
  1209. INIT_MMX mmxext
  1210. cglobal pred8x8l_down_left_8, 4,5
  1211. sub r0, r3
  1212. movq mm0, [r0-8]
  1213. movq mm3, [r0]
  1214. movq mm1, [r0+8]
  1215. movq mm2, mm3
  1216. movq mm4, mm3
  1217. PALIGNR mm2, mm0, 7, mm0
  1218. PALIGNR mm1, mm4, 1, mm4
  1219. test r1, r1
  1220. jz .fix_lt_2
  1221. test r2, r2
  1222. jz .fix_tr_1
  1223. jmp .do_top
  1224. .fix_lt_2:
  1225. movq mm5, mm3
  1226. pxor mm5, mm2
  1227. psllq mm5, 56
  1228. psrlq mm5, 56
  1229. pxor mm2, mm5
  1230. test r2, r2
  1231. jnz .do_top
  1232. .fix_tr_1:
  1233. movq mm5, mm3
  1234. pxor mm5, mm1
  1235. psrlq mm5, 56
  1236. psllq mm5, 56
  1237. pxor mm1, mm5
  1238. jmp .do_top
  1239. .fix_tr_2:
  1240. punpckhbw mm3, mm3
  1241. pshufw mm1, mm3, 0xFF
  1242. jmp .do_topright
  1243. .do_top:
  1244. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1245. movq mm7, mm4
  1246. test r2, r2
  1247. jz .fix_tr_2
  1248. movq mm0, [r0+8]
  1249. movq mm5, mm0
  1250. movq mm2, mm0
  1251. movq mm4, mm0
  1252. psrlq mm5, 56
  1253. PALIGNR mm2, mm3, 7, mm3
  1254. PALIGNR mm5, mm4, 1, mm4
  1255. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1256. .do_topright:
  1257. lea r1, [r0+r3*2]
  1258. movq mm6, mm1
  1259. psrlq mm1, 56
  1260. movq mm4, mm1
  1261. lea r2, [r1+r3*2]
  1262. movq mm2, mm6
  1263. PALIGNR mm2, mm7, 1, mm0
  1264. movq mm3, mm6
  1265. PALIGNR mm3, mm7, 7, mm0
  1266. PALIGNR mm4, mm6, 1, mm0
  1267. movq mm5, mm7
  1268. movq mm1, mm7
  1269. movq mm7, mm6
  1270. lea r4, [r2+r3*2]
  1271. psllq mm1, 8
  1272. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1273. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1274. movq [r4+r3*2], mm1
  1275. movq mm2, mm0
  1276. psllq mm1, 8
  1277. psrlq mm2, 56
  1278. psllq mm0, 8
  1279. por mm1, mm2
  1280. movq [r4+r3*1], mm1
  1281. movq mm2, mm0
  1282. psllq mm1, 8
  1283. psrlq mm2, 56
  1284. psllq mm0, 8
  1285. por mm1, mm2
  1286. movq [r2+r3*2], mm1
  1287. movq mm2, mm0
  1288. psllq mm1, 8
  1289. psrlq mm2, 56
  1290. psllq mm0, 8
  1291. por mm1, mm2
  1292. movq [r2+r3*1], mm1
  1293. movq mm2, mm0
  1294. psllq mm1, 8
  1295. psrlq mm2, 56
  1296. psllq mm0, 8
  1297. por mm1, mm2
  1298. movq [r1+r3*2], mm1
  1299. movq mm2, mm0
  1300. psllq mm1, 8
  1301. psrlq mm2, 56
  1302. psllq mm0, 8
  1303. por mm1, mm2
  1304. movq [r1+r3*1], mm1
  1305. movq mm2, mm0
  1306. psllq mm1, 8
  1307. psrlq mm2, 56
  1308. psllq mm0, 8
  1309. por mm1, mm2
  1310. movq [r0+r3*2], mm1
  1311. psllq mm1, 8
  1312. psrlq mm0, 56
  1313. por mm1, mm0
  1314. movq [r0+r3*1], mm1
  1315. RET
  1316. %macro PRED8x8L_DOWN_LEFT 0
  1317. cglobal pred8x8l_down_left_8, 4,4
  1318. sub r0, r3
  1319. movq mm0, [r0-8]
  1320. movq mm3, [r0]
  1321. movq mm1, [r0+8]
  1322. movq mm2, mm3
  1323. movq mm4, mm3
  1324. PALIGNR mm2, mm0, 7, mm0
  1325. PALIGNR mm1, mm4, 1, mm4
  1326. test r1, r1 ; top_left
  1327. jz .fix_lt_2
  1328. test r2, r2 ; top_right
  1329. jz .fix_tr_1
  1330. jmp .do_top
  1331. .fix_lt_2:
  1332. movq mm5, mm3
  1333. pxor mm5, mm2
  1334. psllq mm5, 56
  1335. psrlq mm5, 56
  1336. pxor mm2, mm5
  1337. test r2, r2 ; top_right
  1338. jnz .do_top
  1339. .fix_tr_1:
  1340. movq mm5, mm3
  1341. pxor mm5, mm1
  1342. psrlq mm5, 56
  1343. psllq mm5, 56
  1344. pxor mm1, mm5
  1345. jmp .do_top
  1346. .fix_tr_2:
  1347. punpckhbw mm3, mm3
  1348. pshufw mm1, mm3, 0xFF
  1349. jmp .do_topright
  1350. .do_top:
  1351. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1352. movq2dq xmm3, mm4
  1353. test r2, r2 ; top_right
  1354. jz .fix_tr_2
  1355. movq mm0, [r0+8]
  1356. movq mm5, mm0
  1357. movq mm2, mm0
  1358. movq mm4, mm0
  1359. psrlq mm5, 56
  1360. PALIGNR mm2, mm3, 7, mm3
  1361. PALIGNR mm5, mm4, 1, mm4
  1362. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1363. .do_topright:
  1364. movq2dq xmm4, mm1
  1365. psrlq mm1, 56
  1366. movq2dq xmm5, mm1
  1367. lea r1, [r0+r3*2]
  1368. pslldq xmm4, 8
  1369. por xmm3, xmm4
  1370. movdqa xmm2, xmm3
  1371. psrldq xmm2, 1
  1372. pslldq xmm5, 15
  1373. por xmm2, xmm5
  1374. lea r2, [r1+r3*2]
  1375. movdqa xmm1, xmm3
  1376. pslldq xmm1, 1
  1377. INIT_XMM cpuname
  1378. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1379. psrldq xmm0, 1
  1380. movq [r0+r3*1], xmm0
  1381. psrldq xmm0, 1
  1382. movq [r0+r3*2], xmm0
  1383. psrldq xmm0, 1
  1384. lea r0, [r2+r3*2]
  1385. movq [r1+r3*1], xmm0
  1386. psrldq xmm0, 1
  1387. movq [r1+r3*2], xmm0
  1388. psrldq xmm0, 1
  1389. movq [r2+r3*1], xmm0
  1390. psrldq xmm0, 1
  1391. movq [r2+r3*2], xmm0
  1392. psrldq xmm0, 1
  1393. movq [r0+r3*1], xmm0
  1394. psrldq xmm0, 1
  1395. movq [r0+r3*2], xmm0
  1396. RET
  1397. %endmacro
  1398. INIT_MMX sse2
  1399. PRED8x8L_DOWN_LEFT
  1400. INIT_MMX ssse3
  1401. PRED8x8L_DOWN_LEFT
  1402. ;-----------------------------------------------------------------------------
  1403. ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
  1404. ; int has_topright, int stride)
  1405. ;-----------------------------------------------------------------------------
  1406. INIT_MMX mmxext
  1407. cglobal pred8x8l_down_right_8, 4,5
  1408. sub r0, r3
  1409. lea r4, [r0+r3*2]
  1410. movq mm0, [r0+r3*1-8]
  1411. punpckhbw mm0, [r0+r3*0-8]
  1412. movq mm1, [r4+r3*1-8]
  1413. punpckhbw mm1, [r0+r3*2-8]
  1414. mov r4, r0
  1415. punpckhwd mm1, mm0
  1416. lea r0, [r0+r3*4]
  1417. movq mm2, [r0+r3*1-8]
  1418. punpckhbw mm2, [r0+r3*0-8]
  1419. lea r0, [r0+r3*2]
  1420. movq mm3, [r0+r3*1-8]
  1421. punpckhbw mm3, [r0+r3*0-8]
  1422. punpckhwd mm3, mm2
  1423. punpckhdq mm3, mm1
  1424. lea r0, [r0+r3*2]
  1425. movq mm0, [r0+r3*0-8]
  1426. movq mm1, [r4]
  1427. mov r0, r4
  1428. movq mm4, mm3
  1429. movq mm2, mm3
  1430. PALIGNR mm4, mm0, 7, mm0
  1431. PALIGNR mm1, mm2, 1, mm2
  1432. test r1, r1 ; top_left
  1433. jz .fix_lt_1
  1434. .do_left:
  1435. movq mm0, mm4
  1436. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1437. movq mm4, mm0
  1438. movq mm7, mm2
  1439. movq mm6, mm2
  1440. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1441. psllq mm1, 56
  1442. PALIGNR mm7, mm1, 7, mm3
  1443. movq mm0, [r0-8]
  1444. movq mm3, [r0]
  1445. movq mm1, [r0+8]
  1446. movq mm2, mm3
  1447. movq mm4, mm3
  1448. PALIGNR mm2, mm0, 7, mm0
  1449. PALIGNR mm1, mm4, 1, mm4
  1450. test r1, r1 ; top_left
  1451. jz .fix_lt_2
  1452. test r2, r2 ; top_right
  1453. jz .fix_tr_1
  1454. .do_top:
  1455. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1456. movq mm5, mm4
  1457. jmp .body
  1458. .fix_lt_1:
  1459. movq mm5, mm3
  1460. pxor mm5, mm4
  1461. psrlq mm5, 56
  1462. psllq mm5, 48
  1463. pxor mm1, mm5
  1464. jmp .do_left
  1465. .fix_lt_2:
  1466. movq mm5, mm3
  1467. pxor mm5, mm2
  1468. psllq mm5, 56
  1469. psrlq mm5, 56
  1470. pxor mm2, mm5
  1471. test r2, r2 ; top_right
  1472. jnz .do_top
  1473. .fix_tr_1:
  1474. movq mm5, mm3
  1475. pxor mm5, mm1
  1476. psrlq mm5, 56
  1477. psllq mm5, 56
  1478. pxor mm1, mm5
  1479. jmp .do_top
  1480. .body:
  1481. lea r1, [r0+r3*2]
  1482. movq mm1, mm7
  1483. movq mm7, mm5
  1484. movq mm5, mm6
  1485. movq mm2, mm7
  1486. lea r2, [r1+r3*2]
  1487. PALIGNR mm2, mm6, 1, mm0
  1488. movq mm3, mm7
  1489. PALIGNR mm3, mm6, 7, mm0
  1490. movq mm4, mm7
  1491. lea r4, [r2+r3*2]
  1492. psrlq mm4, 8
  1493. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1494. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1495. movq [r4+r3*2], mm0
  1496. movq mm2, mm1
  1497. psrlq mm0, 8
  1498. psllq mm2, 56
  1499. psrlq mm1, 8
  1500. por mm0, mm2
  1501. movq [r4+r3*1], mm0
  1502. movq mm2, mm1
  1503. psrlq mm0, 8
  1504. psllq mm2, 56
  1505. psrlq mm1, 8
  1506. por mm0, mm2
  1507. movq [r2+r3*2], mm0
  1508. movq mm2, mm1
  1509. psrlq mm0, 8
  1510. psllq mm2, 56
  1511. psrlq mm1, 8
  1512. por mm0, mm2
  1513. movq [r2+r3*1], mm0
  1514. movq mm2, mm1
  1515. psrlq mm0, 8
  1516. psllq mm2, 56
  1517. psrlq mm1, 8
  1518. por mm0, mm2
  1519. movq [r1+r3*2], mm0
  1520. movq mm2, mm1
  1521. psrlq mm0, 8
  1522. psllq mm2, 56
  1523. psrlq mm1, 8
  1524. por mm0, mm2
  1525. movq [r1+r3*1], mm0
  1526. movq mm2, mm1
  1527. psrlq mm0, 8
  1528. psllq mm2, 56
  1529. psrlq mm1, 8
  1530. por mm0, mm2
  1531. movq [r0+r3*2], mm0
  1532. psrlq mm0, 8
  1533. psllq mm1, 56
  1534. por mm0, mm1
  1535. movq [r0+r3*1], mm0
  1536. RET
  1537. %macro PRED8x8L_DOWN_RIGHT 0
  1538. cglobal pred8x8l_down_right_8, 4,5
  1539. sub r0, r3
  1540. lea r4, [r0+r3*2]
  1541. movq mm0, [r0+r3*1-8]
  1542. punpckhbw mm0, [r0+r3*0-8]
  1543. movq mm1, [r4+r3*1-8]
  1544. punpckhbw mm1, [r0+r3*2-8]
  1545. mov r4, r0
  1546. punpckhwd mm1, mm0
  1547. lea r0, [r0+r3*4]
  1548. movq mm2, [r0+r3*1-8]
  1549. punpckhbw mm2, [r0+r3*0-8]
  1550. lea r0, [r0+r3*2]
  1551. movq mm3, [r0+r3*1-8]
  1552. punpckhbw mm3, [r0+r3*0-8]
  1553. punpckhwd mm3, mm2
  1554. punpckhdq mm3, mm1
  1555. lea r0, [r0+r3*2]
  1556. movq mm0, [r0+r3*0-8]
  1557. movq mm1, [r4]
  1558. mov r0, r4
  1559. movq mm4, mm3
  1560. movq mm2, mm3
  1561. PALIGNR mm4, mm0, 7, mm0
  1562. PALIGNR mm1, mm2, 1, mm2
  1563. test r1, r1
  1564. jz .fix_lt_1
  1565. jmp .do_left
  1566. .fix_lt_1:
  1567. movq mm5, mm3
  1568. pxor mm5, mm4
  1569. psrlq mm5, 56
  1570. psllq mm5, 48
  1571. pxor mm1, mm5
  1572. jmp .do_left
  1573. .fix_lt_2:
  1574. movq mm5, mm3
  1575. pxor mm5, mm2
  1576. psllq mm5, 56
  1577. psrlq mm5, 56
  1578. pxor mm2, mm5
  1579. test r2, r2
  1580. jnz .do_top
  1581. .fix_tr_1:
  1582. movq mm5, mm3
  1583. pxor mm5, mm1
  1584. psrlq mm5, 56
  1585. psllq mm5, 56
  1586. pxor mm1, mm5
  1587. jmp .do_top
  1588. .do_left:
  1589. movq mm0, mm4
  1590. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1591. movq mm4, mm0
  1592. movq mm7, mm2
  1593. movq2dq xmm3, mm2
  1594. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1595. psllq mm1, 56
  1596. PALIGNR mm7, mm1, 7, mm3
  1597. movq2dq xmm1, mm7
  1598. movq mm0, [r0-8]
  1599. movq mm3, [r0]
  1600. movq mm1, [r0+8]
  1601. movq mm2, mm3
  1602. movq mm4, mm3
  1603. PALIGNR mm2, mm0, 7, mm0
  1604. PALIGNR mm1, mm4, 1, mm4
  1605. test r1, r1
  1606. jz .fix_lt_2
  1607. test r2, r2
  1608. jz .fix_tr_1
  1609. .do_top:
  1610. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1611. movq2dq xmm4, mm4
  1612. lea r1, [r0+r3*2]
  1613. movdqa xmm0, xmm3
  1614. pslldq xmm4, 8
  1615. por xmm3, xmm4
  1616. lea r2, [r1+r3*2]
  1617. pslldq xmm4, 1
  1618. por xmm1, xmm4
  1619. psrldq xmm0, 7
  1620. pslldq xmm0, 15
  1621. psrldq xmm0, 7
  1622. por xmm1, xmm0
  1623. lea r0, [r2+r3*2]
  1624. movdqa xmm2, xmm3
  1625. psrldq xmm2, 1
  1626. INIT_XMM cpuname
  1627. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1628. movdqa xmm1, xmm0
  1629. psrldq xmm1, 1
  1630. movq [r0+r3*2], xmm0
  1631. movq [r0+r3*1], xmm1
  1632. psrldq xmm0, 2
  1633. psrldq xmm1, 2
  1634. movq [r2+r3*2], xmm0
  1635. movq [r2+r3*1], xmm1
  1636. psrldq xmm0, 2
  1637. psrldq xmm1, 2
  1638. movq [r1+r3*2], xmm0
  1639. movq [r1+r3*1], xmm1
  1640. psrldq xmm0, 2
  1641. psrldq xmm1, 2
  1642. movq [r4+r3*2], xmm0
  1643. movq [r4+r3*1], xmm1
  1644. RET
  1645. %endmacro
  1646. INIT_MMX sse2
  1647. PRED8x8L_DOWN_RIGHT
  1648. INIT_MMX ssse3
  1649. PRED8x8L_DOWN_RIGHT
  1650. ;-----------------------------------------------------------------------------
  1651. ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
  1652. ; int has_topright, int stride)
  1653. ;-----------------------------------------------------------------------------
  1654. INIT_MMX mmxext
  1655. cglobal pred8x8l_vertical_right_8, 4,5
  1656. sub r0, r3
  1657. lea r4, [r0+r3*2]
  1658. movq mm0, [r0+r3*1-8]
  1659. punpckhbw mm0, [r0+r3*0-8]
  1660. movq mm1, [r4+r3*1-8]
  1661. punpckhbw mm1, [r0+r3*2-8]
  1662. mov r4, r0
  1663. punpckhwd mm1, mm0
  1664. lea r0, [r0+r3*4]
  1665. movq mm2, [r0+r3*1-8]
  1666. punpckhbw mm2, [r0+r3*0-8]
  1667. lea r0, [r0+r3*2]
  1668. movq mm3, [r0+r3*1-8]
  1669. punpckhbw mm3, [r0+r3*0-8]
  1670. punpckhwd mm3, mm2
  1671. punpckhdq mm3, mm1
  1672. lea r0, [r0+r3*2]
  1673. movq mm0, [r0+r3*0-8]
  1674. movq mm1, [r4]
  1675. mov r0, r4
  1676. movq mm4, mm3
  1677. movq mm2, mm3
  1678. PALIGNR mm4, mm0, 7, mm0
  1679. PALIGNR mm1, mm2, 1, mm2
  1680. test r1, r1
  1681. jz .fix_lt_1
  1682. jmp .do_left
  1683. .fix_lt_1:
  1684. movq mm5, mm3
  1685. pxor mm5, mm4
  1686. psrlq mm5, 56
  1687. psllq mm5, 48
  1688. pxor mm1, mm5
  1689. jmp .do_left
  1690. .fix_lt_2:
  1691. movq mm5, mm3
  1692. pxor mm5, mm2
  1693. psllq mm5, 56
  1694. psrlq mm5, 56
  1695. pxor mm2, mm5
  1696. test r2, r2
  1697. jnz .do_top
  1698. .fix_tr_1:
  1699. movq mm5, mm3
  1700. pxor mm5, mm1
  1701. psrlq mm5, 56
  1702. psllq mm5, 56
  1703. pxor mm1, mm5
  1704. jmp .do_top
  1705. .do_left:
  1706. movq mm0, mm4
  1707. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1708. movq mm7, mm2
  1709. movq mm0, [r0-8]
  1710. movq mm3, [r0]
  1711. movq mm1, [r0+8]
  1712. movq mm2, mm3
  1713. movq mm4, mm3
  1714. PALIGNR mm2, mm0, 7, mm0
  1715. PALIGNR mm1, mm4, 1, mm4
  1716. test r1, r1
  1717. jz .fix_lt_2
  1718. test r2, r2
  1719. jz .fix_tr_1
  1720. .do_top:
  1721. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1722. lea r1, [r0+r3*2]
  1723. movq mm2, mm6
  1724. movq mm3, mm6
  1725. PALIGNR mm3, mm7, 7, mm0
  1726. PALIGNR mm6, mm7, 6, mm1
  1727. movq mm4, mm3
  1728. pavgb mm3, mm2
  1729. lea r2, [r1+r3*2]
  1730. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1731. movq [r0+r3*1], mm3
  1732. movq [r0+r3*2], mm0
  1733. movq mm5, mm0
  1734. movq mm6, mm3
  1735. movq mm1, mm7
  1736. movq mm2, mm1
  1737. psllq mm2, 8
  1738. movq mm3, mm1
  1739. psllq mm3, 16
  1740. lea r4, [r2+r3*2]
  1741. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1742. PALIGNR mm6, mm0, 7, mm2
  1743. movq [r1+r3*1], mm6
  1744. psllq mm0, 8
  1745. PALIGNR mm5, mm0, 7, mm1
  1746. movq [r1+r3*2], mm5
  1747. psllq mm0, 8
  1748. PALIGNR mm6, mm0, 7, mm2
  1749. movq [r2+r3*1], mm6
  1750. psllq mm0, 8
  1751. PALIGNR mm5, mm0, 7, mm1
  1752. movq [r2+r3*2], mm5
  1753. psllq mm0, 8
  1754. PALIGNR mm6, mm0, 7, mm2
  1755. movq [r4+r3*1], mm6
  1756. psllq mm0, 8
  1757. PALIGNR mm5, mm0, 7, mm1
  1758. movq [r4+r3*2], mm5
  1759. RET
  1760. %macro PRED8x8L_VERTICAL_RIGHT 0
  1761. cglobal pred8x8l_vertical_right_8, 4,5,7
  1762. ; manually spill XMM registers for Win64 because
  1763. ; the code here is initialized with INIT_MMX
  1764. WIN64_SPILL_XMM 7
  1765. sub r0, r3
  1766. lea r4, [r0+r3*2]
  1767. movq mm0, [r0+r3*1-8]
  1768. punpckhbw mm0, [r0+r3*0-8]
  1769. movq mm1, [r4+r3*1-8]
  1770. punpckhbw mm1, [r0+r3*2-8]
  1771. mov r4, r0
  1772. punpckhwd mm1, mm0
  1773. lea r0, [r0+r3*4]
  1774. movq mm2, [r0+r3*1-8]
  1775. punpckhbw mm2, [r0+r3*0-8]
  1776. lea r0, [r0+r3*2]
  1777. movq mm3, [r0+r3*1-8]
  1778. punpckhbw mm3, [r0+r3*0-8]
  1779. punpckhwd mm3, mm2
  1780. punpckhdq mm3, mm1
  1781. lea r0, [r0+r3*2]
  1782. movq mm0, [r0+r3*0-8]
  1783. movq mm1, [r4]
  1784. mov r0, r4
  1785. movq mm4, mm3
  1786. movq mm2, mm3
  1787. PALIGNR mm4, mm0, 7, mm0
  1788. PALIGNR mm1, mm2, 1, mm2
  1789. test r1, r1
  1790. jnz .do_left
  1791. .fix_lt_1:
  1792. movq mm5, mm3
  1793. pxor mm5, mm4
  1794. psrlq mm5, 56
  1795. psllq mm5, 48
  1796. pxor mm1, mm5
  1797. jmp .do_left
  1798. .fix_lt_2:
  1799. movq mm5, mm3
  1800. pxor mm5, mm2
  1801. psllq mm5, 56
  1802. psrlq mm5, 56
  1803. pxor mm2, mm5
  1804. test r2, r2
  1805. jnz .do_top
  1806. .fix_tr_1:
  1807. movq mm5, mm3
  1808. pxor mm5, mm1
  1809. psrlq mm5, 56
  1810. psllq mm5, 56
  1811. pxor mm1, mm5
  1812. jmp .do_top
  1813. .do_left:
  1814. movq mm0, mm4
  1815. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1816. movq2dq xmm0, mm2
  1817. movq mm0, [r0-8]
  1818. movq mm3, [r0]
  1819. movq mm1, [r0+8]
  1820. movq mm2, mm3
  1821. movq mm4, mm3
  1822. PALIGNR mm2, mm0, 7, mm0
  1823. PALIGNR mm1, mm4, 1, mm4
  1824. test r1, r1
  1825. jz .fix_lt_2
  1826. test r2, r2
  1827. jz .fix_tr_1
  1828. .do_top:
  1829. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1830. lea r1, [r0+r3*2]
  1831. movq2dq xmm4, mm6
  1832. pslldq xmm4, 8
  1833. por xmm0, xmm4
  1834. movdqa xmm6, [pw_ff00]
  1835. movdqa xmm1, xmm0
  1836. lea r2, [r1+r3*2]
  1837. movdqa xmm2, xmm0
  1838. movdqa xmm3, xmm0
  1839. pslldq xmm0, 1
  1840. pslldq xmm1, 2
  1841. pavgb xmm2, xmm0
  1842. INIT_XMM cpuname
  1843. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1844. pandn xmm6, xmm4
  1845. movdqa xmm5, xmm4
  1846. psrlw xmm4, 8
  1847. packuswb xmm6, xmm4
  1848. movhlps xmm4, xmm6
  1849. movhps [r0+r3*2], xmm5
  1850. movhps [r0+r3*1], xmm2
  1851. psrldq xmm5, 4
  1852. movss xmm5, xmm6
  1853. psrldq xmm2, 4
  1854. movss xmm2, xmm4
  1855. lea r0, [r2+r3*2]
  1856. psrldq xmm5, 1
  1857. psrldq xmm2, 1
  1858. movq [r0+r3*2], xmm5
  1859. movq [r0+r3*1], xmm2
  1860. psrldq xmm5, 1
  1861. psrldq xmm2, 1
  1862. movq [r2+r3*2], xmm5
  1863. movq [r2+r3*1], xmm2
  1864. psrldq xmm5, 1
  1865. psrldq xmm2, 1
  1866. movq [r1+r3*2], xmm5
  1867. movq [r1+r3*1], xmm2
  1868. RET
  1869. %endmacro
  1870. INIT_MMX sse2
  1871. PRED8x8L_VERTICAL_RIGHT
  1872. INIT_MMX ssse3
  1873. PRED8x8L_VERTICAL_RIGHT
  1874. ;-----------------------------------------------------------------------------
  1875. ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
  1876. ; int has_topright, int stride)
  1877. ;-----------------------------------------------------------------------------
  1878. %macro PRED8x8L_VERTICAL_LEFT 0
  1879. cglobal pred8x8l_vertical_left_8, 4,4
  1880. sub r0, r3
  1881. movq mm0, [r0-8]
  1882. movq mm3, [r0]
  1883. movq mm1, [r0+8]
  1884. movq mm2, mm3
  1885. movq mm4, mm3
  1886. PALIGNR mm2, mm0, 7, mm0
  1887. PALIGNR mm1, mm4, 1, mm4
  1888. test r1, r1
  1889. jz .fix_lt_2
  1890. test r2, r2
  1891. jz .fix_tr_1
  1892. jmp .do_top
  1893. .fix_lt_2:
  1894. movq mm5, mm3
  1895. pxor mm5, mm2
  1896. psllq mm5, 56
  1897. psrlq mm5, 56
  1898. pxor mm2, mm5
  1899. test r2, r2
  1900. jnz .do_top
  1901. .fix_tr_1:
  1902. movq mm5, mm3
  1903. pxor mm5, mm1
  1904. psrlq mm5, 56
  1905. psllq mm5, 56
  1906. pxor mm1, mm5
  1907. jmp .do_top
  1908. .fix_tr_2:
  1909. punpckhbw mm3, mm3
  1910. pshufw mm1, mm3, 0xFF
  1911. jmp .do_topright
  1912. .do_top:
  1913. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1914. movq2dq xmm4, mm4
  1915. test r2, r2
  1916. jz .fix_tr_2
  1917. movq mm0, [r0+8]
  1918. movq mm5, mm0
  1919. movq mm2, mm0
  1920. movq mm4, mm0
  1921. psrlq mm5, 56
  1922. PALIGNR mm2, mm3, 7, mm3
  1923. PALIGNR mm5, mm4, 1, mm4
  1924. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1925. .do_topright:
  1926. movq2dq xmm3, mm1
  1927. lea r1, [r0+r3*2]
  1928. pslldq xmm3, 8
  1929. por xmm4, xmm3
  1930. movdqa xmm2, xmm4
  1931. movdqa xmm1, xmm4
  1932. movdqa xmm3, xmm4
  1933. psrldq xmm2, 1
  1934. pslldq xmm1, 1
  1935. pavgb xmm3, xmm2
  1936. lea r2, [r1+r3*2]
  1937. INIT_XMM cpuname
  1938. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1939. psrldq xmm0, 1
  1940. movq [r0+r3*1], xmm3
  1941. movq [r0+r3*2], xmm0
  1942. lea r0, [r2+r3*2]
  1943. psrldq xmm3, 1
  1944. psrldq xmm0, 1
  1945. movq [r1+r3*1], xmm3
  1946. movq [r1+r3*2], xmm0
  1947. psrldq xmm3, 1
  1948. psrldq xmm0, 1
  1949. movq [r2+r3*1], xmm3
  1950. movq [r2+r3*2], xmm0
  1951. psrldq xmm3, 1
  1952. psrldq xmm0, 1
  1953. movq [r0+r3*1], xmm3
  1954. movq [r0+r3*2], xmm0
  1955. RET
  1956. %endmacro
  1957. INIT_MMX sse2
  1958. PRED8x8L_VERTICAL_LEFT
  1959. INIT_MMX ssse3
  1960. PRED8x8L_VERTICAL_LEFT
  1961. ;-----------------------------------------------------------------------------
  1962. ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
  1963. ; int has_topright, int stride)
  1964. ;-----------------------------------------------------------------------------
  1965. %macro PRED8x8L_HORIZONTAL_UP 0
  1966. cglobal pred8x8l_horizontal_up_8, 4,4
  1967. sub r0, r3
  1968. lea r2, [r0+r3*2]
  1969. movq mm0, [r0+r3*1-8]
  1970. test r1, r1
  1971. lea r1, [r0+r3]
  1972. cmovnz r1, r0
  1973. punpckhbw mm0, [r1+r3*0-8]
  1974. movq mm1, [r2+r3*1-8]
  1975. punpckhbw mm1, [r0+r3*2-8]
  1976. mov r2, r0
  1977. punpckhwd mm1, mm0
  1978. lea r0, [r0+r3*4]
  1979. movq mm2, [r0+r3*1-8]
  1980. punpckhbw mm2, [r0+r3*0-8]
  1981. lea r0, [r0+r3*2]
  1982. movq mm3, [r0+r3*1-8]
  1983. punpckhbw mm3, [r0+r3*0-8]
  1984. punpckhwd mm3, mm2
  1985. punpckhdq mm3, mm1
  1986. lea r0, [r0+r3*2]
  1987. movq mm0, [r0+r3*0-8]
  1988. movq mm1, [r1+r3*0-8]
  1989. mov r0, r2
  1990. movq mm4, mm3
  1991. movq mm2, mm3
  1992. PALIGNR mm4, mm0, 7, mm0
  1993. PALIGNR mm1, mm2, 1, mm2
  1994. movq mm0, mm4
  1995. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1996. movq mm4, mm0
  1997. movq mm7, mm2
  1998. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1999. psllq mm1, 56
  2000. PALIGNR mm7, mm1, 7, mm3
  2001. lea r1, [r0+r3*2]
  2002. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2003. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2004. movq mm2, mm0
  2005. psllw mm0, 8
  2006. psrlw mm2, 8
  2007. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2008. movq mm3, mm2
  2009. movq mm4, mm2
  2010. movq mm5, mm2
  2011. psrlq mm2, 8
  2012. psrlq mm3, 16
  2013. lea r2, [r1+r3*2]
  2014. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2015. punpckhbw mm7, mm7
  2016. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2017. pavgb mm4, mm2
  2018. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2019. movq mm5, mm4
  2020. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2021. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2022. movq mm6, mm5
  2023. movq mm7, mm5
  2024. movq mm0, mm5
  2025. PALIGNR mm5, mm4, 2, mm1
  2026. pshufw mm1, mm6, 11111001b
  2027. PALIGNR mm6, mm4, 4, mm2
  2028. pshufw mm2, mm7, 11111110b
  2029. PALIGNR mm7, mm4, 6, mm3
  2030. pshufw mm3, mm0, 11111111b
  2031. movq [r0+r3*1], mm4
  2032. movq [r0+r3*2], mm5
  2033. lea r0, [r2+r3*2]
  2034. movq [r1+r3*1], mm6
  2035. movq [r1+r3*2], mm7
  2036. movq [r2+r3*1], mm0
  2037. movq [r2+r3*2], mm1
  2038. movq [r0+r3*1], mm2
  2039. movq [r0+r3*2], mm3
  2040. RET
  2041. %endmacro
  2042. INIT_MMX mmxext
  2043. PRED8x8L_HORIZONTAL_UP
  2044. INIT_MMX ssse3
  2045. PRED8x8L_HORIZONTAL_UP
  2046. ;-----------------------------------------------------------------------------
  2047. ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
  2048. ; int has_topright, int stride)
  2049. ;-----------------------------------------------------------------------------
  2050. INIT_MMX mmxext
  2051. cglobal pred8x8l_horizontal_down_8, 4,5
  2052. sub r0, r3
  2053. lea r4, [r0+r3*2]
  2054. movq mm0, [r0+r3*1-8]
  2055. punpckhbw mm0, [r0+r3*0-8]
  2056. movq mm1, [r4+r3*1-8]
  2057. punpckhbw mm1, [r0+r3*2-8]
  2058. mov r4, r0
  2059. punpckhwd mm1, mm0
  2060. lea r0, [r0+r3*4]
  2061. movq mm2, [r0+r3*1-8]
  2062. punpckhbw mm2, [r0+r3*0-8]
  2063. lea r0, [r0+r3*2]
  2064. movq mm3, [r0+r3*1-8]
  2065. punpckhbw mm3, [r0+r3*0-8]
  2066. punpckhwd mm3, mm2
  2067. punpckhdq mm3, mm1
  2068. lea r0, [r0+r3*2]
  2069. movq mm0, [r0+r3*0-8]
  2070. movq mm1, [r4]
  2071. mov r0, r4
  2072. movq mm4, mm3
  2073. movq mm2, mm3
  2074. PALIGNR mm4, mm0, 7, mm0
  2075. PALIGNR mm1, mm2, 1, mm2
  2076. test r1, r1
  2077. jnz .do_left
  2078. .fix_lt_1:
  2079. movq mm5, mm3
  2080. pxor mm5, mm4
  2081. psrlq mm5, 56
  2082. psllq mm5, 48
  2083. pxor mm1, mm5
  2084. jmp .do_left
  2085. .fix_lt_2:
  2086. movq mm5, mm3
  2087. pxor mm5, mm2
  2088. psllq mm5, 56
  2089. psrlq mm5, 56
  2090. pxor mm2, mm5
  2091. test r2, r2
  2092. jnz .do_top
  2093. .fix_tr_1:
  2094. movq mm5, mm3
  2095. pxor mm5, mm1
  2096. psrlq mm5, 56
  2097. psllq mm5, 56
  2098. pxor mm1, mm5
  2099. jmp .do_top
  2100. .do_left:
  2101. movq mm0, mm4
  2102. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2103. movq mm4, mm0
  2104. movq mm7, mm2
  2105. movq mm6, mm2
  2106. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2107. psllq mm1, 56
  2108. PALIGNR mm7, mm1, 7, mm3
  2109. movq mm0, [r0-8]
  2110. movq mm3, [r0]
  2111. movq mm1, [r0+8]
  2112. movq mm2, mm3
  2113. movq mm4, mm3
  2114. PALIGNR mm2, mm0, 7, mm0
  2115. PALIGNR mm1, mm4, 1, mm4
  2116. test r1, r1
  2117. jz .fix_lt_2
  2118. test r2, r2
  2119. jz .fix_tr_1
  2120. .do_top:
  2121. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2122. movq mm5, mm4
  2123. lea r1, [r0+r3*2]
  2124. psllq mm7, 56
  2125. movq mm2, mm5
  2126. movq mm3, mm6
  2127. movq mm4, mm2
  2128. PALIGNR mm2, mm6, 7, mm5
  2129. PALIGNR mm6, mm7, 7, mm0
  2130. lea r2, [r1+r3*2]
  2131. PALIGNR mm4, mm3, 1, mm7
  2132. movq mm5, mm3
  2133. pavgb mm3, mm6
  2134. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2135. movq mm4, mm2
  2136. movq mm1, mm2
  2137. lea r4, [r2+r3*2]
  2138. psrlq mm4, 16
  2139. psrlq mm1, 8
  2140. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2141. movq mm7, mm3
  2142. punpcklbw mm3, mm0
  2143. punpckhbw mm7, mm0
  2144. movq mm1, mm7
  2145. movq mm0, mm7
  2146. movq mm4, mm7
  2147. movq [r4+r3*2], mm3
  2148. PALIGNR mm7, mm3, 2, mm5
  2149. movq [r4+r3*1], mm7
  2150. PALIGNR mm1, mm3, 4, mm5
  2151. movq [r2+r3*2], mm1
  2152. PALIGNR mm0, mm3, 6, mm3
  2153. movq [r2+r3*1], mm0
  2154. movq mm2, mm6
  2155. movq mm3, mm6
  2156. movq [r1+r3*2], mm4
  2157. PALIGNR mm6, mm4, 2, mm5
  2158. movq [r1+r3*1], mm6
  2159. PALIGNR mm2, mm4, 4, mm5
  2160. movq [r0+r3*2], mm2
  2161. PALIGNR mm3, mm4, 6, mm4
  2162. movq [r0+r3*1], mm3
  2163. RET
  2164. %macro PRED8x8L_HORIZONTAL_DOWN 0
  2165. cglobal pred8x8l_horizontal_down_8, 4,5
  2166. sub r0, r3
  2167. lea r4, [r0+r3*2]
  2168. movq mm0, [r0+r3*1-8]
  2169. punpckhbw mm0, [r0+r3*0-8]
  2170. movq mm1, [r4+r3*1-8]
  2171. punpckhbw mm1, [r0+r3*2-8]
  2172. mov r4, r0
  2173. punpckhwd mm1, mm0
  2174. lea r0, [r0+r3*4]
  2175. movq mm2, [r0+r3*1-8]
  2176. punpckhbw mm2, [r0+r3*0-8]
  2177. lea r0, [r0+r3*2]
  2178. movq mm3, [r0+r3*1-8]
  2179. punpckhbw mm3, [r0+r3*0-8]
  2180. punpckhwd mm3, mm2
  2181. punpckhdq mm3, mm1
  2182. lea r0, [r0+r3*2]
  2183. movq mm0, [r0+r3*0-8]
  2184. movq mm1, [r4]
  2185. mov r0, r4
  2186. movq mm4, mm3
  2187. movq mm2, mm3
  2188. PALIGNR mm4, mm0, 7, mm0
  2189. PALIGNR mm1, mm2, 1, mm2
  2190. test r1, r1
  2191. jnz .do_left
  2192. .fix_lt_1:
  2193. movq mm5, mm3
  2194. pxor mm5, mm4
  2195. psrlq mm5, 56
  2196. psllq mm5, 48
  2197. pxor mm1, mm5
  2198. jmp .do_left
  2199. .fix_lt_2:
  2200. movq mm5, mm3
  2201. pxor mm5, mm2
  2202. psllq mm5, 56
  2203. psrlq mm5, 56
  2204. pxor mm2, mm5
  2205. test r2, r2
  2206. jnz .do_top
  2207. .fix_tr_1:
  2208. movq mm5, mm3
  2209. pxor mm5, mm1
  2210. psrlq mm5, 56
  2211. psllq mm5, 56
  2212. pxor mm1, mm5
  2213. jmp .do_top
  2214. .fix_tr_2:
  2215. punpckhbw mm3, mm3
  2216. pshufw mm1, mm3, 0xFF
  2217. jmp .do_topright
  2218. .do_left:
  2219. movq mm0, mm4
  2220. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2221. movq2dq xmm0, mm2
  2222. pslldq xmm0, 8
  2223. movq mm4, mm0
  2224. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2225. movq2dq xmm2, mm1
  2226. pslldq xmm2, 15
  2227. psrldq xmm2, 8
  2228. por xmm0, xmm2
  2229. movq mm0, [r0-8]
  2230. movq mm3, [r0]
  2231. movq mm1, [r0+8]
  2232. movq mm2, mm3
  2233. movq mm4, mm3
  2234. PALIGNR mm2, mm0, 7, mm0
  2235. PALIGNR mm1, mm4, 1, mm4
  2236. test r1, r1
  2237. jz .fix_lt_2
  2238. test r2, r2
  2239. jz .fix_tr_1
  2240. .do_top:
  2241. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2242. movq2dq xmm1, mm4
  2243. test r2, r2
  2244. jz .fix_tr_2
  2245. movq mm0, [r0+8]
  2246. movq mm5, mm0
  2247. movq mm2, mm0
  2248. movq mm4, mm0
  2249. psrlq mm5, 56
  2250. PALIGNR mm2, mm3, 7, mm3
  2251. PALIGNR mm5, mm4, 1, mm4
  2252. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2253. .do_topright:
  2254. movq2dq xmm5, mm1
  2255. pslldq xmm5, 8
  2256. por xmm1, xmm5
  2257. INIT_XMM cpuname
  2258. lea r2, [r4+r3*2]
  2259. movdqa xmm2, xmm1
  2260. movdqa xmm3, xmm1
  2261. PALIGNR xmm1, xmm0, 7, xmm4
  2262. PALIGNR xmm2, xmm0, 9, xmm5
  2263. lea r1, [r2+r3*2]
  2264. PALIGNR xmm3, xmm0, 8, xmm0
  2265. movdqa xmm4, xmm1
  2266. pavgb xmm4, xmm3
  2267. lea r0, [r1+r3*2]
  2268. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2269. punpcklbw xmm4, xmm0
  2270. movhlps xmm0, xmm4
  2271. movq [r0+r3*2], xmm4
  2272. movq [r2+r3*2], xmm0
  2273. psrldq xmm4, 2
  2274. psrldq xmm0, 2
  2275. movq [r0+r3*1], xmm4
  2276. movq [r2+r3*1], xmm0
  2277. psrldq xmm4, 2
  2278. psrldq xmm0, 2
  2279. movq [r1+r3*2], xmm4
  2280. movq [r4+r3*2], xmm0
  2281. psrldq xmm4, 2
  2282. psrldq xmm0, 2
  2283. movq [r1+r3*1], xmm4
  2284. movq [r4+r3*1], xmm0
  2285. RET
  2286. %endmacro
  2287. INIT_MMX sse2
  2288. PRED8x8L_HORIZONTAL_DOWN
  2289. INIT_MMX ssse3
  2290. PRED8x8L_HORIZONTAL_DOWN
  2291. ;-------------------------------------------------------------------------------
  2292. ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2293. ;-------------------------------------------------------------------------------
  2294. INIT_MMX mmxext
  2295. cglobal pred4x4_dc_8, 3,5
  2296. pxor mm7, mm7
  2297. mov r4, r0
  2298. sub r0, r2
  2299. movd mm0, [r0]
  2300. psadbw mm0, mm7
  2301. movzx r1d, byte [r0+r2*1-1]
  2302. movd r3d, mm0
  2303. add r3d, r1d
  2304. movzx r1d, byte [r0+r2*2-1]
  2305. lea r0, [r0+r2*2]
  2306. add r3d, r1d
  2307. movzx r1d, byte [r0+r2*1-1]
  2308. add r3d, r1d
  2309. movzx r1d, byte [r0+r2*2-1]
  2310. add r3d, r1d
  2311. add r3d, 4
  2312. shr r3d, 3
  2313. imul r3d, 0x01010101
  2314. mov [r4+r2*0], r3d
  2315. mov [r0+r2*0], r3d
  2316. mov [r0+r2*1], r3d
  2317. mov [r0+r2*2], r3d
  2318. RET
  2319. ;-----------------------------------------------------------------------------
  2320. ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
  2321. ; int stride)
  2322. ;-----------------------------------------------------------------------------
  2323. %macro PRED4x4_TM 0
  2324. cglobal pred4x4_tm_vp8_8, 3,6
  2325. sub r0, r2
  2326. pxor mm7, mm7
  2327. movd mm0, [r0]
  2328. punpcklbw mm0, mm7
  2329. movzx r4d, byte [r0-1]
  2330. mov r5d, 2
  2331. .loop:
  2332. movzx r1d, byte [r0+r2*1-1]
  2333. movzx r3d, byte [r0+r2*2-1]
  2334. sub r1d, r4d
  2335. sub r3d, r4d
  2336. movd mm2, r1d
  2337. movd mm4, r3d
  2338. %if cpuflag(mmxext)
  2339. pshufw mm2, mm2, 0
  2340. pshufw mm4, mm4, 0
  2341. %else
  2342. punpcklwd mm2, mm2
  2343. punpcklwd mm4, mm4
  2344. punpckldq mm2, mm2
  2345. punpckldq mm4, mm4
  2346. %endif
  2347. paddw mm2, mm0
  2348. paddw mm4, mm0
  2349. packuswb mm2, mm2
  2350. packuswb mm4, mm4
  2351. movd [r0+r2*1], mm2
  2352. movd [r0+r2*2], mm4
  2353. lea r0, [r0+r2*2]
  2354. dec r5d
  2355. jg .loop
  2356. REP_RET
  2357. %endmacro
  2358. INIT_MMX mmx
  2359. PRED4x4_TM
  2360. INIT_MMX mmxext
  2361. PRED4x4_TM
  2362. INIT_XMM ssse3
  2363. cglobal pred4x4_tm_vp8_8, 3,3
  2364. sub r0, r2
  2365. movq mm6, [tm_shuf]
  2366. pxor mm1, mm1
  2367. movd mm0, [r0]
  2368. punpcklbw mm0, mm1
  2369. movd mm7, [r0-4]
  2370. pshufb mm7, mm6
  2371. lea r1, [r0+r2*2]
  2372. movd mm2, [r0+r2*1-4]
  2373. movd mm3, [r0+r2*2-4]
  2374. movd mm4, [r1+r2*1-4]
  2375. movd mm5, [r1+r2*2-4]
  2376. pshufb mm2, mm6
  2377. pshufb mm3, mm6
  2378. pshufb mm4, mm6
  2379. pshufb mm5, mm6
  2380. psubw mm0, mm7
  2381. paddw mm2, mm0
  2382. paddw mm3, mm0
  2383. paddw mm4, mm0
  2384. paddw mm5, mm0
  2385. packuswb mm2, mm2
  2386. packuswb mm3, mm3
  2387. packuswb mm4, mm4
  2388. packuswb mm5, mm5
  2389. movd [r0+r2*1], mm2
  2390. movd [r0+r2*2], mm3
  2391. movd [r1+r2*1], mm4
  2392. movd [r1+r2*2], mm5
  2393. RET
  2394. ;-----------------------------------------------------------------------------
  2395. ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
  2396. ; int stride)
  2397. ;-----------------------------------------------------------------------------
  2398. INIT_MMX mmxext
  2399. cglobal pred4x4_vertical_vp8_8, 3,3
  2400. sub r0, r2
  2401. movd m1, [r0-1]
  2402. movd m0, [r0]
  2403. mova m2, m0 ;t0 t1 t2 t3
  2404. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2405. lea r1, [r0+r2*2]
  2406. psrlq m0, 8 ;t1 t2 t3 t4
  2407. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2408. movd [r0+r2*1], m3
  2409. movd [r0+r2*2], m3
  2410. movd [r1+r2*1], m3
  2411. movd [r1+r2*2], m3
  2412. RET
  2413. ;-----------------------------------------------------------------------------
  2414. ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
  2415. ; int stride)
  2416. ;-----------------------------------------------------------------------------
  2417. INIT_MMX mmxext
  2418. cglobal pred4x4_down_left_8, 3,3
  2419. sub r0, r2
  2420. movq m1, [r0]
  2421. punpckldq m1, [r1]
  2422. movq m2, m1
  2423. movq m3, m1
  2424. psllq m1, 8
  2425. pxor m2, m1
  2426. psrlq m2, 8
  2427. pxor m2, m3
  2428. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2429. lea r1, [r0+r2*2]
  2430. psrlq m0, 8
  2431. movd [r0+r2*1], m0
  2432. psrlq m0, 8
  2433. movd [r0+r2*2], m0
  2434. psrlq m0, 8
  2435. movd [r1+r2*1], m0
  2436. psrlq m0, 8
  2437. movd [r1+r2*2], m0
  2438. RET
  2439. ;------------------------------------------------------------------------------
  2440. ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
  2441. ; int stride)
  2442. ;------------------------------------------------------------------------------
  2443. INIT_MMX mmxext
  2444. cglobal pred4x4_vertical_left_8, 3,3
  2445. sub r0, r2
  2446. movq m1, [r0]
  2447. punpckldq m1, [r1]
  2448. movq m3, m1
  2449. movq m2, m1
  2450. psrlq m3, 8
  2451. psrlq m2, 16
  2452. movq m4, m3
  2453. pavgb m4, m1
  2454. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2455. lea r1, [r0+r2*2]
  2456. movh [r0+r2*1], m4
  2457. movh [r0+r2*2], m0
  2458. psrlq m4, 8
  2459. psrlq m0, 8
  2460. movh [r1+r2*1], m4
  2461. movh [r1+r2*2], m0
  2462. RET
  2463. ;------------------------------------------------------------------------------
  2464. ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
  2465. ; int stride)
  2466. ;------------------------------------------------------------------------------
  2467. INIT_MMX mmxext
  2468. cglobal pred4x4_horizontal_up_8, 3,3
  2469. sub r0, r2
  2470. lea r1, [r0+r2*2]
  2471. movd m0, [r0+r2*1-4]
  2472. punpcklbw m0, [r0+r2*2-4]
  2473. movd m1, [r1+r2*1-4]
  2474. punpcklbw m1, [r1+r2*2-4]
  2475. punpckhwd m0, m1
  2476. movq m1, m0
  2477. punpckhbw m1, m1
  2478. pshufw m1, m1, 0xFF
  2479. punpckhdq m0, m1
  2480. movq m2, m0
  2481. movq m3, m0
  2482. movq m7, m0
  2483. psrlq m2, 16
  2484. psrlq m3, 8
  2485. pavgb m7, m3
  2486. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2487. punpcklbw m7, m4
  2488. movd [r0+r2*1], m7
  2489. psrlq m7, 16
  2490. movd [r0+r2*2], m7
  2491. psrlq m7, 16
  2492. movd [r1+r2*1], m7
  2493. movd [r1+r2*2], m1
  2494. RET
  2495. ;------------------------------------------------------------------------------
  2496. ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
  2497. ; const uint8_t *topright, int stride)
  2498. ;------------------------------------------------------------------------------
  2499. INIT_MMX mmxext
  2500. cglobal pred4x4_horizontal_down_8, 3,3
  2501. sub r0, r2
  2502. lea r1, [r0+r2*2]
  2503. movh m0, [r0-4] ; lt ..
  2504. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2505. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2506. movd m1, [r1+r2*2-4] ; l3
  2507. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2508. movd m2, [r0+r2*2-4] ; l1
  2509. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2510. punpckhwd m1, m2 ; l0 l1 l2 l3
  2511. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2512. movq m0, m1
  2513. movq m2, m1
  2514. movq m5, m1
  2515. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2516. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2517. pavgb m5, m2
  2518. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2519. punpcklbw m5, m3
  2520. psrlq m3, 32
  2521. PALIGNR m3, m5, 6, m4
  2522. movh [r1+r2*2], m5
  2523. psrlq m5, 16
  2524. movh [r1+r2*1], m5
  2525. psrlq m5, 16
  2526. movh [r0+r2*2], m5
  2527. movh [r0+r2*1], m3
  2528. RET
  2529. ;-----------------------------------------------------------------------------
  2530. ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
  2531. ; const uint8_t *topright, int stride)
  2532. ;-----------------------------------------------------------------------------
  2533. INIT_MMX mmxext
  2534. cglobal pred4x4_vertical_right_8, 3,3
  2535. sub r0, r2
  2536. lea r1, [r0+r2*2]
  2537. movh m0, [r0] ; ........t3t2t1t0
  2538. movq m5, m0
  2539. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2540. pavgb m5, m0
  2541. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2542. movq m1, m0
  2543. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2544. movq m2, m0
  2545. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2546. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2547. movq m1, m3
  2548. psrlq m3, 16
  2549. psllq m1, 48
  2550. movh [r0+r2*1], m5
  2551. movh [r0+r2*2], m3
  2552. PALIGNR m5, m1, 7, m2
  2553. psllq m1, 8
  2554. movh [r1+r2*1], m5
  2555. PALIGNR m3, m1, 7, m1
  2556. movh [r1+r2*2], m3
  2557. RET
  2558. ;-----------------------------------------------------------------------------
  2559. ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
  2560. ; int stride)
  2561. ;-----------------------------------------------------------------------------
  2562. INIT_MMX mmxext
  2563. cglobal pred4x4_down_right_8, 3,3
  2564. sub r0, r2
  2565. lea r1, [r0+r2*2]
  2566. movq m1, [r1-8]
  2567. movq m2, [r0+r2*1-8]
  2568. punpckhbw m2, [r0-8]
  2569. movh m3, [r0]
  2570. punpckhwd m1, m2
  2571. PALIGNR m3, m1, 5, m1
  2572. movq m1, m3
  2573. PALIGNR m3, [r1+r2*1-8], 7, m4
  2574. movq m2, m3
  2575. PALIGNR m3, [r1+r2*2-8], 7, m4
  2576. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2577. movh [r1+r2*2], m0
  2578. psrlq m0, 8
  2579. movh [r1+r2*1], m0
  2580. psrlq m0, 8
  2581. movh [r0+r2*2], m0
  2582. psrlq m0, 8
  2583. movh [r0+r2*1], m0
  2584. RET