h264_intrapred.asm 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106
  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. SECTION_RODATA
  23. tm_shuf: times 8 db 0x03, 0x80
  24. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  25. db 1, 2, 3, 4, 5, 6, 7, 8
  26. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  27. db 1, 2, 3, 4, 0, 0, 0, 0
  28. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  29. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  30. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  31. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  32. SECTION .text
  33. cextern pb_1
  34. cextern pb_3
  35. cextern pw_5
  36. cextern pw_16
  37. cextern pw_17
  38. cextern pw_32
  39. ;-----------------------------------------------------------------------------
  40. ; void pred16x16_vertical(uint8_t *src, int stride)
  41. ;-----------------------------------------------------------------------------
  42. cglobal pred16x16_vertical_mmx, 2,3
  43. sub r0, r1
  44. mov r2, 8
  45. movq mm0, [r0+0]
  46. movq mm1, [r0+8]
  47. .loop:
  48. movq [r0+r1*1+0], mm0
  49. movq [r0+r1*1+8], mm1
  50. movq [r0+r1*2+0], mm0
  51. movq [r0+r1*2+8], mm1
  52. lea r0, [r0+r1*2]
  53. dec r2
  54. jg .loop
  55. REP_RET
  56. cglobal pred16x16_vertical_sse, 2,3
  57. sub r0, r1
  58. mov r2, 4
  59. movaps xmm0, [r0]
  60. .loop:
  61. movaps [r0+r1*1], xmm0
  62. movaps [r0+r1*2], xmm0
  63. lea r0, [r0+r1*2]
  64. movaps [r0+r1*1], xmm0
  65. movaps [r0+r1*2], xmm0
  66. lea r0, [r0+r1*2]
  67. dec r2
  68. jg .loop
  69. REP_RET
  70. ;-----------------------------------------------------------------------------
  71. ; void pred16x16_horizontal(uint8_t *src, int stride)
  72. ;-----------------------------------------------------------------------------
  73. %macro PRED16x16_H 1
  74. cglobal pred16x16_horizontal_%1, 2,3
  75. mov r2, 8
  76. %ifidn %1, ssse3
  77. mova m2, [pb_3]
  78. %endif
  79. .loop:
  80. movd m0, [r0+r1*0-4]
  81. movd m1, [r0+r1*1-4]
  82. %ifidn %1, ssse3
  83. pshufb m0, m2
  84. pshufb m1, m2
  85. %else
  86. punpcklbw m0, m0
  87. punpcklbw m1, m1
  88. %ifidn %1, mmxext
  89. pshufw m0, m0, 0xff
  90. pshufw m1, m1, 0xff
  91. %else
  92. punpckhwd m0, m0
  93. punpckhwd m1, m1
  94. punpckhdq m0, m0
  95. punpckhdq m1, m1
  96. %endif
  97. mova [r0+r1*0+8], m0
  98. mova [r0+r1*1+8], m1
  99. %endif
  100. mova [r0+r1*0], m0
  101. mova [r0+r1*1], m1
  102. lea r0, [r0+r1*2]
  103. dec r2
  104. jg .loop
  105. REP_RET
  106. %endmacro
  107. INIT_MMX
  108. PRED16x16_H mmx
  109. PRED16x16_H mmxext
  110. INIT_XMM
  111. PRED16x16_H ssse3
  112. ;-----------------------------------------------------------------------------
  113. ; void pred16x16_dc(uint8_t *src, int stride)
  114. ;-----------------------------------------------------------------------------
  115. %macro PRED16x16_DC 1
  116. cglobal pred16x16_dc_%1, 2,7
  117. mov r4, r0
  118. sub r0, r1
  119. pxor mm0, mm0
  120. pxor mm1, mm1
  121. psadbw mm0, [r0+0]
  122. psadbw mm1, [r0+8]
  123. dec r0
  124. movzx r5d, byte [r0+r1*1]
  125. paddw mm0, mm1
  126. movd r6d, mm0
  127. lea r0, [r0+r1*2]
  128. %rep 7
  129. movzx r2d, byte [r0+r1*0]
  130. movzx r3d, byte [r0+r1*1]
  131. add r5d, r2d
  132. add r6d, r3d
  133. lea r0, [r0+r1*2]
  134. %endrep
  135. movzx r2d, byte [r0+r1*0]
  136. add r5d, r6d
  137. lea r2d, [r2+r5+16]
  138. shr r2d, 5
  139. %ifidn %1, mmxext
  140. movd m0, r2d
  141. punpcklbw m0, m0
  142. pshufw m0, m0, 0
  143. %elifidn %1, sse2
  144. movd m0, r2d
  145. punpcklbw m0, m0
  146. pshuflw m0, m0, 0
  147. punpcklqdq m0, m0
  148. %elifidn %1, ssse3
  149. pxor m1, m1
  150. movd m0, r2d
  151. pshufb m0, m1
  152. %endif
  153. %if mmsize==8
  154. mov r3d, 8
  155. .loop:
  156. mova [r4+r1*0+0], m0
  157. mova [r4+r1*0+8], m0
  158. mova [r4+r1*1+0], m0
  159. mova [r4+r1*1+8], m0
  160. %else
  161. mov r3d, 4
  162. .loop:
  163. mova [r4+r1*0], m0
  164. mova [r4+r1*1], m0
  165. lea r4, [r4+r1*2]
  166. mova [r4+r1*0], m0
  167. mova [r4+r1*1], m0
  168. %endif
  169. lea r4, [r4+r1*2]
  170. dec r3d
  171. jg .loop
  172. REP_RET
  173. %endmacro
  174. INIT_MMX
  175. PRED16x16_DC mmxext
  176. INIT_XMM
  177. PRED16x16_DC sse2
  178. PRED16x16_DC ssse3
  179. ;-----------------------------------------------------------------------------
  180. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  181. ;-----------------------------------------------------------------------------
  182. %macro PRED16x16_TM_MMX 1
  183. cglobal pred16x16_tm_vp8_%1, 2,5
  184. sub r0, r1
  185. pxor mm7, mm7
  186. movq mm0, [r0+0]
  187. movq mm2, [r0+8]
  188. movq mm1, mm0
  189. movq mm3, mm2
  190. punpcklbw mm0, mm7
  191. punpckhbw mm1, mm7
  192. punpcklbw mm2, mm7
  193. punpckhbw mm3, mm7
  194. movzx r3d, byte [r0-1]
  195. mov r4d, 16
  196. .loop:
  197. movzx r2d, byte [r0+r1-1]
  198. sub r2d, r3d
  199. movd mm4, r2d
  200. %ifidn %1, mmx
  201. punpcklwd mm4, mm4
  202. punpckldq mm4, mm4
  203. %else
  204. pshufw mm4, mm4, 0
  205. %endif
  206. movq mm5, mm4
  207. movq mm6, mm4
  208. movq mm7, mm4
  209. paddw mm4, mm0
  210. paddw mm5, mm1
  211. paddw mm6, mm2
  212. paddw mm7, mm3
  213. packuswb mm4, mm5
  214. packuswb mm6, mm7
  215. movq [r0+r1+0], mm4
  216. movq [r0+r1+8], mm6
  217. add r0, r1
  218. dec r4d
  219. jg .loop
  220. REP_RET
  221. %endmacro
  222. PRED16x16_TM_MMX mmx
  223. PRED16x16_TM_MMX mmxext
  224. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  225. sub r0, r1
  226. pxor xmm2, xmm2
  227. movdqa xmm0, [r0]
  228. movdqa xmm1, xmm0
  229. punpcklbw xmm0, xmm2
  230. punpckhbw xmm1, xmm2
  231. movzx r4d, byte [r0-1]
  232. mov r5d, 8
  233. .loop:
  234. movzx r2d, byte [r0+r1*1-1]
  235. movzx r3d, byte [r0+r1*2-1]
  236. sub r2d, r4d
  237. sub r3d, r4d
  238. movd xmm2, r2d
  239. movd xmm4, r3d
  240. pshuflw xmm2, xmm2, 0
  241. pshuflw xmm4, xmm4, 0
  242. punpcklqdq xmm2, xmm2
  243. punpcklqdq xmm4, xmm4
  244. movdqa xmm3, xmm2
  245. movdqa xmm5, xmm4
  246. paddw xmm2, xmm0
  247. paddw xmm3, xmm1
  248. paddw xmm4, xmm0
  249. paddw xmm5, xmm1
  250. packuswb xmm2, xmm3
  251. packuswb xmm4, xmm5
  252. movdqa [r0+r1*1], xmm2
  253. movdqa [r0+r1*2], xmm4
  254. lea r0, [r0+r1*2]
  255. dec r5d
  256. jg .loop
  257. REP_RET
  258. ;-----------------------------------------------------------------------------
  259. ; void pred16x16_plane(uint8_t *src, int stride)
  260. ;-----------------------------------------------------------------------------
  261. %macro H264_PRED16x16_PLANE 3
  262. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  263. mov r2, r1 ; +stride
  264. neg r1 ; -stride
  265. movh m0, [r0+r1 -1]
  266. %if mmsize == 8
  267. pxor m4, m4
  268. movh m1, [r0+r1 +3 ]
  269. movh m2, [r0+r1 +8 ]
  270. movh m3, [r0+r1 +12]
  271. punpcklbw m0, m4
  272. punpcklbw m1, m4
  273. punpcklbw m2, m4
  274. punpcklbw m3, m4
  275. pmullw m0, [pw_m8tom1 ]
  276. pmullw m1, [pw_m8tom1+8]
  277. pmullw m2, [pw_1to8 ]
  278. pmullw m3, [pw_1to8 +8]
  279. paddw m0, m2
  280. paddw m1, m3
  281. %else ; mmsize == 16
  282. %ifidn %1, sse2
  283. pxor m2, m2
  284. movh m1, [r0+r1 +8]
  285. punpcklbw m0, m2
  286. punpcklbw m1, m2
  287. pmullw m0, [pw_m8tom1]
  288. pmullw m1, [pw_1to8]
  289. paddw m0, m1
  290. %else ; ssse3
  291. movhps m0, [r0+r1 +8]
  292. pmaddubsw m0, [plane_shuf] ; H coefficients
  293. %endif
  294. movhlps m1, m0
  295. %endif
  296. paddw m0, m1
  297. %ifidn %1, mmx
  298. mova m1, m0
  299. psrlq m1, 32
  300. %elifidn %1, mmx2
  301. pshufw m1, m0, 0xE
  302. %else ; mmsize == 16
  303. pshuflw m1, m0, 0xE
  304. %endif
  305. paddw m0, m1
  306. %ifidn %1, mmx
  307. mova m1, m0
  308. psrlq m1, 16
  309. %elifidn %1, mmx2
  310. pshufw m1, m0, 0x1
  311. %else
  312. pshuflw m1, m0, 0x1
  313. %endif
  314. paddw m0, m1 ; sum of H coefficients
  315. %ifidn %3, h264
  316. pmullw m0, [pw_5]
  317. paddw m0, [pw_32]
  318. psraw m0, 6
  319. %elifidn %3, rv40
  320. pmullw m0, [pw_5]
  321. psraw m0, 6
  322. %elifidn %3, svq3
  323. movd r3d, m0
  324. movsx r3, r3w
  325. test r3, r3
  326. lea r4, [r3+3]
  327. cmovs r3, r4
  328. sar r3, 2 ; H/4
  329. lea r3, [r3*5] ; 5*(H/4)
  330. test r3, r3
  331. lea r4, [r3+15]
  332. cmovs r3, r4
  333. sar r3, 4 ; (5*(H/4))/16
  334. movd m0, r3d
  335. %endif
  336. lea r4, [r0+r2*8-1]
  337. lea r3, [r0+r2*4-1]
  338. add r4, r2
  339. %ifdef ARCH_X86_64
  340. %define e_reg r11
  341. %else
  342. %define e_reg r0
  343. %endif
  344. movzx e_reg, byte [r3+r2*2 ]
  345. movzx r5, byte [r4+r1 ]
  346. sub r5, e_reg
  347. movzx e_reg, byte [r3+r2 ]
  348. movzx r6, byte [r4 ]
  349. sub r6, e_reg
  350. lea r5, [r5+r6*2]
  351. movzx e_reg, byte [r3+r1 ]
  352. movzx r6, byte [r4+r2*2 ]
  353. sub r6, e_reg
  354. lea r5, [r5+r6*4]
  355. movzx e_reg, byte [r3 ]
  356. %ifdef ARCH_X86_64
  357. movzx r10, byte [r4+r2 ]
  358. sub r10, e_reg
  359. %else
  360. movzx r6, byte [r4+r2 ]
  361. sub r6, e_reg
  362. lea r5, [r5+r6*4]
  363. sub r5, r6
  364. %endif
  365. lea e_reg, [r3+r1*4]
  366. lea r3, [r4+r2*4]
  367. movzx r4, byte [e_reg+r2 ]
  368. movzx r6, byte [r3 ]
  369. sub r6, r4
  370. %ifdef ARCH_X86_64
  371. lea r6, [r10+r6*2]
  372. lea r5, [r5+r6*2]
  373. add r5, r6
  374. %else
  375. lea r5, [r5+r6*4]
  376. lea r5, [r5+r6*2]
  377. %endif
  378. movzx r4, byte [e_reg ]
  379. %ifdef ARCH_X86_64
  380. movzx r10, byte [r3 +r2 ]
  381. sub r10, r4
  382. sub r5, r10
  383. %else
  384. movzx r6, byte [r3 +r2 ]
  385. sub r6, r4
  386. lea r5, [r5+r6*8]
  387. sub r5, r6
  388. %endif
  389. movzx r4, byte [e_reg+r1 ]
  390. movzx r6, byte [r3 +r2*2]
  391. sub r6, r4
  392. %ifdef ARCH_X86_64
  393. add r6, r10
  394. %endif
  395. lea r5, [r5+r6*8]
  396. movzx r4, byte [e_reg+r2*2]
  397. movzx r6, byte [r3 +r1 ]
  398. sub r6, r4
  399. lea r5, [r5+r6*4]
  400. add r5, r6 ; sum of V coefficients
  401. %ifndef ARCH_X86_64
  402. mov r0, r0m
  403. %endif
  404. %ifidn %3, h264
  405. lea r5, [r5*5+32]
  406. sar r5, 6
  407. %elifidn %3, rv40
  408. lea r5, [r5*5]
  409. sar r5, 6
  410. %elifidn %3, svq3
  411. test r5, r5
  412. lea r6, [r5+3]
  413. cmovs r5, r6
  414. sar r5, 2 ; V/4
  415. lea r5, [r5*5] ; 5*(V/4)
  416. test r5, r5
  417. lea r6, [r5+15]
  418. cmovs r5, r6
  419. sar r5, 4 ; (5*(V/4))/16
  420. %endif
  421. movzx r4, byte [r0+r1 +15]
  422. movzx r3, byte [r3+r2*2 ]
  423. lea r3, [r3+r4+1]
  424. shl r3, 4
  425. movd r1d, m0
  426. movsx r1d, r1w
  427. add r1d, r5d
  428. add r3d, r1d
  429. shl r1d, 3
  430. sub r3d, r1d ; a
  431. movd m1, r5d
  432. movd m3, r3d
  433. %ifidn %1, mmx
  434. punpcklwd m0, m0
  435. punpcklwd m1, m1
  436. punpcklwd m3, m3
  437. punpckldq m0, m0
  438. punpckldq m1, m1
  439. punpckldq m3, m3
  440. %elifidn %1, mmx2
  441. pshufw m0, m0, 0x0
  442. pshufw m1, m1, 0x0
  443. pshufw m3, m3, 0x0
  444. %else
  445. pshuflw m0, m0, 0x0
  446. pshuflw m1, m1, 0x0
  447. pshuflw m3, m3, 0x0
  448. punpcklqdq m0, m0 ; splat H (words)
  449. punpcklqdq m1, m1 ; splat V (words)
  450. punpcklqdq m3, m3 ; splat a (words)
  451. %endif
  452. %ifidn %3, svq3
  453. SWAP 0, 1
  454. %endif
  455. mova m2, m0
  456. %if mmsize == 8
  457. mova m5, m0
  458. %endif
  459. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  460. %if mmsize == 16
  461. psllw m2, 3
  462. %else
  463. psllw m5, 3
  464. psllw m2, 2
  465. mova m6, m5
  466. paddw m6, m2
  467. %endif
  468. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  469. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  470. %if mmsize == 8
  471. paddw m5, m0 ; a + {8,9,10,11}*H
  472. paddw m6, m0 ; a + {12,13,14,15}*H
  473. %endif
  474. mov r4, 8
  475. .loop
  476. mova m3, m0 ; b[0..7]
  477. mova m4, m2 ; b[8..15]
  478. psraw m3, 5
  479. psraw m4, 5
  480. packuswb m3, m4
  481. mova [r0], m3
  482. %if mmsize == 8
  483. mova m3, m5 ; b[8..11]
  484. mova m4, m6 ; b[12..15]
  485. psraw m3, 5
  486. psraw m4, 5
  487. packuswb m3, m4
  488. mova [r0+8], m3
  489. %endif
  490. paddw m0, m1
  491. paddw m2, m1
  492. %if mmsize == 8
  493. paddw m5, m1
  494. paddw m6, m1
  495. %endif
  496. mova m3, m0 ; b[0..7]
  497. mova m4, m2 ; b[8..15]
  498. psraw m3, 5
  499. psraw m4, 5
  500. packuswb m3, m4
  501. mova [r0+r2], m3
  502. %if mmsize == 8
  503. mova m3, m5 ; b[8..11]
  504. mova m4, m6 ; b[12..15]
  505. psraw m3, 5
  506. psraw m4, 5
  507. packuswb m3, m4
  508. mova [r0+r2+8], m3
  509. %endif
  510. paddw m0, m1
  511. paddw m2, m1
  512. %if mmsize == 8
  513. paddw m5, m1
  514. paddw m6, m1
  515. %endif
  516. lea r0, [r0+r2*2]
  517. dec r4
  518. jg .loop
  519. REP_RET
  520. %endmacro
  521. INIT_MMX
  522. H264_PRED16x16_PLANE mmx, 0, h264
  523. H264_PRED16x16_PLANE mmx, 0, rv40
  524. H264_PRED16x16_PLANE mmx, 0, svq3
  525. H264_PRED16x16_PLANE mmx2, 0, h264
  526. H264_PRED16x16_PLANE mmx2, 0, rv40
  527. H264_PRED16x16_PLANE mmx2, 0, svq3
  528. INIT_XMM
  529. H264_PRED16x16_PLANE sse2, 8, h264
  530. H264_PRED16x16_PLANE sse2, 8, rv40
  531. H264_PRED16x16_PLANE sse2, 8, svq3
  532. H264_PRED16x16_PLANE ssse3, 8, h264
  533. H264_PRED16x16_PLANE ssse3, 8, rv40
  534. H264_PRED16x16_PLANE ssse3, 8, svq3
  535. ;-----------------------------------------------------------------------------
  536. ; void pred8x8_plane(uint8_t *src, int stride)
  537. ;-----------------------------------------------------------------------------
  538. %macro H264_PRED8x8_PLANE 2
  539. cglobal pred8x8_plane_%1, 2, 7, %2
  540. mov r2, r1 ; +stride
  541. neg r1 ; -stride
  542. movd m0, [r0+r1 -1]
  543. %if mmsize == 8
  544. pxor m2, m2
  545. movh m1, [r0+r1 +4 ]
  546. punpcklbw m0, m2
  547. punpcklbw m1, m2
  548. pmullw m0, [pw_m4to4]
  549. pmullw m1, [pw_m4to4+8]
  550. %else ; mmsize == 16
  551. %ifidn %1, sse2
  552. pxor m2, m2
  553. movd m1, [r0+r1 +4]
  554. punpckldq m0, m1
  555. punpcklbw m0, m2
  556. pmullw m0, [pw_m4to4]
  557. %else ; ssse3
  558. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  559. pmaddubsw m0, [plane8_shuf] ; H coefficients
  560. %endif
  561. movhlps m1, m0
  562. %endif
  563. paddw m0, m1
  564. %ifnidn %1, ssse3
  565. %ifidn %1, mmx
  566. mova m1, m0
  567. psrlq m1, 32
  568. %elifidn %1, mmx2
  569. pshufw m1, m0, 0xE
  570. %else ; mmsize == 16
  571. pshuflw m1, m0, 0xE
  572. %endif
  573. paddw m0, m1
  574. %endif ; !ssse3
  575. %ifidn %1, mmx
  576. mova m1, m0
  577. psrlq m1, 16
  578. %elifidn %1, mmx2
  579. pshufw m1, m0, 0x1
  580. %else
  581. pshuflw m1, m0, 0x1
  582. %endif
  583. paddw m0, m1 ; sum of H coefficients
  584. pmullw m0, [pw_17]
  585. paddw m0, [pw_16]
  586. psraw m0, 5
  587. lea r4, [r0+r2*4-1]
  588. lea r3, [r0 -1]
  589. add r4, r2
  590. %ifdef ARCH_X86_64
  591. %define e_reg r11
  592. %else
  593. %define e_reg r0
  594. %endif
  595. movzx e_reg, byte [r3+r2*2 ]
  596. movzx r5, byte [r4+r1 ]
  597. sub r5, e_reg
  598. movzx e_reg, byte [r3 ]
  599. %ifdef ARCH_X86_64
  600. movzx r10, byte [r4+r2 ]
  601. sub r10, e_reg
  602. sub r5, r10
  603. %else
  604. movzx r6, byte [r4+r2 ]
  605. sub r6, e_reg
  606. lea r5, [r5+r6*4]
  607. sub r5, r6
  608. %endif
  609. movzx e_reg, byte [r3+r1 ]
  610. movzx r6, byte [r4+r2*2 ]
  611. sub r6, e_reg
  612. %ifdef ARCH_X86_64
  613. add r6, r10
  614. %endif
  615. lea r5, [r5+r6*4]
  616. movzx e_reg, byte [r3+r2 ]
  617. movzx r6, byte [r4 ]
  618. sub r6, e_reg
  619. lea r6, [r5+r6*2]
  620. lea r5, [r6*9+16]
  621. lea r5, [r5+r6*8]
  622. sar r5, 5
  623. %ifndef ARCH_X86_64
  624. mov r0, r0m
  625. %endif
  626. movzx r3, byte [r4+r2*2 ]
  627. movzx r4, byte [r0+r1 +7]
  628. lea r3, [r3+r4+1]
  629. shl r3, 4
  630. movd r1d, m0
  631. movsx r1d, r1w
  632. add r1d, r5d
  633. sub r3d, r1d
  634. add r1d, r1d
  635. sub r3d, r1d ; a
  636. movd m1, r5d
  637. movd m3, r3d
  638. %ifidn %1, mmx
  639. punpcklwd m0, m0
  640. punpcklwd m1, m1
  641. punpcklwd m3, m3
  642. punpckldq m0, m0
  643. punpckldq m1, m1
  644. punpckldq m3, m3
  645. %elifidn %1, mmx2
  646. pshufw m0, m0, 0x0
  647. pshufw m1, m1, 0x0
  648. pshufw m3, m3, 0x0
  649. %else
  650. pshuflw m0, m0, 0x0
  651. pshuflw m1, m1, 0x0
  652. pshuflw m3, m3, 0x0
  653. punpcklqdq m0, m0 ; splat H (words)
  654. punpcklqdq m1, m1 ; splat V (words)
  655. punpcklqdq m3, m3 ; splat a (words)
  656. %endif
  657. %if mmsize == 8
  658. mova m2, m0
  659. %endif
  660. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  661. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  662. %if mmsize == 8
  663. psllw m2, 2
  664. paddw m2, m0 ; a + {4,5,6,7}*H
  665. %endif
  666. mov r4, 4
  667. ALIGN 16
  668. .loop
  669. %if mmsize == 16
  670. mova m3, m0 ; b[0..7]
  671. paddw m0, m1
  672. psraw m3, 5
  673. mova m4, m0 ; V+b[0..7]
  674. paddw m0, m1
  675. psraw m4, 5
  676. packuswb m3, m4
  677. movh [r0], m3
  678. movhps [r0+r2], m3
  679. %else ; mmsize == 8
  680. mova m3, m0 ; b[0..3]
  681. mova m4, m2 ; b[4..7]
  682. paddw m0, m1
  683. paddw m2, m1
  684. psraw m3, 5
  685. psraw m4, 5
  686. mova m5, m0 ; V+b[0..3]
  687. mova m6, m2 ; V+b[4..7]
  688. paddw m0, m1
  689. paddw m2, m1
  690. psraw m5, 5
  691. psraw m6, 5
  692. packuswb m3, m4
  693. packuswb m5, m6
  694. mova [r0], m3
  695. mova [r0+r2], m5
  696. %endif
  697. lea r0, [r0+r2*2]
  698. dec r4
  699. jg .loop
  700. REP_RET
  701. %endmacro
  702. INIT_MMX
  703. H264_PRED8x8_PLANE mmx, 0
  704. H264_PRED8x8_PLANE mmx2, 0
  705. INIT_XMM
  706. H264_PRED8x8_PLANE sse2, 8
  707. H264_PRED8x8_PLANE ssse3, 8
  708. ;-----------------------------------------------------------------------------
  709. ; void pred8x8_vertical(uint8_t *src, int stride)
  710. ;-----------------------------------------------------------------------------
  711. cglobal pred8x8_vertical_mmx, 2,2
  712. sub r0, r1
  713. movq mm0, [r0]
  714. %rep 3
  715. movq [r0+r1*1], mm0
  716. movq [r0+r1*2], mm0
  717. lea r0, [r0+r1*2]
  718. %endrep
  719. movq [r0+r1*1], mm0
  720. movq [r0+r1*2], mm0
  721. RET
  722. ;-----------------------------------------------------------------------------
  723. ; void pred8x8_horizontal(uint8_t *src, int stride)
  724. ;-----------------------------------------------------------------------------
  725. %macro PRED8x8_H 1
  726. cglobal pred8x8_horizontal_%1, 2,3
  727. mov r2, 4
  728. %ifidn %1, ssse3
  729. mova m2, [pb_3]
  730. %endif
  731. .loop:
  732. movd m0, [r0+r1*0-4]
  733. movd m1, [r0+r1*1-4]
  734. %ifidn %1, ssse3
  735. pshufb m0, m2
  736. pshufb m1, m2
  737. %else
  738. punpcklbw m0, m0
  739. punpcklbw m1, m1
  740. %ifidn %1, mmxext
  741. pshufw m0, m0, 0xff
  742. pshufw m1, m1, 0xff
  743. %else
  744. punpckhwd m0, m0
  745. punpckhwd m1, m1
  746. punpckhdq m0, m0
  747. punpckhdq m1, m1
  748. %endif
  749. %endif
  750. mova [r0+r1*0], m0
  751. mova [r0+r1*1], m1
  752. lea r0, [r0+r1*2]
  753. dec r2
  754. jg .loop
  755. REP_RET
  756. %endmacro
  757. INIT_MMX
  758. PRED8x8_H mmx
  759. PRED8x8_H mmxext
  760. PRED8x8_H ssse3
  761. ;-----------------------------------------------------------------------------
  762. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  763. ;-----------------------------------------------------------------------------
  764. cglobal pred8x8_dc_rv40_mmxext, 2,7
  765. mov r4, r0
  766. sub r0, r1
  767. pxor mm0, mm0
  768. psadbw mm0, [r0]
  769. dec r0
  770. movzx r5d, byte [r0+r1*1]
  771. movd r6d, mm0
  772. lea r0, [r0+r1*2]
  773. %rep 3
  774. movzx r2d, byte [r0+r1*0]
  775. movzx r3d, byte [r0+r1*1]
  776. add r5d, r2d
  777. add r6d, r3d
  778. lea r0, [r0+r1*2]
  779. %endrep
  780. movzx r2d, byte [r0+r1*0]
  781. add r5d, r6d
  782. lea r2d, [r2+r5+8]
  783. shr r2d, 4
  784. movd mm0, r2d
  785. punpcklbw mm0, mm0
  786. pshufw mm0, mm0, 0
  787. mov r3d, 4
  788. .loop:
  789. movq [r4+r1*0], mm0
  790. movq [r4+r1*1], mm0
  791. lea r4, [r4+r1*2]
  792. dec r3d
  793. jg .loop
  794. REP_RET
  795. ;-----------------------------------------------------------------------------
  796. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  797. ;-----------------------------------------------------------------------------
  798. %macro PRED8x8_TM_MMX 1
  799. cglobal pred8x8_tm_vp8_%1, 2,6
  800. sub r0, r1
  801. pxor mm7, mm7
  802. movq mm0, [r0]
  803. movq mm1, mm0
  804. punpcklbw mm0, mm7
  805. punpckhbw mm1, mm7
  806. movzx r4d, byte [r0-1]
  807. mov r5d, 4
  808. .loop:
  809. movzx r2d, byte [r0+r1*1-1]
  810. movzx r3d, byte [r0+r1*2-1]
  811. sub r2d, r4d
  812. sub r3d, r4d
  813. movd mm2, r2d
  814. movd mm4, r3d
  815. %ifidn %1, mmx
  816. punpcklwd mm2, mm2
  817. punpcklwd mm4, mm4
  818. punpckldq mm2, mm2
  819. punpckldq mm4, mm4
  820. %else
  821. pshufw mm2, mm2, 0
  822. pshufw mm4, mm4, 0
  823. %endif
  824. movq mm3, mm2
  825. movq mm5, mm4
  826. paddw mm2, mm0
  827. paddw mm3, mm1
  828. paddw mm4, mm0
  829. paddw mm5, mm1
  830. packuswb mm2, mm3
  831. packuswb mm4, mm5
  832. movq [r0+r1*1], mm2
  833. movq [r0+r1*2], mm4
  834. lea r0, [r0+r1*2]
  835. dec r5d
  836. jg .loop
  837. REP_RET
  838. %endmacro
  839. PRED8x8_TM_MMX mmx
  840. PRED8x8_TM_MMX mmxext
  841. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  842. sub r0, r1
  843. pxor xmm1, xmm1
  844. movq xmm0, [r0]
  845. punpcklbw xmm0, xmm1
  846. movzx r4d, byte [r0-1]
  847. mov r5d, 4
  848. .loop:
  849. movzx r2d, byte [r0+r1*1-1]
  850. movzx r3d, byte [r0+r1*2-1]
  851. sub r2d, r4d
  852. sub r3d, r4d
  853. movd xmm2, r2d
  854. movd xmm3, r3d
  855. pshuflw xmm2, xmm2, 0
  856. pshuflw xmm3, xmm3, 0
  857. punpcklqdq xmm2, xmm2
  858. punpcklqdq xmm3, xmm3
  859. paddw xmm2, xmm0
  860. paddw xmm3, xmm0
  861. packuswb xmm2, xmm3
  862. movq [r0+r1*1], xmm2
  863. movhps [r0+r1*2], xmm2
  864. lea r0, [r0+r1*2]
  865. dec r5d
  866. jg .loop
  867. REP_RET
  868. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  869. sub r0, r1
  870. movdqa xmm4, [tm_shuf]
  871. pxor xmm1, xmm1
  872. movq xmm0, [r0]
  873. punpcklbw xmm0, xmm1
  874. movd xmm5, [r0-4]
  875. pshufb xmm5, xmm4
  876. mov r2d, 4
  877. .loop:
  878. movd xmm2, [r0+r1*1-4]
  879. movd xmm3, [r0+r1*2-4]
  880. pshufb xmm2, xmm4
  881. pshufb xmm3, xmm4
  882. psubw xmm2, xmm5
  883. psubw xmm3, xmm5
  884. paddw xmm2, xmm0
  885. paddw xmm3, xmm0
  886. packuswb xmm2, xmm3
  887. movq [r0+r1*1], xmm2
  888. movhps [r0+r1*2], xmm2
  889. lea r0, [r0+r1*2]
  890. dec r2d
  891. jg .loop
  892. REP_RET
  893. ;-----------------------------------------------------------------------------
  894. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  895. ;-----------------------------------------------------------------------------
  896. cglobal pred4x4_dc_mmxext, 3,5
  897. pxor mm7, mm7
  898. mov r4, r0
  899. sub r0, r2
  900. movd mm0, [r0]
  901. psadbw mm0, mm7
  902. movzx r1d, byte [r0+r2*1-1]
  903. movd r3d, mm0
  904. add r3d, r1d
  905. movzx r1d, byte [r0+r2*2-1]
  906. lea r0, [r0+r2*2]
  907. add r3d, r1d
  908. movzx r1d, byte [r0+r2*1-1]
  909. add r3d, r1d
  910. movzx r1d, byte [r0+r2*2-1]
  911. add r3d, r1d
  912. add r3d, 4
  913. shr r3d, 3
  914. imul r3d, 0x01010101
  915. mov [r4+r2*0], r3d
  916. mov [r0+r2*0], r3d
  917. mov [r0+r2*1], r3d
  918. mov [r0+r2*2], r3d
  919. RET
  920. ;-----------------------------------------------------------------------------
  921. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  922. ;-----------------------------------------------------------------------------
  923. %macro PRED4x4_TM_MMX 1
  924. cglobal pred4x4_tm_vp8_%1, 3,6
  925. sub r0, r2
  926. pxor mm7, mm7
  927. movd mm0, [r0]
  928. punpcklbw mm0, mm7
  929. movzx r4d, byte [r0-1]
  930. mov r5d, 2
  931. .loop:
  932. movzx r1d, byte [r0+r2*1-1]
  933. movzx r3d, byte [r0+r2*2-1]
  934. sub r1d, r4d
  935. sub r3d, r4d
  936. movd mm2, r1d
  937. movd mm4, r3d
  938. %ifidn %1, mmx
  939. punpcklwd mm2, mm2
  940. punpcklwd mm4, mm4
  941. punpckldq mm2, mm2
  942. punpckldq mm4, mm4
  943. %else
  944. pshufw mm2, mm2, 0
  945. pshufw mm4, mm4, 0
  946. %endif
  947. paddw mm2, mm0
  948. paddw mm4, mm0
  949. packuswb mm2, mm2
  950. packuswb mm4, mm4
  951. movd [r0+r2*1], mm2
  952. movd [r0+r2*2], mm4
  953. lea r0, [r0+r2*2]
  954. dec r5d
  955. jg .loop
  956. REP_RET
  957. %endmacro
  958. PRED4x4_TM_MMX mmx
  959. PRED4x4_TM_MMX mmxext
  960. cglobal pred4x4_tm_vp8_ssse3, 3,3
  961. sub r0, r2
  962. movq mm6, [tm_shuf]
  963. pxor mm1, mm1
  964. movd mm0, [r0]
  965. punpcklbw mm0, mm1
  966. movd mm7, [r0-4]
  967. pshufb mm7, mm6
  968. lea r1, [r0+r2*2]
  969. movd mm2, [r0+r2*1-4]
  970. movd mm3, [r0+r2*2-4]
  971. movd mm4, [r1+r2*1-4]
  972. movd mm5, [r1+r2*2-4]
  973. pshufb mm2, mm6
  974. pshufb mm3, mm6
  975. pshufb mm4, mm6
  976. pshufb mm5, mm6
  977. psubw mm2, mm7
  978. psubw mm3, mm7
  979. psubw mm4, mm7
  980. psubw mm5, mm7
  981. paddw mm2, mm0
  982. paddw mm3, mm0
  983. paddw mm4, mm0
  984. paddw mm5, mm0
  985. packuswb mm2, mm2
  986. packuswb mm3, mm3
  987. packuswb mm4, mm4
  988. packuswb mm5, mm5
  989. movd [r0+r2*1], mm2
  990. movd [r0+r2*2], mm3
  991. movd [r1+r2*1], mm4
  992. movd [r1+r2*2], mm5
  993. RET
  994. ; dest, left, right, src, tmp
  995. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  996. %macro PRED4x4_LOWPASS 5
  997. mova %5, %2
  998. pavgb %2, %3
  999. pxor %3, %5
  1000. mova %1, %4
  1001. pand %3, [pb_1]
  1002. psubusb %2, %3
  1003. pavgb %1, %2
  1004. %endmacro
  1005. ;-----------------------------------------------------------------------------
  1006. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  1007. ;-----------------------------------------------------------------------------
  1008. INIT_MMX
  1009. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  1010. sub r0, r2
  1011. movd m1, [r0-1]
  1012. movd m0, [r0]
  1013. mova m2, m0 ;t0 t1 t2 t3
  1014. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  1015. lea r1, [r0+r2*2]
  1016. psrlq m0, 8 ;t1 t2 t3 t4
  1017. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  1018. movd [r0+r2*1], m3
  1019. movd [r0+r2*2], m3
  1020. movd [r1+r2*1], m3
  1021. movd [r1+r2*2], m3
  1022. RET