rv40dsp.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV40 decoder
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  5. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. align 16
  26. pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
  27. sixtap_filter_hb_m: times 8 db 1, -5
  28. times 8 db 52, 20
  29. ; multiplied by 2 to have the same shift
  30. times 8 db 2, -10
  31. times 8 db 40, 40
  32. ; back to normal
  33. times 8 db 1, -5
  34. times 8 db 20, 52
  35. sixtap_filter_v_m: times 8 dw 1
  36. times 8 dw -5
  37. times 8 dw 52
  38. times 8 dw 20
  39. ; multiplied by 2 to have the same shift
  40. times 8 dw 2
  41. times 8 dw -10
  42. times 8 dw 40
  43. times 8 dw 40
  44. ; back to normal
  45. times 8 dw 1
  46. times 8 dw -5
  47. times 8 dw 20
  48. times 8 dw 52
  49. %ifdef PIC
  50. %define sixtap_filter_hw picregq
  51. %define sixtap_filter_hb picregq
  52. %define sixtap_filter_v picregq
  53. %define npicregs 1
  54. %else
  55. %define sixtap_filter_hw sixtap_filter_hw_m
  56. %define sixtap_filter_hb sixtap_filter_hb_m
  57. %define sixtap_filter_v sixtap_filter_v_m
  58. %define npicregs 0
  59. %endif
  60. filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  61. filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  62. filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  63. cextern pw_32
  64. cextern pw_16
  65. cextern pw_512
  66. SECTION .text
  67. ;-----------------------------------------------------------------------------
  68. ; subpel MC functions:
  69. ;
  70. ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  71. ; uint8_t *src, int srcstride,
  72. ; int len, int m);
  73. ;----------------------------------------------------------------------
  74. %macro LOAD 2
  75. %if WIN64
  76. movsxd %1q, %1d
  77. %endif
  78. %ifdef PIC
  79. add %1q, picregq
  80. %else
  81. add %1q, %2
  82. %endif
  83. %endmacro
  84. %macro STORE 3
  85. %ifidn %3, avg
  86. movh %2, [dstq]
  87. %endif
  88. packuswb %1, %1
  89. %ifidn %3, avg
  90. PAVGB %1, %2
  91. %endif
  92. movh [dstq], %1
  93. %endmacro
  94. %macro FILTER_V 1
  95. cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
  96. %ifdef PIC
  97. lea picregq, [sixtap_filter_v_m]
  98. %endif
  99. pxor m7, m7
  100. LOAD my, sixtap_filter_v
  101. ; read 5 lines
  102. sub srcq, srcstrideq
  103. sub srcq, srcstrideq
  104. movh m0, [srcq]
  105. movh m1, [srcq+srcstrideq]
  106. movh m2, [srcq+srcstrideq*2]
  107. lea srcq, [srcq+srcstrideq*2]
  108. add srcq, srcstrideq
  109. movh m3, [srcq]
  110. movh m4, [srcq+srcstrideq]
  111. punpcklbw m0, m7
  112. punpcklbw m1, m7
  113. punpcklbw m2, m7
  114. punpcklbw m3, m7
  115. punpcklbw m4, m7
  116. %ifdef m8
  117. mova m8, [myq+ 0]
  118. mova m9, [myq+16]
  119. mova m10, [myq+32]
  120. mova m11, [myq+48]
  121. %define COEFF05 m8
  122. %define COEFF14 m9
  123. %define COEFF2 m10
  124. %define COEFF3 m11
  125. %else
  126. %define COEFF05 [myq+ 0]
  127. %define COEFF14 [myq+16]
  128. %define COEFF2 [myq+32]
  129. %define COEFF3 [myq+48]
  130. %endif
  131. .nextrow:
  132. mova m6, m1
  133. movh m5, [srcq+2*srcstrideq] ; read new row
  134. paddw m6, m4
  135. punpcklbw m5, m7
  136. pmullw m6, COEFF14
  137. paddw m0, m5
  138. pmullw m0, COEFF05
  139. paddw m6, m0
  140. mova m0, m1
  141. paddw m6, [pw_32]
  142. mova m1, m2
  143. pmullw m2, COEFF2
  144. paddw m6, m2
  145. mova m2, m3
  146. pmullw m3, COEFF3
  147. paddw m6, m3
  148. ; round/clip/store
  149. mova m3, m4
  150. psraw m6, 6
  151. mova m4, m5
  152. STORE m6, m5, %1
  153. ; go to next line
  154. add dstq, dststrideq
  155. add srcq, srcstrideq
  156. dec heightd ; next row
  157. jg .nextrow
  158. REP_RET
  159. %endmacro
  160. %macro FILTER_H 1
  161. cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
  162. %ifdef PIC
  163. lea picregq, [sixtap_filter_v_m]
  164. %endif
  165. pxor m7, m7
  166. LOAD mx, sixtap_filter_v
  167. mova m6, [pw_32]
  168. %ifdef m8
  169. mova m8, [mxq+ 0]
  170. mova m9, [mxq+16]
  171. mova m10, [mxq+32]
  172. mova m11, [mxq+48]
  173. %define COEFF05 m8
  174. %define COEFF14 m9
  175. %define COEFF2 m10
  176. %define COEFF3 m11
  177. %else
  178. %define COEFF05 [mxq+ 0]
  179. %define COEFF14 [mxq+16]
  180. %define COEFF2 [mxq+32]
  181. %define COEFF3 [mxq+48]
  182. %endif
  183. .nextrow:
  184. movq m0, [srcq-2]
  185. movq m5, [srcq+3]
  186. movq m1, [srcq-1]
  187. movq m4, [srcq+2]
  188. punpcklbw m0, m7
  189. punpcklbw m5, m7
  190. punpcklbw m1, m7
  191. punpcklbw m4, m7
  192. movq m2, [srcq-0]
  193. movq m3, [srcq+1]
  194. paddw m0, m5
  195. paddw m1, m4
  196. punpcklbw m2, m7
  197. punpcklbw m3, m7
  198. pmullw m0, COEFF05
  199. pmullw m1, COEFF14
  200. pmullw m2, COEFF2
  201. pmullw m3, COEFF3
  202. paddw m0, m6
  203. paddw m1, m2
  204. paddw m0, m3
  205. paddw m0, m1
  206. psraw m0, 6
  207. STORE m0, m1, %1
  208. ; go to next line
  209. add dstq, dststrideq
  210. add srcq, srcstrideq
  211. dec heightd ; next row
  212. jg .nextrow
  213. REP_RET
  214. %endmacro
  215. %if ARCH_X86_32
  216. INIT_MMX mmx
  217. FILTER_V put
  218. FILTER_H put
  219. INIT_MMX mmxext
  220. FILTER_V avg
  221. FILTER_H avg
  222. INIT_MMX 3dnow
  223. FILTER_V avg
  224. FILTER_H avg
  225. %endif
  226. INIT_XMM sse2
  227. FILTER_H put
  228. FILTER_H avg
  229. FILTER_V put
  230. FILTER_V avg
  231. %macro FILTER_SSSE3 1
  232. cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
  233. %ifdef PIC
  234. lea picregq, [sixtap_filter_hb_m]
  235. %endif
  236. ; read 5 lines
  237. sub srcq, srcstrideq
  238. LOAD my, sixtap_filter_hb
  239. sub srcq, srcstrideq
  240. movh m0, [srcq]
  241. movh m1, [srcq+srcstrideq]
  242. movh m2, [srcq+srcstrideq*2]
  243. lea srcq, [srcq+srcstrideq*2]
  244. add srcq, srcstrideq
  245. mova m5, [myq]
  246. movh m3, [srcq]
  247. movh m4, [srcq+srcstrideq]
  248. lea srcq, [srcq+2*srcstrideq]
  249. .nextrow:
  250. mova m6, m2
  251. punpcklbw m0, m1
  252. punpcklbw m6, m3
  253. pmaddubsw m0, m5
  254. pmaddubsw m6, [myq+16]
  255. movh m7, [srcq] ; read new row
  256. paddw m6, m0
  257. mova m0, m1
  258. mova m1, m2
  259. mova m2, m3
  260. mova m3, m4
  261. mova m4, m7
  262. punpcklbw m7, m3
  263. pmaddubsw m7, m5
  264. paddw m6, m7
  265. pmulhrsw m6, [pw_512]
  266. STORE m6, m7, %1
  267. ; go to next line
  268. add dstq, dststrideq
  269. add srcq, srcstrideq
  270. dec heightd ; next row
  271. jg .nextrow
  272. REP_RET
  273. cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
  274. %ifdef PIC
  275. lea picregq, [sixtap_filter_hb_m]
  276. %endif
  277. mova m3, [filter_h6_shuf2]
  278. mova m4, [filter_h6_shuf3]
  279. LOAD mx, sixtap_filter_hb
  280. mova m5, [mxq] ; set up 6tap filter in bytes
  281. mova m6, [mxq+16]
  282. mova m7, [filter_h6_shuf1]
  283. .nextrow:
  284. movu m0, [srcq-2]
  285. mova m1, m0
  286. mova m2, m0
  287. pshufb m0, m7
  288. pshufb m1, m3
  289. pshufb m2, m4
  290. pmaddubsw m0, m5
  291. pmaddubsw m1, m6
  292. pmaddubsw m2, m5
  293. paddw m0, m1
  294. paddw m0, m2
  295. pmulhrsw m0, [pw_512]
  296. STORE m0, m1, %1
  297. ; go to next line
  298. add dstq, dststrideq
  299. add srcq, srcstrideq
  300. dec heightd ; next row
  301. jg .nextrow
  302. REP_RET
  303. %endmacro
  304. INIT_XMM ssse3
  305. FILTER_SSSE3 put
  306. FILTER_SSSE3 avg
  307. ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
  308. %macro RV40_WCORE 4-5
  309. movh m4, [%3 + r6 + 0]
  310. movh m5, [%4 + r6 + 0]
  311. %if %0 == 4
  312. %define OFFSET r6 + mmsize / 2
  313. %else
  314. ; 8x8 block and sse2, stride was provided
  315. %define OFFSET r6
  316. add r6, r5
  317. %endif
  318. movh m6, [%3 + OFFSET]
  319. movh m7, [%4 + OFFSET]
  320. %if %1 == 0
  321. ; 14bits weights
  322. punpcklbw m4, m0
  323. punpcklbw m5, m0
  324. punpcklbw m6, m0
  325. punpcklbw m7, m0
  326. psllw m4, 7
  327. psllw m5, 7
  328. psllw m6, 7
  329. psllw m7, 7
  330. pmulhw m4, m3
  331. pmulhw m5, m2
  332. pmulhw m6, m3
  333. pmulhw m7, m2
  334. paddw m4, m5
  335. paddw m6, m7
  336. %else
  337. ; 5bits weights
  338. %if cpuflag(ssse3)
  339. punpcklbw m4, m5
  340. punpcklbw m6, m7
  341. pmaddubsw m4, m3
  342. pmaddubsw m6, m3
  343. %else
  344. punpcklbw m4, m0
  345. punpcklbw m5, m0
  346. punpcklbw m6, m0
  347. punpcklbw m7, m0
  348. pmullw m4, m3
  349. pmullw m5, m2
  350. pmullw m6, m3
  351. pmullw m7, m2
  352. paddw m4, m5
  353. paddw m6, m7
  354. %endif
  355. %endif
  356. ; bias and shift down
  357. %if cpuflag(ssse3)
  358. pmulhrsw m4, m1
  359. pmulhrsw m6, m1
  360. %else
  361. paddw m4, m1
  362. paddw m6, m1
  363. psrlw m4, 5
  364. psrlw m6, 5
  365. %endif
  366. packuswb m4, m6
  367. %if %0 == 5
  368. ; Only called for 8x8 blocks and sse2
  369. sub r6, r5
  370. movh [%2 + r6], m4
  371. add r6, r5
  372. movhps [%2 + r6], m4
  373. %else
  374. mova [%2 + r6], m4
  375. %endif
  376. %endmacro
  377. %macro MAIN_LOOP 2
  378. %if mmsize == 8
  379. RV40_WCORE %2, r0, r1, r2
  380. %if %1 == 16
  381. RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
  382. %endif
  383. ; Prepare for next loop
  384. add r6, r5
  385. %else
  386. %ifidn %1, 8
  387. RV40_WCORE %2, r0, r1, r2, r5
  388. ; Prepare 2 next lines
  389. add r6, r5
  390. %else
  391. RV40_WCORE %2, r0, r1, r2
  392. ; Prepare single next line
  393. add r6, r5
  394. %endif
  395. %endif
  396. %endmacro
  397. ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
  398. ; %1=size %2=num of xmm regs
  399. ; The weights are FP0.14 notation of fractions depending on pts.
  400. ; For timebases without rounding error (i.e. PAL), the fractions
  401. ; can be simplified, and several operations can be avoided.
  402. ; Therefore, we check here whether they are multiples of 2^9 for
  403. ; those simplifications to occur.
  404. %macro RV40_WEIGHT 3
  405. cglobal rv40_weight_func_%1_%2, 6, 7, 8
  406. %if cpuflag(ssse3)
  407. mova m1, [pw_1024]
  408. %else
  409. mova m1, [pw_16]
  410. %endif
  411. pxor m0, m0
  412. ; Set loop counter and increments
  413. mov r6, r5
  414. shl r6, %3
  415. add r0, r6
  416. add r1, r6
  417. add r2, r6
  418. neg r6
  419. movd m2, r3d
  420. movd m3, r4d
  421. %ifidn %1,rnd
  422. %define RND 0
  423. SPLATW m2, m2
  424. %else
  425. %define RND 1
  426. %if cpuflag(ssse3)
  427. punpcklbw m3, m2
  428. %else
  429. SPLATW m2, m2
  430. %endif
  431. %endif
  432. SPLATW m3, m3
  433. .loop:
  434. MAIN_LOOP %2, RND
  435. jnz .loop
  436. REP_RET
  437. %endmacro
  438. INIT_MMX mmxext
  439. RV40_WEIGHT rnd, 8, 3
  440. RV40_WEIGHT rnd, 16, 4
  441. RV40_WEIGHT nornd, 8, 3
  442. RV40_WEIGHT nornd, 16, 4
  443. INIT_XMM sse2
  444. RV40_WEIGHT rnd, 8, 3
  445. RV40_WEIGHT rnd, 16, 4
  446. RV40_WEIGHT nornd, 8, 3
  447. RV40_WEIGHT nornd, 16, 4
  448. INIT_XMM ssse3
  449. RV40_WEIGHT rnd, 8, 3
  450. RV40_WEIGHT rnd, 16, 4
  451. RV40_WEIGHT nornd, 8, 3
  452. RV40_WEIGHT nornd, 16, 4