output.asm 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. ;******************************************************************************
  2. ;* x86-optimized vertical line scaling functions
  3. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Kieran Kunhya <kieran@kunhya.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. minshort: times 8 dw 0x8000
  25. yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
  26. yuv2yuvX_10_start: times 4 dd 0x10000
  27. yuv2yuvX_9_start: times 4 dd 0x20000
  28. yuv2yuvX_10_upper: times 8 dw 0x3ff
  29. yuv2yuvX_9_upper: times 8 dw 0x1ff
  30. pd_4: times 4 dd 4
  31. pd_4min0x40000:times 4 dd 4 - (0x40000)
  32. pw_16: times 8 dw 16
  33. pw_32: times 8 dw 32
  34. pw_512: times 8 dw 512
  35. pw_1024: times 8 dw 1024
  36. SECTION .text
  37. ;-----------------------------------------------------------------------------
  38. ; vertical line scaling
  39. ;
  40. ; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
  41. ; const uint8_t *dither, int offset)
  42. ; and
  43. ; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
  44. ; const int16_t **src, uint8_t *dst, int dstW,
  45. ; const uint8_t *dither, int offset)
  46. ;
  47. ; Scale one or $filterSize lines of source data to generate one line of output
  48. ; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
  49. ; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
  50. ; of 2. $offset is either 0 or 3. $dither holds 8 values.
  51. ;-----------------------------------------------------------------------------
  52. %macro yuv2planeX_mainloop 2
  53. .pixelloop_%2:
  54. %assign %%i 0
  55. ; the rep here is for the 8-bit output MMX case, where dither covers
  56. ; 8 pixels but we can only handle 2 pixels per register, and thus 4
  57. ; pixels per iteration. In order to not have to keep track of where
  58. ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
  59. %if %1 == 8
  60. %assign %%repcnt 16/mmsize
  61. %else
  62. %assign %%repcnt 1
  63. %endif
  64. %rep %%repcnt
  65. %if %1 == 8
  66. %if ARCH_X86_32
  67. mova m2, [rsp+mmsize*(0+%%i)]
  68. mova m1, [rsp+mmsize*(1+%%i)]
  69. %else ; x86-64
  70. mova m2, m8
  71. mova m1, m_dith
  72. %endif ; x86-32/64
  73. %else ; %1 == 9/10/16
  74. mova m1, [yuv2yuvX_%1_start]
  75. mova m2, m1
  76. %endif ; %1 == 8/9/10/16
  77. movsx cntr_reg, fltsizem
  78. .filterloop_%2_ %+ %%i:
  79. ; input pixels
  80. mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
  81. %if %1 == 16
  82. mova m3, [r6+r5*4]
  83. mova m5, [r6+r5*4+mmsize]
  84. %else ; %1 == 8/9/10
  85. mova m3, [r6+r5*2]
  86. %endif ; %1 == 8/9/10/16
  87. mov r6, [srcq+gprsize*cntr_reg-gprsize]
  88. %if %1 == 16
  89. mova m4, [r6+r5*4]
  90. mova m6, [r6+r5*4+mmsize]
  91. %else ; %1 == 8/9/10
  92. mova m4, [r6+r5*2]
  93. %endif ; %1 == 8/9/10/16
  94. ; coefficients
  95. movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
  96. %if %1 == 16
  97. pshuflw m7, m0, 0 ; coeff[0]
  98. pshuflw m0, m0, 0x55 ; coeff[1]
  99. pmovsxwd m7, m7 ; word -> dword
  100. pmovsxwd m0, m0 ; word -> dword
  101. pmulld m3, m7
  102. pmulld m5, m7
  103. pmulld m4, m0
  104. pmulld m6, m0
  105. paddd m2, m3
  106. paddd m1, m5
  107. paddd m2, m4
  108. paddd m1, m6
  109. %else ; %1 == 10/9/8
  110. punpcklwd m5, m3, m4
  111. punpckhwd m3, m4
  112. SPLATD m0
  113. pmaddwd m5, m0
  114. pmaddwd m3, m0
  115. paddd m2, m5
  116. paddd m1, m3
  117. %endif ; %1 == 8/9/10/16
  118. sub cntr_reg, 2
  119. jg .filterloop_%2_ %+ %%i
  120. %if %1 == 16
  121. psrad m2, 31 - %1
  122. psrad m1, 31 - %1
  123. %else ; %1 == 10/9/8
  124. psrad m2, 27 - %1
  125. psrad m1, 27 - %1
  126. %endif ; %1 == 8/9/10/16
  127. %if %1 == 8
  128. packssdw m2, m1
  129. packuswb m2, m2
  130. movh [dstq+r5*1], m2
  131. %else ; %1 == 9/10/16
  132. %if %1 == 16
  133. packssdw m2, m1
  134. paddw m2, [minshort]
  135. %else ; %1 == 9/10
  136. %if cpuflag(sse4)
  137. packusdw m2, m1
  138. %else ; mmxext/sse2
  139. packssdw m2, m1
  140. pmaxsw m2, m6
  141. %endif ; mmxext/sse2/sse4/avx
  142. pminsw m2, [yuv2yuvX_%1_upper]
  143. %endif ; %1 == 9/10/16
  144. mov%2 [dstq+r5*2], m2
  145. %endif ; %1 == 8/9/10/16
  146. add r5, mmsize/2
  147. sub wd, mmsize/2
  148. %assign %%i %%i+2
  149. %endrep
  150. jg .pixelloop_%2
  151. %endmacro
  152. %macro yuv2planeX_fn 3
  153. %if ARCH_X86_32
  154. %define cntr_reg fltsizeq
  155. %define movsx mov
  156. %else
  157. %define cntr_reg r7
  158. %define movsx movsxd
  159. %endif
  160. cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
  161. %if %1 == 8 || %1 == 9 || %1 == 10
  162. pxor m6, m6
  163. %endif ; %1 == 8/9/10
  164. %if %1 == 8
  165. %if ARCH_X86_32
  166. %assign pad 0x2c - (stack_offset & 15)
  167. SUB rsp, pad
  168. %define m_dith m7
  169. %else ; x86-64
  170. %define m_dith m9
  171. %endif ; x86-32
  172. ; create registers holding dither
  173. movq m_dith, [ditherq] ; dither
  174. test offsetd, offsetd
  175. jz .no_rot
  176. %if mmsize == 16
  177. punpcklqdq m_dith, m_dith
  178. %endif ; mmsize == 16
  179. PALIGNR m_dith, m_dith, 3, m0
  180. .no_rot:
  181. %if mmsize == 16
  182. punpcklbw m_dith, m6
  183. %if ARCH_X86_64
  184. punpcklwd m8, m_dith, m6
  185. pslld m8, 12
  186. %else ; x86-32
  187. punpcklwd m5, m_dith, m6
  188. pslld m5, 12
  189. %endif ; x86-32/64
  190. punpckhwd m_dith, m6
  191. pslld m_dith, 12
  192. %if ARCH_X86_32
  193. mova [rsp+ 0], m5
  194. mova [rsp+16], m_dith
  195. %endif
  196. %else ; mmsize == 8
  197. punpcklbw m5, m_dith, m6
  198. punpckhbw m_dith, m6
  199. punpcklwd m4, m5, m6
  200. punpckhwd m5, m6
  201. punpcklwd m3, m_dith, m6
  202. punpckhwd m_dith, m6
  203. pslld m4, 12
  204. pslld m5, 12
  205. pslld m3, 12
  206. pslld m_dith, 12
  207. mova [rsp+ 0], m4
  208. mova [rsp+ 8], m5
  209. mova [rsp+16], m3
  210. mova [rsp+24], m_dith
  211. %endif ; mmsize == 8/16
  212. %endif ; %1 == 8
  213. xor r5, r5
  214. %if mmsize == 8 || %1 == 8
  215. yuv2planeX_mainloop %1, a
  216. %else ; mmsize == 16
  217. test dstq, 15
  218. jnz .unaligned
  219. yuv2planeX_mainloop %1, a
  220. REP_RET
  221. .unaligned:
  222. yuv2planeX_mainloop %1, u
  223. %endif ; mmsize == 8/16
  224. %if %1 == 8
  225. %if ARCH_X86_32
  226. ADD rsp, pad
  227. RET
  228. %else ; x86-64
  229. REP_RET
  230. %endif ; x86-32/64
  231. %else ; %1 == 9/10/16
  232. REP_RET
  233. %endif ; %1 == 8/9/10/16
  234. %endmacro
  235. %if ARCH_X86_32
  236. INIT_MMX mmxext
  237. yuv2planeX_fn 8, 0, 7
  238. yuv2planeX_fn 9, 0, 5
  239. yuv2planeX_fn 10, 0, 5
  240. %endif
  241. INIT_XMM sse2
  242. yuv2planeX_fn 8, 10, 7
  243. yuv2planeX_fn 9, 7, 5
  244. yuv2planeX_fn 10, 7, 5
  245. INIT_XMM sse4
  246. yuv2planeX_fn 8, 10, 7
  247. yuv2planeX_fn 9, 7, 5
  248. yuv2planeX_fn 10, 7, 5
  249. yuv2planeX_fn 16, 8, 5
  250. %if HAVE_AVX_EXTERNAL
  251. INIT_XMM avx
  252. yuv2planeX_fn 8, 10, 7
  253. yuv2planeX_fn 9, 7, 5
  254. yuv2planeX_fn 10, 7, 5
  255. %endif
  256. ; %1=outout-bpc, %2=alignment (u/a)
  257. %macro yuv2plane1_mainloop 2
  258. .loop_%2:
  259. %if %1 == 8
  260. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  261. paddsw m1, m3, [srcq+wq*2+mmsize*1]
  262. psraw m0, 7
  263. psraw m1, 7
  264. packuswb m0, m1
  265. mov%2 [dstq+wq], m0
  266. %elif %1 == 16
  267. paddd m0, m4, [srcq+wq*4+mmsize*0]
  268. paddd m1, m4, [srcq+wq*4+mmsize*1]
  269. paddd m2, m4, [srcq+wq*4+mmsize*2]
  270. paddd m3, m4, [srcq+wq*4+mmsize*3]
  271. psrad m0, 3
  272. psrad m1, 3
  273. psrad m2, 3
  274. psrad m3, 3
  275. %if cpuflag(sse4) ; avx/sse4
  276. packusdw m0, m1
  277. packusdw m2, m3
  278. %else ; mmx/sse2
  279. packssdw m0, m1
  280. packssdw m2, m3
  281. paddw m0, m5
  282. paddw m2, m5
  283. %endif ; mmx/sse2/sse4/avx
  284. mov%2 [dstq+wq*2+mmsize*0], m0
  285. mov%2 [dstq+wq*2+mmsize*1], m2
  286. %else ; %1 == 9/10
  287. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  288. paddsw m1, m2, [srcq+wq*2+mmsize*1]
  289. psraw m0, 15 - %1
  290. psraw m1, 15 - %1
  291. pmaxsw m0, m4
  292. pmaxsw m1, m4
  293. pminsw m0, m3
  294. pminsw m1, m3
  295. mov%2 [dstq+wq*2+mmsize*0], m0
  296. mov%2 [dstq+wq*2+mmsize*1], m1
  297. %endif
  298. add wq, mmsize
  299. jl .loop_%2
  300. %endmacro
  301. %macro yuv2plane1_fn 3
  302. cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
  303. movsxdifnidn wq, wd
  304. add wq, mmsize - 1
  305. and wq, ~(mmsize - 1)
  306. %if %1 == 8
  307. add dstq, wq
  308. %else ; %1 != 8
  309. lea dstq, [dstq+wq*2]
  310. %endif ; %1 == 8
  311. %if %1 == 16
  312. lea srcq, [srcq+wq*4]
  313. %else ; %1 != 16
  314. lea srcq, [srcq+wq*2]
  315. %endif ; %1 == 16
  316. neg wq
  317. %if %1 == 8
  318. pxor m4, m4 ; zero
  319. ; create registers holding dither
  320. movq m3, [ditherq] ; dither
  321. test offsetd, offsetd
  322. jz .no_rot
  323. %if mmsize == 16
  324. punpcklqdq m3, m3
  325. %endif ; mmsize == 16
  326. PALIGNR m3, m3, 3, m2
  327. .no_rot:
  328. %if mmsize == 8
  329. mova m2, m3
  330. punpckhbw m3, m4 ; byte->word
  331. punpcklbw m2, m4 ; byte->word
  332. %else
  333. punpcklbw m3, m4
  334. mova m2, m3
  335. %endif
  336. %elif %1 == 9
  337. pxor m4, m4
  338. mova m3, [pw_512]
  339. mova m2, [pw_32]
  340. %elif %1 == 10
  341. pxor m4, m4
  342. mova m3, [pw_1024]
  343. mova m2, [pw_16]
  344. %else ; %1 == 16
  345. %if cpuflag(sse4) ; sse4/avx
  346. mova m4, [pd_4]
  347. %else ; mmx/sse2
  348. mova m4, [pd_4min0x40000]
  349. mova m5, [minshort]
  350. %endif ; mmx/sse2/sse4/avx
  351. %endif ; %1 == ..
  352. ; actual pixel scaling
  353. %if mmsize == 8
  354. yuv2plane1_mainloop %1, a
  355. %else ; mmsize == 16
  356. test dstq, 15
  357. jnz .unaligned
  358. yuv2plane1_mainloop %1, a
  359. REP_RET
  360. .unaligned:
  361. yuv2plane1_mainloop %1, u
  362. %endif ; mmsize == 8/16
  363. REP_RET
  364. %endmacro
  365. %if ARCH_X86_32
  366. INIT_MMX mmx
  367. yuv2plane1_fn 8, 0, 5
  368. yuv2plane1_fn 16, 0, 3
  369. INIT_MMX mmxext
  370. yuv2plane1_fn 9, 0, 3
  371. yuv2plane1_fn 10, 0, 3
  372. %endif
  373. INIT_XMM sse2
  374. yuv2plane1_fn 8, 5, 5
  375. yuv2plane1_fn 9, 5, 3
  376. yuv2plane1_fn 10, 5, 3
  377. yuv2plane1_fn 16, 6, 3
  378. INIT_XMM sse4
  379. yuv2plane1_fn 16, 5, 3
  380. %if HAVE_AVX_EXTERNAL
  381. INIT_XMM avx
  382. yuv2plane1_fn 8, 5, 5
  383. yuv2plane1_fn 9, 5, 3
  384. yuv2plane1_fn 10, 5, 3
  385. yuv2plane1_fn 16, 5, 3
  386. %endif