output.asm 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973
  1. ;******************************************************************************
  2. ;* x86-optimized vertical line scaling functions
  3. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Kieran Kunhya <kieran@kunhya.com>
  5. ;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA 32
  25. minshort: times 8 dw 0x8000
  26. yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
  27. yuv2yuvX_10_start: times 4 dd 0x10000
  28. yuv2yuvX_9_start: times 4 dd 0x20000
  29. yuv2yuvX_10_upper: times 8 dw 0x3ff
  30. yuv2yuvX_9_upper: times 8 dw 0x1ff
  31. pd_4: times 4 dd 4
  32. pd_4min0x40000:times 4 dd 4 - (0x40000)
  33. pw_16: times 8 dw 16
  34. pw_32: times 8 dw 32
  35. pd_255: times 8 dd 255
  36. pw_512: times 8 dw 512
  37. pw_1024: times 8 dw 1024
  38. pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
  39. pd_yuv2gbrp16_start: times 8 dd -0x40000000
  40. pd_yuv2gbrp_y_start: times 8 dd (1 << 9)
  41. pd_yuv2gbrp_uv_start: times 8 dd ((1 << 9) - (128 << 19))
  42. pd_yuv2gbrp_a_start: times 8 dd (1 << 18)
  43. pd_yuv2gbrp16_offset: times 8 dd 0x10000 ;(1 << 16)
  44. pd_yuv2gbrp16_round13: times 8 dd 0xE0002000 ;(1 << 13) - (1 << 29)
  45. pd_yuv2gbrp16_a_offset: times 8 dd 0x20002000
  46. pd_yuv2gbrp16_upper30: times 8 dd 0x3FFFFFFF ;(1<<30) - 1
  47. pd_yuv2gbrp16_upper27: times 8 dd 0x07FFFFFF ;(1<<27) - 1
  48. pd_yuv2gbrp16_upper16: times 8 dd 0x0000FFFF ;(1<<16) - 1
  49. pd_yuv2gbrp16_upperC: times 8 dd 0xC0000000
  50. pd_yuv2gbrp_debias: times 8 dd 0x00008000 ;(1 << 29 - 14)
  51. pb_pack_shuffle8: db 0, 4, 8, 12, \
  52. -1, -1, -1, -1, \
  53. -1, -1, -1, -1, \
  54. -1, -1, -1, -1, \
  55. -1, -1, -1, -1, \
  56. 0, 4, 8, 12, \
  57. -1, -1, -1, -1, \
  58. -1, -1, -1, -1
  59. pb_pack_shuffle16le: db 0, 1, 4, 5, \
  60. 8, 9, 12, 13, \
  61. -1, -1, -1, -1, \
  62. -1, -1, -1, -1, \
  63. -1, -1, -1, -1, \
  64. -1, -1, -1, -1, \
  65. 0, 1, 4, 5, \
  66. 8, 9, 12, 13
  67. pb_pack_shuffle16be: db 1, 0, 5, 4, \
  68. 9, 8, 13, 12, \
  69. -1, -1, -1, -1, \
  70. -1, -1, -1, -1, \
  71. -1, -1, -1, -1, \
  72. -1, -1, -1, -1, \
  73. 1, 0, 5, 4, \
  74. 9, 8, 13, 12
  75. pb_shuffle32be: db 3, 2, 1, 0, \
  76. 7, 6, 5, 4, \
  77. 11, 10, 9, 8, \
  78. 15, 14, 13, 12, \
  79. 3, 2, 1, 0, \
  80. 7, 6, 5, 4, \
  81. 11, 10, 9, 8, \
  82. 15, 14, 13, 12
  83. yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
  84. -1, -1, -1, -1, \
  85. -1, -1, -1, -1, \
  86. -1, -1, -1, -1
  87. yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
  88. -1, -1, -1, -1, \
  89. -1, -1, -1, -1, \
  90. -1, -1, -1, -1
  91. yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
  92. SECTION .text
  93. ;-----------------------------------------------------------------------------
  94. ; vertical line scaling
  95. ;
  96. ; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
  97. ; const uint8_t *dither, int offset)
  98. ; and
  99. ; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
  100. ; const int16_t **src, uint8_t *dst, int dstW,
  101. ; const uint8_t *dither, int offset)
  102. ;
  103. ; Scale one or $filterSize lines of source data to generate one line of output
  104. ; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
  105. ; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
  106. ; of 2. $offset is either 0 or 3. $dither holds 8 values.
  107. ;-----------------------------------------------------------------------------
  108. %macro yuv2planeX_mainloop 2
  109. .pixelloop_%2:
  110. %assign %%i 0
  111. ; the rep here is for the 8-bit output MMX case, where dither covers
  112. ; 8 pixels but we can only handle 2 pixels per register, and thus 4
  113. ; pixels per iteration. In order to not have to keep track of where
  114. ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
  115. %if %1 == 8
  116. %assign %%repcnt 16/mmsize
  117. %else
  118. %assign %%repcnt 1
  119. %endif
  120. %rep %%repcnt
  121. %if %1 == 8
  122. %if ARCH_X86_32
  123. mova m2, [rsp+mmsize*(0+%%i)]
  124. mova m1, [rsp+mmsize*(1+%%i)]
  125. %else ; x86-64
  126. mova m2, m8
  127. mova m1, m_dith
  128. %endif ; x86-32/64
  129. %else ; %1 == 9/10/16
  130. mova m1, [yuv2yuvX_%1_start]
  131. mova m2, m1
  132. %endif ; %1 == 8/9/10/16
  133. movsx cntr_reg, fltsizem
  134. .filterloop_%2_ %+ %%i:
  135. ; input pixels
  136. mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
  137. %if %1 == 16
  138. mova m3, [r6+r5*4]
  139. mova m5, [r6+r5*4+mmsize]
  140. %else ; %1 == 8/9/10
  141. mova m3, [r6+r5*2]
  142. %endif ; %1 == 8/9/10/16
  143. mov r6, [srcq+gprsize*cntr_reg-gprsize]
  144. %if %1 == 16
  145. mova m4, [r6+r5*4]
  146. mova m6, [r6+r5*4+mmsize]
  147. %else ; %1 == 8/9/10
  148. mova m4, [r6+r5*2]
  149. %endif ; %1 == 8/9/10/16
  150. ; coefficients
  151. movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
  152. %if %1 == 16
  153. pshuflw m7, m0, 0 ; coeff[0]
  154. pshuflw m0, m0, 0x55 ; coeff[1]
  155. pmovsxwd m7, m7 ; word -> dword
  156. pmovsxwd m0, m0 ; word -> dword
  157. pmulld m3, m7
  158. pmulld m5, m7
  159. pmulld m4, m0
  160. pmulld m6, m0
  161. paddd m2, m3
  162. paddd m1, m5
  163. paddd m2, m4
  164. paddd m1, m6
  165. %else ; %1 == 10/9/8
  166. punpcklwd m5, m3, m4
  167. punpckhwd m3, m4
  168. SPLATD m0
  169. pmaddwd m5, m0
  170. pmaddwd m3, m0
  171. paddd m2, m5
  172. paddd m1, m3
  173. %endif ; %1 == 8/9/10/16
  174. sub cntr_reg, 2
  175. jg .filterloop_%2_ %+ %%i
  176. %if %1 == 16
  177. psrad m2, 31 - %1
  178. psrad m1, 31 - %1
  179. %else ; %1 == 10/9/8
  180. psrad m2, 27 - %1
  181. psrad m1, 27 - %1
  182. %endif ; %1 == 8/9/10/16
  183. %if %1 == 8
  184. packssdw m2, m1
  185. packuswb m2, m2
  186. movh [dstq+r5*1], m2
  187. %else ; %1 == 9/10/16
  188. %if %1 == 16
  189. packssdw m2, m1
  190. paddw m2, [minshort]
  191. %else ; %1 == 9/10
  192. %if cpuflag(sse4)
  193. packusdw m2, m1
  194. %else ; mmxext/sse2
  195. packssdw m2, m1
  196. pmaxsw m2, m6
  197. %endif ; mmxext/sse2/sse4/avx
  198. pminsw m2, [yuv2yuvX_%1_upper]
  199. %endif ; %1 == 9/10/16
  200. mov%2 [dstq+r5*2], m2
  201. %endif ; %1 == 8/9/10/16
  202. add r5, mmsize/2
  203. sub wd, mmsize/2
  204. %assign %%i %%i+2
  205. %endrep
  206. jg .pixelloop_%2
  207. %endmacro
  208. %macro yuv2planeX_fn 3
  209. %if ARCH_X86_32
  210. %define cntr_reg fltsizeq
  211. %define movsx mov
  212. %else
  213. %define cntr_reg r7
  214. %define movsx movsxd
  215. %endif
  216. cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
  217. %if %1 == 8 || %1 == 9 || %1 == 10
  218. pxor m6, m6
  219. %endif ; %1 == 8/9/10
  220. %if %1 == 8
  221. %if ARCH_X86_32
  222. %assign pad 0x2c - (stack_offset & 15)
  223. SUB rsp, pad
  224. %define m_dith m7
  225. %else ; x86-64
  226. %define m_dith m9
  227. %endif ; x86-32
  228. ; create registers holding dither
  229. movq m_dith, [ditherq] ; dither
  230. test offsetd, offsetd
  231. jz .no_rot
  232. %if mmsize == 16
  233. punpcklqdq m_dith, m_dith
  234. %endif ; mmsize == 16
  235. PALIGNR m_dith, m_dith, 3, m0
  236. .no_rot:
  237. %if mmsize == 16
  238. punpcklbw m_dith, m6
  239. %if ARCH_X86_64
  240. punpcklwd m8, m_dith, m6
  241. pslld m8, 12
  242. %else ; x86-32
  243. punpcklwd m5, m_dith, m6
  244. pslld m5, 12
  245. %endif ; x86-32/64
  246. punpckhwd m_dith, m6
  247. pslld m_dith, 12
  248. %if ARCH_X86_32
  249. mova [rsp+ 0], m5
  250. mova [rsp+16], m_dith
  251. %endif
  252. %else ; mmsize == 8
  253. punpcklbw m5, m_dith, m6
  254. punpckhbw m_dith, m6
  255. punpcklwd m4, m5, m6
  256. punpckhwd m5, m6
  257. punpcklwd m3, m_dith, m6
  258. punpckhwd m_dith, m6
  259. pslld m4, 12
  260. pslld m5, 12
  261. pslld m3, 12
  262. pslld m_dith, 12
  263. mova [rsp+ 0], m4
  264. mova [rsp+ 8], m5
  265. mova [rsp+16], m3
  266. mova [rsp+24], m_dith
  267. %endif ; mmsize == 8/16
  268. %endif ; %1 == 8
  269. xor r5, r5
  270. %if mmsize == 8 || %1 == 8
  271. yuv2planeX_mainloop %1, a
  272. %else ; mmsize == 16
  273. test dstq, 15
  274. jnz .unaligned
  275. yuv2planeX_mainloop %1, a
  276. REP_RET
  277. .unaligned:
  278. yuv2planeX_mainloop %1, u
  279. %endif ; mmsize == 8/16
  280. %if %1 == 8
  281. %if ARCH_X86_32
  282. ADD rsp, pad
  283. RET
  284. %else ; x86-64
  285. REP_RET
  286. %endif ; x86-32/64
  287. %else ; %1 == 9/10/16
  288. REP_RET
  289. %endif ; %1 == 8/9/10/16
  290. %endmacro
  291. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  292. INIT_MMX mmxext
  293. yuv2planeX_fn 8, 0, 7
  294. %endif
  295. INIT_XMM sse2
  296. yuv2planeX_fn 8, 10, 7
  297. yuv2planeX_fn 9, 7, 5
  298. yuv2planeX_fn 10, 7, 5
  299. INIT_XMM sse4
  300. yuv2planeX_fn 8, 10, 7
  301. yuv2planeX_fn 9, 7, 5
  302. yuv2planeX_fn 10, 7, 5
  303. yuv2planeX_fn 16, 8, 5
  304. %if HAVE_AVX_EXTERNAL
  305. INIT_XMM avx
  306. yuv2planeX_fn 8, 10, 7
  307. yuv2planeX_fn 9, 7, 5
  308. yuv2planeX_fn 10, 7, 5
  309. %endif
  310. ; %1=outout-bpc, %2=alignment (u/a)
  311. %macro yuv2plane1_mainloop 2
  312. .loop_%2:
  313. %if %1 == 8
  314. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  315. paddsw m1, m3, [srcq+wq*2+mmsize*1]
  316. psraw m0, 7
  317. psraw m1, 7
  318. packuswb m0, m1
  319. mov%2 [dstq+wq], m0
  320. %elif %1 == 16
  321. paddd m0, m4, [srcq+wq*4+mmsize*0]
  322. paddd m1, m4, [srcq+wq*4+mmsize*1]
  323. paddd m2, m4, [srcq+wq*4+mmsize*2]
  324. paddd m3, m4, [srcq+wq*4+mmsize*3]
  325. psrad m0, 3
  326. psrad m1, 3
  327. psrad m2, 3
  328. psrad m3, 3
  329. %if cpuflag(sse4) ; avx/sse4
  330. packusdw m0, m1
  331. packusdw m2, m3
  332. %else ; mmx/sse2
  333. packssdw m0, m1
  334. packssdw m2, m3
  335. paddw m0, m5
  336. paddw m2, m5
  337. %endif ; mmx/sse2/sse4/avx
  338. mov%2 [dstq+wq*2+mmsize*0], m0
  339. mov%2 [dstq+wq*2+mmsize*1], m2
  340. %else ; %1 == 9/10
  341. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  342. paddsw m1, m2, [srcq+wq*2+mmsize*1]
  343. psraw m0, 15 - %1
  344. psraw m1, 15 - %1
  345. pmaxsw m0, m4
  346. pmaxsw m1, m4
  347. pminsw m0, m3
  348. pminsw m1, m3
  349. mov%2 [dstq+wq*2+mmsize*0], m0
  350. mov%2 [dstq+wq*2+mmsize*1], m1
  351. %endif
  352. add wq, mmsize
  353. jl .loop_%2
  354. %endmacro
  355. %macro yuv2plane1_fn 3
  356. cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
  357. movsxdifnidn wq, wd
  358. add wq, mmsize - 1
  359. and wq, ~(mmsize - 1)
  360. %if %1 == 8
  361. add dstq, wq
  362. %else ; %1 != 8
  363. lea dstq, [dstq+wq*2]
  364. %endif ; %1 == 8
  365. %if %1 == 16
  366. lea srcq, [srcq+wq*4]
  367. %else ; %1 != 16
  368. lea srcq, [srcq+wq*2]
  369. %endif ; %1 == 16
  370. neg wq
  371. %if %1 == 8
  372. pxor m4, m4 ; zero
  373. ; create registers holding dither
  374. movq m3, [ditherq] ; dither
  375. test offsetd, offsetd
  376. jz .no_rot
  377. punpcklqdq m3, m3
  378. PALIGNR m3, m3, 3, m2
  379. .no_rot:
  380. punpcklbw m3, m4
  381. mova m2, m3
  382. %elif %1 == 9
  383. pxor m4, m4
  384. mova m3, [pw_512]
  385. mova m2, [pw_32]
  386. %elif %1 == 10
  387. pxor m4, m4
  388. mova m3, [pw_1024]
  389. mova m2, [pw_16]
  390. %else ; %1 == 16
  391. %if cpuflag(sse4) ; sse4/avx
  392. mova m4, [pd_4]
  393. %else ; sse2
  394. mova m4, [pd_4min0x40000]
  395. mova m5, [minshort]
  396. %endif ; sse2/sse4/avx
  397. %endif ; %1 == ..
  398. ; actual pixel scaling
  399. test dstq, 15
  400. jnz .unaligned
  401. yuv2plane1_mainloop %1, a
  402. REP_RET
  403. .unaligned:
  404. yuv2plane1_mainloop %1, u
  405. REP_RET
  406. %endmacro
  407. INIT_XMM sse2
  408. yuv2plane1_fn 8, 5, 5
  409. yuv2plane1_fn 9, 5, 3
  410. yuv2plane1_fn 10, 5, 3
  411. yuv2plane1_fn 16, 6, 3
  412. INIT_XMM sse4
  413. yuv2plane1_fn 16, 5, 3
  414. %if HAVE_AVX_EXTERNAL
  415. INIT_XMM avx
  416. yuv2plane1_fn 8, 5, 5
  417. yuv2plane1_fn 9, 5, 3
  418. yuv2plane1_fn 10, 5, 3
  419. yuv2plane1_fn 16, 5, 3
  420. %endif
  421. %undef movsx
  422. ;-----------------------------------------------------------------------------
  423. ; AVX2 yuv2nv12cX implementation
  424. ;
  425. ; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
  426. ; const int16_t *filter, int filterSize,
  427. ; const int16_t **u, const int16_t **v,
  428. ; uint8_t *dst, int dstWidth)
  429. ;
  430. ; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
  431. ; const int16_t *filter, int filterSize,
  432. ; const int16_t **u, const int16_t **v,
  433. ; uint8_t *dst, int dstWidth)
  434. ;-----------------------------------------------------------------------------
  435. %if ARCH_X86_64
  436. %macro yuv2nv12cX_fn 1
  437. cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
  438. mov tmp1q, qword [ditherq]
  439. movq xm0, tmp1q
  440. ror tmp1q, 24
  441. movq xm1, tmp1q
  442. pmovzxbd m0, xm0
  443. pslld m0, m0, 12 ; ditherLo
  444. pmovzxbd m1, xm1
  445. pslld m1, m1, 12 ; ditherHi
  446. pxor m9, m9 ; uint8_min dwords
  447. mova m10, [pd_255] ; uint8_max dwords
  448. mova m11, [%1_shuffle_mask] ; shuffle_mask
  449. mova m12, [yuv2nv12_permute_mask] ; permute mask
  450. DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
  451. xor r8q, r8q
  452. nv12_outer_%1:
  453. mova m2, m0 ; resultLo
  454. mova m3, m1 ; resultHi
  455. xor r9q, r9q
  456. nv12_inner_%1:
  457. movsx r10d, word [filterq + (2 * r9q)]
  458. movd xm4, r10d
  459. vpbroadcastd m4, xm4 ; filter
  460. mov tmp1q, [uq + (gprsize * r9q)]
  461. mova xm7, oword [tmp1q + 2 * r8q]
  462. mov tmp2q, [vq + (gprsize * r9q)]
  463. mova xm8, oword [tmp2q + 2 * r8q]
  464. punpcklwd xm5, xm7, xm8
  465. pmovsxwd m5, xm5 ; multiplicandsLo
  466. punpckhwd xm6, xm7, xm8
  467. pmovsxwd m6, xm6 ; multiplicandsHi
  468. pmulld m7, m5, m4 ; mulResultLo
  469. pmulld m8, m6, m4 ; mulResultHi
  470. paddd m2, m2, m7 ; resultLo += mulResultLo
  471. paddd m3, m3, m8 ; resultHi += mulResultHi
  472. inc r9d
  473. cmp r9d, filterSized
  474. jl nv12_inner_%1
  475. ; end of inner loop
  476. psrad m2, m2, 19
  477. psrad m3, m3, 19
  478. ; Vectorized av_clip_uint8
  479. pmaxsd m2, m2, m9
  480. pmaxsd m3, m3, m9
  481. pminsd m2, m2, m10
  482. pminsd m3, m3, m10
  483. ; At this point we have clamped uint8s arranged in this order:
  484. ; m2: u1 0 0 0 v1 0 0 0 [...]
  485. ; m3: u5 0 0 0 v5 0 0 0 [...]
  486. ;
  487. ; First, we shuffle the bytes to make the bytes semi-contiguous.
  488. ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
  489. ; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
  490. ; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
  491. pshufb m2, m2, m11
  492. pshufb m3, m3, m11
  493. ; To fix the cross-lane shuffling issue, we'll then use cross-lane
  494. ; permutation to combine the two segments
  495. vpermd m2, m12, m2
  496. vpermd m3, m12, m3
  497. ; Now we have the final results in the lower 8 bytes of each register
  498. movq [dstq], xm2
  499. movq [dstq + 8], xm3
  500. add r8d, 8
  501. add dstq, 16
  502. cmp r8d, dstWidthd
  503. jl nv12_outer_%1
  504. RET
  505. %endmacro
  506. %if HAVE_AVX2_EXTERNAL
  507. INIT_YMM avx2
  508. yuv2nv12cX_fn yuv2nv12
  509. yuv2nv12cX_fn yuv2nv21
  510. %endif
  511. %endif ; ARCH_X86_64
  512. ;-----------------------------------------------------------------------------
  513. ; planar grb yuv2anyX functions
  514. ; void ff_yuv2<gbr_format>_full_X_<opt>(SwsContext *c, const int16_t *lumFilter,
  515. ; const int16_t **lumSrcx, int lumFilterSize,
  516. ; const int16_t *chrFilter, const int16_t **chrUSrcx,
  517. ; const int16_t **chrVSrcx, int chrFilterSize,
  518. ; const int16_t **alpSrcx, uint8_t **dest,
  519. ; int dstW, int y)
  520. ;-----------------------------------------------------------------------------
  521. %if ARCH_X86_64
  522. struc SwsContext
  523. .padding: resb 40292 ; offsetof(SwsContext, yuv2rgb_y_offset)
  524. .yuv2rgb_y_offset: resd 1
  525. .yuv2rgb_y_coeff: resd 1
  526. .yuv2rgb_v2r_coeff: resd 1
  527. .yuv2rgb_v2g_coeff: resd 1
  528. .yuv2rgb_u2g_coeff: resd 1
  529. .yuv2rgb_u2b_coeff: resd 1
  530. endstruc
  531. %define R m0
  532. %define G m1
  533. %define B m2
  534. %define A m3
  535. %define Y m4
  536. %define U m5
  537. %define V m6
  538. ; Clip a signed integer to an unsigned power of two range.
  539. ; av_clip_uintp2
  540. ; 1 - dest
  541. ; 2 - bit position to clip at
  542. %macro CLIPP2 2
  543. ; (~a) >> 31 & ((1<<p) - 1);
  544. pcmpeqb m4, m4
  545. pxor m4, %1
  546. psrad m4, 31
  547. movu m5, [pd_yuv2gbrp16_upper%2]
  548. pand m4, m5
  549. ; (a & ~((1<<p) - 1)) == 0
  550. pandn m5, %1
  551. pxor m6, m6
  552. pcmpeqd m5, m6
  553. %if cpuflag(avx2)
  554. vpblendvb %1, m4, %1, m5
  555. %else
  556. pxor %1, m4
  557. pand %1, m5
  558. pxor %1, m4
  559. %endif
  560. %endmacro
  561. ; 1 - dest
  562. ; 2 - source
  563. %macro LOAD16 2
  564. %if cpuflag(avx2)
  565. movu xm%1, %2
  566. vpmovsxwd m%1, xm%1
  567. %elif cpuflag(sse4)
  568. movsd m%1, %2
  569. pmovsxwd m%1, m%1
  570. %else
  571. movsd m%1, %2
  572. punpcklwd m%1, m%1
  573. psrad m%1, 16 ; sign extend
  574. %endif
  575. %endmacro
  576. ; 1 - dest
  577. ; 2 - source
  578. ; 3 - depth
  579. %macro LOAD_PIXELS 3
  580. mov ptrq, [%2 + jq*8]
  581. %if %3 >= 16
  582. movu m%1, [ptrq + xq*4]
  583. %else
  584. LOAD16 %1, [ptrq + xq*2]
  585. %endif
  586. %endmacro
  587. ; 1 - dest
  588. ; 2 - source
  589. %macro STORE8 2
  590. mov ptrq, %1
  591. %if mmsize > 16
  592. pshufb m%2, [pb_pack_shuffle8]
  593. vextractf128 xm4, m%2, 1
  594. por xm%2, xm4
  595. movq [ptrq + xq], xm%2
  596. %else
  597. %if cpuflag(sse4)
  598. pshufb m%2, [pb_pack_shuffle8]
  599. %else
  600. psrldq m4, m%2, 3
  601. por m%2, m4
  602. psrldq m4, m%2, 6
  603. por m%2, m4
  604. %endif
  605. movd [ptrq + xq], m%2
  606. %endif
  607. %endmacro
  608. ; 1 - dest
  609. ; 2 - source
  610. ; 3 - is big endian
  611. %macro STORE16 3
  612. mov ptrq, %1
  613. %if mmsize > 16
  614. %if %3 ; bigendian
  615. pshufb m%2, [pb_pack_shuffle16be]
  616. %else
  617. pshufb m%2, [pb_pack_shuffle16le]
  618. %endif
  619. vpermq m%2, m%2, (3 << 6 | 0 << 4 | 3 << 2 | 0 << 0)
  620. movu [ptrq + xq*2], xm%2
  621. %else
  622. %if cpuflag(sse4) && %3 ; bigendian
  623. pshufb m%2, [pb_pack_shuffle16be]
  624. %elif cpuflag(sse4)
  625. pshufb m%2, [pb_pack_shuffle16le]
  626. %else
  627. pshuflw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
  628. pshufhw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
  629. pshufd m%2, m%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
  630. %if %3 ; bigendian
  631. psrlw m4, m%2, 8
  632. psllw m%2, 8
  633. por m%2, m4
  634. %endif
  635. %endif
  636. movq [ptrq + xq*2], m%2
  637. %endif
  638. %endmacro
  639. %macro SWAP32 1
  640. %if mmsize > 16 || cpuflag(sse4)
  641. pshufb m%1, [pb_shuffle32be]
  642. %else
  643. psrlw m4, m%1, 8
  644. psllw m%1, 8
  645. por m%1, m4
  646. pshuflw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0)
  647. pshufhw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0)
  648. %endif
  649. %endmacro
  650. ; 1 - dest
  651. ; 2 - source
  652. ; 3 - depth
  653. ; 4 - is big endian
  654. %macro STORE_PIXELS 4
  655. %if %3 > 16
  656. %if %4
  657. SWAP32 %2
  658. %endif
  659. mov ptrq, %1
  660. movu [ptrq + xq*4], m%2
  661. %elif %3 > 8
  662. STORE16 %1, %2, %4
  663. %else
  664. STORE8 %1, %2
  665. %endif
  666. %endmacro
  667. %macro PMULLO 3
  668. %if cpuflag(sse4) || mmsize > 16
  669. pmulld %1, %2, %3
  670. %else
  671. %ifidni %1, %2
  672. %else
  673. mova %1, %2
  674. %endif
  675. pshufd m7, %1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1
  676. pshufd m8, %3, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1
  677. pmuludq m7, m8
  678. pshufd m7, m7, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8
  679. pmuludq %1, %3
  680. pshufd %1, %1, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8
  681. punpckldq %1, m7
  682. %endif
  683. %endmacro
  684. ; 1 - name
  685. ; 2 - depth
  686. ; 3 - has alpha
  687. ; 3 - is big endian
  688. ; 5 - is float
  689. %macro yuv2gbrp_fn 5
  690. %define DEPTH %2
  691. %define HAS_ALPHA %3
  692. %define IS_BE %4
  693. %define FLOAT %5
  694. %define SH (22 + 8 - DEPTH)
  695. %if DEPTH >= 16
  696. %define RGB_SHIFT 14
  697. %define A_SHIFT 14
  698. %elif 22 != SH
  699. %define RGB_SHIFT SH
  700. %define A_SHIFT (SH-3)
  701. %else
  702. %define RGB_SHIFT 22
  703. %define A_SHIFT 19
  704. %endif
  705. %if DEPTH >= 16
  706. %define YUV_SHIFT 14
  707. %define Y_START m9
  708. %define Y_ROUND [pd_yuv2gbrp16_round13]
  709. %define UV_START m9
  710. %define A_START m9
  711. %define A_CLIP2P 30
  712. %else
  713. %define YUV_SHIFT 10
  714. %define Y_START [pd_yuv2gbrp_y_start]
  715. %define Y_ROUND m9
  716. %define UV_START [pd_yuv2gbrp_uv_start]
  717. %define A_START [pd_yuv2gbrp_a_start]
  718. %define A_CLIP2P 27
  719. %endif
  720. cglobal yuv2%1_full_X, 12, 14, 16, ptr, lumFilter, lumSrcx, lumFilterSize, chrFilter, chrUSrcx, chrVSrcx, chrFilterSize, alpSrcx, dest, dstW, y, x, j
  721. VBROADCASTSS m10, dword [ptrq + SwsContext.yuv2rgb_y_offset]
  722. VBROADCASTSS m11, dword [ptrq + SwsContext.yuv2rgb_y_coeff]
  723. VBROADCASTSS m12, dword [ptrq + SwsContext.yuv2rgb_v2r_coeff]
  724. VBROADCASTSS m13, dword [ptrq + SwsContext.yuv2rgb_v2g_coeff]
  725. VBROADCASTSS m14, dword [ptrq + SwsContext.yuv2rgb_u2g_coeff]
  726. VBROADCASTSS m15, dword [ptrq + SwsContext.yuv2rgb_u2b_coeff]
  727. %if DEPTH >= 16
  728. movu m9, [pd_yuv2gbrp16_start]
  729. %else
  730. mov xq, (1 << (SH-1))
  731. movq xm9, xq
  732. VBROADCASTSS m9, xm9
  733. %endif
  734. xor xq, xq
  735. %%loop_x:
  736. movu Y, Y_START
  737. movu U, UV_START
  738. movu V, UV_START
  739. xor jq, jq
  740. %%loop_luma:
  741. movsx ptrd, word [lumFilterq + jq*2]
  742. movd xm0, ptrd
  743. VBROADCASTSS m0, xm0
  744. LOAD_PIXELS 1, lumSrcxq, DEPTH
  745. PMULLO m1, m1, m0
  746. paddd Y, m1
  747. inc jd
  748. cmp jd, lumFilterSized
  749. jl %%loop_luma
  750. %if HAS_ALPHA
  751. cmp alpSrcxq, 0
  752. je %%skip_alpha_load
  753. xor jq, jq
  754. movu A, A_START
  755. %%loop_alpha:
  756. movsx ptrd, word [lumFilterq + jq*2]
  757. movd xm0, ptrd
  758. VBROADCASTSS m0, xm0
  759. LOAD_PIXELS 1, alpSrcxq, DEPTH
  760. PMULLO m1, m1, m0
  761. paddd A, m1
  762. inc jd
  763. cmp jd, lumFilterSized
  764. jl %%loop_alpha
  765. %if DEPTH >= 16
  766. psrad A, 1
  767. paddd A, [pd_yuv2gbrp16_a_offset]
  768. %endif
  769. %%skip_alpha_load:
  770. %endif
  771. xor jq, jq
  772. %%loop_chr:
  773. movsx ptrd, word [chrFilterq + jq*2]
  774. movd xm0, ptrd
  775. VBROADCASTSS m0, xm0
  776. LOAD_PIXELS 1, chrUSrcxq, DEPTH
  777. LOAD_PIXELS 2, chrVSrcxq, DEPTH
  778. PMULLO m1, m1, m0
  779. PMULLO m2, m2, m0
  780. paddd U, m1
  781. paddd V, m2
  782. inc jd
  783. cmp jd, chrFilterSized
  784. jl %%loop_chr
  785. psrad Y, YUV_SHIFT
  786. %if DEPTH >= 16
  787. paddd Y, [pd_yuv2gbrp16_offset]
  788. %endif
  789. psrad U, YUV_SHIFT
  790. psrad V, YUV_SHIFT
  791. psubd Y, m10 ; yuv2rgb_y_offset
  792. PMULLO Y, Y, m11 ; yuv2rgb_y_coeff
  793. paddd Y, Y_ROUND
  794. PMULLO R, V, m12 ; yuv2rgb_v2r_coeff
  795. PMULLO B, U, m15 ; yuv2rgb_u2b_coeff
  796. PMULLO U, U, m14 ; yuv2rgb_u2g_coeff
  797. PMULLO V, V, m13 ; yuv2rgb_v2g_coeff
  798. paddd G, U, V
  799. paddd R, Y
  800. paddd G, Y
  801. paddd B, Y
  802. %if DEPTH < 16
  803. CLIPP2 R, 30
  804. CLIPP2 G, 30
  805. CLIPP2 B, 30
  806. %endif
  807. psrad R, RGB_SHIFT
  808. psrad G, RGB_SHIFT
  809. psrad B, RGB_SHIFT
  810. %if DEPTH >= 16
  811. paddd R, [pd_yuv2gbrp_debias]
  812. paddd G, [pd_yuv2gbrp_debias]
  813. paddd B, [pd_yuv2gbrp_debias]
  814. CLIPP2 R, 16
  815. CLIPP2 G, 16
  816. CLIPP2 B, 16
  817. %endif
  818. %if FLOAT
  819. cvtdq2ps R, R
  820. cvtdq2ps G, G
  821. cvtdq2ps B, B
  822. mulps R, [pd_65535_invf]
  823. mulps G, [pd_65535_invf]
  824. mulps B, [pd_65535_invf]
  825. %endif
  826. STORE_PIXELS [destq + 0], 1, DEPTH, IS_BE ; G
  827. STORE_PIXELS [destq + 8], 2, DEPTH, IS_BE ; B
  828. STORE_PIXELS [destq + 16], 0, DEPTH, IS_BE ; R
  829. %if HAS_ALPHA
  830. cmp alpSrcxq, 0
  831. je %%skip_alpha_store
  832. CLIPP2 A, A_CLIP2P
  833. psrad A, A_SHIFT
  834. %if FLOAT
  835. cvtdq2ps A, A
  836. mulps A, [pd_65535_invf]
  837. %endif
  838. STORE_PIXELS [destq + 24], 3, DEPTH, IS_BE
  839. %%skip_alpha_store:
  840. %endif
  841. add xq, mmsize/4
  842. cmp xd, dstWd
  843. jl %%loop_x
  844. RET
  845. %endmacro
  846. %macro yuv2gbrp_fn_decl 2
  847. INIT_%1 %2
  848. yuv2gbrp_fn gbrp, 8, 0, 0, 0
  849. yuv2gbrp_fn gbrap, 8, 1, 0, 0
  850. yuv2gbrp_fn gbrp9le, 9, 0, 0, 0
  851. yuv2gbrp_fn gbrp10le, 10, 0, 0, 0
  852. yuv2gbrp_fn gbrap10le, 10, 1, 0, 0
  853. yuv2gbrp_fn gbrp12le, 12, 0, 0, 0
  854. yuv2gbrp_fn gbrap12le, 12, 1, 0, 0
  855. yuv2gbrp_fn gbrp14le, 14, 0, 0, 0
  856. yuv2gbrp_fn gbrp16le, 16, 0, 0, 0
  857. yuv2gbrp_fn gbrap16le, 16, 1, 0, 0
  858. yuv2gbrp_fn gbrpf32le, 32, 0, 0, 1
  859. yuv2gbrp_fn gbrapf32le, 32, 1, 0, 1
  860. yuv2gbrp_fn gbrp9be, 9, 0, 1, 0
  861. yuv2gbrp_fn gbrp10be, 10, 0, 1, 0
  862. yuv2gbrp_fn gbrap10be, 10, 1, 1, 0
  863. yuv2gbrp_fn gbrp12be, 12, 0, 1, 0
  864. yuv2gbrp_fn gbrap12be, 12, 1, 1, 0
  865. yuv2gbrp_fn gbrp14be, 14, 0, 1, 0
  866. yuv2gbrp_fn gbrp16be, 16, 0, 1, 0
  867. yuv2gbrp_fn gbrap16be, 16, 1, 1, 0
  868. yuv2gbrp_fn gbrpf32be, 32, 0, 1, 1
  869. yuv2gbrp_fn gbrapf32be, 32, 1, 1, 1
  870. %endmacro
  871. yuv2gbrp_fn_decl XMM, sse2
  872. yuv2gbrp_fn_decl XMM, sse4
  873. %if HAVE_AVX2_EXTERNAL
  874. yuv2gbrp_fn_decl YMM, avx2
  875. %endif
  876. %endif ; ARCH_X86_64