colorspacedsp.asm 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097
  1. ;*****************************************************************************
  2. ;* x86-optimized functions for colorspace filter
  3. ;*
  4. ;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. pw_1: times 8 dw 1
  25. pw_2: times 8 dw 2
  26. pw_4: times 8 dw 4
  27. pw_8: times 8 dw 8
  28. pw_16: times 8 dw 16
  29. pw_64: times 8 dw 64
  30. pw_128: times 8 dw 128
  31. pw_256: times 8 dw 256
  32. pw_512: times 8 dw 512
  33. pw_1023: times 8 dw 1023
  34. pw_1024: times 8 dw 1024
  35. pw_2048: times 8 dw 2048
  36. pw_4095: times 8 dw 4095
  37. pw_8192: times 8 dw 8192
  38. pw_16384: times 8 dw 16384
  39. pd_1: times 4 dd 1
  40. pd_2: times 4 dd 2
  41. pd_128: times 4 dd 128
  42. pd_512: times 4 dd 512
  43. pd_2048: times 4 dd 2048
  44. pd_8192: times 4 dd 8192
  45. pd_32768: times 4 dd 32768
  46. pd_131072: times 4 dd 131072
  47. SECTION .text
  48. ; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
  49. ; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
  50. ; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
  51. ; const int16_t yuv_offset[2][8])
  52. %if ARCH_X86_64
  53. %macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
  54. %assign %%sh (14 + %1 - %2)
  55. %assign %%rnd (1 << (%%sh - 1))
  56. %assign %%uvinoff (128 << (%1 - 8))
  57. %assign %%uvoutoff (128 << (%2 - 8))
  58. %if %3 == 0
  59. %assign %%ss 444
  60. %elif %4 == 0
  61. %assign %%ss 422
  62. %else ; %4 == 1
  63. %assign %%ss 420
  64. %endif ; %3/%4
  65. %if %2 != 8
  66. %assign %%maxval (1 << %2) - 1
  67. %endif ; %2 != 8
  68. %assign %%ypsh %%sh - 1
  69. %if %%ypsh > 14
  70. %assign %%yoffsh %%ypsh - 13
  71. %assign %%ypsh 14
  72. %else
  73. %assign %%yoffsh 1
  74. %endif
  75. %assign %%yprnd (1 << (%%yoffsh - 1))
  76. %assign %%ypmul (1 << %%ypsh)
  77. cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
  78. yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
  79. %if %3 == 1
  80. inc wd
  81. sar wd, 1
  82. %if %4 == 1
  83. inc hd
  84. sar hd, 1
  85. %endif ; %4 == 1
  86. %endif ; %3 == 1
  87. mov [rsp+3*mmsize+0], wd
  88. mov [rsp+3*mmsize+4], hd
  89. mova m10, [cq]
  90. pxor m11, m11
  91. mova m12, [pd_ %+ %%uvoutoff]
  92. pslld m12, %%sh
  93. paddd m12, [pd_ %+ %%rnd]
  94. mova m13, [pw_ %+ %%uvinoff]
  95. mova m14, [yoffq+ 0] ; y_off_in
  96. mova m15, [yoffq+16] ; y_off_out
  97. %if %%yoffsh != 0
  98. psllw m15, %%yoffsh
  99. %endif
  100. paddw m15, [pw_ %+ %%yprnd]
  101. punpcklwd m10, m15
  102. mova m15, [pw_ %+ %%ypmul]
  103. movh m0, [cq+1*16] ; cyu
  104. movh m1, [cq+2*16] ; cyv
  105. movh m2, [cq+4*16] ; cuu
  106. movh m3, [cq+5*16] ; cuv
  107. movh m4, [cq+7*16] ; cvu
  108. movh m5, [cq+8*16] ; cvv
  109. punpcklwd m0, m1
  110. punpcklwd m2, m3
  111. punpcklwd m4, m5
  112. mova [rsp+0*mmsize], m0
  113. mova [rsp+1*mmsize], m2
  114. mova [rsp+2*mmsize], m4
  115. DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
  116. mov uiq, [yiq+gprsize*1]
  117. mov viq, [yiq+gprsize*2]
  118. mov yiq, [yiq+gprsize*0]
  119. mov uoq, [yoq+gprsize*1]
  120. mov voq, [yoq+gprsize*2]
  121. mov yoq, [yoq+gprsize*0]
  122. mov uisq, [yisq+gprsize*1]
  123. mov visq, [yisq+gprsize*2]
  124. mov yisq, [yisq+gprsize*0]
  125. mov uosq, [yosq+gprsize*1]
  126. mov vosq, [yosq+gprsize*2]
  127. mov yosq, [yosq+gprsize*0]
  128. .loop_v:
  129. xor xq, xq
  130. .loop_h:
  131. %if %4 == 1
  132. lea tmpq, [yiq+yisq]
  133. %endif ; %4 == 1
  134. %if %1 == 8
  135. movu m0, [yiq+xq*(1<<%3)] ; y00/01
  136. %if %4 == 1
  137. movu m2, [tmpq+xq*2] ; y10/11
  138. %endif ; %4 == 1
  139. %if %3 == 1
  140. movh m4, [uiq+xq] ; u
  141. movh m5, [viq+xq] ; v
  142. %else ; %3 != 1
  143. movu m4, [uiq+xq] ; u
  144. movu m5, [viq+xq] ; v
  145. %endif ; %3 ==/!= 1
  146. punpckhbw m1, m0, m11
  147. punpcklbw m0, m11
  148. %if %4 == 1
  149. punpckhbw m3, m2, m11
  150. punpcklbw m2, m11
  151. %endif ; %4 == 1
  152. %if %3 == 0
  153. punpckhbw m2, m4, m11
  154. punpckhbw m3, m5, m11
  155. %endif ; %3 == 0
  156. punpcklbw m4, m11
  157. punpcklbw m5, m11
  158. %else ; %1 != 8
  159. movu m0, [yiq+xq*(2<<%3)] ; y00/01
  160. movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
  161. %if %4 == 1
  162. movu m2, [tmpq+xq*4] ; y10/11
  163. movu m3, [tmpq+xq*4+mmsize] ; y10/11
  164. %endif ; %4 == 1
  165. movu m4, [uiq+xq*2] ; u
  166. movu m5, [viq+xq*2] ; v
  167. %if %3 == 0
  168. movu m2, [uiq+xq*2+mmsize]
  169. movu m3, [viq+xq*2+mmsize]
  170. %endif ; %3 == 0
  171. %endif ; %1 ==/!= 8
  172. psubw m0, m14
  173. psubw m1, m14
  174. %if %4 == 1
  175. psubw m2, m14
  176. psubw m3, m14
  177. %endif ; %4 == 1
  178. psubw m4, m13
  179. psubw m5, m13
  180. %if %3 == 0
  181. psubw m2, m13
  182. psubw m3, m13
  183. %endif ; %3 == 0
  184. SBUTTERFLY wd, 4, 5, 6
  185. pmaddwd m6, m4, [rsp+1*mmsize]
  186. pmaddwd m7, m5, [rsp+1*mmsize]
  187. %if %3 == 0
  188. SBUTTERFLY wd, 2, 3, 8
  189. pmaddwd m8, m2, [rsp+1*mmsize]
  190. pmaddwd m9, m3, [rsp+1*mmsize]
  191. %else ; %3 != 0
  192. pmaddwd m8, m4, [rsp+2*mmsize]
  193. pmaddwd m9, m5, [rsp+2*mmsize]
  194. %endif
  195. paddd m6, m12
  196. paddd m7, m12
  197. paddd m8, m12
  198. paddd m9, m12
  199. psrad m6, %%sh
  200. psrad m7, %%sh
  201. psrad m8, %%sh
  202. psrad m9, %%sh
  203. packssdw m6, m7
  204. packssdw m8, m9
  205. %if %2 == 8
  206. packuswb m6, m8
  207. %if %3 == 0
  208. movu [uoq+xq], m6
  209. %else ; %3 != 0
  210. movh [uoq+xq], m6
  211. movhps [voq+xq], m6
  212. %endif ; %3 ==/!= 0
  213. %else ; %2 != 8
  214. CLIPW m6, m11, [pw_ %+ %%maxval]
  215. CLIPW m8, m11, [pw_ %+ %%maxval]
  216. movu [uoq+xq*2], m6
  217. %if %3 == 0
  218. movu [uoq+xq*2+mmsize], m8
  219. %else ; %3 != 0
  220. movu [voq+xq*2], m8
  221. %endif ; %3 ==/!= 0
  222. %endif ; %2 ==/!= 8
  223. %if %3 == 0
  224. pmaddwd m6, m4, [rsp+2*mmsize]
  225. pmaddwd m7, m5, [rsp+2*mmsize]
  226. pmaddwd m8, m2, [rsp+2*mmsize]
  227. pmaddwd m9, m3, [rsp+2*mmsize]
  228. paddd m6, m12
  229. paddd m7, m12
  230. paddd m8, m12
  231. paddd m9, m12
  232. psrad m6, %%sh
  233. psrad m7, %%sh
  234. psrad m8, %%sh
  235. psrad m9, %%sh
  236. packssdw m6, m7
  237. packssdw m8, m9
  238. %if %2 == 8
  239. packuswb m6, m8
  240. movu [voq+xq], m6
  241. %else ; %2 != 8
  242. CLIPW m6, m11, [pw_ %+ %%maxval]
  243. CLIPW m8, m11, [pw_ %+ %%maxval]
  244. movu [voq+xq*2], m6
  245. movu [voq+xq*2+mmsize], m8
  246. %endif ; %2 ==/!= 8
  247. %endif ; %3 == 0
  248. pmaddwd m4, [rsp+0*mmsize]
  249. pmaddwd m5, [rsp+0*mmsize] ; uv_val
  250. %if %3 == 0
  251. pmaddwd m2, [rsp+0*mmsize]
  252. pmaddwd m3, [rsp+0*mmsize]
  253. %endif ; %3 == 0
  254. ; unpack y pixels with m15 (shifted round + offset), then multiply
  255. ; by m10, add uv pixels, and we're done!
  256. %if %3 == 1
  257. punpckhdq m8, m4, m4
  258. punpckldq m4, m4
  259. punpckhdq m9, m5, m5
  260. punpckldq m5, m5
  261. %else ; %3 != 1
  262. SWAP 8, 5, 2
  263. SWAP 3, 9
  264. %endif ; %3 ==/!= 1
  265. %if %4 == 1
  266. punpckhwd m6, m2, m15
  267. punpcklwd m2, m15
  268. punpckhwd m7, m3, m15
  269. punpcklwd m3, m15
  270. pmaddwd m2, m10
  271. pmaddwd m6, m10
  272. pmaddwd m3, m10
  273. pmaddwd m7, m10
  274. paddd m2, m4
  275. paddd m6, m8
  276. paddd m3, m5
  277. paddd m7, m9
  278. psrad m2, %%sh
  279. psrad m6, %%sh
  280. psrad m3, %%sh
  281. psrad m7, %%sh
  282. packssdw m2, m6
  283. packssdw m3, m7
  284. lea tmpq, [yoq+yosq]
  285. %if %2 == 8
  286. packuswb m2, m3
  287. movu [tmpq+xq*2], m2
  288. %else ; %2 != 8
  289. CLIPW m2, m11, [pw_ %+ %%maxval]
  290. CLIPW m3, m11, [pw_ %+ %%maxval]
  291. movu [tmpq+xq*4], m2
  292. movu [tmpq+xq*4+mmsize], m3
  293. %endif ; %2 ==/!= 8
  294. %endif ; %4 == 1
  295. punpckhwd m6, m0, m15
  296. punpcklwd m0, m15
  297. punpckhwd m7, m1, m15
  298. punpcklwd m1, m15
  299. pmaddwd m0, m10
  300. pmaddwd m6, m10
  301. pmaddwd m1, m10
  302. pmaddwd m7, m10
  303. paddd m0, m4
  304. paddd m6, m8
  305. paddd m1, m5
  306. paddd m7, m9
  307. psrad m0, %%sh
  308. psrad m6, %%sh
  309. psrad m1, %%sh
  310. psrad m7, %%sh
  311. packssdw m0, m6
  312. packssdw m1, m7
  313. %if %2 == 8
  314. packuswb m0, m1
  315. movu [yoq+xq*(1<<%3)], m0
  316. %else ; %2 != 8
  317. CLIPW m0, m11, [pw_ %+ %%maxval]
  318. CLIPW m1, m11, [pw_ %+ %%maxval]
  319. movu [yoq+xq*(2<<%3)], m0
  320. movu [yoq+xq*(2<<%3)+mmsize], m1
  321. %endif ; %2 ==/!= 8
  322. add xq, mmsize >> %3
  323. cmp xd, dword [rsp+3*mmsize+0]
  324. jl .loop_h
  325. %if %4 == 1
  326. lea yiq, [yiq+yisq*2]
  327. lea yoq, [yoq+yosq*2]
  328. %else ; %4 != 1
  329. add yiq, yisq
  330. add yoq, yosq
  331. %endif ; %4 ==/!= 1
  332. add uiq, uisq
  333. add viq, visq
  334. add uoq, uosq
  335. add voq, vosq
  336. dec dword [rsp+3*mmsize+4]
  337. jg .loop_v
  338. RET
  339. %endmacro
  340. %macro YUV2YUV_FNS 2 ; ss_w, ss_h
  341. YUV2YUV_FN 8, 8, %1, %2
  342. YUV2YUV_FN 10, 8, %1, %2
  343. YUV2YUV_FN 12, 8, %1, %2
  344. YUV2YUV_FN 8, 10, %1, %2
  345. YUV2YUV_FN 10, 10, %1, %2
  346. YUV2YUV_FN 12, 10, %1, %2
  347. YUV2YUV_FN 8, 12, %1, %2
  348. YUV2YUV_FN 10, 12, %1, %2
  349. YUV2YUV_FN 12, 12, %1, %2
  350. %endmacro
  351. INIT_XMM sse2
  352. YUV2YUV_FNS 0, 0
  353. YUV2YUV_FNS 1, 0
  354. YUV2YUV_FNS 1, 1
  355. ; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
  356. ; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
  357. ; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
  358. ; const int16_t yuv_offset[8])
  359. %macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
  360. %assign %%sh (%1 - 1)
  361. %assign %%rnd (1 << (%%sh - 1))
  362. %assign %%uvoff (1 << (%1 - 1))
  363. %if %2 == 0
  364. %assign %%ss 444
  365. %elif %3 == 0
  366. %assign %%ss 422
  367. %else ; %3 == 1
  368. %assign %%ss 420
  369. %endif ; %2/%3
  370. cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
  371. rgb, rgbs, yuv, yuvs, ww, h, c, yoff
  372. %if %2 == 1
  373. inc wwd
  374. sar wwd, 1
  375. %endif ; %2 == 1
  376. %if %3 == 1
  377. inc hd
  378. sar hd, 1
  379. %endif ; %3 == 1
  380. pxor m11, m11
  381. mova m15, [yoffq] ; yoff
  382. movh m14, [cq+ 0] ; cy
  383. movh m10, [cq+ 32] ; crv
  384. movh m13, [cq+112] ; cbu
  385. movh m12, [cq+ 64] ; cgu
  386. movh m9, [cq+ 80] ; cgv
  387. punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd
  388. punpcklwd m13, m11 ; cbu, 0
  389. punpcklwd m11, m10 ; 0, crv
  390. punpcklwd m12, m9 ; cgu, cgv
  391. mova [rsp+0*mmsize], m11
  392. mova [rsp+1*mmsize], m12
  393. mova [rsp+2*mmsize], m13
  394. mova [rsp+3*mmsize], m14
  395. pxor m14, m14
  396. DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
  397. mov gq, [rq+1*gprsize]
  398. mov bq, [rq+2*gprsize]
  399. mov rq, [rq+0*gprsize]
  400. mov uq, [yq+1*gprsize]
  401. mov vq, [yq+2*gprsize]
  402. mov yq, [yq+0*gprsize]
  403. mov usq, [ysq+1*gprsize]
  404. mov vsq, [ysq+2*gprsize]
  405. mov ysq, [ysq+0*gprsize]
  406. .loop_v:
  407. xor xq, xq
  408. .loop_h:
  409. %if %3 == 1
  410. lea tmpq, [yq+ysq]
  411. %endif ; %3 == 1
  412. %if %1 == 8
  413. movu m0, [yq+xq*(1<<%2)]
  414. %if %3 == 1
  415. movu m2, [tmpq+xq*2]
  416. %endif ; %3 == 1
  417. %if %2 == 1
  418. movh m4, [uq+xq]
  419. movh m5, [vq+xq]
  420. %else ; %2 != 1
  421. movu m4, [uq+xq]
  422. movu m5, [vq+xq]
  423. %endif ; %2 ==/!= 1
  424. punpckhbw m1, m0, m14
  425. punpcklbw m0, m14
  426. %if %3 == 1
  427. punpckhbw m3, m2, m14
  428. punpcklbw m2, m14
  429. %endif ; %3 == 1
  430. %if %2 == 0
  431. punpckhbw m2, m4, m14
  432. punpckhbw m3, m5, m14
  433. %endif ; %2 == 0
  434. punpcklbw m4, m14
  435. punpcklbw m5, m14
  436. %else ; %1 != 8
  437. movu m0, [yq+xq*(2<<%2)]
  438. movu m1, [yq+xq*(2<<%2)+mmsize]
  439. %if %3 == 1
  440. movu m2, [tmpq+xq*4]
  441. movu m3, [tmpq+xq*4+mmsize]
  442. %endif ; %3 == 1
  443. movu m4, [uq+xq*2]
  444. movu m5, [vq+xq*2]
  445. %if %2 == 0
  446. movu m2, [uq+xq*2+mmsize]
  447. movu m3, [vq+xq*2+mmsize]
  448. %endif ; %2 == 0
  449. %endif ; %1 ==/!= 8
  450. psubw m0, m15
  451. psubw m1, m15
  452. %if %3 == 1
  453. psubw m2, m15
  454. psubw m3, m15
  455. %endif ; %3 == 1
  456. psubw m4, [pw_ %+ %%uvoff]
  457. psubw m5, [pw_ %+ %%uvoff]
  458. SBUTTERFLY wd, 4, 5, 6
  459. %if %2 == 0
  460. psubw m2, [pw_ %+ %%uvoff]
  461. psubw m3, [pw_ %+ %%uvoff]
  462. SBUTTERFLY wd, 2, 3, 6
  463. %endif ; %2 == 0
  464. ; calculate y+rnd full-resolution [0-3,6-9]
  465. punpckhwd m6, m0, [pw_1] ; y, 1
  466. punpcklwd m0, [pw_1] ; y, 1
  467. punpckhwd m7, m1, [pw_1] ; y, 1
  468. punpcklwd m1, [pw_1] ; y, 1
  469. pmaddwd m0, [rsp+3*mmsize]
  470. pmaddwd m6, [rsp+3*mmsize]
  471. pmaddwd m1, [rsp+3*mmsize]
  472. pmaddwd m7, [rsp+3*mmsize]
  473. %if %3 == 1
  474. punpckhwd m8, m2, [pw_1] ; y, 1
  475. punpcklwd m2, [pw_1] ; y, 1
  476. punpckhwd m9, m3, [pw_1] ; y, 1
  477. punpcklwd m3, [pw_1] ; y, 1
  478. pmaddwd m2, [rsp+3*mmsize]
  479. pmaddwd m8, [rsp+3*mmsize]
  480. pmaddwd m3, [rsp+3*mmsize]
  481. pmaddwd m9, [rsp+3*mmsize]
  482. mova [rsp+4*mmsize], m2
  483. mova [rsp+5*mmsize], m8
  484. mova [rsp+6*mmsize], m3
  485. mova [rsp+7*mmsize], m9
  486. %endif ; %3 == 1
  487. ; calculate r offsets (un-subsampled, then duplicate)
  488. pmaddwd m10, m4, [rsp+0*mmsize]
  489. %if %2 == 1
  490. pmaddwd m12, m5, [rsp+0*mmsize]
  491. punpckhdq m11, m10, m10
  492. punpckldq m10, m10
  493. punpckhdq m13, m12, m12
  494. punpckldq m12, m12
  495. %else ; %2 != 1
  496. pmaddwd m11, m5, [rsp+0*mmsize]
  497. pmaddwd m12, m2, [rsp+0*mmsize]
  498. pmaddwd m13, m3, [rsp+0*mmsize]
  499. %endif ; %2 ==/!= 1
  500. %if %3 == 1
  501. paddd m2, m10, [rsp+4*mmsize]
  502. paddd m3, m11, [rsp+5*mmsize]
  503. paddd m8, m12, [rsp+6*mmsize]
  504. paddd m9, m13, [rsp+7*mmsize]
  505. %endif
  506. paddd m10, m0
  507. paddd m11, m6
  508. paddd m12, m1
  509. paddd m13, m7
  510. %if %3 == 1
  511. psrad m2, %%sh
  512. psrad m3, %%sh
  513. psrad m8, %%sh
  514. psrad m9, %%sh
  515. %endif ; %3 == 1
  516. psrad m10, %%sh
  517. psrad m11, %%sh
  518. psrad m12, %%sh
  519. psrad m13, %%sh
  520. %if %3 == 1
  521. lea tmpq, [rq+rgbsq*2]
  522. packssdw m2, m3
  523. packssdw m8, m9
  524. mova [tmpq+xq*4], m2
  525. mova [tmpq+xq*4+mmsize], m8
  526. %endif ; %3 == 1
  527. packssdw m10, m11
  528. packssdw m12, m13
  529. mova [rq+xq*(2 << %2)], m10
  530. mova [rq+xq*(2 << %2)+mmsize], m12
  531. ; calculate g offsets (un-subsampled, then duplicate)
  532. pmaddwd m10, m4, [rsp+1*mmsize]
  533. %if %2 == 1
  534. pmaddwd m12, m5, [rsp+1*mmsize]
  535. punpckhdq m11, m10, m10
  536. punpckldq m10, m10
  537. punpckhdq m13, m12, m12
  538. punpckldq m12, m12
  539. %else ; %2 != 1
  540. pmaddwd m11, m5, [rsp+1*mmsize]
  541. pmaddwd m12, m2, [rsp+1*mmsize]
  542. pmaddwd m13, m3, [rsp+1*mmsize]
  543. %endif ; %2 ==/!= 1
  544. %if %3 == 1
  545. paddd m2, m10, [rsp+4*mmsize]
  546. paddd m3, m11, [rsp+5*mmsize]
  547. paddd m8, m12, [rsp+6*mmsize]
  548. paddd m9, m13, [rsp+7*mmsize]
  549. %endif ; %3 == 1
  550. paddd m10, m0
  551. paddd m11, m6
  552. paddd m12, m1
  553. paddd m13, m7
  554. %if %3 == 1
  555. psrad m2, %%sh
  556. psrad m3, %%sh
  557. psrad m8, %%sh
  558. psrad m9, %%sh
  559. %endif ; %3 == 1
  560. psrad m10, %%sh
  561. psrad m11, %%sh
  562. psrad m12, %%sh
  563. psrad m13, %%sh
  564. %if %3 == 1
  565. lea tmpq, [gq+rgbsq*2]
  566. packssdw m2, m3
  567. packssdw m8, m9
  568. mova [tmpq+xq*4], m2
  569. mova [tmpq+xq*4+mmsize], m8
  570. %endif ; %3 == 1
  571. packssdw m10, m11
  572. packssdw m12, m13
  573. mova [gq+xq*(2 << %2)], m10
  574. mova [gq+xq*(2 << %2)+mmsize], m12
  575. ; calculate b offsets (un-subsampled, then duplicate)
  576. pmaddwd m4, [rsp+2*mmsize]
  577. pmaddwd m5, [rsp+2*mmsize]
  578. %if %2 == 1
  579. punpckhdq m2, m4, m4
  580. punpckldq m4, m4
  581. punpckhdq m3, m5, m5
  582. punpckldq m5, m5
  583. %else ; %2 != 1
  584. pmaddwd m2, [rsp+2*mmsize]
  585. pmaddwd m3, [rsp+2*mmsize]
  586. SWAP 2, 5
  587. %endif ; %2 ==/!= 1
  588. paddd m0, m4
  589. paddd m6, m2
  590. paddd m1, m5
  591. paddd m7, m3
  592. %if %3 == 1
  593. paddd m4, [rsp+4*mmsize]
  594. paddd m2, [rsp+5*mmsize]
  595. paddd m5, [rsp+6*mmsize]
  596. paddd m3, [rsp+7*mmsize]
  597. %endif ; %3 == 1
  598. psrad m0, %%sh
  599. psrad m6, %%sh
  600. psrad m1, %%sh
  601. psrad m7, %%sh
  602. %if %3 == 1
  603. psrad m4, %%sh
  604. psrad m2, %%sh
  605. psrad m5, %%sh
  606. psrad m3, %%sh
  607. %endif ; %3 == 1
  608. packssdw m0, m6
  609. packssdw m1, m7
  610. movu [bq+xq*(2 << %2)], m0
  611. movu [bq+xq*(2 << %2)+mmsize], m1
  612. %if %3 == 1
  613. lea tmpq, [bq+rgbsq*2]
  614. packssdw m4, m2
  615. packssdw m5, m3
  616. movu [tmpq+xq*4], m4
  617. movu [tmpq+xq*4+mmsize], m5
  618. %endif ; %3 == 1
  619. add xd, mmsize >> %2
  620. cmp xd, wwd
  621. jl .loop_h
  622. lea rq, [rq+rgbsq*(2 << %3)]
  623. lea gq, [gq+rgbsq*(2 << %3)]
  624. lea bq, [bq+rgbsq*(2 << %3)]
  625. %if %3 == 1
  626. lea yq, [yq+ysq*2]
  627. %else ; %3 != 0
  628. add yq, ysq
  629. %endif ; %3 ==/!= 1
  630. add uq, usq
  631. add vq, vsq
  632. dec hd
  633. jg .loop_v
  634. RET
  635. %endmacro
  636. %macro YUV2RGB_FNS 2
  637. YUV2RGB_FN 8, %1, %2
  638. YUV2RGB_FN 10, %1, %2
  639. YUV2RGB_FN 12, %1, %2
  640. %endmacro
  641. INIT_XMM sse2
  642. YUV2RGB_FNS 0, 0
  643. YUV2RGB_FNS 1, 0
  644. YUV2RGB_FNS 1, 1
  645. %macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
  646. %assign %%sh 29 - %1
  647. %assign %%rnd (1 << (%%sh - 15))
  648. %assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
  649. %if %1 != 8
  650. %assign %%maxval ((1 << %1) - 1)
  651. %endif ; %1 != 8
  652. %if %2 == 0
  653. %assign %%ss 444
  654. %elif %3 == 0
  655. %assign %%ss 422
  656. %else ; %3 == 1
  657. %assign %%ss 420
  658. %endif ; %2/%3
  659. cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
  660. yuv, yuvs, rgb, rgbs, ww, h, c, off
  661. %if %2 == 1
  662. inc wwd
  663. sar wwd, 1
  664. %endif ; %2 == 1
  665. %if %3 == 1
  666. inc hd
  667. sar hd, 1
  668. %endif ; %3 == 1
  669. ; prepare coeffs
  670. movh m8, [offq]
  671. movh m9, [pw_ %+ %%uvrnd]
  672. psllw m8, %%sh - 14
  673. paddw m9, [pw_ %+ %%rnd]
  674. paddw m8, [pw_ %+ %%rnd]
  675. movh m0, [cq+ 0]
  676. movh m1, [cq+ 16]
  677. movh m2, [cq+ 32]
  678. movh m3, [cq+ 48]
  679. movh m4, [cq+ 64]
  680. movh m5, [cq+ 80]
  681. movh m6, [cq+112]
  682. movh m7, [cq+128]
  683. punpcklwd m0, m1
  684. punpcklwd m2, m8
  685. punpcklwd m3, m4
  686. punpcklwd m4, m5, m9
  687. punpcklwd m5, m6
  688. punpcklwd m7, m9
  689. mova [rsp+0*mmsize], m0 ; cry, cgy
  690. mova [rsp+1*mmsize], m2 ; cby, off + rnd
  691. mova [rsp+2*mmsize], m3 ; cru, cgu
  692. mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd
  693. mova [rsp+4*mmsize], m5 ; cburv, cgv
  694. mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd
  695. DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
  696. mov gq, [rq+gprsize*1]
  697. mov bq, [rq+gprsize*2]
  698. mov rq, [rq+gprsize*0]
  699. mov uq, [yq+gprsize*1]
  700. mov vq, [yq+gprsize*2]
  701. mov yq, [yq+gprsize*0]
  702. mov usq, [ysq+gprsize*1]
  703. mov vsq, [ysq+gprsize*2]
  704. mov ysq, [ysq+gprsize*0]
  705. pxor m15, m15
  706. .loop_v:
  707. xor xd, xd
  708. .loop_h:
  709. ; top line y
  710. mova m0, [rq+xq*(2<<%2)]
  711. mova m3, [rq+xq*(2<<%2)+mmsize]
  712. mova m1, [gq+xq*(2<<%2)]
  713. mova m4, [gq+xq*(2<<%2)+mmsize]
  714. mova m2, [bq+xq*(2<<%2)]
  715. mova m5, [bq+xq*(2<<%2)+mmsize]
  716. punpcklwd m6, m0, m1
  717. punpckhwd m7, m0, m1
  718. punpcklwd m8, m3, m4
  719. punpckhwd m9, m3, m4
  720. punpcklwd m10, m2, [pw_16384]
  721. punpckhwd m11, m2, [pw_16384]
  722. punpcklwd m12, m5, [pw_16384]
  723. punpckhwd m13, m5, [pw_16384]
  724. pmaddwd m6, [rsp+0*mmsize]
  725. pmaddwd m7, [rsp+0*mmsize]
  726. pmaddwd m8, [rsp+0*mmsize]
  727. pmaddwd m9, [rsp+0*mmsize]
  728. pmaddwd m10, [rsp+1*mmsize]
  729. pmaddwd m11, [rsp+1*mmsize]
  730. pmaddwd m12, [rsp+1*mmsize]
  731. pmaddwd m13, [rsp+1*mmsize]
  732. paddd m6, m10
  733. paddd m7, m11
  734. paddd m8, m12
  735. paddd m9, m13
  736. psrad m6, %%sh
  737. psrad m7, %%sh
  738. psrad m8, %%sh
  739. psrad m9, %%sh
  740. packssdw m6, m7
  741. packssdw m8, m9
  742. %if %1 == 8
  743. packuswb m6, m8
  744. movu [yq+xq*(1<<%2)], m6
  745. %else
  746. CLIPW m6, m15, [pw_ %+ %%maxval]
  747. CLIPW m8, m15, [pw_ %+ %%maxval]
  748. movu [yq+xq*(2<<%2)], m6
  749. movu [yq+xq*(2<<%2)+mmsize], m8
  750. %endif
  751. %if %2 == 1
  752. ; subsampling cached data
  753. pmaddwd m0, [pw_1]
  754. pmaddwd m1, [pw_1]
  755. pmaddwd m2, [pw_1]
  756. pmaddwd m3, [pw_1]
  757. pmaddwd m4, [pw_1]
  758. pmaddwd m5, [pw_1]
  759. %if %3 == 1
  760. ; bottom line y, r/g portion only
  761. lea tmpq, [rgbsq+xq*2]
  762. mova m6, [rq+tmpq*2]
  763. mova m9, [rq+tmpq*2+mmsize]
  764. mova m7, [gq+tmpq*2]
  765. mova m10, [gq+tmpq*2+mmsize]
  766. mova m8, [bq+tmpq*2]
  767. mova m11, [bq+tmpq*2+mmsize]
  768. punpcklwd m12, m6, m7
  769. punpckhwd m13, m6, m7
  770. punpcklwd m14, m9, m10
  771. punpckhwd m15, m9, m10
  772. ; release two more registers
  773. pmaddwd m6, [pw_1]
  774. pmaddwd m7, [pw_1]
  775. pmaddwd m9, [pw_1]
  776. pmaddwd m10, [pw_1]
  777. paddd m0, m6
  778. paddd m3, m9
  779. paddd m1, m7
  780. paddd m4, m10
  781. ; bottom line y, b/rnd portion only
  782. punpcklwd m6, m8, [pw_16384]
  783. punpckhwd m7, m8, [pw_16384]
  784. punpcklwd m9, m11, [pw_16384]
  785. punpckhwd m10, m11, [pw_16384]
  786. pmaddwd m12, [rsp+0*mmsize]
  787. pmaddwd m13, [rsp+0*mmsize]
  788. pmaddwd m14, [rsp+0*mmsize]
  789. pmaddwd m15, [rsp+0*mmsize]
  790. pmaddwd m6, [rsp+1*mmsize]
  791. pmaddwd m7, [rsp+1*mmsize]
  792. pmaddwd m9, [rsp+1*mmsize]
  793. pmaddwd m10, [rsp+1*mmsize]
  794. paddd m12, m6
  795. paddd m13, m7
  796. paddd m14, m9
  797. paddd m15, m10
  798. psrad m12, %%sh
  799. psrad m13, %%sh
  800. psrad m14, %%sh
  801. psrad m15, %%sh
  802. packssdw m12, m13
  803. packssdw m14, m15
  804. lea tmpq, [yq+ysq]
  805. %if %1 == 8
  806. packuswb m12, m14
  807. movu [tmpq+xq*2], m12
  808. %else
  809. pxor m15, m15
  810. CLIPW m12, m15, [pw_ %+ %%maxval]
  811. CLIPW m14, m15, [pw_ %+ %%maxval]
  812. movu [tmpq+xq*4], m12
  813. movu [tmpq+xq*4+mmsize], m14
  814. %endif
  815. ; complete subsampling of r/g/b pixels for u/v
  816. pmaddwd m8, [pw_1]
  817. pmaddwd m11, [pw_1]
  818. paddd m2, m8
  819. paddd m5, m11
  820. paddd m0, [pd_2]
  821. paddd m1, [pd_2]
  822. paddd m2, [pd_2]
  823. paddd m3, [pd_2]
  824. paddd m4, [pd_2]
  825. paddd m5, [pd_2]
  826. psrad m0, 2
  827. psrad m1, 2
  828. psrad m2, 2
  829. psrad m3, 2
  830. psrad m4, 2
  831. psrad m5, 2
  832. %else ; %3 != 1
  833. paddd m0, [pd_1]
  834. paddd m1, [pd_1]
  835. paddd m2, [pd_1]
  836. paddd m3, [pd_1]
  837. paddd m4, [pd_1]
  838. paddd m5, [pd_1]
  839. psrad m0, 1
  840. psrad m1, 1
  841. psrad m2, 1
  842. psrad m3, 1
  843. psrad m4, 1
  844. psrad m5, 1
  845. %endif ; %3 ==/!= 1
  846. packssdw m0, m3
  847. packssdw m1, m4
  848. packssdw m2, m5
  849. %endif ; %2 == 1
  850. ; convert u/v pixels
  851. SBUTTERFLY wd, 0, 1, 6
  852. punpckhwd m6, m2, [pw_16384]
  853. punpcklwd m2, [pw_16384]
  854. pmaddwd m7, m0, [rsp+2*mmsize]
  855. pmaddwd m8, m1, [rsp+2*mmsize]
  856. pmaddwd m9, m2, [rsp+3*mmsize]
  857. pmaddwd m10, m6, [rsp+3*mmsize]
  858. pmaddwd m0, [rsp+4*mmsize]
  859. pmaddwd m1, [rsp+4*mmsize]
  860. pmaddwd m2, [rsp+5*mmsize]
  861. pmaddwd m6, [rsp+5*mmsize]
  862. paddd m7, m9
  863. paddd m8, m10
  864. paddd m0, m2
  865. paddd m1, m6
  866. psrad m7, %%sh
  867. psrad m8, %%sh
  868. psrad m0, %%sh
  869. psrad m1, %%sh
  870. packssdw m7, m8
  871. packssdw m0, m1
  872. %if %2 == 1
  873. %if %1 == 8
  874. packuswb m7, m0
  875. movh [uq+xq], m7
  876. movhps [vq+xq], m7
  877. %else
  878. CLIPW m7, m15, [pw_ %+ %%maxval]
  879. CLIPW m0, m15, [pw_ %+ %%maxval]
  880. movu [uq+xq*2], m7
  881. movu [vq+xq*2], m0
  882. %endif
  883. %else ; %2 != 1
  884. ; second set of u/v pixels
  885. SBUTTERFLY wd, 3, 4, 6
  886. punpckhwd m6, m5, [pw_16384]
  887. punpcklwd m5, [pw_16384]
  888. pmaddwd m8, m3, [rsp+2*mmsize]
  889. pmaddwd m9, m4, [rsp+2*mmsize]
  890. pmaddwd m10, m5, [rsp+3*mmsize]
  891. pmaddwd m11, m6, [rsp+3*mmsize]
  892. pmaddwd m3, [rsp+4*mmsize]
  893. pmaddwd m4, [rsp+4*mmsize]
  894. pmaddwd m5, [rsp+5*mmsize]
  895. pmaddwd m6, [rsp+5*mmsize]
  896. paddd m8, m10
  897. paddd m9, m11
  898. paddd m3, m5
  899. paddd m4, m6
  900. psrad m8, %%sh
  901. psrad m9, %%sh
  902. psrad m3, %%sh
  903. psrad m4, %%sh
  904. packssdw m8, m9
  905. packssdw m3, m4
  906. %if %1 == 8
  907. packuswb m7, m8
  908. packuswb m0, m3
  909. movu [uq+xq], m7
  910. movu [vq+xq], m0
  911. %else
  912. CLIPW m7, m15, [pw_ %+ %%maxval]
  913. CLIPW m0, m15, [pw_ %+ %%maxval]
  914. CLIPW m8, m15, [pw_ %+ %%maxval]
  915. CLIPW m3, m15, [pw_ %+ %%maxval]
  916. movu [uq+xq*2], m7
  917. movu [uq+xq*2+mmsize], m8
  918. movu [vq+xq*2], m0
  919. movu [vq+xq*2+mmsize], m3
  920. %endif
  921. %endif ; %2 ==/!= 1
  922. add xq, mmsize >> %2
  923. cmp xd, wwd
  924. jl .loop_h
  925. %if %3 == 0
  926. add yq, ysq
  927. %else ; %3 != 0
  928. lea yq, [yq+ysq*2]
  929. %endif ; %3 ==/!= 0
  930. add uq, usq
  931. add vq, vsq
  932. lea rq, [rq+rgbsq*(2<<%3)]
  933. lea gq, [gq+rgbsq*(2<<%3)]
  934. lea bq, [bq+rgbsq*(2<<%3)]
  935. dec hd
  936. jg .loop_v
  937. RET
  938. %endmacro
  939. %macro RGB2YUV_FNS 2
  940. RGB2YUV_FN 8, %1, %2
  941. RGB2YUV_FN 10, %1, %2
  942. RGB2YUV_FN 12, %1, %2
  943. %endmacro
  944. INIT_XMM sse2
  945. RGB2YUV_FNS 0, 0
  946. RGB2YUV_FNS 1, 0
  947. RGB2YUV_FNS 1, 1
  948. ; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
  949. ; int w, int h, const int16_t coeff[3][3][8])
  950. INIT_XMM sse2
  951. cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
  952. movh m0, [cq+ 0]
  953. movh m1, [cq+ 32]
  954. movh m2, [cq+ 48]
  955. movh m3, [cq+ 80]
  956. movh m4, [cq+ 96]
  957. movh m5, [cq+128]
  958. punpcklwd m0, [cq+ 16]
  959. punpcklwd m1, [pw_8192]
  960. punpcklwd m2, [cq+ 64]
  961. punpcklwd m3, [pw_8192]
  962. punpcklwd m4, [cq+112]
  963. punpcklwd m5, [pw_8192]
  964. DEFINE_ARGS data0, stride, ww, h, data1, data2, x
  965. shl strideq, 1
  966. mov data1q, [data0q+gprsize*1]
  967. mov data2q, [data0q+gprsize*2]
  968. mov data0q, [data0q+gprsize*0]
  969. .loop_v:
  970. xor xd, xd
  971. .loop_h:
  972. mova m6, [data0q+xq*2]
  973. mova m7, [data1q+xq*2]
  974. mova m8, [data2q+xq*2]
  975. SBUTTERFLY wd, 6, 7, 9
  976. punpckhwd m9, m8, [pw_1]
  977. punpcklwd m8, [pw_1]
  978. pmaddwd m10, m6, m0
  979. pmaddwd m11, m7, m0
  980. pmaddwd m12, m8, m1
  981. pmaddwd m13, m9, m1
  982. paddd m10, m12
  983. paddd m11, m13
  984. psrad m10, 14
  985. psrad m11, 14
  986. pmaddwd m12, m6, m2
  987. pmaddwd m13, m7, m2
  988. pmaddwd m14, m8, m3
  989. pmaddwd m15, m9, m3
  990. paddd m12, m14
  991. paddd m13, m15
  992. psrad m12, 14
  993. psrad m13, 14
  994. pmaddwd m6, m4
  995. pmaddwd m7, m4
  996. pmaddwd m8, m5
  997. pmaddwd m9, m5
  998. paddd m6, m8
  999. paddd m7, m9
  1000. psrad m6, 14
  1001. psrad m7, 14
  1002. packssdw m10, m11
  1003. packssdw m12, m13
  1004. packssdw m6, m7
  1005. mova [data0q+xq*2], m10
  1006. mova [data1q+xq*2], m12
  1007. mova [data2q+xq*2], m6
  1008. add xd, mmsize / 2
  1009. cmp xd, wwd
  1010. jl .loop_h
  1011. add data0q, strideq
  1012. add data1q, strideq
  1013. add data2q, strideq
  1014. dec hd
  1015. jg .loop_v
  1016. RET
  1017. %endif