h264_deblock_10bit.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Oskar Arvidsson <oskar@irock.se>
  7. ;* Loren Merritt <lorenm@u.washington.edu>
  8. ;* Fiona Glaser <fiona@x264.com>
  9. ;*
  10. ;* This file is part of FFmpeg.
  11. ;*
  12. ;* FFmpeg is free software; you can redistribute it and/or
  13. ;* modify it under the terms of the GNU Lesser General Public
  14. ;* License as published by the Free Software Foundation; either
  15. ;* version 2.1 of the License, or (at your option) any later version.
  16. ;*
  17. ;* FFmpeg is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. ;* Lesser General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU Lesser General Public
  23. ;* License along with FFmpeg; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. ;******************************************************************************
  26. %include "libavutil/x86/x86util.asm"
  27. SECTION_RODATA
  28. pw_pixel_max: times 8 dw ((1 << 10)-1)
  29. SECTION .text
  30. cextern pw_2
  31. cextern pw_3
  32. cextern pw_4
  33. ; out: %4 = |%1-%2|-%3
  34. ; clobbers: %5
  35. %macro ABS_SUB 5
  36. psubusw %5, %2, %1
  37. psubusw %4, %1, %2
  38. por %4, %5
  39. psubw %4, %3
  40. %endmacro
  41. ; out: %4 = |%1-%2|<%3
  42. %macro DIFF_LT 5
  43. psubusw %4, %2, %1
  44. psubusw %5, %1, %2
  45. por %5, %4 ; |%1-%2|
  46. pxor %4, %4
  47. psubw %5, %3 ; |%1-%2|-%3
  48. pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
  49. %endmacro
  50. %macro LOAD_AB 4
  51. movd %1, %3
  52. movd %2, %4
  53. SPLATW %1, %1
  54. SPLATW %2, %2
  55. %endmacro
  56. ; in: %2=tc reg
  57. ; out: %1=splatted tc
  58. %macro LOAD_TC 2
  59. movd %1, [%2]
  60. punpcklbw %1, %1
  61. %if mmsize == 8
  62. pshufw %1, %1, 0
  63. %else
  64. pshuflw %1, %1, 01010000b
  65. pshufd %1, %1, 01010000b
  66. %endif
  67. psraw %1, 6
  68. %endmacro
  69. ; in: %1=p1, %2=p0, %3=q0, %4=q1
  70. ; %5=alpha, %6=beta, %7-%9=tmp
  71. ; out: %7=mask
  72. %macro LOAD_MASK 9
  73. ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
  74. ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
  75. pand %8, %9
  76. ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
  77. pxor %7, %7
  78. pand %8, %9
  79. pcmpgtw %7, %8
  80. %endmacro
  81. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  82. ; out: %1=p0', m2=q0'
  83. %macro DEBLOCK_P0_Q0 7
  84. psubw %3, %4
  85. pxor %7, %7
  86. paddw %3, [pw_4]
  87. psubw %7, %5
  88. psubw %6, %2, %1
  89. psllw %6, 2
  90. paddw %3, %6
  91. psraw %3, 3
  92. mova %6, [pw_pixel_max]
  93. CLIPW %3, %7, %5
  94. pxor %7, %7
  95. paddw %1, %3
  96. psubw %2, %3
  97. CLIPW %1, %7, %6
  98. CLIPW %2, %7, %6
  99. %endmacro
  100. ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
  101. %macro LUMA_Q1 6
  102. pavgw %6, %3, %4 ; (p0+q0+1)>>1
  103. paddw %1, %6
  104. pxor %6, %6
  105. psraw %1, 1
  106. psubw %6, %5
  107. psubw %1, %2
  108. CLIPW %1, %6, %5
  109. paddw %1, %2
  110. %endmacro
  111. %macro LUMA_DEBLOCK_ONE 3
  112. DIFF_LT m5, %1, bm, m4, m6
  113. pxor m6, m6
  114. mova %3, m4
  115. pcmpgtw m6, tcm
  116. pand m4, tcm
  117. pandn m6, m7
  118. pand m4, m6
  119. LUMA_Q1 m5, %2, m1, m2, m4, m6
  120. %endmacro
  121. %macro LUMA_H_STORE 2
  122. %if mmsize == 8
  123. movq [r0-4], m0
  124. movq [r0+r1-4], m1
  125. movq [r0+r1*2-4], m2
  126. movq [r0+%2-4], m3
  127. %else
  128. movq [r0-4], m0
  129. movhps [r0+r1-4], m0
  130. movq [r0+r1*2-4], m1
  131. movhps [%1-4], m1
  132. movq [%1+r1-4], m2
  133. movhps [%1+r1*2-4], m2
  134. movq [%1+%2-4], m3
  135. movhps [%1+r1*4-4], m3
  136. %endif
  137. %endmacro
  138. %macro DEBLOCK_LUMA 0
  139. ;-----------------------------------------------------------------------------
  140. ; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
  141. ; int8_t *tc0)
  142. ;-----------------------------------------------------------------------------
  143. cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
  144. %assign pad 5*mmsize+12-(stack_offset&15)
  145. %define tcm [rsp]
  146. %define ms1 [rsp+mmsize]
  147. %define ms2 [rsp+mmsize*2]
  148. %define am [rsp+mmsize*3]
  149. %define bm [rsp+mmsize*4]
  150. SUB rsp, pad
  151. shl r2d, 2
  152. shl r3d, 2
  153. LOAD_AB m4, m5, r2d, r3d
  154. mov r3, 32/mmsize
  155. mov r2, r0
  156. sub r0, r1
  157. mova am, m4
  158. sub r0, r1
  159. mova bm, m5
  160. sub r0, r1
  161. .loop:
  162. mova m0, [r0+r1]
  163. mova m1, [r0+r1*2]
  164. mova m2, [r2]
  165. mova m3, [r2+r1]
  166. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  167. LOAD_TC m6, r4
  168. mova tcm, m6
  169. mova m5, [r0]
  170. LUMA_DEBLOCK_ONE m1, m0, ms1
  171. mova [r0+r1], m5
  172. mova m5, [r2+r1*2]
  173. LUMA_DEBLOCK_ONE m2, m3, ms2
  174. mova [r2+r1], m5
  175. pxor m5, m5
  176. mova m6, tcm
  177. pcmpgtw m5, tcm
  178. psubw m6, ms1
  179. pandn m5, m7
  180. psubw m6, ms2
  181. pand m5, m6
  182. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  183. mova [r0+r1*2], m1
  184. mova [r2], m2
  185. add r0, mmsize
  186. add r2, mmsize
  187. add r4, mmsize/8
  188. dec r3
  189. jg .loop
  190. ADD rsp, pad
  191. RET
  192. cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
  193. %assign pad 7*mmsize+12-(stack_offset&15)
  194. %define tcm [rsp]
  195. %define ms1 [rsp+mmsize]
  196. %define ms2 [rsp+mmsize*2]
  197. %define p1m [rsp+mmsize*3]
  198. %define p2m [rsp+mmsize*4]
  199. %define am [rsp+mmsize*5]
  200. %define bm [rsp+mmsize*6]
  201. SUB rsp, pad
  202. shl r2d, 2
  203. shl r3d, 2
  204. LOAD_AB m4, m5, r2d, r3d
  205. mov r3, r1
  206. mova am, m4
  207. add r3, r1
  208. mov r5, 32/mmsize
  209. mova bm, m5
  210. add r3, r1
  211. %if mmsize == 16
  212. mov r2, r0
  213. add r2, r3
  214. %endif
  215. .loop:
  216. %if mmsize == 8
  217. movq m2, [r0-8] ; y q2 q1 q0
  218. movq m7, [r0+0]
  219. movq m5, [r0+r1-8]
  220. movq m3, [r0+r1+0]
  221. movq m0, [r0+r1*2-8]
  222. movq m6, [r0+r1*2+0]
  223. movq m1, [r0+r3-8]
  224. TRANSPOSE4x4W 2, 5, 0, 1, 4
  225. SWAP 2, 7
  226. movq m7, [r0+r3]
  227. TRANSPOSE4x4W 2, 3, 6, 7, 4
  228. %else
  229. movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  230. movu m0, [r0+r1-8]
  231. movu m2, [r0+r1*2-8]
  232. movu m3, [r2-8]
  233. TRANSPOSE4x4W 5, 0, 2, 3, 6
  234. mova tcm, m3
  235. movu m4, [r2+r1-8]
  236. movu m1, [r2+r1*2-8]
  237. movu m3, [r2+r3-8]
  238. movu m7, [r2+r1*4-8]
  239. TRANSPOSE4x4W 4, 1, 3, 7, 6
  240. mova m6, tcm
  241. punpcklqdq m6, m7
  242. punpckhqdq m5, m4
  243. SBUTTERFLY qdq, 0, 1, 7
  244. SBUTTERFLY qdq, 2, 3, 7
  245. %endif
  246. mova p2m, m6
  247. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  248. LOAD_TC m6, r4
  249. mova tcm, m6
  250. LUMA_DEBLOCK_ONE m1, m0, ms1
  251. mova p1m, m5
  252. mova m5, p2m
  253. LUMA_DEBLOCK_ONE m2, m3, ms2
  254. mova p2m, m5
  255. pxor m5, m5
  256. mova m6, tcm
  257. pcmpgtw m5, tcm
  258. psubw m6, ms1
  259. pandn m5, m7
  260. psubw m6, ms2
  261. pand m5, m6
  262. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  263. mova m0, p1m
  264. mova m3, p2m
  265. TRANSPOSE4x4W 0, 1, 2, 3, 4
  266. LUMA_H_STORE r2, r3
  267. add r4, mmsize/8
  268. lea r0, [r0+r1*(mmsize/2)]
  269. lea r2, [r2+r1*(mmsize/2)]
  270. dec r5
  271. jg .loop
  272. ADD rsp, pad
  273. RET
  274. %endmacro
  275. %if ARCH_X86_64
  276. ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
  277. ; m12=alpha, m13=beta
  278. ; out: m0=p1', m3=q1', m1=p0', m2=q0'
  279. ; clobbers: m4, m5, m6, m7, m10, m11, m14
  280. %macro DEBLOCK_LUMA_INTER_SSE2 0
  281. LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
  282. LOAD_TC m6, r4
  283. DIFF_LT m8, m1, m13, m10, m4
  284. DIFF_LT m9, m2, m13, m11, m4
  285. pand m6, m7
  286. mova m14, m6
  287. pxor m4, m4
  288. pcmpgtw m6, m4
  289. pand m6, m14
  290. mova m5, m10
  291. pand m5, m6
  292. LUMA_Q1 m8, m0, m1, m2, m5, m4
  293. mova m5, m11
  294. pand m5, m6
  295. LUMA_Q1 m9, m3, m1, m2, m5, m4
  296. pxor m4, m4
  297. psubw m6, m10
  298. pcmpgtw m4, m14
  299. pandn m4, m7
  300. psubw m6, m11
  301. pand m4, m6
  302. DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
  303. SWAP 0, 8
  304. SWAP 3, 9
  305. %endmacro
  306. %macro DEBLOCK_LUMA_64 0
  307. cglobal deblock_v_luma_10, 5,5,15
  308. %define p2 m8
  309. %define p1 m0
  310. %define p0 m1
  311. %define q0 m2
  312. %define q1 m3
  313. %define q2 m9
  314. %define mask0 m7
  315. %define mask1 m10
  316. %define mask2 m11
  317. shl r2d, 2
  318. shl r3d, 2
  319. LOAD_AB m12, m13, r2d, r3d
  320. mov r2, r0
  321. sub r0, r1
  322. sub r0, r1
  323. sub r0, r1
  324. mov r3, 2
  325. .loop:
  326. mova p2, [r0]
  327. mova p1, [r0+r1]
  328. mova p0, [r0+r1*2]
  329. mova q0, [r2]
  330. mova q1, [r2+r1]
  331. mova q2, [r2+r1*2]
  332. DEBLOCK_LUMA_INTER_SSE2
  333. mova [r0+r1], p1
  334. mova [r0+r1*2], p0
  335. mova [r2], q0
  336. mova [r2+r1], q1
  337. add r0, mmsize
  338. add r2, mmsize
  339. add r4, 2
  340. dec r3
  341. jg .loop
  342. REP_RET
  343. cglobal deblock_h_luma_10, 5,7,15
  344. shl r2d, 2
  345. shl r3d, 2
  346. LOAD_AB m12, m13, r2d, r3d
  347. mov r2, r1
  348. add r2, r1
  349. add r2, r1
  350. mov r5, r0
  351. add r5, r2
  352. mov r6, 2
  353. .loop:
  354. movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  355. movu m0, [r0+r1-8]
  356. movu m2, [r0+r1*2-8]
  357. movu m9, [r5-8]
  358. movu m5, [r5+r1-8]
  359. movu m1, [r5+r1*2-8]
  360. movu m3, [r5+r2-8]
  361. movu m7, [r5+r1*4-8]
  362. TRANSPOSE4x4W 8, 0, 2, 9, 10
  363. TRANSPOSE4x4W 5, 1, 3, 7, 10
  364. punpckhqdq m8, m5
  365. SBUTTERFLY qdq, 0, 1, 10
  366. SBUTTERFLY qdq, 2, 3, 10
  367. punpcklqdq m9, m7
  368. DEBLOCK_LUMA_INTER_SSE2
  369. TRANSPOSE4x4W 0, 1, 2, 3, 4
  370. LUMA_H_STORE r5, r2
  371. add r4, 2
  372. lea r0, [r0+r1*8]
  373. lea r5, [r5+r1*8]
  374. dec r6
  375. jg .loop
  376. REP_RET
  377. %endmacro
  378. INIT_XMM sse2
  379. DEBLOCK_LUMA_64
  380. %if HAVE_AVX_EXTERNAL
  381. INIT_XMM avx
  382. DEBLOCK_LUMA_64
  383. %endif
  384. %endif
  385. %macro SWAPMOVA 2
  386. %ifid %1
  387. SWAP %1, %2
  388. %else
  389. mova %1, %2
  390. %endif
  391. %endmacro
  392. ; in: t0-t2: tmp registers
  393. ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
  394. ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
  395. %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
  396. %if ARCH_X86_64
  397. paddw t0, %3, %2
  398. mova t2, %4
  399. paddw t2, %3
  400. %else
  401. mova t0, %3
  402. mova t2, %4
  403. paddw t0, %2
  404. paddw t2, %3
  405. %endif
  406. paddw t0, %1
  407. paddw t2, t2
  408. paddw t0, %5
  409. paddw t2, %9
  410. paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
  411. paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
  412. psrlw t2, 3
  413. psrlw t1, t0, 2
  414. psubw t2, %3
  415. psubw t1, %2
  416. pand t2, %8
  417. pand t1, %8
  418. paddw t2, %3
  419. paddw t1, %2
  420. SWAPMOVA %11, t1
  421. psubw t1, t0, %3
  422. paddw t0, t0
  423. psubw t1, %5
  424. psubw t0, %3
  425. paddw t1, %6
  426. paddw t1, %2
  427. paddw t0, %6
  428. psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
  429. psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
  430. pxor t0, t1
  431. pxor t1, %1
  432. pand t0, %8
  433. pand t1, %7
  434. pxor t0, t1
  435. pxor t0, %1
  436. SWAPMOVA %10, t0
  437. SWAPMOVA %12, t2
  438. %endmacro
  439. %macro LUMA_INTRA_INIT 1
  440. %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
  441. %define t0 m4
  442. %define t1 m5
  443. %define t2 m6
  444. %define t3 m7
  445. %assign i 4
  446. %rep %1
  447. CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
  448. %assign i i+1
  449. %endrep
  450. SUB rsp, pad
  451. %endmacro
  452. ; in: %1-%3=tmp, %4=p2, %5=q2
  453. %macro LUMA_INTRA_INTER 5
  454. LOAD_AB t0, t1, r2d, r3d
  455. mova %1, t0
  456. LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
  457. %if ARCH_X86_64
  458. mova %2, t0 ; mask0
  459. psrlw t3, %1, 2
  460. %else
  461. mova t3, %1
  462. mova %2, t0 ; mask0
  463. psrlw t3, 2
  464. %endif
  465. paddw t3, [pw_2] ; alpha/4+2
  466. DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
  467. pand t2, %2
  468. mova t3, %5 ; q2
  469. mova %1, t2 ; mask1
  470. DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
  471. pand t2, %1
  472. mova t3, %4 ; p2
  473. mova %3, t2 ; mask1q
  474. DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
  475. pand t2, %1
  476. mova %1, t2 ; mask1p
  477. %endmacro
  478. %macro LUMA_H_INTRA_LOAD 0
  479. %if mmsize == 8
  480. movu t0, [r0-8]
  481. movu t1, [r0+r1-8]
  482. movu m0, [r0+r1*2-8]
  483. movu m1, [r0+r4-8]
  484. TRANSPOSE4x4W 4, 5, 0, 1, 2
  485. mova t4, t0 ; p3
  486. mova t5, t1 ; p2
  487. movu m2, [r0]
  488. movu m3, [r0+r1]
  489. movu t0, [r0+r1*2]
  490. movu t1, [r0+r4]
  491. TRANSPOSE4x4W 2, 3, 4, 5, 6
  492. mova t6, t0 ; q2
  493. mova t7, t1 ; q3
  494. %else
  495. movu t0, [r0-8]
  496. movu t1, [r0+r1-8]
  497. movu m0, [r0+r1*2-8]
  498. movu m1, [r0+r5-8]
  499. movu m2, [r4-8]
  500. movu m3, [r4+r1-8]
  501. movu t2, [r4+r1*2-8]
  502. movu t3, [r4+r5-8]
  503. TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
  504. mova t4, t0 ; p3
  505. mova t5, t1 ; p2
  506. mova t6, t2 ; q2
  507. mova t7, t3 ; q3
  508. %endif
  509. %endmacro
  510. ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
  511. %macro LUMA_H_INTRA_STORE 9
  512. %if mmsize == 8
  513. TRANSPOSE4x4W %1, %2, %3, %4, %9
  514. movq [r0-8], m%1
  515. movq [r0+r1-8], m%2
  516. movq [r0+r1*2-8], m%3
  517. movq [r0+r4-8], m%4
  518. movq m%1, %8
  519. TRANSPOSE4x4W %5, %6, %7, %1, %9
  520. movq [r0], m%5
  521. movq [r0+r1], m%6
  522. movq [r0+r1*2], m%7
  523. movq [r0+r4], m%1
  524. %else
  525. TRANSPOSE2x4x4W %1, %2, %3, %4, %9
  526. movq [r0-8], m%1
  527. movq [r0+r1-8], m%2
  528. movq [r0+r1*2-8], m%3
  529. movq [r0+r5-8], m%4
  530. movhps [r4-8], m%1
  531. movhps [r4+r1-8], m%2
  532. movhps [r4+r1*2-8], m%3
  533. movhps [r4+r5-8], m%4
  534. %ifnum %8
  535. SWAP %1, %8
  536. %else
  537. mova m%1, %8
  538. %endif
  539. TRANSPOSE2x4x4W %5, %6, %7, %1, %9
  540. movq [r0], m%5
  541. movq [r0+r1], m%6
  542. movq [r0+r1*2], m%7
  543. movq [r0+r5], m%1
  544. movhps [r4], m%5
  545. movhps [r4+r1], m%6
  546. movhps [r4+r1*2], m%7
  547. movhps [r4+r5], m%1
  548. %endif
  549. %endmacro
  550. %if ARCH_X86_64
  551. ;-----------------------------------------------------------------------------
  552. ; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
  553. ; int beta)
  554. ;-----------------------------------------------------------------------------
  555. %macro DEBLOCK_LUMA_INTRA_64 0
  556. cglobal deblock_v_luma_intra_10, 4,7,16
  557. %define t0 m1
  558. %define t1 m2
  559. %define t2 m4
  560. %define p2 m8
  561. %define p1 m9
  562. %define p0 m10
  563. %define q0 m11
  564. %define q1 m12
  565. %define q2 m13
  566. %define aa m5
  567. %define bb m14
  568. lea r4, [r1*4]
  569. lea r5, [r1*3] ; 3*stride
  570. neg r4
  571. add r4, r0 ; pix-4*stride
  572. mov r6, 2
  573. mova m0, [pw_2]
  574. shl r2d, 2
  575. shl r3d, 2
  576. LOAD_AB aa, bb, r2d, r3d
  577. .loop:
  578. mova p2, [r4+r1]
  579. mova p1, [r4+2*r1]
  580. mova p0, [r4+r5]
  581. mova q0, [r0]
  582. mova q1, [r0+r1]
  583. mova q2, [r0+2*r1]
  584. LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
  585. mova t2, aa
  586. psrlw t2, 2
  587. paddw t2, m0 ; alpha/4+2
  588. DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  589. DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
  590. DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
  591. pand m6, m3
  592. pand m7, m6
  593. pand m6, t1
  594. LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
  595. LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
  596. add r0, mmsize
  597. add r4, mmsize
  598. dec r6
  599. jg .loop
  600. REP_RET
  601. ;-----------------------------------------------------------------------------
  602. ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
  603. ; int beta)
  604. ;-----------------------------------------------------------------------------
  605. cglobal deblock_h_luma_intra_10, 4,7,16
  606. %define t0 m15
  607. %define t1 m14
  608. %define t2 m2
  609. %define q3 m5
  610. %define q2 m8
  611. %define q1 m9
  612. %define q0 m10
  613. %define p0 m11
  614. %define p1 m12
  615. %define p2 m13
  616. %define p3 m4
  617. %define spill [rsp]
  618. %assign pad 24-(stack_offset&15)
  619. SUB rsp, pad
  620. lea r4, [r1*4]
  621. lea r5, [r1*3] ; 3*stride
  622. add r4, r0 ; pix+4*stride
  623. mov r6, 2
  624. mova m0, [pw_2]
  625. shl r2d, 2
  626. shl r3d, 2
  627. .loop:
  628. movu q3, [r0-8]
  629. movu q2, [r0+r1-8]
  630. movu q1, [r0+r1*2-8]
  631. movu q0, [r0+r5-8]
  632. movu p0, [r4-8]
  633. movu p1, [r4+r1-8]
  634. movu p2, [r4+r1*2-8]
  635. movu p3, [r4+r5-8]
  636. TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
  637. LOAD_AB m1, m2, r2d, r3d
  638. LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
  639. psrlw m1, 2
  640. paddw m1, m0 ; alpha/4+2
  641. DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  642. DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
  643. DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
  644. pand m6, m3
  645. pand m7, m6
  646. pand m6, t1
  647. mova spill, q3
  648. LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
  649. LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
  650. mova m7, spill
  651. LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
  652. lea r0, [r0+r1*8]
  653. lea r4, [r4+r1*8]
  654. dec r6
  655. jg .loop
  656. ADD rsp, pad
  657. RET
  658. %endmacro
  659. INIT_XMM sse2
  660. DEBLOCK_LUMA_INTRA_64
  661. %if HAVE_AVX_EXTERNAL
  662. INIT_XMM avx
  663. DEBLOCK_LUMA_INTRA_64
  664. %endif
  665. %endif
  666. %macro DEBLOCK_LUMA_INTRA 0
  667. ;-----------------------------------------------------------------------------
  668. ; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
  669. ; int beta)
  670. ;-----------------------------------------------------------------------------
  671. cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
  672. LUMA_INTRA_INIT 3
  673. lea r4, [r1*4]
  674. lea r5, [r1*3]
  675. neg r4
  676. add r4, r0
  677. mov r6, 32/mmsize
  678. shl r2d, 2
  679. shl r3d, 2
  680. .loop:
  681. mova m0, [r4+r1*2] ; p1
  682. mova m1, [r4+r5] ; p0
  683. mova m2, [r0] ; q0
  684. mova m3, [r0+r1] ; q1
  685. LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
  686. LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
  687. mova t3, [r0+r1*2] ; q2
  688. LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
  689. add r0, mmsize
  690. add r4, mmsize
  691. dec r6
  692. jg .loop
  693. ADD rsp, pad
  694. RET
  695. ;-----------------------------------------------------------------------------
  696. ; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
  697. ; int beta)
  698. ;-----------------------------------------------------------------------------
  699. cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
  700. LUMA_INTRA_INIT 8
  701. %if mmsize == 8
  702. lea r4, [r1*3]
  703. mov r5, 32/mmsize
  704. %else
  705. lea r4, [r1*4]
  706. lea r5, [r1*3] ; 3*stride
  707. add r4, r0 ; pix+4*stride
  708. mov r6, 32/mmsize
  709. %endif
  710. shl r2d, 2
  711. shl r3d, 2
  712. .loop:
  713. LUMA_H_INTRA_LOAD
  714. LUMA_INTRA_INTER t8, t9, t10, t5, t6
  715. LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
  716. mova t3, t6 ; q2
  717. LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
  718. mova m2, t4
  719. mova m0, t11
  720. mova m1, t5
  721. mova m3, t8
  722. mova m6, t6
  723. LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
  724. lea r0, [r0+r1*(mmsize/2)]
  725. %if mmsize == 8
  726. dec r5
  727. %else
  728. lea r4, [r4+r1*(mmsize/2)]
  729. dec r6
  730. %endif
  731. jg .loop
  732. ADD rsp, pad
  733. RET
  734. %endmacro
  735. %if ARCH_X86_64 == 0
  736. INIT_MMX mmxext
  737. DEBLOCK_LUMA
  738. DEBLOCK_LUMA_INTRA
  739. INIT_XMM sse2
  740. DEBLOCK_LUMA
  741. DEBLOCK_LUMA_INTRA
  742. %if HAVE_AVX_EXTERNAL
  743. INIT_XMM avx
  744. DEBLOCK_LUMA
  745. DEBLOCK_LUMA_INTRA
  746. %endif
  747. %endif
  748. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  749. ; out: %1=p0', %2=q0'
  750. %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
  751. mova %6, [pw_2]
  752. paddw %6, %3
  753. paddw %6, %4
  754. paddw %7, %6, %2
  755. paddw %6, %1
  756. paddw %6, %3
  757. paddw %7, %4
  758. psraw %6, 2
  759. psraw %7, 2
  760. psubw %6, %1
  761. psubw %7, %2
  762. pand %6, %5
  763. pand %7, %5
  764. paddw %1, %6
  765. paddw %2, %7
  766. %endmacro
  767. %macro CHROMA_V_LOAD 1
  768. mova m0, [r0] ; p1
  769. mova m1, [r0+r1] ; p0
  770. mova m2, [%1] ; q0
  771. mova m3, [%1+r1] ; q1
  772. %endmacro
  773. %macro CHROMA_V_STORE 0
  774. mova [r0+1*r1], m1
  775. mova [r0+2*r1], m2
  776. %endmacro
  777. %macro CHROMA_V_LOAD_TC 2
  778. movd %1, [%2]
  779. punpcklbw %1, %1
  780. punpcklwd %1, %1
  781. psraw %1, 6
  782. %endmacro
  783. %macro DEBLOCK_CHROMA 0
  784. ;-----------------------------------------------------------------------------
  785. ; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
  786. ; int8_t *tc0)
  787. ;-----------------------------------------------------------------------------
  788. cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
  789. mov r5, r0
  790. sub r0, r1
  791. sub r0, r1
  792. shl r2d, 2
  793. shl r3d, 2
  794. %if mmsize < 16
  795. mov r6, 16/mmsize
  796. .loop:
  797. %endif
  798. CHROMA_V_LOAD r5
  799. LOAD_AB m4, m5, r2d, r3d
  800. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  801. pxor m4, m4
  802. CHROMA_V_LOAD_TC m6, r4
  803. psubw m6, [pw_3]
  804. pmaxsw m6, m4
  805. pand m7, m6
  806. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  807. CHROMA_V_STORE
  808. %if mmsize < 16
  809. add r0, mmsize
  810. add r5, mmsize
  811. add r4, mmsize/4
  812. dec r6
  813. jg .loop
  814. REP_RET
  815. %else
  816. RET
  817. %endif
  818. ;-----------------------------------------------------------------------------
  819. ; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
  820. ; int beta)
  821. ;-----------------------------------------------------------------------------
  822. cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
  823. mov r4, r0
  824. sub r0, r1
  825. sub r0, r1
  826. shl r2d, 2
  827. shl r3d, 2
  828. %if mmsize < 16
  829. mov r5, 16/mmsize
  830. .loop:
  831. %endif
  832. CHROMA_V_LOAD r4
  833. LOAD_AB m4, m5, r2d, r3d
  834. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  835. CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
  836. CHROMA_V_STORE
  837. %if mmsize < 16
  838. add r0, mmsize
  839. add r4, mmsize
  840. dec r5
  841. jg .loop
  842. REP_RET
  843. %else
  844. RET
  845. %endif
  846. %endmacro
  847. %if ARCH_X86_64 == 0
  848. INIT_MMX mmxext
  849. DEBLOCK_CHROMA
  850. %endif
  851. INIT_XMM sse2
  852. DEBLOCK_CHROMA
  853. %if HAVE_AVX_EXTERNAL
  854. INIT_XMM avx
  855. DEBLOCK_CHROMA
  856. %endif