yadif_template.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. /*
  2. * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. */
  20. #ifdef COMPILE_TEMPLATE_SSE
  21. #define MM "%%xmm"
  22. #define MOV "movq"
  23. #define MOVQ "movdqa"
  24. #define MOVQU "movdqu"
  25. #define STEP 8
  26. #define LOAD(mem,dst) \
  27. MOV" "mem", "dst" \n\t"\
  28. "punpcklbw "MM"7, "dst" \n\t"
  29. #define PSRL1(reg) "psrldq $1, "reg" \n\t"
  30. #define PSRL2(reg) "psrldq $2, "reg" \n\t"
  31. #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
  32. "psrldq $2, "src" \n\t"
  33. #else
  34. #define MM "%%mm"
  35. #define MOV "movd"
  36. #define MOVQ "movq"
  37. #define MOVQU "movq"
  38. #define STEP 4
  39. #define LOAD(mem,dst) \
  40. MOV" "mem", "dst" \n\t"\
  41. "punpcklbw "MM"7, "dst" \n\t"
  42. #define PSRL1(reg) "psrlq $8, "reg" \n\t"
  43. #define PSRL2(reg) "psrlq $16, "reg" \n\t"
  44. #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
  45. #endif
  46. #ifdef COMPILE_TEMPLATE_SSSE3
  47. #define PABS(tmp,dst) \
  48. "pabsw "dst", "dst" \n\t"
  49. #else
  50. #define PABS(tmp,dst) \
  51. "pxor "tmp", "tmp" \n\t"\
  52. "psubw "dst", "tmp" \n\t"\
  53. "pmaxsw "tmp", "dst" \n\t"
  54. #endif
  55. #define CHECK(pj,mj) \
  56. MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
  57. MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
  58. MOVQ" "MM"2, "MM"4 \n\t"\
  59. MOVQ" "MM"2, "MM"5 \n\t"\
  60. "pxor "MM"3, "MM"4 \n\t"\
  61. "pavgb "MM"3, "MM"5 \n\t"\
  62. "pand "MANGLE(pb_1)", "MM"4 \n\t"\
  63. "psubusb "MM"4, "MM"5 \n\t"\
  64. PSRL1(MM"5") \
  65. "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
  66. MOVQ" "MM"2, "MM"4 \n\t"\
  67. "psubusb "MM"3, "MM"2 \n\t"\
  68. "psubusb "MM"4, "MM"3 \n\t"\
  69. "pmaxub "MM"3, "MM"2 \n\t"\
  70. MOVQ" "MM"2, "MM"3 \n\t"\
  71. MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
  72. PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
  73. PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
  74. "punpcklbw "MM"7, "MM"2 \n\t"\
  75. "punpcklbw "MM"7, "MM"3 \n\t"\
  76. "punpcklbw "MM"7, "MM"4 \n\t"\
  77. "paddw "MM"3, "MM"2 \n\t"\
  78. "paddw "MM"4, "MM"2 \n\t" /* score */
  79. #define CHECK1 \
  80. MOVQ" "MM"0, "MM"3 \n\t"\
  81. "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
  82. "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
  83. MOVQ" "MM"3, "MM"6 \n\t"\
  84. "pand "MM"3, "MM"5 \n\t"\
  85. "pandn "MM"1, "MM"3 \n\t"\
  86. "por "MM"5, "MM"3 \n\t"\
  87. MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
  88. #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
  89. hurts both quality and speed, but matches the C version. */\
  90. "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
  91. "psllw $14, "MM"6 \n\t"\
  92. "paddsw "MM"6, "MM"2 \n\t"\
  93. MOVQ" "MM"0, "MM"3 \n\t"\
  94. "pcmpgtw "MM"2, "MM"3 \n\t"\
  95. "pminsw "MM"2, "MM"0 \n\t"\
  96. "pand "MM"3, "MM"5 \n\t"\
  97. "pandn "MM"1, "MM"3 \n\t"\
  98. "por "MM"5, "MM"3 \n\t"\
  99. MOVQ" "MM"3, "MM"1 \n\t"
  100. void RENAME(ff_yadif_filter_line)(uint8_t *dst,
  101. uint8_t *prev, uint8_t *cur, uint8_t *next,
  102. int w, int prefs, int mrefs, int parity, int mode)
  103. {
  104. uint8_t tmp[5*16];
  105. uint8_t *tmpA= (uint8_t*)(((uint64_t)(tmp+15)) & ~15);
  106. int x;
  107. #define FILTER\
  108. for(x=0; x<w; x+=STEP){\
  109. __asm__ volatile(\
  110. "pxor "MM"7, "MM"7 \n\t"\
  111. LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
  112. LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
  113. LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
  114. LOAD("(%["next2"])", MM"3") /* next2[x] */\
  115. MOVQ" "MM"3, "MM"4 \n\t"\
  116. "paddw "MM"2, "MM"3 \n\t"\
  117. "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
  118. MOVQ" "MM"0, (%[tmpA]) \n\t" /* c */\
  119. MOVQ" "MM"3, 16(%[tmpA]) \n\t" /* d */\
  120. MOVQ" "MM"1, 32(%[tmpA]) \n\t" /* e */\
  121. "psubw "MM"4, "MM"2 \n\t"\
  122. PABS( MM"4", MM"2") /* temporal_diff0 */\
  123. LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
  124. LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
  125. "psubw "MM"0, "MM"3 \n\t"\
  126. "psubw "MM"1, "MM"4 \n\t"\
  127. PABS( MM"5", MM"3")\
  128. PABS( MM"5", MM"4")\
  129. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
  130. "psrlw $1, "MM"2 \n\t"\
  131. "psrlw $1, "MM"3 \n\t"\
  132. "pmaxsw "MM"3, "MM"2 \n\t"\
  133. LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
  134. LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
  135. "psubw "MM"0, "MM"3 \n\t"\
  136. "psubw "MM"1, "MM"4 \n\t"\
  137. PABS( MM"5", MM"3")\
  138. PABS( MM"5", MM"4")\
  139. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
  140. "psrlw $1, "MM"3 \n\t"\
  141. "pmaxsw "MM"3, "MM"2 \n\t"\
  142. MOVQ" "MM"2, 48(%[tmpA]) \n\t" /* diff */\
  143. \
  144. "paddw "MM"0, "MM"1 \n\t"\
  145. "paddw "MM"0, "MM"0 \n\t"\
  146. "psubw "MM"1, "MM"0 \n\t"\
  147. "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
  148. PABS( MM"2", MM"0") /* ABS(c-e) */\
  149. \
  150. MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
  151. MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
  152. MOVQ" "MM"2, "MM"4 \n\t"\
  153. "psubusb "MM"3, "MM"2 \n\t"\
  154. "psubusb "MM"4, "MM"3 \n\t"\
  155. "pmaxub "MM"3, "MM"2 \n\t"\
  156. PSHUF(MM"3", MM"2") \
  157. "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
  158. "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
  159. "paddw "MM"2, "MM"0 \n\t"\
  160. "paddw "MM"3, "MM"0 \n\t"\
  161. "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
  162. \
  163. CHECK(-2,0)\
  164. CHECK1\
  165. CHECK(-3,1)\
  166. CHECK2\
  167. CHECK(0,-2)\
  168. CHECK1\
  169. CHECK(1,-3)\
  170. CHECK2\
  171. \
  172. /* if(p->mode<2) ... */\
  173. MOVQ" 48(%[tmpA]), "MM"6 \n\t" /* diff */\
  174. "cmpl $2, %[mode] \n\t"\
  175. "jge 1f \n\t"\
  176. LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
  177. LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
  178. LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
  179. LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
  180. "paddw "MM"4, "MM"2 \n\t"\
  181. "paddw "MM"5, "MM"3 \n\t"\
  182. "psrlw $1, "MM"2 \n\t" /* b */\
  183. "psrlw $1, "MM"3 \n\t" /* f */\
  184. MOVQ" (%[tmpA]), "MM"4 \n\t" /* c */\
  185. MOVQ" 16(%[tmpA]), "MM"5 \n\t" /* d */\
  186. MOVQ" 32(%[tmpA]), "MM"7 \n\t" /* e */\
  187. "psubw "MM"4, "MM"2 \n\t" /* b-c */\
  188. "psubw "MM"7, "MM"3 \n\t" /* f-e */\
  189. MOVQ" "MM"5, "MM"0 \n\t"\
  190. "psubw "MM"4, "MM"5 \n\t" /* d-c */\
  191. "psubw "MM"7, "MM"0 \n\t" /* d-e */\
  192. MOVQ" "MM"2, "MM"4 \n\t"\
  193. "pminsw "MM"3, "MM"2 \n\t"\
  194. "pmaxsw "MM"4, "MM"3 \n\t"\
  195. "pmaxsw "MM"5, "MM"2 \n\t"\
  196. "pminsw "MM"5, "MM"3 \n\t"\
  197. "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
  198. "pminsw "MM"0, "MM"3 \n\t" /* min */\
  199. "pxor "MM"4, "MM"4 \n\t"\
  200. "pmaxsw "MM"3, "MM"6 \n\t"\
  201. "psubw "MM"2, "MM"4 \n\t" /* -max */\
  202. "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
  203. "1: \n\t"\
  204. \
  205. MOVQ" 16(%[tmpA]), "MM"2 \n\t" /* d */\
  206. MOVQ" "MM"2, "MM"3 \n\t"\
  207. "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
  208. "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
  209. "pmaxsw "MM"2, "MM"1 \n\t"\
  210. "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
  211. "packuswb "MM"1, "MM"1 \n\t"\
  212. \
  213. :\
  214. :[tmpA] "r"(tmpA),\
  215. [prev] "r"(prev),\
  216. [cur] "r"(cur),\
  217. [next] "r"(next),\
  218. [prefs]"r"((x86_reg)prefs),\
  219. [mrefs]"r"((x86_reg)mrefs),\
  220. [mode] "g"(mode)\
  221. );\
  222. __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
  223. dst += STEP;\
  224. prev+= STEP;\
  225. cur += STEP;\
  226. next+= STEP;\
  227. }
  228. if (parity) {
  229. #define prev2 "prev"
  230. #define next2 "cur"
  231. FILTER
  232. #undef prev2
  233. #undef next2
  234. } else {
  235. #define prev2 "cur"
  236. #define next2 "next"
  237. FILTER
  238. #undef prev2
  239. #undef next2
  240. }
  241. }
  242. #undef STEP
  243. #undef MM
  244. #undef MOV
  245. #undef MOVQ
  246. #undef MOVQU
  247. #undef PSHUF
  248. #undef PSRL1
  249. #undef PSRL2
  250. #undef LOAD
  251. #undef PABS
  252. #undef CHECK
  253. #undef CHECK1
  254. #undef CHECK2
  255. #undef FILTER