yadif_template.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /*
  2. * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. */
  20. #ifdef COMPILE_TEMPLATE_SSE
  21. #define MM "%%xmm"
  22. #define MOV "movq"
  23. #define MOVQ "movdqa"
  24. #define MOVQU "movdqu"
  25. #define STEP 8
  26. #define LOAD(mem,dst) \
  27. MOV" "mem", "dst" \n\t"\
  28. "punpcklbw "MM"7, "dst" \n\t"
  29. #define PSRL1(reg) "psrldq $1, "reg" \n\t"
  30. #define PSRL2(reg) "psrldq $2, "reg" \n\t"
  31. #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
  32. "psrldq $2, "src" \n\t"
  33. #else
  34. #define MM "%%mm"
  35. #define MOV "movd"
  36. #define MOVQ "movq"
  37. #define MOVQU "movq"
  38. #define STEP 4
  39. #define LOAD(mem,dst) \
  40. MOV" "mem", "dst" \n\t"\
  41. "punpcklbw "MM"7, "dst" \n\t"
  42. #define PSRL1(reg) "psrlq $8, "reg" \n\t"
  43. #define PSRL2(reg) "psrlq $16, "reg" \n\t"
  44. #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
  45. #endif
  46. #ifdef COMPILE_TEMPLATE_SSSE3
  47. #define PABS(tmp,dst) \
  48. "pabsw "dst", "dst" \n\t"
  49. #else
  50. #define PABS(tmp,dst) \
  51. "pxor "tmp", "tmp" \n\t"\
  52. "psubw "dst", "tmp" \n\t"\
  53. "pmaxsw "tmp", "dst" \n\t"
  54. #endif
  55. #define CHECK(pj,mj) \
  56. MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
  57. MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
  58. MOVQ" "MM"2, "MM"4 \n\t"\
  59. MOVQ" "MM"2, "MM"5 \n\t"\
  60. "pxor "MM"3, "MM"4 \n\t"\
  61. "pavgb "MM"3, "MM"5 \n\t"\
  62. "pand "MANGLE(ff_pb_1)", "MM"4 \n\t"\
  63. "psubusb "MM"4, "MM"5 \n\t"\
  64. PSRL1(MM"5") \
  65. "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
  66. MOVQ" "MM"2, "MM"4 \n\t"\
  67. "psubusb "MM"3, "MM"2 \n\t"\
  68. "psubusb "MM"4, "MM"3 \n\t"\
  69. "pmaxub "MM"3, "MM"2 \n\t"\
  70. MOVQ" "MM"2, "MM"3 \n\t"\
  71. MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
  72. PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
  73. PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
  74. "punpcklbw "MM"7, "MM"2 \n\t"\
  75. "punpcklbw "MM"7, "MM"3 \n\t"\
  76. "punpcklbw "MM"7, "MM"4 \n\t"\
  77. "paddw "MM"3, "MM"2 \n\t"\
  78. "paddw "MM"4, "MM"2 \n\t" /* score */
  79. #define CHECK1 \
  80. MOVQ" "MM"0, "MM"3 \n\t"\
  81. "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
  82. "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
  83. MOVQ" "MM"3, "MM"6 \n\t"\
  84. "pand "MM"3, "MM"5 \n\t"\
  85. "pandn "MM"1, "MM"3 \n\t"\
  86. "por "MM"5, "MM"3 \n\t"\
  87. MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
  88. #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
  89. hurts both quality and speed, but matches the C version. */\
  90. "paddw "MANGLE(ff_pw_1)", "MM"6 \n\t"\
  91. "psllw $14, "MM"6 \n\t"\
  92. "paddsw "MM"6, "MM"2 \n\t"\
  93. MOVQ" "MM"0, "MM"3 \n\t"\
  94. "pcmpgtw "MM"2, "MM"3 \n\t"\
  95. "pminsw "MM"2, "MM"0 \n\t"\
  96. "pand "MM"3, "MM"5 \n\t"\
  97. "pandn "MM"1, "MM"3 \n\t"\
  98. "por "MM"5, "MM"3 \n\t"\
  99. MOVQ" "MM"3, "MM"1 \n\t"
  100. void RENAME(ff_yadif_filter_line)(uint8_t *dst,
  101. uint8_t *prev, uint8_t *cur, uint8_t *next,
  102. int w, int refs, int parity, int mode)
  103. {
  104. DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
  105. DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
  106. DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
  107. DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
  108. int x;
  109. #define FILTER\
  110. for(x=0; x<w; x+=STEP){\
  111. __asm__ volatile(\
  112. "pxor "MM"7, "MM"7 \n\t"\
  113. LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
  114. LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
  115. LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
  116. LOAD("(%["next2"])", MM"3") /* next2[x] */\
  117. MOVQ" "MM"3, "MM"4 \n\t"\
  118. "paddw "MM"2, "MM"3 \n\t"\
  119. "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
  120. MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
  121. MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
  122. MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
  123. "psubw "MM"4, "MM"2 \n\t"\
  124. PABS( MM"4", MM"2") /* temporal_diff0 */\
  125. LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
  126. LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
  127. "psubw "MM"0, "MM"3 \n\t"\
  128. "psubw "MM"1, "MM"4 \n\t"\
  129. PABS( MM"5", MM"3")\
  130. PABS( MM"5", MM"4")\
  131. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
  132. "psrlw $1, "MM"2 \n\t"\
  133. "psrlw $1, "MM"3 \n\t"\
  134. "pmaxsw "MM"3, "MM"2 \n\t"\
  135. LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
  136. LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
  137. "psubw "MM"0, "MM"3 \n\t"\
  138. "psubw "MM"1, "MM"4 \n\t"\
  139. PABS( MM"5", MM"3")\
  140. PABS( MM"5", MM"4")\
  141. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
  142. "psrlw $1, "MM"3 \n\t"\
  143. "pmaxsw "MM"3, "MM"2 \n\t"\
  144. MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
  145. \
  146. "paddw "MM"0, "MM"1 \n\t"\
  147. "paddw "MM"0, "MM"0 \n\t"\
  148. "psubw "MM"1, "MM"0 \n\t"\
  149. "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
  150. PABS( MM"2", MM"0") /* ABS(c-e) */\
  151. \
  152. MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
  153. MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
  154. MOVQ" "MM"2, "MM"4 \n\t"\
  155. "psubusb "MM"3, "MM"2 \n\t"\
  156. "psubusb "MM"4, "MM"3 \n\t"\
  157. "pmaxub "MM"3, "MM"2 \n\t"\
  158. PSHUF(MM"3", MM"2") \
  159. "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
  160. "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
  161. "paddw "MM"2, "MM"0 \n\t"\
  162. "paddw "MM"3, "MM"0 \n\t"\
  163. "psubw "MANGLE(ff_pw_1)", "MM"0 \n\t" /* spatial_score */\
  164. \
  165. CHECK(-2,0)\
  166. CHECK1\
  167. CHECK(-3,1)\
  168. CHECK2\
  169. CHECK(0,-2)\
  170. CHECK1\
  171. CHECK(1,-3)\
  172. CHECK2\
  173. \
  174. /* if(p->mode<2) ... */\
  175. MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
  176. "cmpl $2, %[mode] \n\t"\
  177. "jge 1f \n\t"\
  178. LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
  179. LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
  180. LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
  181. LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
  182. "paddw "MM"4, "MM"2 \n\t"\
  183. "paddw "MM"5, "MM"3 \n\t"\
  184. "psrlw $1, "MM"2 \n\t" /* b */\
  185. "psrlw $1, "MM"3 \n\t" /* f */\
  186. MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
  187. MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
  188. MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
  189. "psubw "MM"4, "MM"2 \n\t" /* b-c */\
  190. "psubw "MM"7, "MM"3 \n\t" /* f-e */\
  191. MOVQ" "MM"5, "MM"0 \n\t"\
  192. "psubw "MM"4, "MM"5 \n\t" /* d-c */\
  193. "psubw "MM"7, "MM"0 \n\t" /* d-e */\
  194. MOVQ" "MM"2, "MM"4 \n\t"\
  195. "pminsw "MM"3, "MM"2 \n\t"\
  196. "pmaxsw "MM"4, "MM"3 \n\t"\
  197. "pmaxsw "MM"5, "MM"2 \n\t"\
  198. "pminsw "MM"5, "MM"3 \n\t"\
  199. "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
  200. "pminsw "MM"0, "MM"3 \n\t" /* min */\
  201. "pxor "MM"4, "MM"4 \n\t"\
  202. "pmaxsw "MM"3, "MM"6 \n\t"\
  203. "psubw "MM"2, "MM"4 \n\t" /* -max */\
  204. "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
  205. "1: \n\t"\
  206. \
  207. MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
  208. MOVQ" "MM"2, "MM"3 \n\t"\
  209. "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
  210. "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
  211. "pmaxsw "MM"2, "MM"1 \n\t"\
  212. "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
  213. "packuswb "MM"1, "MM"1 \n\t"\
  214. \
  215. :[tmp0]"=m"(tmp0),\
  216. [tmp1]"=m"(tmp1),\
  217. [tmp2]"=m"(tmp2),\
  218. [tmp3]"=m"(tmp3)\
  219. :[prev] "r"(prev),\
  220. [cur] "r"(cur),\
  221. [next] "r"(next),\
  222. [prefs]"r"((x86_reg)refs),\
  223. [mrefs]"r"((x86_reg)-refs),\
  224. [mode] "g"(mode)\
  225. );\
  226. __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
  227. dst += STEP;\
  228. prev+= STEP;\
  229. cur += STEP;\
  230. next+= STEP;\
  231. }
  232. if (parity) {
  233. #define prev2 "prev"
  234. #define next2 "cur"
  235. FILTER
  236. #undef prev2
  237. #undef next2
  238. } else {
  239. #define prev2 "cur"
  240. #define next2 "next"
  241. FILTER
  242. #undef prev2
  243. #undef next2
  244. }
  245. }
  246. #undef STEP
  247. #undef MM
  248. #undef MOV
  249. #undef MOVQ
  250. #undef MOVQU
  251. #undef PSHUF
  252. #undef PSRL1
  253. #undef PSRL2
  254. #undef LOAD
  255. #undef PABS
  256. #undef CHECK
  257. #undef CHECK1
  258. #undef CHECK2
  259. #undef FILTER