yadif_template.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. /*
  2. * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. */
  20. #ifdef COMPILE_TEMPLATE_SSE2
  21. #define MM "%%xmm"
  22. #define MOV "movq"
  23. #define MOVQ "movdqa"
  24. #define MOVQU "movdqu"
  25. #define STEP 8
  26. #define LOAD(mem,dst) \
  27. MOV" "mem", "dst" \n\t"\
  28. "punpcklbw "MM"7, "dst" \n\t"
  29. #define PSRL1(reg) "psrldq $1, "reg" \n\t"
  30. #define PSRL2(reg) "psrldq $2, "reg" \n\t"
  31. #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
  32. "psrldq $2, "src" \n\t"
  33. #else
  34. #define MM "%%mm"
  35. #define MOV "movd"
  36. #define MOVQ "movq"
  37. #define MOVQU "movq"
  38. #define STEP 4
  39. #define LOAD(mem,dst) \
  40. MOV" "mem", "dst" \n\t"\
  41. "punpcklbw "MM"7, "dst" \n\t"
  42. #define PSRL1(reg) "psrlq $8, "reg" \n\t"
  43. #define PSRL2(reg) "psrlq $16, "reg" \n\t"
  44. #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
  45. #endif
  46. #ifdef COMPILE_TEMPLATE_SSSE3
  47. #define PABS(tmp,dst) \
  48. "pabsw "dst", "dst" \n\t"
  49. #else
  50. #define PABS(tmp,dst) \
  51. "pxor "tmp", "tmp" \n\t"\
  52. "psubw "dst", "tmp" \n\t"\
  53. "pmaxsw "tmp", "dst" \n\t"
  54. #endif
  55. #define CHECK(pj,mj) \
  56. MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
  57. MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
  58. MOVQ" "MM"2, "MM"4 \n\t"\
  59. MOVQ" "MM"2, "MM"5 \n\t"\
  60. "pxor "MM"3, "MM"4 \n\t"\
  61. "pavgb "MM"3, "MM"5 \n\t"\
  62. "pand "MANGLE(pb_1)", "MM"4 \n\t"\
  63. "psubusb "MM"4, "MM"5 \n\t"\
  64. PSRL1(MM"5") \
  65. "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
  66. MOVQ" "MM"2, "MM"4 \n\t"\
  67. "psubusb "MM"3, "MM"2 \n\t"\
  68. "psubusb "MM"4, "MM"3 \n\t"\
  69. "pmaxub "MM"3, "MM"2 \n\t"\
  70. MOVQ" "MM"2, "MM"3 \n\t"\
  71. MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
  72. PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
  73. PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
  74. "punpcklbw "MM"7, "MM"2 \n\t"\
  75. "punpcklbw "MM"7, "MM"3 \n\t"\
  76. "punpcklbw "MM"7, "MM"4 \n\t"\
  77. "paddw "MM"3, "MM"2 \n\t"\
  78. "paddw "MM"4, "MM"2 \n\t" /* score */
  79. #define CHECK1 \
  80. MOVQ" "MM"0, "MM"3 \n\t"\
  81. "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
  82. "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
  83. MOVQ" "MM"3, "MM"6 \n\t"\
  84. "pand "MM"3, "MM"5 \n\t"\
  85. "pandn "MM"1, "MM"3 \n\t"\
  86. "por "MM"5, "MM"3 \n\t"\
  87. MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
  88. #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
  89. hurts both quality and speed, but matches the C version. */\
  90. "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
  91. "psllw $14, "MM"6 \n\t"\
  92. "paddsw "MM"6, "MM"2 \n\t"\
  93. MOVQ" "MM"0, "MM"3 \n\t"\
  94. "pcmpgtw "MM"2, "MM"3 \n\t"\
  95. "pminsw "MM"2, "MM"0 \n\t"\
  96. "pand "MM"3, "MM"5 \n\t"\
  97. "pandn "MM"1, "MM"3 \n\t"\
  98. "por "MM"5, "MM"3 \n\t"\
  99. MOVQ" "MM"3, "MM"1 \n\t"
  100. static void RENAME(yadif_filter_line)(uint8_t *dst, uint8_t *prev, uint8_t *cur,
  101. uint8_t *next, int w, int prefs,
  102. int mrefs, int parity, int mode)
  103. {
  104. uint8_t tmpU[5*16];
  105. uint8_t *tmp= (uint8_t*)(((uint64_t)(tmpU+15)) & ~15);
  106. int x;
  107. #define FILTER\
  108. for(x=0; x<w; x+=STEP){\
  109. __asm__ volatile(\
  110. "pxor "MM"7, "MM"7 \n\t"\
  111. LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
  112. LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
  113. LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
  114. LOAD("(%["next2"])", MM"3") /* next2[x] */\
  115. MOVQ" "MM"3, "MM"4 \n\t"\
  116. "paddw "MM"2, "MM"3 \n\t"\
  117. "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
  118. MOVQ" "MM"0, (%[tmp]) \n\t" /* c */\
  119. MOVQ" "MM"3, 16(%[tmp]) \n\t" /* d */\
  120. MOVQ" "MM"1, 32(%[tmp]) \n\t" /* e */\
  121. "psubw "MM"4, "MM"2 \n\t"\
  122. PABS( MM"4", MM"2") /* temporal_diff0 */\
  123. LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
  124. LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
  125. "psubw "MM"0, "MM"3 \n\t"\
  126. "psubw "MM"1, "MM"4 \n\t"\
  127. PABS( MM"5", MM"3")\
  128. PABS( MM"5", MM"4")\
  129. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
  130. "psrlw $1, "MM"2 \n\t"\
  131. "psrlw $1, "MM"3 \n\t"\
  132. "pmaxsw "MM"3, "MM"2 \n\t"\
  133. LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
  134. LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
  135. "psubw "MM"0, "MM"3 \n\t"\
  136. "psubw "MM"1, "MM"4 \n\t"\
  137. PABS( MM"5", MM"3")\
  138. PABS( MM"5", MM"4")\
  139. "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
  140. "psrlw $1, "MM"3 \n\t"\
  141. "pmaxsw "MM"3, "MM"2 \n\t"\
  142. MOVQ" "MM"2, 48(%[tmp]) \n\t" /* diff */\
  143. \
  144. "paddw "MM"0, "MM"1 \n\t"\
  145. "paddw "MM"0, "MM"0 \n\t"\
  146. "psubw "MM"1, "MM"0 \n\t"\
  147. "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
  148. PABS( MM"2", MM"0") /* ABS(c-e) */\
  149. \
  150. MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
  151. MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
  152. MOVQ" "MM"2, "MM"4 \n\t"\
  153. "psubusb "MM"3, "MM"2 \n\t"\
  154. "psubusb "MM"4, "MM"3 \n\t"\
  155. "pmaxub "MM"3, "MM"2 \n\t"\
  156. PSHUF(MM"3", MM"2") \
  157. "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
  158. "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
  159. "paddw "MM"2, "MM"0 \n\t"\
  160. "paddw "MM"3, "MM"0 \n\t"\
  161. "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
  162. \
  163. CHECK(-2,0)\
  164. CHECK1\
  165. CHECK(-3,1)\
  166. CHECK2\
  167. CHECK(0,-2)\
  168. CHECK1\
  169. CHECK(1,-3)\
  170. CHECK2\
  171. \
  172. /* if(p->mode<2) ... */\
  173. MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
  174. "cmpl $2, %[mode] \n\t"\
  175. "jge 1f \n\t"\
  176. LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
  177. LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
  178. LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
  179. LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
  180. "paddw "MM"4, "MM"2 \n\t"\
  181. "paddw "MM"5, "MM"3 \n\t"\
  182. "psrlw $1, "MM"2 \n\t" /* b */\
  183. "psrlw $1, "MM"3 \n\t" /* f */\
  184. MOVQ" (%[tmp]), "MM"4 \n\t" /* c */\
  185. MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
  186. MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
  187. "psubw "MM"4, "MM"2 \n\t" /* b-c */\
  188. "psubw "MM"7, "MM"3 \n\t" /* f-e */\
  189. MOVQ" "MM"5, "MM"0 \n\t"\
  190. "psubw "MM"4, "MM"5 \n\t" /* d-c */\
  191. "psubw "MM"7, "MM"0 \n\t" /* d-e */\
  192. MOVQ" "MM"2, "MM"4 \n\t"\
  193. "pminsw "MM"3, "MM"2 \n\t"\
  194. "pmaxsw "MM"4, "MM"3 \n\t"\
  195. "pmaxsw "MM"5, "MM"2 \n\t"\
  196. "pminsw "MM"5, "MM"3 \n\t"\
  197. "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
  198. "pminsw "MM"0, "MM"3 \n\t" /* min */\
  199. "pxor "MM"4, "MM"4 \n\t"\
  200. "pmaxsw "MM"3, "MM"6 \n\t"\
  201. "psubw "MM"2, "MM"4 \n\t" /* -max */\
  202. "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
  203. "1: \n\t"\
  204. \
  205. MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
  206. MOVQ" "MM"2, "MM"3 \n\t"\
  207. "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
  208. "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
  209. "pmaxsw "MM"2, "MM"1 \n\t"\
  210. "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
  211. "packuswb "MM"1, "MM"1 \n\t"\
  212. \
  213. ::[prev] "r"(prev),\
  214. [cur] "r"(cur),\
  215. [next] "r"(next),\
  216. [prefs]"r"((x86_reg)prefs),\
  217. [mrefs]"r"((x86_reg)mrefs),\
  218. [mode] "g"(mode),\
  219. [tmp] "r"(tmp)\
  220. );\
  221. __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
  222. dst += STEP;\
  223. prev+= STEP;\
  224. cur += STEP;\
  225. next+= STEP;\
  226. }
  227. if (parity) {
  228. #define prev2 "prev"
  229. #define next2 "cur"
  230. FILTER
  231. #undef prev2
  232. #undef next2
  233. } else {
  234. #define prev2 "cur"
  235. #define next2 "next"
  236. FILTER
  237. #undef prev2
  238. #undef next2
  239. }
  240. }
  241. #undef STEP
  242. #undef MM
  243. #undef MOV
  244. #undef MOVQ
  245. #undef MOVQU
  246. #undef PSHUF
  247. #undef PSRL1
  248. #undef PSRL2
  249. #undef LOAD
  250. #undef PABS
  251. #undef CHECK
  252. #undef CHECK1
  253. #undef CHECK2
  254. #undef FILTER