hscale_fast_bilinear_simd.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../swscale_internal.h"
  21. #include "libavutil/x86/asm.h"
  22. #include "libavutil/x86/cpu.h"
  23. #define RET 0xC3 // near return opcode for x86
  24. #define PREFETCH "prefetchnta"
  25. #if HAVE_INLINE_ASM
  26. av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
  27. int16_t *filter, int32_t *filterPos,
  28. int numSplits)
  29. {
  30. uint8_t *fragmentA;
  31. x86_reg imm8OfPShufW1A;
  32. x86_reg imm8OfPShufW2A;
  33. x86_reg fragmentLengthA;
  34. uint8_t *fragmentB;
  35. x86_reg imm8OfPShufW1B;
  36. x86_reg imm8OfPShufW2B;
  37. x86_reg fragmentLengthB;
  38. int fragmentPos;
  39. int xpos, i;
  40. // create an optimized horizontal scaling routine
  41. /* This scaler is made of runtime-generated MMXEXT code using specially tuned
  42. * pshufw instructions. For every four output pixels, if four input pixels
  43. * are enough for the fast bilinear scaling, then a chunk of fragmentB is
  44. * used. If five input pixels are needed, then a chunk of fragmentA is used.
  45. */
  46. // code fragment
  47. __asm__ volatile (
  48. "jmp 9f \n\t"
  49. // Begin
  50. "0: \n\t"
  51. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  52. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  53. "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
  54. "punpcklbw %%mm7, %%mm1 \n\t"
  55. "punpcklbw %%mm7, %%mm0 \n\t"
  56. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  57. "1: \n\t"
  58. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  59. "2: \n\t"
  60. "psubw %%mm1, %%mm0 \n\t"
  61. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  62. "pmullw %%mm3, %%mm0 \n\t"
  63. "psllw $7, %%mm1 \n\t"
  64. "paddw %%mm1, %%mm0 \n\t"
  65. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  66. "add $8, %%"FF_REG_a" \n\t"
  67. // End
  68. "9: \n\t"
  69. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  70. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  71. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  72. "dec %1 \n\t"
  73. "dec %2 \n\t"
  74. "sub %0, %1 \n\t"
  75. "sub %0, %2 \n\t"
  76. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  77. "sub %0, %3 \n\t"
  78. : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
  79. "=r" (fragmentLengthA)
  80. );
  81. __asm__ volatile (
  82. "jmp 9f \n\t"
  83. // Begin
  84. "0: \n\t"
  85. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  86. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  87. "punpcklbw %%mm7, %%mm0 \n\t"
  88. "pshufw $0xFF, %%mm0, %%mm1 \n\t"
  89. "1: \n\t"
  90. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  91. "2: \n\t"
  92. "psubw %%mm1, %%mm0 \n\t"
  93. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  94. "pmullw %%mm3, %%mm0 \n\t"
  95. "psllw $7, %%mm1 \n\t"
  96. "paddw %%mm1, %%mm0 \n\t"
  97. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  98. "add $8, %%"FF_REG_a" \n\t"
  99. // End
  100. "9: \n\t"
  101. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  102. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  103. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  104. "dec %1 \n\t"
  105. "dec %2 \n\t"
  106. "sub %0, %1 \n\t"
  107. "sub %0, %2 \n\t"
  108. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  109. "sub %0, %3 \n\t"
  110. : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
  111. "=r" (fragmentLengthB)
  112. );
  113. xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
  114. fragmentPos = 0;
  115. for (i = 0; i < dstW / numSplits; i++) {
  116. int xx = xpos >> 16;
  117. if ((i & 3) == 0) {
  118. int a = 0;
  119. int b = ((xpos + xInc) >> 16) - xx;
  120. int c = ((xpos + xInc * 2) >> 16) - xx;
  121. int d = ((xpos + xInc * 3) >> 16) - xx;
  122. int inc = (d + 1 < 4);
  123. uint8_t *fragment = inc ? fragmentB : fragmentA;
  124. x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
  125. x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
  126. x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
  127. int maxShift = 3 - (d + inc);
  128. int shift = 0;
  129. if (filterCode) {
  130. filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
  131. filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
  132. filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
  133. filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
  134. filterPos[i / 2] = xx;
  135. memcpy(filterCode + fragmentPos, fragment, fragmentLength);
  136. filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
  137. ((b + inc) << 2) |
  138. ((c + inc) << 4) |
  139. ((d + inc) << 6);
  140. filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
  141. (c << 4) |
  142. (d << 6);
  143. if (i + 4 - inc >= dstW)
  144. shift = maxShift; // avoid overread
  145. else if ((filterPos[i / 2] & 3) <= maxShift)
  146. shift = filterPos[i / 2] & 3; // align
  147. if (shift && i >= shift) {
  148. filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
  149. filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
  150. filterPos[i / 2] -= shift;
  151. }
  152. }
  153. fragmentPos += fragmentLength;
  154. if (filterCode)
  155. filterCode[fragmentPos] = RET;
  156. }
  157. xpos += xInc;
  158. }
  159. if (filterCode)
  160. filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
  161. return fragmentPos + 1;
  162. }
  163. void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
  164. int dstWidth, const uint8_t *src,
  165. int srcW, int xInc)
  166. {
  167. int32_t *filterPos = c->hLumFilterPos;
  168. int16_t *filter = c->hLumFilter;
  169. void *mmxextFilterCode = c->lumMmxextFilterCode;
  170. int i;
  171. #if ARCH_X86_64
  172. uint64_t retsave;
  173. #else
  174. #if !HAVE_EBX_AVAILABLE
  175. uint64_t ebxsave;
  176. #endif
  177. #endif
  178. __asm__ volatile(
  179. #if ARCH_X86_64
  180. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  181. "mov %%"FF_REG_a", %5 \n\t" // retsave
  182. #else
  183. #if !HAVE_EBX_AVAILABLE
  184. "mov %%"FF_REG_b", %5 \n\t" // ebxsave
  185. #endif
  186. #endif
  187. "pxor %%mm7, %%mm7 \n\t"
  188. "mov %0, %%"FF_REG_c" \n\t"
  189. "mov %1, %%"FF_REG_D" \n\t"
  190. "mov %2, %%"FF_REG_d" \n\t"
  191. "mov %3, %%"FF_REG_b" \n\t"
  192. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  193. PREFETCH" (%%"FF_REG_c") \n\t"
  194. PREFETCH" 32(%%"FF_REG_c") \n\t"
  195. PREFETCH" 64(%%"FF_REG_c") \n\t"
  196. #if ARCH_X86_64
  197. #define CALL_MMXEXT_FILTER_CODE \
  198. "movl (%%"FF_REG_b"), %%esi \n\t"\
  199. "call *%4 \n\t"\
  200. "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
  201. "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
  202. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  203. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  204. #else
  205. #define CALL_MMXEXT_FILTER_CODE \
  206. "movl (%%"FF_REG_b"), %%esi \n\t"\
  207. "call *%4 \n\t"\
  208. "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
  209. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  210. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  211. #endif /* ARCH_X86_64 */
  212. CALL_MMXEXT_FILTER_CODE
  213. CALL_MMXEXT_FILTER_CODE
  214. CALL_MMXEXT_FILTER_CODE
  215. CALL_MMXEXT_FILTER_CODE
  216. CALL_MMXEXT_FILTER_CODE
  217. CALL_MMXEXT_FILTER_CODE
  218. CALL_MMXEXT_FILTER_CODE
  219. CALL_MMXEXT_FILTER_CODE
  220. #if ARCH_X86_64
  221. "mov %5, %%"FF_REG_a" \n\t"
  222. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  223. #else
  224. #if !HAVE_EBX_AVAILABLE
  225. "mov %5, %%"FF_REG_b" \n\t"
  226. #endif
  227. #endif
  228. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  229. "m" (mmxextFilterCode)
  230. #if ARCH_X86_64
  231. ,"m"(retsave)
  232. #else
  233. #if !HAVE_EBX_AVAILABLE
  234. ,"m" (ebxsave)
  235. #endif
  236. #endif
  237. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  238. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  239. ,"%"FF_REG_b
  240. #endif
  241. );
  242. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  243. dst[i] = src[srcW-1]*128;
  244. }
  245. void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
  246. int dstWidth, const uint8_t *src1,
  247. const uint8_t *src2, int srcW, int xInc)
  248. {
  249. int32_t *filterPos = c->hChrFilterPos;
  250. int16_t *filter = c->hChrFilter;
  251. void *mmxextFilterCode = c->chrMmxextFilterCode;
  252. int i;
  253. #if ARCH_X86_64
  254. DECLARE_ALIGNED(8, uint64_t, retsave);
  255. #else
  256. #if !HAVE_EBX_AVAILABLE
  257. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  258. #endif
  259. #endif
  260. __asm__ volatile(
  261. #if ARCH_X86_64
  262. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  263. "mov %%"FF_REG_a", %7 \n\t" // retsave
  264. #else
  265. #if !HAVE_EBX_AVAILABLE
  266. "mov %%"FF_REG_b", %7 \n\t" // ebxsave
  267. #endif
  268. #endif
  269. "pxor %%mm7, %%mm7 \n\t"
  270. "mov %0, %%"FF_REG_c" \n\t"
  271. "mov %1, %%"FF_REG_D" \n\t"
  272. "mov %2, %%"FF_REG_d" \n\t"
  273. "mov %3, %%"FF_REG_b" \n\t"
  274. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  275. PREFETCH" (%%"FF_REG_c") \n\t"
  276. PREFETCH" 32(%%"FF_REG_c") \n\t"
  277. PREFETCH" 64(%%"FF_REG_c") \n\t"
  278. CALL_MMXEXT_FILTER_CODE
  279. CALL_MMXEXT_FILTER_CODE
  280. CALL_MMXEXT_FILTER_CODE
  281. CALL_MMXEXT_FILTER_CODE
  282. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  283. "mov %5, %%"FF_REG_c" \n\t" // src2
  284. "mov %6, %%"FF_REG_D" \n\t" // dst2
  285. PREFETCH" (%%"FF_REG_c") \n\t"
  286. PREFETCH" 32(%%"FF_REG_c") \n\t"
  287. PREFETCH" 64(%%"FF_REG_c") \n\t"
  288. CALL_MMXEXT_FILTER_CODE
  289. CALL_MMXEXT_FILTER_CODE
  290. CALL_MMXEXT_FILTER_CODE
  291. CALL_MMXEXT_FILTER_CODE
  292. #if ARCH_X86_64
  293. "mov %7, %%"FF_REG_a" \n\t"
  294. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  295. #else
  296. #if !HAVE_EBX_AVAILABLE
  297. "mov %7, %%"FF_REG_b" \n\t"
  298. #endif
  299. #endif
  300. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  301. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  302. #if ARCH_X86_64
  303. ,"m"(retsave)
  304. #else
  305. #if !HAVE_EBX_AVAILABLE
  306. ,"m" (ebxsave)
  307. #endif
  308. #endif
  309. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  310. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  311. ,"%"FF_REG_b
  312. #endif
  313. );
  314. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  315. dst1[i] = src1[srcW-1]*128;
  316. dst2[i] = src2[srcW-1]*128;
  317. }
  318. }
  319. #endif //HAVE_INLINE_ASM