hscale_fast_bilinear_simd.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../swscale_internal.h"
  21. #include "libavutil/attributes.h"
  22. #include "libavutil/x86/asm.h"
  23. #include "libavutil/x86/cpu.h"
  24. #include "libavutil/mem_internal.h"
  25. #define RET 0xC3 // near return opcode for x86
  26. #define PREFETCH "prefetchnta"
  27. #if HAVE_INLINE_ASM
  28. av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
  29. int16_t *filter, int32_t *filterPos,
  30. int numSplits)
  31. {
  32. uint8_t *fragmentA;
  33. x86_reg imm8OfPShufW1A;
  34. x86_reg imm8OfPShufW2A;
  35. x86_reg fragmentLengthA;
  36. uint8_t *fragmentB;
  37. x86_reg imm8OfPShufW1B;
  38. x86_reg imm8OfPShufW2B;
  39. x86_reg fragmentLengthB;
  40. int fragmentPos;
  41. int xpos, i;
  42. // create an optimized horizontal scaling routine
  43. /* This scaler is made of runtime-generated MMXEXT code using specially tuned
  44. * pshufw instructions. For every four output pixels, if four input pixels
  45. * are enough for the fast bilinear scaling, then a chunk of fragmentB is
  46. * used. If five input pixels are needed, then a chunk of fragmentA is used.
  47. */
  48. // code fragment
  49. __asm__ volatile (
  50. "jmp 9f \n\t"
  51. // Begin
  52. "0: \n\t"
  53. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  54. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  55. "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t"
  56. "punpcklbw %%mm7, %%mm1 \n\t"
  57. "punpcklbw %%mm7, %%mm0 \n\t"
  58. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  59. "1: \n\t"
  60. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  61. "2: \n\t"
  62. "psubw %%mm1, %%mm0 \n\t"
  63. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  64. "pmullw %%mm3, %%mm0 \n\t"
  65. "psllw $7, %%mm1 \n\t"
  66. "paddw %%mm1, %%mm0 \n\t"
  67. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  68. "add $8, %%"FF_REG_a" \n\t"
  69. // End
  70. "9: \n\t"
  71. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  72. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  73. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  74. "dec %1 \n\t"
  75. "dec %2 \n\t"
  76. "sub %0, %1 \n\t"
  77. "sub %0, %2 \n\t"
  78. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  79. "sub %0, %3 \n\t"
  80. : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
  81. "=r" (fragmentLengthA)
  82. );
  83. __asm__ volatile (
  84. "jmp 9f \n\t"
  85. // Begin
  86. "0: \n\t"
  87. "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t"
  88. "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t"
  89. "punpcklbw %%mm7, %%mm0 \n\t"
  90. "pshufw $0xFF, %%mm0, %%mm1 \n\t"
  91. "1: \n\t"
  92. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  93. "2: \n\t"
  94. "psubw %%mm1, %%mm0 \n\t"
  95. "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"
  96. "pmullw %%mm3, %%mm0 \n\t"
  97. "psllw $7, %%mm1 \n\t"
  98. "paddw %%mm1, %%mm0 \n\t"
  99. "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t"
  100. "add $8, %%"FF_REG_a" \n\t"
  101. // End
  102. "9: \n\t"
  103. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  104. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  105. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  106. "dec %1 \n\t"
  107. "dec %2 \n\t"
  108. "sub %0, %1 \n\t"
  109. "sub %0, %2 \n\t"
  110. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  111. "sub %0, %3 \n\t"
  112. : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
  113. "=r" (fragmentLengthB)
  114. );
  115. xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
  116. fragmentPos = 0;
  117. for (i = 0; i < dstW / numSplits; i++) {
  118. int xx = xpos >> 16;
  119. if ((i & 3) == 0) {
  120. int a = 0;
  121. int b = ((xpos + xInc) >> 16) - xx;
  122. int c = ((xpos + xInc * 2) >> 16) - xx;
  123. int d = ((xpos + xInc * 3) >> 16) - xx;
  124. int inc = (d + 1 < 4);
  125. uint8_t *fragment = inc ? fragmentB : fragmentA;
  126. x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
  127. x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
  128. x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
  129. int maxShift = 3 - (d + inc);
  130. int shift = 0;
  131. if (filterCode) {
  132. filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
  133. filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
  134. filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
  135. filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
  136. filterPos[i / 2] = xx;
  137. memcpy(filterCode + fragmentPos, fragment, fragmentLength);
  138. filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
  139. ((b + inc) << 2) |
  140. ((c + inc) << 4) |
  141. ((d + inc) << 6);
  142. filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
  143. (c << 4) |
  144. (d << 6);
  145. if (i + 4 - inc >= dstW)
  146. shift = maxShift; // avoid overread
  147. else if ((filterPos[i / 2] & 3) <= maxShift)
  148. shift = filterPos[i / 2] & 3; // align
  149. if (shift && i >= shift) {
  150. filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
  151. filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
  152. filterPos[i / 2] -= shift;
  153. }
  154. }
  155. fragmentPos += fragmentLength;
  156. if (filterCode)
  157. filterCode[fragmentPos] = RET;
  158. }
  159. xpos += xInc;
  160. }
  161. if (filterCode)
  162. filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
  163. return fragmentPos + 1;
  164. }
  165. void ff_hyscale_fast_mmxext(SwsInternal *c, int16_t *dst,
  166. int dstWidth, const uint8_t *src,
  167. int srcW, int xInc)
  168. {
  169. int32_t *filterPos = c->hLumFilterPos;
  170. int16_t *filter = c->hLumFilter;
  171. void *mmxextFilterCode = c->lumMmxextFilterCode;
  172. int i;
  173. #if ARCH_X86_64
  174. uint64_t retsave;
  175. #else
  176. #if !HAVE_EBX_AVAILABLE
  177. uint64_t ebxsave;
  178. #endif
  179. #endif
  180. __asm__ volatile(
  181. #if ARCH_X86_64
  182. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  183. "mov %%"FF_REG_a", %5 \n\t" // retsave
  184. #else
  185. #if !HAVE_EBX_AVAILABLE
  186. "mov %%"FF_REG_b", %5 \n\t" // ebxsave
  187. #endif
  188. #endif
  189. "pxor %%mm7, %%mm7 \n\t"
  190. "mov %0, %%"FF_REG_c" \n\t"
  191. "mov %1, %%"FF_REG_D" \n\t"
  192. "mov %2, %%"FF_REG_d" \n\t"
  193. "mov %3, %%"FF_REG_b" \n\t"
  194. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  195. PREFETCH" (%%"FF_REG_c") \n\t"
  196. PREFETCH" 32(%%"FF_REG_c") \n\t"
  197. PREFETCH" 64(%%"FF_REG_c") \n\t"
  198. #if ARCH_X86_64
  199. #define CALL_MMXEXT_FILTER_CODE \
  200. "movl (%%"FF_REG_b"), %%esi \n\t"\
  201. "call *%4 \n\t"\
  202. "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
  203. "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
  204. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  205. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  206. #else
  207. #define CALL_MMXEXT_FILTER_CODE \
  208. "movl (%%"FF_REG_b"), %%esi \n\t"\
  209. "call *%4 \n\t"\
  210. "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
  211. "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
  212. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
  213. #endif /* ARCH_X86_64 */
  214. CALL_MMXEXT_FILTER_CODE
  215. CALL_MMXEXT_FILTER_CODE
  216. CALL_MMXEXT_FILTER_CODE
  217. CALL_MMXEXT_FILTER_CODE
  218. CALL_MMXEXT_FILTER_CODE
  219. CALL_MMXEXT_FILTER_CODE
  220. CALL_MMXEXT_FILTER_CODE
  221. CALL_MMXEXT_FILTER_CODE
  222. #if ARCH_X86_64
  223. "mov %5, %%"FF_REG_a" \n\t"
  224. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  225. #else
  226. #if !HAVE_EBX_AVAILABLE
  227. "mov %5, %%"FF_REG_b" \n\t"
  228. #endif
  229. #endif
  230. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  231. "m" (mmxextFilterCode)
  232. #if ARCH_X86_64
  233. ,"m"(retsave)
  234. #else
  235. #if !HAVE_EBX_AVAILABLE
  236. ,"m" (ebxsave)
  237. #endif
  238. #endif
  239. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  240. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  241. ,"%"FF_REG_b
  242. #endif
  243. );
  244. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  245. dst[i] = src[srcW-1]*128;
  246. }
  247. void ff_hcscale_fast_mmxext(SwsInternal *c, int16_t *dst1, int16_t *dst2,
  248. int dstWidth, const uint8_t *src1,
  249. const uint8_t *src2, int srcW, int xInc)
  250. {
  251. int32_t *filterPos = c->hChrFilterPos;
  252. int16_t *filter = c->hChrFilter;
  253. void *mmxextFilterCode = c->chrMmxextFilterCode;
  254. int i;
  255. #if ARCH_X86_64
  256. DECLARE_ALIGNED(8, uint64_t, retsave);
  257. #else
  258. #if !HAVE_EBX_AVAILABLE
  259. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  260. #endif
  261. #endif
  262. __asm__ volatile(
  263. #if ARCH_X86_64
  264. "mov -8(%%rsp), %%"FF_REG_a" \n\t"
  265. "mov %%"FF_REG_a", %7 \n\t" // retsave
  266. #else
  267. #if !HAVE_EBX_AVAILABLE
  268. "mov %%"FF_REG_b", %7 \n\t" // ebxsave
  269. #endif
  270. #endif
  271. "pxor %%mm7, %%mm7 \n\t"
  272. "mov %0, %%"FF_REG_c" \n\t"
  273. "mov %1, %%"FF_REG_D" \n\t"
  274. "mov %2, %%"FF_REG_d" \n\t"
  275. "mov %3, %%"FF_REG_b" \n\t"
  276. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  277. PREFETCH" (%%"FF_REG_c") \n\t"
  278. PREFETCH" 32(%%"FF_REG_c") \n\t"
  279. PREFETCH" 64(%%"FF_REG_c") \n\t"
  280. CALL_MMXEXT_FILTER_CODE
  281. CALL_MMXEXT_FILTER_CODE
  282. CALL_MMXEXT_FILTER_CODE
  283. CALL_MMXEXT_FILTER_CODE
  284. "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
  285. "mov %5, %%"FF_REG_c" \n\t" // src2
  286. "mov %6, %%"FF_REG_D" \n\t" // dst2
  287. PREFETCH" (%%"FF_REG_c") \n\t"
  288. PREFETCH" 32(%%"FF_REG_c") \n\t"
  289. PREFETCH" 64(%%"FF_REG_c") \n\t"
  290. CALL_MMXEXT_FILTER_CODE
  291. CALL_MMXEXT_FILTER_CODE
  292. CALL_MMXEXT_FILTER_CODE
  293. CALL_MMXEXT_FILTER_CODE
  294. #if ARCH_X86_64
  295. "mov %7, %%"FF_REG_a" \n\t"
  296. "mov %%"FF_REG_a", -8(%%rsp) \n\t"
  297. #else
  298. #if !HAVE_EBX_AVAILABLE
  299. "mov %7, %%"FF_REG_b" \n\t"
  300. #endif
  301. #endif
  302. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  303. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  304. #if ARCH_X86_64
  305. ,"m"(retsave)
  306. #else
  307. #if !HAVE_EBX_AVAILABLE
  308. ,"m" (ebxsave)
  309. #endif
  310. #endif
  311. : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
  312. #if ARCH_X86_64 || HAVE_EBX_AVAILABLE
  313. ,"%"FF_REG_b
  314. #endif
  315. );
  316. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  317. dst1[i] = src1[srcW-1]*128;
  318. dst2[i] = src2[srcW-1]*128;
  319. }
  320. }
  321. #endif //HAVE_INLINE_ASM