hscale_fast_bilinear_simd.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../swscale_internal.h"
  21. #include "libavutil/x86/asm.h"
  22. #include "libavutil/x86/cpu.h"
  23. #define RET 0xC3 // near return opcode for x86
  24. #define PREFETCH "prefetchnta"
  25. #if HAVE_INLINE_ASM
  26. av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
  27. int16_t *filter, int32_t *filterPos,
  28. int numSplits)
  29. {
  30. uint8_t *fragmentA;
  31. x86_reg imm8OfPShufW1A;
  32. x86_reg imm8OfPShufW2A;
  33. x86_reg fragmentLengthA;
  34. uint8_t *fragmentB;
  35. x86_reg imm8OfPShufW1B;
  36. x86_reg imm8OfPShufW2B;
  37. x86_reg fragmentLengthB;
  38. int fragmentPos;
  39. int xpos, i;
  40. // create an optimized horizontal scaling routine
  41. /* This scaler is made of runtime-generated MMXEXT code using specially tuned
  42. * pshufw instructions. For every four output pixels, if four input pixels
  43. * are enough for the fast bilinear scaling, then a chunk of fragmentB is
  44. * used. If five input pixels are needed, then a chunk of fragmentA is used.
  45. */
  46. // code fragment
  47. __asm__ volatile (
  48. "jmp 9f \n\t"
  49. // Begin
  50. "0: \n\t"
  51. "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
  52. "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
  53. "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
  54. "punpcklbw %%mm7, %%mm1 \n\t"
  55. "punpcklbw %%mm7, %%mm0 \n\t"
  56. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  57. "1: \n\t"
  58. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  59. "2: \n\t"
  60. "psubw %%mm1, %%mm0 \n\t"
  61. "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
  62. "pmullw %%mm3, %%mm0 \n\t"
  63. "psllw $7, %%mm1 \n\t"
  64. "paddw %%mm1, %%mm0 \n\t"
  65. "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
  66. "add $8, %%"REG_a" \n\t"
  67. // End
  68. "9: \n\t"
  69. // "int $3 \n\t"
  70. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  71. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  72. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  73. "dec %1 \n\t"
  74. "dec %2 \n\t"
  75. "sub %0, %1 \n\t"
  76. "sub %0, %2 \n\t"
  77. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  78. "sub %0, %3 \n\t"
  79. : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
  80. "=r" (fragmentLengthA)
  81. );
  82. __asm__ volatile (
  83. "jmp 9f \n\t"
  84. // Begin
  85. "0: \n\t"
  86. "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
  87. "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
  88. "punpcklbw %%mm7, %%mm0 \n\t"
  89. "pshufw $0xFF, %%mm0, %%mm1 \n\t"
  90. "1: \n\t"
  91. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  92. "2: \n\t"
  93. "psubw %%mm1, %%mm0 \n\t"
  94. "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
  95. "pmullw %%mm3, %%mm0 \n\t"
  96. "psllw $7, %%mm1 \n\t"
  97. "paddw %%mm1, %%mm0 \n\t"
  98. "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
  99. "add $8, %%"REG_a" \n\t"
  100. // End
  101. "9: \n\t"
  102. // "int $3 \n\t"
  103. "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
  104. "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
  105. "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
  106. "dec %1 \n\t"
  107. "dec %2 \n\t"
  108. "sub %0, %1 \n\t"
  109. "sub %0, %2 \n\t"
  110. "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
  111. "sub %0, %3 \n\t"
  112. : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
  113. "=r" (fragmentLengthB)
  114. );
  115. xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
  116. fragmentPos = 0;
  117. for (i = 0; i < dstW / numSplits; i++) {
  118. int xx = xpos >> 16;
  119. if ((i & 3) == 0) {
  120. int a = 0;
  121. int b = ((xpos + xInc) >> 16) - xx;
  122. int c = ((xpos + xInc * 2) >> 16) - xx;
  123. int d = ((xpos + xInc * 3) >> 16) - xx;
  124. int inc = (d + 1 < 4);
  125. uint8_t *fragment = inc ? fragmentB : fragmentA;
  126. x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
  127. x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
  128. x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
  129. int maxShift = 3 - (d + inc);
  130. int shift = 0;
  131. if (filterCode) {
  132. filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
  133. filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
  134. filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
  135. filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
  136. filterPos[i / 2] = xx;
  137. memcpy(filterCode + fragmentPos, fragment, fragmentLength);
  138. filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
  139. ((b + inc) << 2) |
  140. ((c + inc) << 4) |
  141. ((d + inc) << 6);
  142. filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
  143. (c << 4) |
  144. (d << 6);
  145. if (i + 4 - inc >= dstW)
  146. shift = maxShift; // avoid overread
  147. else if ((filterPos[i / 2] & 3) <= maxShift)
  148. shift = filterPos[i / 2] & 3; // align
  149. if (shift && i >= shift) {
  150. filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
  151. filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
  152. filterPos[i / 2] -= shift;
  153. }
  154. }
  155. fragmentPos += fragmentLength;
  156. if (filterCode)
  157. filterCode[fragmentPos] = RET;
  158. }
  159. xpos += xInc;
  160. }
  161. if (filterCode)
  162. filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
  163. return fragmentPos + 1;
  164. }
  165. void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
  166. int dstWidth, const uint8_t *src,
  167. int srcW, int xInc)
  168. {
  169. int32_t *filterPos = c->hLumFilterPos;
  170. int16_t *filter = c->hLumFilter;
  171. void *mmxextFilterCode = c->lumMmxextFilterCode;
  172. int i;
  173. #if defined(PIC)
  174. uint64_t ebxsave;
  175. #endif
  176. #if ARCH_X86_64
  177. uint64_t retsave;
  178. #endif
  179. __asm__ volatile(
  180. #if defined(PIC)
  181. "mov %%"REG_b", %5 \n\t"
  182. #if ARCH_X86_64
  183. "mov -8(%%rsp), %%"REG_a" \n\t"
  184. "mov %%"REG_a", %6 \n\t"
  185. #endif
  186. #else
  187. #if ARCH_X86_64
  188. "mov -8(%%rsp), %%"REG_a" \n\t"
  189. "mov %%"REG_a", %5 \n\t"
  190. #endif
  191. #endif
  192. "pxor %%mm7, %%mm7 \n\t"
  193. "mov %0, %%"REG_c" \n\t"
  194. "mov %1, %%"REG_D" \n\t"
  195. "mov %2, %%"REG_d" \n\t"
  196. "mov %3, %%"REG_b" \n\t"
  197. "xor %%"REG_a", %%"REG_a" \n\t" // i
  198. PREFETCH" (%%"REG_c") \n\t"
  199. PREFETCH" 32(%%"REG_c") \n\t"
  200. PREFETCH" 64(%%"REG_c") \n\t"
  201. #if ARCH_X86_64
  202. #define CALL_MMXEXT_FILTER_CODE \
  203. "movl (%%"REG_b"), %%esi \n\t"\
  204. "call *%4 \n\t"\
  205. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  206. "add %%"REG_S", %%"REG_c" \n\t"\
  207. "add %%"REG_a", %%"REG_D" \n\t"\
  208. "xor %%"REG_a", %%"REG_a" \n\t"\
  209. #else
  210. #define CALL_MMXEXT_FILTER_CODE \
  211. "movl (%%"REG_b"), %%esi \n\t"\
  212. "call *%4 \n\t"\
  213. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  214. "add %%"REG_a", %%"REG_D" \n\t"\
  215. "xor %%"REG_a", %%"REG_a" \n\t"\
  216. #endif /* ARCH_X86_64 */
  217. CALL_MMXEXT_FILTER_CODE
  218. CALL_MMXEXT_FILTER_CODE
  219. CALL_MMXEXT_FILTER_CODE
  220. CALL_MMXEXT_FILTER_CODE
  221. CALL_MMXEXT_FILTER_CODE
  222. CALL_MMXEXT_FILTER_CODE
  223. CALL_MMXEXT_FILTER_CODE
  224. CALL_MMXEXT_FILTER_CODE
  225. #if defined(PIC)
  226. "mov %5, %%"REG_b" \n\t"
  227. #if ARCH_X86_64
  228. "mov %6, %%"REG_a" \n\t"
  229. "mov %%"REG_a", -8(%%rsp) \n\t"
  230. #endif
  231. #else
  232. #if ARCH_X86_64
  233. "mov %5, %%"REG_a" \n\t"
  234. "mov %%"REG_a", -8(%%rsp) \n\t"
  235. #endif
  236. #endif
  237. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  238. "m" (mmxextFilterCode)
  239. #if defined(PIC)
  240. ,"m" (ebxsave)
  241. #endif
  242. #if ARCH_X86_64
  243. ,"m"(retsave)
  244. #endif
  245. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  246. #if ARCH_X86_64 || !defined(PIC)
  247. ,"%"REG_b
  248. #endif
  249. );
  250. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  251. dst[i] = src[srcW-1]*128;
  252. }
  253. void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
  254. int dstWidth, const uint8_t *src1,
  255. const uint8_t *src2, int srcW, int xInc)
  256. {
  257. int32_t *filterPos = c->hChrFilterPos;
  258. int16_t *filter = c->hChrFilter;
  259. void *mmxextFilterCode = c->chrMmxextFilterCode;
  260. int i;
  261. #if defined(PIC)
  262. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  263. #endif
  264. #if ARCH_X86_64
  265. DECLARE_ALIGNED(8, uint64_t, retsave);
  266. #endif
  267. __asm__ volatile(
  268. #if defined(PIC)
  269. "mov %%"REG_b", %7 \n\t"
  270. #if ARCH_X86_64
  271. "mov -8(%%rsp), %%"REG_a" \n\t"
  272. "mov %%"REG_a", %8 \n\t"
  273. #endif
  274. #else
  275. #if ARCH_X86_64
  276. "mov -8(%%rsp), %%"REG_a" \n\t"
  277. "mov %%"REG_a", %7 \n\t"
  278. #endif
  279. #endif
  280. "pxor %%mm7, %%mm7 \n\t"
  281. "mov %0, %%"REG_c" \n\t"
  282. "mov %1, %%"REG_D" \n\t"
  283. "mov %2, %%"REG_d" \n\t"
  284. "mov %3, %%"REG_b" \n\t"
  285. "xor %%"REG_a", %%"REG_a" \n\t" // i
  286. PREFETCH" (%%"REG_c") \n\t"
  287. PREFETCH" 32(%%"REG_c") \n\t"
  288. PREFETCH" 64(%%"REG_c") \n\t"
  289. CALL_MMXEXT_FILTER_CODE
  290. CALL_MMXEXT_FILTER_CODE
  291. CALL_MMXEXT_FILTER_CODE
  292. CALL_MMXEXT_FILTER_CODE
  293. "xor %%"REG_a", %%"REG_a" \n\t" // i
  294. "mov %5, %%"REG_c" \n\t" // src
  295. "mov %6, %%"REG_D" \n\t" // buf2
  296. PREFETCH" (%%"REG_c") \n\t"
  297. PREFETCH" 32(%%"REG_c") \n\t"
  298. PREFETCH" 64(%%"REG_c") \n\t"
  299. CALL_MMXEXT_FILTER_CODE
  300. CALL_MMXEXT_FILTER_CODE
  301. CALL_MMXEXT_FILTER_CODE
  302. CALL_MMXEXT_FILTER_CODE
  303. #if defined(PIC)
  304. "mov %7, %%"REG_b" \n\t"
  305. #if ARCH_X86_64
  306. "mov %8, %%"REG_a" \n\t"
  307. "mov %%"REG_a", -8(%%rsp) \n\t"
  308. #endif
  309. #else
  310. #if ARCH_X86_64
  311. "mov %7, %%"REG_a" \n\t"
  312. "mov %%"REG_a", -8(%%rsp) \n\t"
  313. #endif
  314. #endif
  315. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  316. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  317. #if defined(PIC)
  318. ,"m" (ebxsave)
  319. #endif
  320. #if ARCH_X86_64
  321. ,"m"(retsave)
  322. #endif
  323. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  324. #if ARCH_X86_64 || !defined(PIC)
  325. ,"%"REG_b
  326. #endif
  327. );
  328. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  329. dst1[i] = src1[srcW-1]*128;
  330. dst2[i] = src2[srcW-1]*128;
  331. }
  332. }
  333. #endif //HAVE_INLINE_ASM