vp6dsp_mmx.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. /**
  2. * @file libavcodec/x86/vp6dsp_mmx.c
  3. * MMX-optimized functions for the VP6 decoder
  4. *
  5. * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/x86_cpu.h"
  24. #include "libavcodec/dsputil.h"
  25. #include "dsputil_mmx.h"
  26. #include "vp6dsp_mmx.h"
  27. #define DIAG4_MMX(in1,in2,in3,in4) \
  28. "movq "#in1"(%0), %%mm0 \n\t" \
  29. "movq "#in2"(%0), %%mm1 \n\t" \
  30. "movq %%mm0, %%mm3 \n\t" \
  31. "movq %%mm1, %%mm4 \n\t" \
  32. "punpcklbw %%mm7, %%mm0 \n\t" \
  33. "punpcklbw %%mm7, %%mm1 \n\t" \
  34. "punpckhbw %%mm7, %%mm3 \n\t" \
  35. "punpckhbw %%mm7, %%mm4 \n\t" \
  36. "pmullw 0(%2), %%mm0 \n\t" /* src[x-8 ] * biweight [0] */ \
  37. "pmullw 8(%2), %%mm1 \n\t" /* src[x ] * biweight [1] */ \
  38. "pmullw 0(%2), %%mm3 \n\t" /* src[x-8 ] * biweight [0] */ \
  39. "pmullw 8(%2), %%mm4 \n\t" /* src[x ] * biweight [1] */ \
  40. "paddw %%mm1, %%mm0 \n\t" \
  41. "paddw %%mm4, %%mm3 \n\t" \
  42. "movq "#in3"(%0), %%mm1 \n\t" \
  43. "movq "#in4"(%0), %%mm2 \n\t" \
  44. "movq %%mm1, %%mm4 \n\t" \
  45. "movq %%mm2, %%mm5 \n\t" \
  46. "punpcklbw %%mm7, %%mm1 \n\t" \
  47. "punpcklbw %%mm7, %%mm2 \n\t" \
  48. "punpckhbw %%mm7, %%mm4 \n\t" \
  49. "punpckhbw %%mm7, %%mm5 \n\t" \
  50. "pmullw 16(%2), %%mm1 \n\t" /* src[x+8 ] * biweight [2] */ \
  51. "pmullw 24(%2), %%mm2 \n\t" /* src[x+16] * biweight [3] */ \
  52. "pmullw 16(%2), %%mm4 \n\t" /* src[x+8 ] * biweight [2] */ \
  53. "pmullw 24(%2), %%mm5 \n\t" /* src[x+16] * biweight [3] */ \
  54. "paddw %%mm2, %%mm1 \n\t" \
  55. "paddw %%mm5, %%mm4 \n\t" \
  56. "paddsw %%mm1, %%mm0 \n\t" \
  57. "paddsw %%mm4, %%mm3 \n\t" \
  58. "paddsw %%mm6, %%mm0 \n\t" /* Add 64 */ \
  59. "paddsw %%mm6, %%mm3 \n\t" /* Add 64 */ \
  60. "psraw $7, %%mm0 \n\t" \
  61. "psraw $7, %%mm3 \n\t" \
  62. "packuswb %%mm3, %%mm0 \n\t" \
  63. "movq %%mm0, (%1) \n\t"
  64. void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride,
  65. const int16_t *h_weights, const int16_t *v_weights)
  66. {
  67. uint8_t tmp[8*11], *t = tmp;
  68. int16_t weights[4*4];
  69. int i;
  70. src -= stride;
  71. for (i=0; i<4*4; i++)
  72. weights[i] = h_weights[i>>2];
  73. __asm__ volatile(
  74. "pxor %%mm7, %%mm7 \n\t"
  75. "movq "MANGLE(ff_pw_64)", %%mm6 \n\t"
  76. "1: \n\t"
  77. DIAG4_MMX(-1,0,1,2)
  78. "add $8, %1 \n\t"
  79. "add %3, %0 \n\t"
  80. "decl %4 \n\t"
  81. "jnz 1b \n\t"
  82. : "+r"(src), "+r"(t)
  83. : "r"(weights), "r"((x86_reg)stride), "r"(11)
  84. : "memory");
  85. t = tmp + 8;
  86. for (i=0; i<4*4; i++)
  87. weights[i] = v_weights[i>>2];
  88. __asm__ volatile(
  89. "pxor %%mm7, %%mm7 \n\t"
  90. "movq "MANGLE(ff_pw_64)", %%mm6 \n\t"
  91. "1: \n\t"
  92. DIAG4_MMX(-8,0,8,16)
  93. "add $8, %0 \n\t"
  94. "add %3, %1 \n\t"
  95. "decl %4 \n\t"
  96. "jnz 1b \n\t"
  97. : "+r"(t), "+r"(dst)
  98. : "r"(weights), "r"((x86_reg)stride), "r"(8)
  99. : "memory");
  100. }