vp6dsp_sse2.c 4.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. /**
  2. * @file libavcodec/x86/vp6dsp_mmx.c
  3. * SSE2-optimized functions for the VP6 decoder
  4. *
  5. * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/x86_cpu.h"
  24. #include "libavcodec/dsputil.h"
  25. #include "dsputil_mmx.h"
  26. #include "vp6dsp_sse2.h"
  27. #define DIAG4_SSE2(in1,in2,in3,in4) \
  28. "movq "#in1"(%0), %%xmm0 \n\t" \
  29. "movq "#in2"(%0), %%xmm1 \n\t" \
  30. "punpcklbw %%xmm7, %%xmm0 \n\t" \
  31. "punpcklbw %%xmm7, %%xmm1 \n\t" \
  32. "pmullw %%xmm4, %%xmm0 \n\t" /* src[x-8 ] * biweight [0] */ \
  33. "pmullw %%xmm5, %%xmm1 \n\t" /* src[x ] * biweight [1] */ \
  34. "paddw %%xmm1, %%xmm0 \n\t" \
  35. "movq "#in3"(%0), %%xmm1 \n\t" \
  36. "movq "#in4"(%0), %%xmm2 \n\t" \
  37. "punpcklbw %%xmm7, %%xmm1 \n\t" \
  38. "punpcklbw %%xmm7, %%xmm2 \n\t" \
  39. "pmullw %%xmm6, %%xmm1 \n\t" /* src[x+8 ] * biweight [2] */ \
  40. "pmullw %%xmm3, %%xmm2 \n\t" /* src[x+16] * biweight [3] */ \
  41. "paddw %%xmm2, %%xmm1 \n\t" \
  42. "paddsw %%xmm1, %%xmm0 \n\t" \
  43. "paddsw "MANGLE(ff_pw_64)", %%xmm0 \n\t" /* Add 64 */ \
  44. "psraw $7, %%xmm0 \n\t" \
  45. "packuswb %%xmm0, %%xmm0 \n\t" \
  46. "movq %%xmm0, (%1) \n\t" \
  47. void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
  48. const int16_t *h_weights,const int16_t *v_weights)
  49. {
  50. uint8_t tmp[8*11], *t = tmp;
  51. src -= stride;
  52. __asm__ volatile(
  53. "pxor %%xmm7, %%xmm7 \n\t"
  54. "movq %4, %%xmm3 \n\t"
  55. "pshuflw $0, %%xmm3, %%xmm4 \n\t"
  56. "punpcklqdq %%xmm4, %%xmm4 \n\t"
  57. "pshuflw $85, %%xmm3, %%xmm5 \n\t"
  58. "punpcklqdq %%xmm5, %%xmm5 \n\t"
  59. "pshuflw $170, %%xmm3, %%xmm6 \n\t"
  60. "punpcklqdq %%xmm6, %%xmm6 \n\t"
  61. "pshuflw $255, %%xmm3, %%xmm3 \n\t"
  62. "punpcklqdq %%xmm3, %%xmm3 \n\t"
  63. "1: \n\t"
  64. DIAG4_SSE2(-1,0,1,2)
  65. "add $8, %1 \n\t"
  66. "add %2, %0 \n\t"
  67. "decl %3 \n\t"
  68. "jnz 1b \n\t"
  69. : "+r"(src), "+r"(t)
  70. : "g"((x86_reg)stride), "r"(11), "m"(*(const int64_t*)h_weights)
  71. : "memory");
  72. t = tmp + 8;
  73. __asm__ volatile(
  74. "movq %4, %%xmm3 \n\t"
  75. "pshuflw $0, %%xmm3, %%xmm4 \n\t"
  76. "punpcklqdq %%xmm4, %%xmm4 \n\t"
  77. "pshuflw $85, %%xmm3, %%xmm5 \n\t"
  78. "punpcklqdq %%xmm5, %%xmm5 \n\t"
  79. "pshuflw $170, %%xmm3, %%xmm6 \n\t"
  80. "punpcklqdq %%xmm6, %%xmm6 \n\t"
  81. "pshuflw $255, %%xmm3, %%xmm3 \n\t"
  82. "punpcklqdq %%xmm3, %%xmm3 \n\t"
  83. "1: \n\t"
  84. DIAG4_SSE2(-8,0,8,16)
  85. "add $8, %0 \n\t"
  86. "add %2, %1 \n\t"
  87. "decl %3 \n\t"
  88. "jnz 1b \n\t"
  89. : "+r"(t), "+r"(dst)
  90. : "g"((x86_reg)stride), "r"(8), "m"(*(const int64_t*)v_weights)
  91. : "memory");
  92. }