fft_3dn2.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /*
  2. * FFT/MDCT transform with Extended 3DNow! optimizations
  3. * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/x86_cpu.h"
  22. #include "libavcodec/dsputil.h"
  23. DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
  24. #ifdef EMULATE_3DNOWEXT
  25. #define PSWAPD(s,d)\
  26. "movq "#s","#d"\n"\
  27. "psrlq $32,"#d"\n"\
  28. "punpckldq "#s","#d"\n"
  29. #define ff_fft_calc_3dn2 ff_fft_calc_3dn
  30. #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
  31. #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
  32. #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
  33. #define ff_imdct_half_3dn2 ff_imdct_half_3dn
  34. #else
  35. #define PSWAPD(s,d) "pswapd "#s","#d"\n"
  36. #endif
  37. void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
  38. void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
  39. void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
  40. {
  41. int n = 1<<s->nbits;
  42. int i;
  43. ff_fft_dispatch_interleave_3dn2(z, s->nbits);
  44. __asm__ volatile("femms");
  45. if(n <= 8)
  46. for(i=0; i<n; i+=2)
  47. FFSWAP(FFTSample, z[i].im, z[i+1].re);
  48. }
  49. void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
  50. {
  51. x86_reg j, k;
  52. long n = 1 << s->nbits;
  53. long n2 = n >> 1;
  54. long n4 = n >> 2;
  55. long n8 = n >> 3;
  56. const uint16_t *revtab = s->fft.revtab;
  57. const FFTSample *tcos = s->tcos;
  58. const FFTSample *tsin = s->tsin;
  59. const FFTSample *in1, *in2;
  60. FFTComplex *z = (FFTComplex *)output;
  61. /* pre rotation */
  62. in1 = input;
  63. in2 = input + n2 - 1;
  64. #ifdef EMULATE_3DNOWEXT
  65. __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31));
  66. #endif
  67. for(k = 0; k < n4; k++) {
  68. // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
  69. __asm__ volatile(
  70. "movd %0, %%mm0 \n"
  71. "movd %2, %%mm1 \n"
  72. "punpckldq %1, %%mm0 \n"
  73. "punpckldq %3, %%mm1 \n"
  74. "movq %%mm0, %%mm2 \n"
  75. PSWAPD( %%mm1, %%mm3 )
  76. "pfmul %%mm1, %%mm0 \n"
  77. "pfmul %%mm3, %%mm2 \n"
  78. #ifdef EMULATE_3DNOWEXT
  79. "movq %%mm0, %%mm1 \n"
  80. "punpckhdq %%mm2, %%mm0 \n"
  81. "punpckldq %%mm2, %%mm1 \n"
  82. "pxor %%mm7, %%mm0 \n"
  83. "pfadd %%mm1, %%mm0 \n"
  84. #else
  85. "pfpnacc %%mm2, %%mm0 \n"
  86. #endif
  87. ::"m"(in2[-2*k]), "m"(in1[2*k]),
  88. "m"(tcos[k]), "m"(tsin[k])
  89. );
  90. __asm__ volatile(
  91. "movq %%mm0, %0 \n\t"
  92. :"=m"(z[revtab[k]])
  93. );
  94. }
  95. ff_fft_dispatch_3dn2(z, s->fft.nbits);
  96. #define CMUL(j,mm0,mm1)\
  97. "movq (%2,"#j",2), %%mm6 \n"\
  98. "movq 8(%2,"#j",2), "#mm0"\n"\
  99. "movq %%mm6, "#mm1"\n"\
  100. "movq "#mm0",%%mm7 \n"\
  101. "pfmul (%3,"#j"), %%mm6 \n"\
  102. "pfmul (%4,"#j"), "#mm0"\n"\
  103. "pfmul (%4,"#j"), "#mm1"\n"\
  104. "pfmul (%3,"#j"), %%mm7 \n"\
  105. "pfsub %%mm6, "#mm0"\n"\
  106. "pfadd %%mm7, "#mm1"\n"
  107. /* post rotation */
  108. j = -n2;
  109. k = n2-8;
  110. __asm__ volatile(
  111. "1: \n"
  112. CMUL(%0, %%mm0, %%mm1)
  113. CMUL(%1, %%mm2, %%mm3)
  114. "movd %%mm0, (%2,%0,2) \n"
  115. "movd %%mm1,12(%2,%1,2) \n"
  116. "movd %%mm2, (%2,%1,2) \n"
  117. "movd %%mm3,12(%2,%0,2) \n"
  118. "psrlq $32, %%mm0 \n"
  119. "psrlq $32, %%mm1 \n"
  120. "psrlq $32, %%mm2 \n"
  121. "psrlq $32, %%mm3 \n"
  122. "movd %%mm0, 8(%2,%0,2) \n"
  123. "movd %%mm1, 4(%2,%1,2) \n"
  124. "movd %%mm2, 8(%2,%1,2) \n"
  125. "movd %%mm3, 4(%2,%0,2) \n"
  126. "sub $8, %1 \n"
  127. "add $8, %0 \n"
  128. "jl 1b \n"
  129. :"+r"(j), "+r"(k)
  130. :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
  131. :"memory"
  132. );
  133. __asm__ volatile("femms");
  134. }
  135. void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
  136. {
  137. x86_reg j, k;
  138. long n = 1 << s->nbits;
  139. long n4 = n >> 2;
  140. ff_imdct_half_3dn2(s, output+n4, input);
  141. j = -n;
  142. k = n-8;
  143. __asm__ volatile(
  144. "movq %4, %%mm7 \n"
  145. "1: \n"
  146. PSWAPD((%2,%1), %%mm0)
  147. PSWAPD((%3,%0), %%mm1)
  148. "pxor %%mm7, %%mm0 \n"
  149. "movq %%mm1, (%3,%1) \n"
  150. "movq %%mm0, (%2,%0) \n"
  151. "sub $8, %1 \n"
  152. "add $8, %0 \n"
  153. "jl 1b \n"
  154. :"+r"(j), "+r"(k)
  155. :"r"(output+n4), "r"(output+n4*3),
  156. "m"(*m1m1)
  157. );
  158. __asm__ volatile("femms");
  159. }