audiodsp.asm 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. ;******************************************************************************
  2. ;* optimized audio functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. %macro SCALARPRODUCT 0
  24. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  25. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  26. shl orderq, 1
  27. add v1q, orderq
  28. add v2q, orderq
  29. neg orderq
  30. pxor m2, m2
  31. .loop:
  32. movu m0, [v1q + orderq]
  33. movu m1, [v1q + orderq + mmsize]
  34. pmaddwd m0, [v2q + orderq]
  35. pmaddwd m1, [v2q + orderq + mmsize]
  36. paddd m2, m0
  37. paddd m2, m1
  38. add orderq, mmsize*2
  39. jl .loop
  40. HADDD m2, m0
  41. movd eax, m2
  42. %if mmsize == 8
  43. emms
  44. %endif
  45. RET
  46. %endmacro
  47. INIT_MMX mmxext
  48. SCALARPRODUCT
  49. INIT_XMM sse2
  50. SCALARPRODUCT
  51. ;-----------------------------------------------------------------------------
  52. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  53. ; int32_t max, unsigned int len)
  54. ;-----------------------------------------------------------------------------
  55. ; %1 = number of xmm registers used
  56. ; %2 = number of inline load/process/store loops per asm loop
  57. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  58. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  59. ; %5 = suffix
  60. %macro VECTOR_CLIP_INT32 4-5
  61. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  62. %if %4
  63. cvtsi2ss m4, minm
  64. cvtsi2ss m5, maxm
  65. %else
  66. movd m4, minm
  67. movd m5, maxm
  68. %endif
  69. SPLATD m4
  70. SPLATD m5
  71. .loop:
  72. %assign %%i 0
  73. %rep %2
  74. mova m0, [srcq+mmsize*(0+%%i)]
  75. mova m1, [srcq+mmsize*(1+%%i)]
  76. mova m2, [srcq+mmsize*(2+%%i)]
  77. mova m3, [srcq+mmsize*(3+%%i)]
  78. %if %3
  79. mova m7, [srcq+mmsize*(4+%%i)]
  80. mova m8, [srcq+mmsize*(5+%%i)]
  81. mova m9, [srcq+mmsize*(6+%%i)]
  82. mova m10, [srcq+mmsize*(7+%%i)]
  83. %endif
  84. CLIPD m0, m4, m5, m6
  85. CLIPD m1, m4, m5, m6
  86. CLIPD m2, m4, m5, m6
  87. CLIPD m3, m4, m5, m6
  88. %if %3
  89. CLIPD m7, m4, m5, m6
  90. CLIPD m8, m4, m5, m6
  91. CLIPD m9, m4, m5, m6
  92. CLIPD m10, m4, m5, m6
  93. %endif
  94. mova [dstq+mmsize*(0+%%i)], m0
  95. mova [dstq+mmsize*(1+%%i)], m1
  96. mova [dstq+mmsize*(2+%%i)], m2
  97. mova [dstq+mmsize*(3+%%i)], m3
  98. %if %3
  99. mova [dstq+mmsize*(4+%%i)], m7
  100. mova [dstq+mmsize*(5+%%i)], m8
  101. mova [dstq+mmsize*(6+%%i)], m9
  102. mova [dstq+mmsize*(7+%%i)], m10
  103. %endif
  104. %assign %%i %%i+4*(%3+1)
  105. %endrep
  106. add srcq, mmsize*4*(%2+%3)
  107. add dstq, mmsize*4*(%2+%3)
  108. sub lend, mmsize*(%2+%3)
  109. jg .loop
  110. REP_RET
  111. %endmacro
  112. INIT_MMX mmx
  113. %define CLIPD CLIPD_MMX
  114. VECTOR_CLIP_INT32 0, 1, 0, 0
  115. INIT_XMM sse2
  116. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  117. %define CLIPD CLIPD_SSE2
  118. VECTOR_CLIP_INT32 6, 2, 0, 1
  119. INIT_XMM sse4
  120. %define CLIPD CLIPD_SSE41
  121. %ifdef m8
  122. VECTOR_CLIP_INT32 11, 1, 1, 0
  123. %else
  124. VECTOR_CLIP_INT32 6, 1, 0, 0
  125. %endif
  126. ;-----------------------------------------------------
  127. ;void ff_vector_clipf(float *dst, const float *src,
  128. ; float min, float max, int len)
  129. ;-----------------------------------------------------
  130. INIT_XMM sse
  131. %if UNIX64
  132. cglobal vector_clipf, 3,3,6, dst, src, len
  133. %else
  134. cglobal vector_clipf, 5,5,6, dst, src, min, max, len
  135. %endif
  136. %if WIN64
  137. SWAP 0, 2
  138. SWAP 1, 3
  139. %elif ARCH_X86_32
  140. movss m0, minm
  141. movss m1, maxm
  142. %endif
  143. SPLATD m0
  144. SPLATD m1
  145. shl lend, 2
  146. add srcq, lenq
  147. add dstq, lenq
  148. neg lenq
  149. .loop:
  150. mova m2, [srcq+lenq+mmsize*0]
  151. mova m3, [srcq+lenq+mmsize*1]
  152. mova m4, [srcq+lenq+mmsize*2]
  153. mova m5, [srcq+lenq+mmsize*3]
  154. maxps m2, m0
  155. maxps m3, m0
  156. maxps m4, m0
  157. maxps m5, m0
  158. minps m2, m1
  159. minps m3, m1
  160. minps m4, m1
  161. minps m5, m1
  162. mova [dstq+lenq+mmsize*0], m2
  163. mova [dstq+lenq+mmsize*1], m3
  164. mova [dstq+lenq+mmsize*2], m4
  165. mova [dstq+lenq+mmsize*3], m5
  166. add lenq, mmsize*4
  167. jl .loop
  168. REP_RET