alacdsp.asm 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. ;******************************************************************************
  2. ;* ALAC DSP SIMD optimizations
  3. ;*
  4. ;* Copyright (C) 2015 James Almer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION .text
  24. INIT_XMM sse4
  25. %if ARCH_X86_64
  26. cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
  27. %else
  28. cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
  29. %define buf1q r2q
  30. %endif
  31. movd m6, shiftm
  32. movd m7, weightm
  33. SPLATD m7
  34. shl lend, 2
  35. mov buf1q, [buf0q + gprsize]
  36. mov buf0q, [buf0q]
  37. add buf1q, lenq
  38. add buf0q, lenq
  39. neg lenq
  40. align 16
  41. .loop:
  42. mova m0, [buf0q + lenq]
  43. mova m1, [buf0q + lenq + mmsize]
  44. mova m2, [buf1q + lenq]
  45. mova m3, [buf1q + lenq + mmsize]
  46. pmulld m4, m2, m7
  47. pmulld m5, m3, m7
  48. psrad m4, m6
  49. psrad m5, m6
  50. psubd m0, m4
  51. psubd m1, m5
  52. paddd m2, m0
  53. paddd m3, m1
  54. mova [buf1q + lenq], m0
  55. mova [buf1q + lenq + mmsize], m1
  56. mova [buf0q + lenq], m2
  57. mova [buf0q + lenq + mmsize], m3
  58. add lenq, mmsize*2
  59. jl .loop
  60. RET
  61. INIT_XMM sse2
  62. cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
  63. movifnidn lend, lenm
  64. movd m4, r2m ; exbits
  65. shl lend, 2
  66. mov buf1q, [buf0q + gprsize]
  67. mov buf0q, [buf0q]
  68. mov exbuf1q, [exbuf0q + gprsize]
  69. mov exbuf0q, [exbuf0q]
  70. add buf1q, lenq
  71. add buf0q, lenq
  72. add exbuf1q, lenq
  73. add exbuf0q, lenq
  74. neg lenq
  75. align 16
  76. .loop:
  77. mova m0, [buf0q + lenq]
  78. mova m1, [buf0q + lenq + mmsize]
  79. pslld m0, m4
  80. pslld m1, m4
  81. mova m2, [buf1q + lenq]
  82. mova m3, [buf1q + lenq + mmsize]
  83. pslld m2, m4
  84. pslld m3, m4
  85. por m0, [exbuf0q + lenq]
  86. por m1, [exbuf0q + lenq + mmsize]
  87. por m2, [exbuf1q + lenq]
  88. por m3, [exbuf1q + lenq + mmsize]
  89. mova [buf0q + lenq ], m0
  90. mova [buf0q + lenq + mmsize], m1
  91. mova [buf1q + lenq ], m2
  92. mova [buf1q + lenq + mmsize], m3
  93. add lenq, mmsize*2
  94. jl .loop
  95. RET
  96. %if ARCH_X86_64
  97. cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
  98. %else
  99. cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
  100. %define exbitsm r2m
  101. %endif
  102. movifnidn lend, r4m
  103. movd m2, exbitsm
  104. shl lend, 2
  105. mov bufq, [bufq]
  106. mov exbufq, [exbufq]
  107. add bufq, lenq
  108. add exbufq, lenq
  109. neg lenq
  110. align 16
  111. .loop:
  112. mova m0, [bufq + lenq]
  113. mova m1, [bufq + lenq + mmsize]
  114. pslld m0, m2
  115. pslld m1, m2
  116. por m0, [exbufq + lenq]
  117. por m1, [exbufq + lenq + mmsize]
  118. mova [bufq + lenq], m0
  119. mova [bufq + lenq + mmsize], m1
  120. add lenq, mmsize*2
  121. jl .loop
  122. RET