bswapdsp.asm 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. ;******************************************************************************
  2. ;* optimized bswap buffer functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  26. cextern pb_80
  27. SECTION .text
  28. ; %1 = aligned/unaligned
  29. %macro BSWAP_LOOPS 1
  30. mov r3d, r2d
  31. sar r2d, 3
  32. jz .left4_%1
  33. .loop8_%1:
  34. mov%1 m0, [r1 + 0]
  35. mov%1 m1, [r1 + 16]
  36. %if cpuflag(ssse3)
  37. pshufb m0, m2
  38. pshufb m1, m2
  39. mov%1 [r0 + 0], m0
  40. mov%1 [r0 + 16], m1
  41. %else
  42. pshuflw m0, m0, 10110001b
  43. pshuflw m1, m1, 10110001b
  44. pshufhw m0, m0, 10110001b
  45. pshufhw m1, m1, 10110001b
  46. mova m2, m0
  47. mova m3, m1
  48. psllw m0, 8
  49. psllw m1, 8
  50. psrlw m2, 8
  51. psrlw m3, 8
  52. por m2, m0
  53. por m3, m1
  54. mov%1 [r0 + 0], m2
  55. mov%1 [r0 + 16], m3
  56. %endif
  57. add r0, 32
  58. add r1, 32
  59. dec r2d
  60. jnz .loop8_%1
  61. .left4_%1:
  62. mov r2d, r3d
  63. test r3d, 4
  64. jz .left
  65. mov%1 m0, [r1]
  66. %if cpuflag(ssse3)
  67. pshufb m0, m2
  68. mov%1 [r0], m0
  69. %else
  70. pshuflw m0, m0, 10110001b
  71. pshufhw m0, m0, 10110001b
  72. mova m2, m0
  73. psllw m0, 8
  74. psrlw m2, 8
  75. por m2, m0
  76. mov%1 [r0], m2
  77. %endif
  78. add r1, 16
  79. add r0, 16
  80. %endmacro
  81. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  82. %macro BSWAP32_BUF 0
  83. %if cpuflag(ssse3)
  84. cglobal bswap32_buf, 3,4,3
  85. mov r3, r1
  86. mova m2, [pb_bswap32]
  87. %else
  88. cglobal bswap32_buf, 3,4,5
  89. mov r3, r1
  90. %endif
  91. or r3, r0
  92. test r3, 15
  93. jz .start_align
  94. BSWAP_LOOPS u
  95. jmp .left
  96. .start_align:
  97. BSWAP_LOOPS a
  98. .left:
  99. %if cpuflag(ssse3)
  100. test r2d, 2
  101. jz .left1
  102. movq m0, [r1]
  103. pshufb m0, m2
  104. movq [r0], m0
  105. add r1, 8
  106. add r0, 8
  107. .left1:
  108. test r2d, 1
  109. jz .end
  110. mov r2d, [r1]
  111. bswap r2d
  112. mov [r0], r2d
  113. %else
  114. and r2d, 3
  115. jz .end
  116. .loop2:
  117. mov r3d, [r1]
  118. bswap r3d
  119. mov [r0], r3d
  120. add r1, 4
  121. add r0, 4
  122. dec r2d
  123. jnz .loop2
  124. %endif
  125. .end:
  126. RET
  127. %endmacro
  128. INIT_XMM sse2
  129. BSWAP32_BUF
  130. INIT_XMM ssse3
  131. BSWAP32_BUF