scale_avx2.asm 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. ;******************************************************************************
  2. ;* x86-optimized horizontal line scaling functions
  3. ;* Copyright 2020 Google LLC
  4. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA 32
  24. swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
  25. four: times 8 dd 4
  26. SECTION .text
  27. ;-----------------------------------------------------------------------------
  28. ; horizontal line scaling
  29. ;
  30. ; void hscale8to15_<filterSize>_<opt>
  31. ; (SwsInternal *c, int16_t *dst,
  32. ; int dstW, const uint8_t *src,
  33. ; const int16_t *filter,
  34. ; const int32_t *filterPos, int filterSize);
  35. ;
  36. ; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
  37. ; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
  38. ; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
  39. ;-----------------------------------------------------------------------------
  40. %macro SCALE_FUNC 1
  41. cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
  42. pxor m0, m0
  43. mova m15, [swizzle]
  44. xor countq, countq
  45. movsxd wq, wd
  46. %ifidn %1, X4
  47. mova m14, [four]
  48. shr fltsized, 2
  49. %endif
  50. cmp wq, 0x10
  51. jl .tail_loop
  52. sub wq, 0x10
  53. .loop:
  54. movu m1, [fltposq]
  55. movu m2, [fltposq+32]
  56. %ifidn %1, X4
  57. pxor m9, m9
  58. pxor m10, m10
  59. pxor m11, m11
  60. pxor m12, m12
  61. xor innerq, innerq
  62. .innerloop:
  63. %endif
  64. vpcmpeqd m13, m13
  65. vpgatherdd m3,[srcmemq + m1], m13
  66. vpcmpeqd m13, m13
  67. vpgatherdd m4,[srcmemq + m2], m13
  68. vpunpcklbw m5, m3, m0
  69. vpunpckhbw m6, m3, m0
  70. vpunpcklbw m7, m4, m0
  71. vpunpckhbw m8, m4, m0
  72. vpmaddwd m5, m5, [filterq]
  73. vpmaddwd m6, m6, [filterq + 32]
  74. vpmaddwd m7, m7, [filterq + 64]
  75. vpmaddwd m8, m8, [filterq + 96]
  76. add filterq, 0x80
  77. %ifidn %1, X4
  78. paddd m9, m5
  79. paddd m10, m6
  80. paddd m11, m7
  81. paddd m12, m8
  82. paddd m1, m14
  83. paddd m2, m14
  84. add innerq, 1
  85. cmp innerq, fltsizeq
  86. jl .innerloop
  87. vphaddd m5, m9, m10
  88. vphaddd m6, m11, m12
  89. %else
  90. vphaddd m5, m5, m6
  91. vphaddd m6, m7, m8
  92. %endif
  93. vpsrad m5, 7
  94. vpsrad m6, 7
  95. vpackssdw m5, m5, m6
  96. vpermd m5, m15, m5
  97. vmovdqu [dstq + countq * 2], m5
  98. add fltposq, 0x40
  99. add countq, 0x10
  100. cmp countq, wq
  101. jle .loop
  102. add wq, 0x10
  103. cmp countq, wq
  104. jge .end
  105. .tail_loop:
  106. movu xm1, [fltposq]
  107. %ifidn %1, X4
  108. pxor xm9, xm9
  109. pxor xm10, xm10
  110. xor innerq, innerq
  111. .tail_innerloop:
  112. %endif
  113. vpcmpeqd xm13, xm13
  114. vpgatherdd xm3,[srcmemq + xm1], xm13
  115. vpunpcklbw xm5, xm3, xm0
  116. vpunpckhbw xm6, xm3, xm0
  117. vpmaddwd xm5, xm5, [filterq]
  118. vpmaddwd xm6, xm6, [filterq + 0x10]
  119. add filterq, 0x20
  120. %ifidn %1, X4
  121. paddd xm9, xm5
  122. paddd xm10, xm6
  123. paddd xm1, xm14
  124. add innerq, 1
  125. cmp innerq, fltsizeq
  126. jl .tail_innerloop
  127. vphaddd xm5, xm9, xm10
  128. %else
  129. vphaddd xm5, xm5, xm6
  130. %endif
  131. vpsrad xm5, 7
  132. vpackssdw xm5, xm5, xm5
  133. vmovq [dstq + countq * 2], xm5
  134. add fltposq, 0x10
  135. add countq, 0x4
  136. cmp countq, wq
  137. jl .tail_loop
  138. .end:
  139. RET
  140. %endmacro
  141. %if ARCH_X86_64
  142. %if HAVE_AVX2_EXTERNAL
  143. INIT_YMM avx2
  144. SCALE_FUNC 4
  145. SCALE_FUNC X4
  146. %endif
  147. %endif