yuv2yuvX.asm 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. ;******************************************************************************
  2. ;* x86-optimized yuv2yuvX
  3. ;* Copyright 2020 Google LLC
  4. ;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; yuv2yuvX
  26. ;
  27. ; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize,
  28. ; int srcOffset, uint8_t *dest, int dstW,
  29. ; const uint8_t *dither, int offset);
  30. ;
  31. ;-----------------------------------------------------------------------------
  32. %macro YUV2YUVX_FUNC 0
  33. cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
  34. %if notcpuflag(sse3)
  35. %define movr mova
  36. %define unroll 1
  37. %else
  38. %define movr movdqu
  39. %define unroll 2
  40. %endif
  41. movsxdifnidn dstWq, dstWd
  42. movsxdifnidn offsetq, offsetd
  43. movsxdifnidn srcq, srcd
  44. %if cpuflag(avx2)
  45. vpbroadcastq m3, [ditherq]
  46. %else
  47. movq xm3, [ditherq]
  48. %endif ; avx2
  49. cmp offsetd, 0
  50. jz .offset
  51. ; offset != 0 path.
  52. psrlq m5, m3, $18
  53. psllq m3, m3, $28
  54. por m3, m3, m5
  55. .offset:
  56. add offsetq, srcq
  57. movd xm1, filterSized
  58. SPLATW m1, xm1, 0
  59. pxor m0, m0, m0
  60. mov filterSizeq, filterq
  61. mov srcq, [filterSizeq]
  62. punpcklbw m3, m0
  63. psllw m1, m1, 3
  64. paddw m3, m3, m1
  65. psraw m7, m3, 4
  66. .outerloop:
  67. mova m4, m7
  68. mova m3, m7
  69. %if cpuflag(sse3)
  70. mova m6, m7
  71. mova m1, m7
  72. %endif
  73. .loop:
  74. %if cpuflag(avx2)
  75. vpbroadcastq m0, [filterSizeq + 8]
  76. %elif cpuflag(sse3)
  77. movddup m0, [filterSizeq + 8]
  78. %else
  79. mova m0, [filterSizeq + 8]
  80. %endif
  81. pmulhw m2, m0, [srcq + offsetq * 2]
  82. pmulhw m5, m0, [srcq + offsetq * 2 + mmsize]
  83. paddw m3, m3, m2
  84. paddw m4, m4, m5
  85. %if cpuflag(sse3)
  86. pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
  87. pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
  88. paddw m6, m6, m2
  89. paddw m1, m1, m5
  90. %endif
  91. add filterSizeq, $10
  92. mov srcq, [filterSizeq]
  93. test srcq, srcq
  94. jnz .loop
  95. psraw m3, m3, 3
  96. psraw m4, m4, 3
  97. %if cpuflag(sse3)
  98. psraw m6, m6, 3
  99. psraw m1, m1, 3
  100. %endif
  101. packuswb m3, m3, m4
  102. %if cpuflag(sse3)
  103. packuswb m6, m6, m1
  104. %endif
  105. mov srcq, [filterq]
  106. %if cpuflag(avx2)
  107. vpermq m3, m3, 216
  108. vpermq m6, m6, 216
  109. %endif
  110. movr [destq + offsetq], m3
  111. %if cpuflag(sse3)
  112. movr [destq + offsetq + mmsize], m6
  113. %endif
  114. add offsetq, mmsize * unroll
  115. mov filterSizeq, filterq
  116. cmp offsetq, dstWq
  117. jb .outerloop
  118. RET
  119. %endmacro
  120. INIT_MMX mmxext
  121. YUV2YUVX_FUNC
  122. INIT_XMM sse3
  123. YUV2YUVX_FUNC
  124. %if HAVE_AVX2_EXTERNAL
  125. INIT_YMM avx2
  126. YUV2YUVX_FUNC
  127. %endif