scale_avx2.asm 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. ;******************************************************************************
  2. ;* x86-optimized horizontal line scaling functions
  3. ;* Copyright 2020 Google LLC
  4. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA 32
  24. swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
  25. four: times 8 dd 4
  26. SECTION .text
  27. ;-----------------------------------------------------------------------------
  28. ; horizontal line scaling
  29. ;
  30. ; void hscale8to15_<filterSize>_<opt>
  31. ; (SwsContext *c, int16_t *dst,
  32. ; int dstW, const uint8_t *src,
  33. ; const int16_t *filter,
  34. ; const int32_t *filterPos, int filterSize);
  35. ;
  36. ; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
  37. ; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
  38. ; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
  39. ;-----------------------------------------------------------------------------
  40. %macro SCALE_FUNC 1
  41. cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
  42. pxor m0, m0
  43. mova m15, [swizzle]
  44. xor countq, countq
  45. movsxd wq, wd
  46. %ifidn %1, X4
  47. mova m14, [four]
  48. shr fltsized, 2
  49. %endif
  50. .loop:
  51. movu m1, [fltposq]
  52. movu m2, [fltposq+32]
  53. %ifidn %1, X4
  54. pxor m9, m9
  55. pxor m10, m10
  56. pxor m11, m11
  57. pxor m12, m12
  58. xor innerq, innerq
  59. .innerloop:
  60. %endif
  61. vpcmpeqd m13, m13
  62. vpgatherdd m3,[srcmemq + m1], m13
  63. vpcmpeqd m13, m13
  64. vpgatherdd m4,[srcmemq + m2], m13
  65. vpunpcklbw m5, m3, m0
  66. vpunpckhbw m6, m3, m0
  67. vpunpcklbw m7, m4, m0
  68. vpunpckhbw m8, m4, m0
  69. vpmaddwd m5, m5, [filterq]
  70. vpmaddwd m6, m6, [filterq + 32]
  71. vpmaddwd m7, m7, [filterq + 64]
  72. vpmaddwd m8, m8, [filterq + 96]
  73. add filterq, 0x80
  74. %ifidn %1, X4
  75. paddd m9, m5
  76. paddd m10, m6
  77. paddd m11, m7
  78. paddd m12, m8
  79. paddd m1, m14
  80. paddd m2, m14
  81. add innerq, 1
  82. cmp innerq, fltsizeq
  83. jl .innerloop
  84. vphaddd m5, m9, m10
  85. vphaddd m6, m11, m12
  86. %else
  87. vphaddd m5, m5, m6
  88. vphaddd m6, m7, m8
  89. %endif
  90. vpsrad m5, 7
  91. vpsrad m6, 7
  92. vpackssdw m5, m5, m6
  93. vpermd m5, m15, m5
  94. vmovdqu [dstq + countq * 2], m5
  95. add fltposq, 0x40
  96. add countq, 0x10
  97. cmp countq, wq
  98. jl .loop
  99. REP_RET
  100. %endmacro
  101. %if ARCH_X86_64
  102. %if HAVE_AVX2_EXTERNAL
  103. INIT_YMM avx2
  104. SCALE_FUNC 4
  105. SCALE_FUNC X4
  106. %endif
  107. %endif