range_convert.asm 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. ;******************************************************************************
  2. ;* Copyright (c) 2024 Ramiro Polla
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. chr_to_mult: times 4 dw 4663, 0
  23. chr_to_offset: times 4 dd -9289992
  24. %define chr_to_shift 12
  25. chr_from_mult: times 4 dw 1799, 0
  26. chr_from_offset: times 4 dd 4081085
  27. %define chr_from_shift 11
  28. lum_to_mult: times 4 dw 19077, 0
  29. lum_to_offset: times 4 dd -39057361
  30. %define lum_to_shift 14
  31. lum_from_mult: times 4 dw 14071, 0
  32. lum_from_offset: times 4 dd 33561947
  33. %define lum_from_shift 14
  34. SECTION .text
  35. ; NOTE: there is no need to clamp the input when converting to jpeg range
  36. ; (like we do in the C code) because packssdw will saturate the output.
  37. ;-----------------------------------------------------------------------------
  38. ; lumConvertRange
  39. ;
  40. ; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
  41. ; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
  42. ;
  43. ;-----------------------------------------------------------------------------
  44. %macro LUMCONVERTRANGE 4
  45. cglobal %1, 2, 2, 5, dst, width
  46. shl widthd, 1
  47. VBROADCASTI128 m2, [%2]
  48. VBROADCASTI128 m3, [%3]
  49. pxor m4, m4
  50. add dstq, widthq
  51. neg widthq
  52. .loop:
  53. movu m0, [dstq+widthq]
  54. punpckhwd m1, m0, m4
  55. punpcklwd m0, m4
  56. pmaddwd m0, m2
  57. pmaddwd m1, m2
  58. paddd m0, m3
  59. paddd m1, m3
  60. psrad m0, %4
  61. psrad m1, %4
  62. packssdw m0, m1
  63. movu [dstq+widthq], m0
  64. add widthq, mmsize
  65. jl .loop
  66. RET
  67. %endmacro
  68. ;-----------------------------------------------------------------------------
  69. ; chrConvertRange
  70. ;
  71. ; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
  72. ; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
  73. ;
  74. ;-----------------------------------------------------------------------------
  75. %macro CHRCONVERTRANGE 4
  76. cglobal %1, 3, 3, 7, dstU, dstV, width
  77. shl widthd, 1
  78. VBROADCASTI128 m4, [%2]
  79. VBROADCASTI128 m5, [%3]
  80. pxor m6, m6
  81. add dstUq, widthq
  82. add dstVq, widthq
  83. neg widthq
  84. .loop:
  85. movu m0, [dstUq+widthq]
  86. movu m2, [dstVq+widthq]
  87. punpckhwd m1, m0, m6
  88. punpckhwd m3, m2, m6
  89. punpcklwd m0, m6
  90. punpcklwd m2, m6
  91. pmaddwd m0, m4
  92. pmaddwd m1, m4
  93. pmaddwd m2, m4
  94. pmaddwd m3, m4
  95. paddd m0, m5
  96. paddd m1, m5
  97. paddd m2, m5
  98. paddd m3, m5
  99. psrad m0, %4
  100. psrad m1, %4
  101. psrad m2, %4
  102. psrad m3, %4
  103. packssdw m0, m1
  104. packssdw m2, m3
  105. movu [dstUq+widthq], m0
  106. movu [dstVq+widthq], m2
  107. add widthq, mmsize
  108. jl .loop
  109. RET
  110. %endmacro
  111. INIT_XMM sse2
  112. LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
  113. CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
  114. LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
  115. CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
  116. %if HAVE_AVX2_EXTERNAL
  117. INIT_YMM avx2
  118. LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
  119. CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
  120. LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
  121. CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
  122. %endif