range_convert.asm 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. ;******************************************************************************
  2. ;* Copyright (c) 2024 Ramiro Polla
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. pack19: times 4 dd (1 << 19) - 1
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; lumConvertRange
  26. ;
  27. ; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
  28. ; uint32_t coeff, int64_t offset);
  29. ; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
  30. ; uint32_t coeff, int64_t offset);
  31. ;
  32. ;-----------------------------------------------------------------------------
  33. %macro LUMCONVERTRANGE 2
  34. cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
  35. shl widthd, %2 >> 3
  36. movd xm2, coeffd
  37. VBROADCASTSS m2, xm2
  38. %if ARCH_X86_64
  39. movq xm3, offsetq
  40. %else
  41. movq xm3, offsetm
  42. %endif
  43. %if %2 == 16
  44. VBROADCASTSD m3, xm3
  45. %ifidni %1,To
  46. VBROADCASTI128 m4, [pack19]
  47. %endif
  48. %elif %2 == 8
  49. VBROADCASTSS m3, xm3
  50. pxor m4, m4
  51. %endif ; %2 == 8/16
  52. add dstq, widthq
  53. neg widthq
  54. .loop:
  55. movu m0, [dstq+widthq]
  56. %if %2 == 16
  57. pshufd m1, m0, 0xb1
  58. pmuldq m0, m2
  59. pmuldq m1, m2
  60. paddq m0, m3
  61. paddq m1, m3
  62. psrlq m0, 18
  63. psrlq m1, 18
  64. pshufd m0, m0, 0xd8
  65. pshufd m1, m1, 0xd8
  66. punpckldq m0, m1
  67. %ifidni %1,To
  68. PMINSD m0, m4, m1
  69. %endif
  70. %elif %2 == 8
  71. punpckhwd m1, m0, m4
  72. punpcklwd m0, m4
  73. pmaddwd m0, m2
  74. pmaddwd m1, m2
  75. paddd m0, m3
  76. paddd m1, m3
  77. psrad m0, 14
  78. psrad m1, 14
  79. packssdw m0, m1
  80. %endif ; %2 == 8/16
  81. movu [dstq+widthq], m0
  82. add widthq, mmsize
  83. jl .loop
  84. RET
  85. %endmacro
  86. ;-----------------------------------------------------------------------------
  87. ; chrConvertRange
  88. ;
  89. ; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
  90. ; uint32_t coeff, int64_t offset);
  91. ; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
  92. ; uint32_t coeff, int64_t offset);
  93. ;
  94. ;-----------------------------------------------------------------------------
  95. %macro CHRCONVERTRANGE 2
  96. cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
  97. shl widthd, %2 >> 3
  98. movd xm4, coeffd
  99. VBROADCASTSS m4, xm4
  100. %if ARCH_X86_64
  101. movq xm5, offsetq
  102. %else
  103. movq xm5, offsetm
  104. %endif
  105. %if %2 == 16
  106. VBROADCASTSD m5, xm5
  107. %ifidni %1,To
  108. VBROADCASTI128 m6, [pack19]
  109. %endif
  110. %elif %2 == 8
  111. VBROADCASTSS m5, xm5
  112. pxor m6, m6
  113. %endif ; %2 == 8/16
  114. add dstUq, widthq
  115. add dstVq, widthq
  116. neg widthq
  117. .loop:
  118. movu m0, [dstUq+widthq]
  119. movu m2, [dstVq+widthq]
  120. %if %2 == 16
  121. pshufd m1, m0, 0xb1
  122. pshufd m3, m2, 0xb1
  123. pmuldq m0, m4
  124. pmuldq m1, m4
  125. pmuldq m2, m4
  126. pmuldq m3, m4
  127. paddq m0, m5
  128. paddq m1, m5
  129. paddq m2, m5
  130. paddq m3, m5
  131. psrlq m0, 18
  132. psrlq m1, 18
  133. psrlq m2, 18
  134. psrlq m3, 18
  135. pshufd m0, m0, 0xd8
  136. pshufd m1, m1, 0xd8
  137. pshufd m2, m2, 0xd8
  138. pshufd m3, m3, 0xd8
  139. punpckldq m0, m1
  140. punpckldq m2, m3
  141. %ifidni %1,To
  142. PMINSD m0, m6, m1
  143. PMINSD m2, m6, m3
  144. %endif
  145. %elif %2 == 8
  146. punpckhwd m1, m0, m6
  147. punpckhwd m3, m2, m6
  148. punpcklwd m0, m6
  149. punpcklwd m2, m6
  150. pmaddwd m0, m4
  151. pmaddwd m1, m4
  152. pmaddwd m2, m4
  153. pmaddwd m3, m4
  154. paddd m0, m5
  155. paddd m1, m5
  156. paddd m2, m5
  157. paddd m3, m5
  158. psrad m0, 14
  159. psrad m1, 14
  160. psrad m2, 14
  161. psrad m3, 14
  162. packssdw m0, m1
  163. packssdw m2, m3
  164. %endif ; %2 == 8/16
  165. movu [dstUq+widthq], m0
  166. movu [dstVq+widthq], m2
  167. add widthq, mmsize
  168. jl .loop
  169. RET
  170. %endmacro
  171. INIT_XMM sse2
  172. LUMCONVERTRANGE To, 8
  173. CHRCONVERTRANGE To, 8
  174. LUMCONVERTRANGE From, 8
  175. CHRCONVERTRANGE From, 8
  176. INIT_XMM sse4
  177. LUMCONVERTRANGE To, 16
  178. CHRCONVERTRANGE To, 16
  179. LUMCONVERTRANGE From, 16
  180. CHRCONVERTRANGE From, 16
  181. %if HAVE_AVX2_EXTERNAL
  182. INIT_YMM avx2
  183. LUMCONVERTRANGE To, 8
  184. LUMCONVERTRANGE To, 16
  185. CHRCONVERTRANGE To, 8
  186. CHRCONVERTRANGE To, 16
  187. LUMCONVERTRANGE From, 8
  188. LUMCONVERTRANGE From, 16
  189. CHRCONVERTRANGE From, 8
  190. CHRCONVERTRANGE From, 16
  191. %endif