range_convert_neon.S 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. /*
  2. * Copyright (c) 2024 Ramiro Polla
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. .macro lumConvertRange fromto, bit_depth
  22. function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1
  23. // x0 int16_t *dst
  24. // w1 int width
  25. // w2 uint32_t coeff
  26. // x3 int64_t offset
  27. .if \bit_depth == 16
  28. .ifc \fromto, To
  29. movi v25.4s, #1
  30. movi v24.4s, #1<<3, lsl #16
  31. sub v24.4s, v24.4s, v25.4s
  32. .endif
  33. dup v25.4s, w2
  34. dup v26.2d, x3
  35. 1:
  36. ld1 {v0.4s, v1.4s}, [x0]
  37. mov v16.16b, v26.16b
  38. mov v17.16b, v26.16b
  39. mov v18.16b, v26.16b
  40. mov v19.16b, v26.16b
  41. smlal v16.2d, v0.2s, v25.2s
  42. smlal2 v17.2d, v0.4s, v25.4s
  43. smlal v18.2d, v1.2s, v25.2s
  44. smlal2 v19.2d, v1.4s, v25.4s
  45. shrn v0.2s, v16.2d, 18
  46. shrn2 v0.4s, v17.2d, 18
  47. shrn v1.2s, v18.2d, 18
  48. shrn2 v1.4s, v19.2d, 18
  49. subs w1, w1, #8
  50. .ifc \fromto, To
  51. smin v0.4s, v0.4s, v24.4s
  52. smin v1.4s, v1.4s, v24.4s
  53. .endif
  54. st1 {v0.4s, v1.4s}, [x0], #32
  55. b.gt 1b
  56. .else
  57. dup v25.4s, w2
  58. dup v26.4s, w3
  59. 1:
  60. ld1 {v0.8h}, [x0]
  61. mov v16.16b, v26.16b
  62. mov v18.16b, v26.16b
  63. sxtl v20.4s, v0.4h
  64. sxtl2 v22.4s, v0.8h
  65. mla v16.4s, v20.4s, v25.4s
  66. mla v18.4s, v22.4s, v25.4s
  67. .ifc \fromto, To
  68. sqshrn v0.4h, v16.4s, 14
  69. sqshrn2 v0.8h, v18.4s, 14
  70. .else
  71. shrn v0.4h, v16.4s, 14
  72. shrn2 v0.8h, v18.4s, 14
  73. .endif
  74. subs w1, w1, #8
  75. st1 {v0.8h}, [x0], #16
  76. b.gt 1b
  77. .endif
  78. ret
  79. endfunc
  80. .endm
  81. .macro chrConvertRange fromto, bit_depth
  82. function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1
  83. // x0 int16_t *dstU
  84. // x1 int16_t *dstV
  85. // w2 int width
  86. // w3 uint32_t coeff
  87. // x4 int64_t offset
  88. .if \bit_depth == 16
  89. .ifc \fromto, To
  90. movi v25.4s, #1
  91. movi v24.4s, #1<<3, lsl #16
  92. sub v24.4s, v24.4s, v25.4s
  93. .endif
  94. dup v25.4s, w3
  95. dup v26.2d, x4
  96. 1:
  97. ld1 {v0.4s, v1.4s}, [x0]
  98. ld1 {v2.4s, v3.4s}, [x1]
  99. mov v16.16b, v26.16b
  100. mov v17.16b, v26.16b
  101. mov v18.16b, v26.16b
  102. mov v19.16b, v26.16b
  103. mov v20.16b, v26.16b
  104. mov v21.16b, v26.16b
  105. mov v22.16b, v26.16b
  106. mov v23.16b, v26.16b
  107. smlal v16.2d, v0.2s, v25.2s
  108. smlal2 v17.2d, v0.4s, v25.4s
  109. smlal v18.2d, v1.2s, v25.2s
  110. smlal2 v19.2d, v1.4s, v25.4s
  111. smlal v20.2d, v2.2s, v25.2s
  112. smlal2 v21.2d, v2.4s, v25.4s
  113. smlal v22.2d, v3.2s, v25.2s
  114. smlal2 v23.2d, v3.4s, v25.4s
  115. shrn v0.2s, v16.2d, 18
  116. shrn2 v0.4s, v17.2d, 18
  117. shrn v1.2s, v18.2d, 18
  118. shrn2 v1.4s, v19.2d, 18
  119. shrn v2.2s, v20.2d, 18
  120. shrn2 v2.4s, v21.2d, 18
  121. shrn v3.2s, v22.2d, 18
  122. shrn2 v3.4s, v23.2d, 18
  123. subs w2, w2, #8
  124. .ifc \fromto, To
  125. smin v0.4s, v0.4s, v24.4s
  126. smin v1.4s, v1.4s, v24.4s
  127. smin v2.4s, v2.4s, v24.4s
  128. smin v3.4s, v3.4s, v24.4s
  129. .endif
  130. st1 {v0.4s, v1.4s}, [x0], #32
  131. st1 {v2.4s, v3.4s}, [x1], #32
  132. b.gt 1b
  133. .else
  134. dup v25.4s, w3
  135. dup v26.4s, w4
  136. 1:
  137. ld1 {v0.8h}, [x0]
  138. ld1 {v1.8h}, [x1]
  139. mov v16.16b, v26.16b
  140. mov v17.16b, v26.16b
  141. mov v18.16b, v26.16b
  142. mov v19.16b, v26.16b
  143. sxtl v20.4s, v0.4h
  144. sxtl v21.4s, v1.4h
  145. sxtl2 v22.4s, v0.8h
  146. sxtl2 v23.4s, v1.8h
  147. mla v16.4s, v20.4s, v25.4s
  148. mla v17.4s, v21.4s, v25.4s
  149. mla v18.4s, v22.4s, v25.4s
  150. mla v19.4s, v23.4s, v25.4s
  151. .ifc \fromto, To
  152. sqshrn v0.4h, v16.4s, 14
  153. sqshrn v1.4h, v17.4s, 14
  154. sqshrn2 v0.8h, v18.4s, 14
  155. sqshrn2 v1.8h, v19.4s, 14
  156. .else
  157. shrn v0.4h, v16.4s, 14
  158. shrn v1.4h, v17.4s, 14
  159. shrn2 v0.8h, v18.4s, 14
  160. shrn2 v1.8h, v19.4s, 14
  161. .endif
  162. subs w2, w2, #8
  163. st1 {v0.8h}, [x0], #16
  164. st1 {v1.8h}, [x1], #16
  165. b.gt 1b
  166. .endif
  167. ret
  168. endfunc
  169. .endm
  170. lumConvertRange To, 8
  171. lumConvertRange To, 16
  172. chrConvertRange To, 8
  173. chrConvertRange To, 16
  174. lumConvertRange From, 8
  175. lumConvertRange From, 16
  176. chrConvertRange From, 8
  177. chrConvertRange From, 16