input.S 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /*
  2. * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. .macro rgb24_to_yuv_load_rgb, src
  22. ld3 { v16.16b, v17.16b, v18.16b }, [\src]
  23. uxtl v19.8h, v16.8b // v19: r
  24. uxtl v20.8h, v17.8b // v20: g
  25. uxtl v21.8h, v18.8b // v21: b
  26. uxtl2 v22.8h, v16.16b // v22: r
  27. uxtl2 v23.8h, v17.16b // v23: g
  28. uxtl2 v24.8h, v18.16b // v24: b
  29. .endm
  30. .macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
  31. mov \dst1\().16b, v6.16b // dst1 = const_offset
  32. mov \dst2\().16b, v6.16b // dst2 = const_offset
  33. smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
  34. smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
  35. smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
  36. smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
  37. smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
  38. smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
  39. sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
  40. sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
  41. .endm
  42. function ff_rgb24ToY_neon, export=1
  43. cmp w4, #0 // check width > 0
  44. ldp w10, w11, [x5] // w10: ry, w11: gy
  45. ldr w12, [x5, #8] // w12: by
  46. b.le 3f
  47. mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
  48. movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
  49. dup v6.4s, w9 // w9: const_offset
  50. cmp w4, #16
  51. dup v0.8h, w10
  52. dup v1.8h, w11
  53. dup v2.8h, w12
  54. b.lt 2f
  55. 1:
  56. rgb24_to_yuv_load_rgb x1
  57. rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
  58. rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
  59. sub w4, w4, #16 // width -= 16
  60. add x1, x1, #48 // src += 48
  61. cmp w4, #16 // width >= 16 ?
  62. stp q16, q17, [x0], #32 // store to dst
  63. b.ge 1b
  64. cbz x4, 3f
  65. 2:
  66. ldrb w13, [x1] // w13: r
  67. ldrb w14, [x1, #1] // w14: g
  68. ldrb w15, [x1, #2] // w15: b
  69. smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
  70. smaddl x13, w14, w11, x13 // x13 += gy * g
  71. smaddl x13, w15, w12, x13 // x13 += by * b
  72. asr w13, w13, #9 // x13 >>= 9
  73. sub w4, w4, #1 // width--
  74. add x1, x1, #3 // src += 3
  75. strh w13, [x0], #2 // store to dst
  76. cbnz w4, 2b
  77. 3:
  78. ret
  79. endfunc
  80. .macro rgb24_load_uv_coeff half
  81. ldp w10, w11, [x6, #12] // w10: ru, w11: gu
  82. ldp w12, w13, [x6, #20] // w12: bu, w13: rv
  83. ldp w14, w15, [x6, #28] // w14: gv, w15: bv
  84. .if \half
  85. mov w9, #512
  86. movk w9, #128, lsl #16 // w9: const_offset
  87. .else
  88. mov w9, #256
  89. movk w9, #64, lsl #16 // w9: const_offset
  90. .endif
  91. dup v0.8h, w10
  92. dup v1.8h, w11
  93. dup v2.8h, w12
  94. dup v3.8h, w13
  95. dup v4.8h, w14
  96. dup v5.8h, w15
  97. dup v6.4s, w9
  98. .endm
  99. function ff_rgb24ToUV_half_neon, export=1
  100. cmp w5, #0 // check width > 0
  101. b.le 3f
  102. cmp w5, #8
  103. rgb24_load_uv_coeff half=1
  104. b.lt 2f
  105. 1:
  106. ld3 { v16.16b, v17.16b, v18.16b }, [x3]
  107. uaddlp v19.8h, v16.16b // v19: r
  108. uaddlp v20.8h, v17.16b // v20: g
  109. uaddlp v21.8h, v18.16b // v21: b
  110. rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
  111. rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
  112. sub w5, w5, #8 // width -= 8
  113. add x3, x3, #48 // src += 48
  114. cmp w5, #8 // width >= 8 ?
  115. str q16, [x0], #16 // store dst_u
  116. str q17, [x1], #16 // store dst_v
  117. b.ge 1b
  118. cbz w5, 3f
  119. 2:
  120. ldrb w2, [x3] // w2: r1
  121. ldrb w4, [x3, #3] // w4: r2
  122. add w2, w2, w4 // w2 = r1 + r2
  123. ldrb w4, [x3, #1] // w4: g1
  124. ldrb w7, [x3, #4] // w7: g2
  125. add w4, w4, w7 // w4 = g1 + g2
  126. ldrb w7, [x3, #2] // w7: b1
  127. ldrb w8, [x3, #5] // w8: b2
  128. add w7, w7, w8 // w7 = b1 + b2
  129. smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
  130. smaddl x8, w4, w11, x8 // dst_u += gu * g
  131. smaddl x8, w7, w12, x8 // dst_u += bu * b
  132. asr x8, x8, #10 // dst_u >>= 10
  133. strh w8, [x0], #2 // store dst_u
  134. smaddl x8, w2, w13, x9 // dst_v = rv * r + const_offset
  135. smaddl x8, w4, w14, x8 // dst_v += gv * g
  136. smaddl x8, w7, w15, x8 // dst_v += bv * b
  137. asr x8, x8, #10 // dst_v >>= 10
  138. sub w5, w5, #1
  139. add x3, x3, #6 // src += 6
  140. strh w8, [x1], #2 // store dst_v
  141. cbnz w5, 2b
  142. 3:
  143. ret
  144. endfunc
  145. function ff_rgb24ToUV_neon, export=1
  146. cmp w5, #0 // check width > 0
  147. b.le 3f
  148. cmp w5, #16
  149. rgb24_load_uv_coeff half=0
  150. b.lt 2f
  151. 1:
  152. rgb24_to_yuv_load_rgb x3
  153. rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
  154. rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
  155. rgb24_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
  156. rgb24_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
  157. sub w5, w5, #16
  158. add x3, x3, #48 // src += 48
  159. cmp w5, #16
  160. stp q16, q17, [x0], #32 // store to dst_u
  161. stp q18, q19, [x1], #32 // store to dst_v
  162. b.ge 1b
  163. cbz w5, 3f
  164. 2:
  165. ldrb w16, [x3] // w16: r
  166. ldrb w17, [x3, #1] // w17: g
  167. ldrb w4, [x3, #2] // w4: b
  168. smaddl x8, w16, w10, x9 // x8 = ru * r + const_offset
  169. smaddl x8, w17, w11, x8 // x8 += gu * g
  170. smaddl x8, w4, w12, x8 // x8 += bu * b
  171. asr w8, w8, #9 // x8 >>= 9
  172. strh w8, [x0], #2 // store to dst_u
  173. smaddl x8, w16, w13, x9 // x8 = rv * r + const_offset
  174. smaddl x8, w17, w14, x8 // x8 += gv * g
  175. smaddl x8, w4, w15, x8 // x8 += bv * b
  176. asr w8, w8, #9 // x8 >>= 9
  177. sub w5, w5, #1 // width--
  178. add x3, x3, #3 // src += 3
  179. strh w8, [x1], #2 // store to dst_v
  180. cbnz w5, 2b
  181. 3:
  182. ret
  183. endfunc