input.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /*
  2. * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. .macro rgb_to_yuv_load_rgb src, element=3
  22. .if \element == 3
  23. ld3 { v16.16b, v17.16b, v18.16b }, [\src]
  24. .else
  25. ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
  26. .endif
  27. uxtl v19.8h, v16.8b // v19: r
  28. uxtl v20.8h, v17.8b // v20: g
  29. uxtl v21.8h, v18.8b // v21: b
  30. uxtl2 v22.8h, v16.16b // v22: r
  31. uxtl2 v23.8h, v17.16b // v23: g
  32. uxtl2 v24.8h, v18.16b // v24: b
  33. .endm
  34. .macro argb_to_yuv_load_rgb src
  35. ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
  36. uxtl v21.8h, v19.8b // v21: b
  37. uxtl2 v24.8h, v19.16b // v24: b
  38. uxtl v19.8h, v17.8b // v19: r
  39. uxtl v20.8h, v18.8b // v20: g
  40. uxtl2 v22.8h, v17.16b // v22: r
  41. uxtl2 v23.8h, v18.16b // v23: g
  42. .endm
  43. .macro rgb_to_yuv_product r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
  44. mov \dst1\().16b, v6.16b // dst1 = const_offset
  45. mov \dst2\().16b, v6.16b // dst2 = const_offset
  46. smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
  47. smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
  48. smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
  49. smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
  50. smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
  51. smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
  52. sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
  53. sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
  54. .endm
  55. .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
  56. function ff_\fmt_bgr\()ToY_neon, export=1
  57. cmp w4, #0 // check width > 0
  58. ldp w12, w11, [x5] // w12: ry, w11: gy
  59. ldr w10, [x5, #8] // w10: by
  60. b.gt 4f
  61. ret
  62. endfunc
  63. function ff_\fmt_rgb\()ToY_neon, export=1
  64. cmp w4, #0 // check width > 0
  65. ldp w10, w11, [x5] // w10: ry, w11: gy
  66. ldr w12, [x5, #8] // w12: by
  67. b.le 3f
  68. 4:
  69. mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
  70. movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
  71. dup v6.4s, w9 // w9: const_offset
  72. cmp w4, #16
  73. dup v0.8h, w10
  74. dup v1.8h, w11
  75. dup v2.8h, w12
  76. b.lt 2f
  77. 1:
  78. .if \alpha_first
  79. argb_to_yuv_load_rgb x1
  80. .else
  81. rgb_to_yuv_load_rgb x1, \element
  82. .endif
  83. rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
  84. rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
  85. sub w4, w4, #16 // width -= 16
  86. add x1, x1, #(16*\element)
  87. cmp w4, #16 // width >= 16 ?
  88. stp q16, q17, [x0], #32 // store to dst
  89. b.ge 1b
  90. cbz x4, 3f
  91. 2:
  92. .if \alpha_first
  93. ldrb w13, [x1, #1] // w13: r
  94. ldrb w14, [x1, #2] // w14: g
  95. ldrb w15, [x1, #3] // w15: b
  96. .else
  97. ldrb w13, [x1] // w13: r
  98. ldrb w14, [x1, #1] // w14: g
  99. ldrb w15, [x1, #2] // w15: b
  100. .endif
  101. smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
  102. smaddl x13, w14, w11, x13 // x13 += gy * g
  103. smaddl x13, w15, w12, x13 // x13 += by * b
  104. asr w13, w13, #9 // x13 >>= 9
  105. sub w4, w4, #1 // width--
  106. add x1, x1, #\element
  107. strh w13, [x0], #2 // store to dst
  108. cbnz w4, 2b
  109. 3:
  110. ret
  111. endfunc
  112. .endm
  113. rgbToY_neon bgr24, rgb24, element=3
  114. rgbToY_neon bgra32, rgba32, element=4
  115. rgbToY_neon abgr32, argb32, element=4, alpha_first=1
  116. .macro rgb_set_uv_coeff half
  117. .if \half
  118. mov w9, #512
  119. movk w9, #128, lsl #16 // w9: const_offset
  120. .else
  121. mov w9, #256
  122. movk w9, #64, lsl #16 // w9: const_offset
  123. .endif
  124. dup v0.8h, w10
  125. dup v1.8h, w11
  126. dup v2.8h, w12
  127. dup v3.8h, w13
  128. dup v4.8h, w14
  129. dup v5.8h, w15
  130. dup v6.4s, w9
  131. .endm
  132. .macro rgb_load_add_half off_r1, off_r2, off_g1, off_g2, off_b1, off_b2
  133. ldrb w2, [x3, #\off_r1] // w2: r1
  134. ldrb w4, [x3, #\off_r2] // w4: r2
  135. add w2, w2, w4 // w2 = r1 + r2
  136. ldrb w4, [x3, #\off_g1] // w4: g1
  137. ldrb w7, [x3, #\off_g2] // w7: g2
  138. add w4, w4, w7 // w4 = g1 + g2
  139. ldrb w7, [x3, #\off_b1] // w7: b1
  140. ldrb w8, [x3, #\off_b2] // w8: b2
  141. add w7, w7, w8 // w7 = b1 + b2
  142. .endm
  143. .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
  144. function ff_\fmt_bgr\()ToUV_half_neon, export=1
  145. cmp w5, #0 // check width > 0
  146. b.le 3f
  147. ldp w12, w11, [x6, #12]
  148. ldp w10, w15, [x6, #20]
  149. ldp w14, w13, [x6, #28]
  150. b 4f
  151. endfunc
  152. function ff_\fmt_rgb\()ToUV_half_neon, export=1
  153. cmp w5, #0 // check width > 0
  154. b.le 3f
  155. ldp w10, w11, [x6, #12] // w10: ru, w11: gu
  156. ldp w12, w13, [x6, #20] // w12: bu, w13: rv
  157. ldp w14, w15, [x6, #28] // w14: gv, w15: bv
  158. 4:
  159. cmp w5, #8
  160. rgb_set_uv_coeff half=1
  161. b.lt 2f
  162. 1:
  163. .if \element == 3
  164. ld3 { v16.16b, v17.16b, v18.16b }, [x3]
  165. .else
  166. ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
  167. .endif
  168. .if \alpha_first
  169. uaddlp v21.8h, v19.16b
  170. uaddlp v20.8h, v18.16b
  171. uaddlp v19.8h, v17.16b
  172. .else
  173. uaddlp v19.8h, v16.16b // v19: r
  174. uaddlp v20.8h, v17.16b // v20: g
  175. uaddlp v21.8h, v18.16b // v21: b
  176. .endif
  177. rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
  178. rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
  179. sub w5, w5, #8 // width -= 8
  180. add x3, x3, #(16*\element)
  181. cmp w5, #8 // width >= 8 ?
  182. str q16, [x0], #16 // store dst_u
  183. str q17, [x1], #16 // store dst_v
  184. b.ge 1b
  185. cbz w5, 3f
  186. 2:
  187. .if \alpha_first
  188. rgb_load_add_half 1, 5, 2, 6, 3, 7
  189. .else
  190. .if \element == 3
  191. rgb_load_add_half 0, 3, 1, 4, 2, 5
  192. .else
  193. rgb_load_add_half 0, 4, 1, 5, 2, 6
  194. .endif
  195. .endif
  196. smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
  197. smaddl x8, w4, w11, x8 // dst_u += gu * g
  198. smaddl x8, w7, w12, x8 // dst_u += bu * b
  199. asr x8, x8, #10 // dst_u >>= 10
  200. strh w8, [x0], #2 // store dst_u
  201. smaddl x8, w2, w13, x9 // dst_v = rv * r + const_offset
  202. smaddl x8, w4, w14, x8 // dst_v += gv * g
  203. smaddl x8, w7, w15, x8 // dst_v += bv * b
  204. asr x8, x8, #10 // dst_v >>= 10
  205. sub w5, w5, #1
  206. add x3, x3, #(2*\element)
  207. strh w8, [x1], #2 // store dst_v
  208. cbnz w5, 2b
  209. 3:
  210. ret
  211. endfunc
  212. .endm
  213. rgbToUV_half_neon bgr24, rgb24, element=3
  214. rgbToUV_half_neon bgra32, rgba32, element=4
  215. rgbToUV_half_neon abgr32, argb32, element=4, alpha_first=1
  216. .macro rgbToUV_neon fmt_bgr, fmt_rgb, element, alpha_first=0
  217. function ff_\fmt_bgr\()ToUV_neon, export=1
  218. cmp w5, #0 // check width > 0
  219. b.le 3f
  220. ldp w12, w11, [x6, #12]
  221. ldp w10, w15, [x6, #20]
  222. ldp w14, w13, [x6, #28]
  223. b 4f
  224. endfunc
  225. function ff_\fmt_rgb\()ToUV_neon, export=1
  226. cmp w5, #0 // check width > 0
  227. b.le 3f
  228. ldp w10, w11, [x6, #12] // w10: ru, w11: gu
  229. ldp w12, w13, [x6, #20] // w12: bu, w13: rv
  230. ldp w14, w15, [x6, #28] // w14: gv, w15: bv
  231. 4:
  232. cmp w5, #16
  233. rgb_set_uv_coeff half=0
  234. b.lt 2f
  235. 1:
  236. .if \alpha_first
  237. argb_to_yuv_load_rgb x3
  238. .else
  239. rgb_to_yuv_load_rgb x3, \element
  240. .endif
  241. rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
  242. rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
  243. rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
  244. rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
  245. sub w5, w5, #16
  246. add x3, x3, #(16*\element)
  247. cmp w5, #16
  248. stp q16, q17, [x0], #32 // store to dst_u
  249. stp q18, q19, [x1], #32 // store to dst_v
  250. b.ge 1b
  251. cbz w5, 3f
  252. 2:
  253. .if \alpha_first
  254. ldrb w16, [x3, #1] // w16: r
  255. ldrb w17, [x3, #2] // w17: g
  256. ldrb w4, [x3, #3] // w4: b
  257. .else
  258. ldrb w16, [x3] // w16: r
  259. ldrb w17, [x3, #1] // w17: g
  260. ldrb w4, [x3, #2] // w4: b
  261. .endif
  262. smaddl x8, w16, w10, x9 // x8 = ru * r + const_offset
  263. smaddl x8, w17, w11, x8 // x8 += gu * g
  264. smaddl x8, w4, w12, x8 // x8 += bu * b
  265. asr w8, w8, #9 // x8 >>= 9
  266. strh w8, [x0], #2 // store to dst_u
  267. smaddl x8, w16, w13, x9 // x8 = rv * r + const_offset
  268. smaddl x8, w17, w14, x8 // x8 += gv * g
  269. smaddl x8, w4, w15, x8 // x8 += bv * b
  270. asr w8, w8, #9 // x8 >>= 9
  271. sub w5, w5, #1 // width--
  272. add x3, x3, #\element
  273. strh w8, [x1], #2 // store to dst_v
  274. cbnz w5, 2b
  275. 3:
  276. ret
  277. endfunc
  278. .endm
  279. rgbToUV_neon bgr24, rgb24, element=3
  280. rgbToUV_neon bgra32, rgba32, element=4
  281. rgbToUV_neon abgr32, argb32, element=4, alpha_first=1