rgb2rgb_neon.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * Copyright (c) 2020 Martin Storsjo
  3. * Copyright (c) 2024 Ramiro Polla
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/aarch64/asm.S"
  22. #define RGB2YUV_COEFFS 16*4+16*32
  23. #define BY v0.h[0]
  24. #define GY v0.h[1]
  25. #define RY v0.h[2]
  26. #define BU v1.h[0]
  27. #define GU v1.h[1]
  28. #define RU v1.h[2]
  29. #define BV v2.h[0]
  30. #define GV v2.h[1]
  31. #define RV v2.h[2]
  32. #define Y_OFFSET v22
  33. #define UV_OFFSET v23
  34. // convert rgb to 16-bit y, u, or v
  35. // uses v3 and v4
  36. .macro rgbconv16 dst, b, g, r, bc, gc, rc
  37. smull v3.4s, \b\().4h, \bc
  38. smlal v3.4s, \g\().4h, \gc
  39. smlal v3.4s, \r\().4h, \rc
  40. smull2 v4.4s, \b\().8h, \bc
  41. smlal2 v4.4s, \g\().8h, \gc
  42. smlal2 v4.4s, \r\().8h, \rc // v3:v4 = b * bc + g * gc + r * rc (32-bit)
  43. shrn \dst\().4h, v3.4s, #7
  44. shrn2 \dst\().8h, v4.4s, #7 // dst = b * bc + g * gc + r * rc (16-bit)
  45. .endm
  46. // void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
  47. // uint8_t *vdst, int width, int height, int lumStride,
  48. // int chromStride, int srcStride, int32_t *rgb2yuv);
  49. function ff_rgb24toyv12_neon, export=1
  50. // x0 const uint8_t *src
  51. // x1 uint8_t *ydst
  52. // x2 uint8_t *udst
  53. // x3 uint8_t *vdst
  54. // w4 int width
  55. // w5 int height
  56. // w6 int lumStride
  57. // w7 int chromStride
  58. ldrsw x14, [sp]
  59. ldr x15, [sp, #8]
  60. // x14 int srcStride
  61. // x15 int32_t *rgb2yuv
  62. // extend width and stride parameters
  63. uxtw x4, w4
  64. sxtw x6, w6
  65. sxtw x7, w7
  66. // src1 = x0
  67. // src2 = x10
  68. add x10, x0, x14 // x10 = src + srcStride
  69. lsl x14, x14, #1 // srcStride *= 2
  70. add x11, x4, x4, lsl #1 // x11 = 3 * width
  71. sub x14, x14, x11 // srcPadding = (2 * srcStride) - (3 * width)
  72. // ydst1 = x1
  73. // ydst2 = x11
  74. add x11, x1, x6 // x11 = ydst + lumStride
  75. lsl x6, x6, #1 // lumStride *= 2
  76. sub x6, x6, x4 // lumPadding = (2 * lumStride) - width
  77. sub x7, x7, x4, lsr #1 // chromPadding = chromStride - (width / 2)
  78. // load rgb2yuv coefficients into v0, v1, and v2
  79. add x15, x15, #RGB2YUV_COEFFS
  80. ld1 {v0.8h-v2.8h}, [x15] // load 24 values
  81. // load offset constants
  82. movi Y_OFFSET.8h, #0x10, lsl #8
  83. movi UV_OFFSET.8h, #0x80, lsl #8
  84. 1:
  85. mov w15, w4 // w15 = width
  86. 2:
  87. // load first line
  88. ld3 {v26.16b, v27.16b, v28.16b}, [x0], #48
  89. // widen first line to 16-bit
  90. uxtl v16.8h, v26.8b // v16 = B11
  91. uxtl v17.8h, v27.8b // v17 = G11
  92. uxtl v18.8h, v28.8b // v18 = R11
  93. uxtl2 v19.8h, v26.16b // v19 = B12
  94. uxtl2 v20.8h, v27.16b // v20 = G12
  95. uxtl2 v21.8h, v28.16b // v21 = R12
  96. // calculate Y values for first line
  97. rgbconv16 v24, v16, v17, v18, BY, GY, RY // v24 = Y11
  98. rgbconv16 v25, v19, v20, v21, BY, GY, RY // v25 = Y12
  99. // load second line
  100. ld3 {v26.16b, v27.16b, v28.16b}, [x10], #48
  101. // pairwise add and save rgb values to calculate average
  102. addp v5.8h, v16.8h, v19.8h
  103. addp v6.8h, v17.8h, v20.8h
  104. addp v7.8h, v18.8h, v21.8h
  105. // widen second line to 16-bit
  106. uxtl v16.8h, v26.8b // v16 = B21
  107. uxtl v17.8h, v27.8b // v17 = G21
  108. uxtl v18.8h, v28.8b // v18 = R21
  109. uxtl2 v19.8h, v26.16b // v19 = B22
  110. uxtl2 v20.8h, v27.16b // v20 = G22
  111. uxtl2 v21.8h, v28.16b // v21 = R22
  112. // calculate Y values for second line
  113. rgbconv16 v26, v16, v17, v18, BY, GY, RY // v26 = Y21
  114. rgbconv16 v27, v19, v20, v21, BY, GY, RY // v27 = Y22
  115. // pairwise add rgb values to calculate average
  116. addp v16.8h, v16.8h, v19.8h
  117. addp v17.8h, v17.8h, v20.8h
  118. addp v18.8h, v18.8h, v21.8h
  119. // calculate average
  120. add v16.8h, v16.8h, v5.8h
  121. add v17.8h, v17.8h, v6.8h
  122. add v18.8h, v18.8h, v7.8h
  123. ushr v16.8h, v16.8h, #2
  124. ushr v17.8h, v17.8h, #2
  125. ushr v18.8h, v18.8h, #2
  126. // calculate U and V values
  127. rgbconv16 v28, v16, v17, v18, BU, GU, RU // v28 = U
  128. rgbconv16 v29, v16, v17, v18, BV, GV, RV // v29 = V
  129. // add offsets and narrow all values
  130. addhn v24.8b, v24.8h, Y_OFFSET.8h
  131. addhn v25.8b, v25.8h, Y_OFFSET.8h
  132. addhn v26.8b, v26.8h, Y_OFFSET.8h
  133. addhn v27.8b, v27.8h, Y_OFFSET.8h
  134. addhn v28.8b, v28.8h, UV_OFFSET.8h
  135. addhn v29.8b, v29.8h, UV_OFFSET.8h
  136. subs w15, w15, #16
  137. // store output
  138. st1 {v24.8b, v25.8b}, [x1], #16 // store ydst1
  139. st1 {v26.8b, v27.8b}, [x11], #16 // store ydst2
  140. st1 {v28.8b}, [x2], #8 // store udst
  141. st1 {v29.8b}, [x3], #8 // store vdst
  142. b.gt 2b
  143. subs w5, w5, #2
  144. // row += 2
  145. add x0, x0, x14 // src1 += srcPadding
  146. add x10, x10, x14 // src2 += srcPadding
  147. add x1, x1, x6 // ydst1 += lumPadding
  148. add x11, x11, x6 // ydst2 += lumPadding
  149. add x2, x2, x7 // udst += chromPadding
  150. add x3, x3, x7 // vdst += chromPadding
  151. b.gt 1b
  152. ret
  153. endfunc
  154. // void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
  155. // uint8_t *dest, int width, int height,
  156. // int src1Stride, int src2Stride, int dstStride);
  157. function ff_interleave_bytes_neon, export=1
  158. sub w5, w5, w3
  159. sub w6, w6, w3
  160. sub w7, w7, w3, lsl #1
  161. 1:
  162. ands w8, w3, #0xfffffff0 // & ~15
  163. b.eq 3f
  164. 2:
  165. ld1 {v0.16b}, [x0], #16
  166. ld1 {v1.16b}, [x1], #16
  167. subs w8, w8, #16
  168. st2 {v0.16b, v1.16b}, [x2], #32
  169. b.gt 2b
  170. tst w3, #15
  171. b.eq 9f
  172. 3:
  173. tst w3, #8
  174. b.eq 4f
  175. ld1 {v0.8b}, [x0], #8
  176. ld1 {v1.8b}, [x1], #8
  177. st2 {v0.8b, v1.8b}, [x2], #16
  178. 4:
  179. tst w3, #4
  180. b.eq 5f
  181. ld1 {v0.s}[0], [x0], #4
  182. ld1 {v1.s}[0], [x1], #4
  183. zip1 v0.8b, v0.8b, v1.8b
  184. st1 {v0.8b}, [x2], #8
  185. 5:
  186. ands w8, w3, #3
  187. b.eq 9f
  188. 6:
  189. ldrb w9, [x0], #1
  190. ldrb w10, [x1], #1
  191. subs w8, w8, #1
  192. bfi w9, w10, #8, #8
  193. strh w9, [x2], #2
  194. b.gt 6b
  195. 9:
  196. subs w4, w4, #1
  197. b.eq 0f
  198. add x0, x0, w5, sxtw
  199. add x1, x1, w6, sxtw
  200. add x2, x2, w7, sxtw
  201. b 1b
  202. 0:
  203. ret
  204. endfunc
  205. // void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
  206. // int width, int height, int srcStride,
  207. // int dst1Stride, int dst2Stride);
  208. function ff_deinterleave_bytes_neon, export=1
  209. sub w5, w5, w3, lsl #1
  210. sub w6, w6, w3
  211. sub w7, w7, w3
  212. 1:
  213. ands w8, w3, #0xfffffff0 // & ~15
  214. b.eq 3f
  215. 2:
  216. ld2 {v0.16b, v1.16b}, [x0], #32
  217. subs w8, w8, #16
  218. st1 {v0.16b}, [x1], #16
  219. st1 {v1.16b}, [x2], #16
  220. b.gt 2b
  221. tst w3, #15
  222. b.eq 9f
  223. 3:
  224. tst w3, #8
  225. b.eq 4f
  226. ld2 {v0.8b, v1.8b}, [x0], #16
  227. st1 {v0.8b}, [x1], #8
  228. st1 {v1.8b}, [x2], #8
  229. 4:
  230. tst w3, #4
  231. b.eq 5f
  232. ld1 {v0.8b}, [x0], #8
  233. shrn v1.8b, v0.8h, #8
  234. xtn v0.8b, v0.8h
  235. st1 {v0.s}[0], [x1], #4
  236. st1 {v1.s}[0], [x2], #4
  237. 5:
  238. ands w8, w3, #3
  239. b.eq 9f
  240. 6:
  241. ldrh w9, [x0], #2
  242. subs w8, w8, #1
  243. ubfx w10, w9, #8, #8
  244. strb w9, [x1], #1
  245. strb w10, [x2], #1
  246. b.gt 6b
  247. 9:
  248. subs w4, w4, #1
  249. b.eq 0f
  250. add x0, x0, w5, sxtw
  251. add x1, x1, w6, sxtw
  252. add x2, x2, w7, sxtw
  253. b 1b
  254. 0:
  255. ret
  256. endfunc