rgb2rgb_neon.S 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689
  1. /*
  2. * Copyright (c) 2020 Martin Storsjo
  3. * Copyright (c) 2024 Ramiro Polla
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/aarch64/asm.S"
  22. #define RGB2YUV_COEFFS 16*4+16*32
  23. #define BY v0.h[0]
  24. #define GY v0.h[1]
  25. #define RY v0.h[2]
  26. #define BU v1.h[0]
  27. #define GU v1.h[1]
  28. #define RU v1.h[2]
  29. #define BV v2.h[0]
  30. #define GV v2.h[1]
  31. #define RV v2.h[2]
  32. #define Y_OFFSET v22
  33. #define UV_OFFSET v23
  34. const shuf_0321_tbl, align=4
  35. .byte 0, 3, 2, 1
  36. .byte 4, 7, 6, 5
  37. .byte 8, 11, 10, 9
  38. .byte 12, 15, 14, 13
  39. endconst
  40. const shuf_1230_tbl, align=4
  41. .byte 1, 2, 3, 0
  42. .byte 5, 6, 7, 4
  43. .byte 9, 10, 11, 8
  44. .byte 13, 14, 15, 12
  45. endconst
  46. const shuf_2103_tbl, align=4
  47. .byte 2, 1, 0, 3
  48. .byte 6, 5, 4, 7
  49. .byte 10, 9, 8, 11
  50. .byte 14, 13, 12, 15
  51. endconst
  52. const shuf_3012_tbl, align=4
  53. .byte 3, 0, 1, 2
  54. .byte 7, 4, 5, 6
  55. .byte 11, 8, 9, 10
  56. .byte 15, 12, 13, 14
  57. endconst
  58. const shuf_3210_tbl, align=4
  59. .byte 3, 2, 1, 0
  60. .byte 7, 6, 5, 4
  61. .byte 11, 10, 9, 8
  62. .byte 15, 14, 13, 12
  63. endconst
  64. const shuf_3102_tbl, align=4
  65. .byte 3, 1, 0, 2
  66. .byte 7, 5, 4, 6
  67. .byte 11, 9, 8, 10
  68. .byte 15, 13, 12, 14
  69. endconst
  70. const shuf_2013_tbl, align=4
  71. .byte 2, 0, 1, 3
  72. .byte 6, 4, 5, 7
  73. .byte 10, 8, 9, 11
  74. .byte 14, 12, 13, 15
  75. endconst
  76. const shuf_1203_tbl, align=4
  77. .byte 1, 2, 0, 3
  78. .byte 5, 6, 4, 7
  79. .byte 9, 10, 8, 11
  80. .byte 13, 14, 12, 15
  81. endconst
  82. const shuf_2130_tbl, align=4
  83. .byte 2, 1, 3, 0
  84. .byte 6, 5, 7, 4
  85. .byte 10, 9, 11, 8
  86. .byte 14, 13, 15, 12
  87. endconst
  88. // convert rgb to 16-bit y, u, or v
  89. // uses v3 and v4
  90. .macro rgbconv16 dst, b, g, r, bc, gc, rc, shr_bits
  91. smull v3.4s, \b\().4h, \bc
  92. smlal v3.4s, \g\().4h, \gc
  93. smlal v3.4s, \r\().4h, \rc
  94. smull2 v4.4s, \b\().8h, \bc
  95. smlal2 v4.4s, \g\().8h, \gc
  96. smlal2 v4.4s, \r\().8h, \rc // v3:v4 = b * bc + g * gc + r * rc (32-bit)
  97. shrn \dst\().4h, v3.4s, \shr_bits
  98. shrn2 \dst\().8h, v4.4s, \shr_bits // dst = b * bc + g * gc + r * rc (16-bit)
  99. .endm
  100. // void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
  101. // uint8_t *vdst, int width, int height, int lumStride,
  102. // int chromStride, int srcStride, int32_t *rgb2yuv);
  103. function ff_rgb24toyv12_neon, export=1
  104. // x0 const uint8_t *src
  105. // x1 uint8_t *ydst
  106. // x2 uint8_t *udst
  107. // x3 uint8_t *vdst
  108. // w4 int width
  109. // w5 int height
  110. // w6 int lumStride
  111. // w7 int chromStride
  112. ldrsw x14, [sp]
  113. ldr x15, [sp, #8]
  114. // x14 int srcStride
  115. // x15 int32_t *rgb2yuv
  116. // extend width and stride parameters
  117. uxtw x4, w4
  118. sxtw x6, w6
  119. sxtw x7, w7
  120. // src1 = x0
  121. // src2 = x10
  122. add x10, x0, x14 // x10 = src + srcStride
  123. lsl x14, x14, #1 // srcStride *= 2
  124. add x11, x4, x4, lsl #1 // x11 = 3 * width
  125. sub x14, x14, x11 // srcPadding = (2 * srcStride) - (3 * width)
  126. // ydst1 = x1
  127. // ydst2 = x11
  128. add x11, x1, x6 // x11 = ydst + lumStride
  129. lsl x6, x6, #1 // lumStride *= 2
  130. sub x6, x6, x4 // lumPadding = (2 * lumStride) - width
  131. sub x7, x7, x4, lsr #1 // chromPadding = chromStride - (width / 2)
  132. // load rgb2yuv coefficients into v0, v1, and v2
  133. add x15, x15, #RGB2YUV_COEFFS
  134. ld1 {v0.8h-v2.8h}, [x15] // load 24 values
  135. // load offset constants
  136. movi Y_OFFSET.8h, #0x10, lsl #8
  137. movi UV_OFFSET.8h, #0x80, lsl #8
  138. 1:
  139. mov w15, w4 // w15 = width
  140. 2:
  141. // load first line
  142. ld3 {v26.16b, v27.16b, v28.16b}, [x0], #48
  143. // widen first line to 16-bit
  144. uxtl v16.8h, v26.8b // v16 = B11
  145. uxtl v17.8h, v27.8b // v17 = G11
  146. uxtl v18.8h, v28.8b // v18 = R11
  147. uxtl2 v19.8h, v26.16b // v19 = B12
  148. uxtl2 v20.8h, v27.16b // v20 = G12
  149. uxtl2 v21.8h, v28.16b // v21 = R12
  150. // calculate Y values for first line
  151. rgbconv16 v24, v16, v17, v18, BY, GY, RY, #7 // v24 = Y11
  152. rgbconv16 v25, v19, v20, v21, BY, GY, RY, #7 // v25 = Y12
  153. // load second line
  154. ld3 {v26.16b, v27.16b, v28.16b}, [x10], #48
  155. // pairwise add and save rgb values to calculate average
  156. addp v5.8h, v16.8h, v19.8h
  157. addp v6.8h, v17.8h, v20.8h
  158. addp v7.8h, v18.8h, v21.8h
  159. // widen second line to 16-bit
  160. uxtl v16.8h, v26.8b // v16 = B21
  161. uxtl v17.8h, v27.8b // v17 = G21
  162. uxtl v18.8h, v28.8b // v18 = R21
  163. uxtl2 v19.8h, v26.16b // v19 = B22
  164. uxtl2 v20.8h, v27.16b // v20 = G22
  165. uxtl2 v21.8h, v28.16b // v21 = R22
  166. // calculate Y values for second line
  167. rgbconv16 v26, v16, v17, v18, BY, GY, RY, #7 // v26 = Y21
  168. rgbconv16 v27, v19, v20, v21, BY, GY, RY, #7 // v27 = Y22
  169. // pairwise add rgb values to calculate average
  170. addp v16.8h, v16.8h, v19.8h
  171. addp v17.8h, v17.8h, v20.8h
  172. addp v18.8h, v18.8h, v21.8h
  173. // calculate sum of r, g, b components in 2x2 blocks
  174. add v16.8h, v16.8h, v5.8h
  175. add v17.8h, v17.8h, v6.8h
  176. add v18.8h, v18.8h, v7.8h
  177. // calculate U and V values
  178. rgbconv16 v28, v16, v17, v18, BU, GU, RU, #9 // v28 = U
  179. rgbconv16 v29, v16, v17, v18, BV, GV, RV, #9 // v29 = V
  180. // add offsets and narrow all values
  181. addhn v24.8b, v24.8h, Y_OFFSET.8h
  182. addhn v25.8b, v25.8h, Y_OFFSET.8h
  183. addhn v26.8b, v26.8h, Y_OFFSET.8h
  184. addhn v27.8b, v27.8h, Y_OFFSET.8h
  185. addhn v28.8b, v28.8h, UV_OFFSET.8h
  186. addhn v29.8b, v29.8h, UV_OFFSET.8h
  187. subs w15, w15, #16
  188. // store output
  189. st1 {v24.8b, v25.8b}, [x1], #16 // store ydst1
  190. st1 {v26.8b, v27.8b}, [x11], #16 // store ydst2
  191. st1 {v28.8b}, [x2], #8 // store udst
  192. st1 {v29.8b}, [x3], #8 // store vdst
  193. b.gt 2b
  194. subs w5, w5, #2
  195. // row += 2
  196. add x0, x0, x14 // src1 += srcPadding
  197. add x10, x10, x14 // src2 += srcPadding
  198. add x1, x1, x6 // ydst1 += lumPadding
  199. add x11, x11, x6 // ydst2 += lumPadding
  200. add x2, x2, x7 // udst += chromPadding
  201. add x3, x3, x7 // vdst += chromPadding
  202. b.gt 1b
  203. ret
  204. endfunc
  205. // void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
  206. // uint8_t *dest, int width, int height,
  207. // int src1Stride, int src2Stride, int dstStride);
  208. function ff_interleave_bytes_neon, export=1
  209. sub w5, w5, w3
  210. sub w6, w6, w3
  211. sub w7, w7, w3, lsl #1
  212. 1:
  213. ands w8, w3, #0xfffffff0 // & ~15
  214. b.eq 3f
  215. 2:
  216. ld1 {v0.16b}, [x0], #16
  217. ld1 {v1.16b}, [x1], #16
  218. subs w8, w8, #16
  219. st2 {v0.16b, v1.16b}, [x2], #32
  220. b.gt 2b
  221. tst w3, #15
  222. b.eq 9f
  223. 3:
  224. tst w3, #8
  225. b.eq 4f
  226. ld1 {v0.8b}, [x0], #8
  227. ld1 {v1.8b}, [x1], #8
  228. st2 {v0.8b, v1.8b}, [x2], #16
  229. 4:
  230. tst w3, #4
  231. b.eq 5f
  232. ld1 {v0.s}[0], [x0], #4
  233. ld1 {v1.s}[0], [x1], #4
  234. zip1 v0.8b, v0.8b, v1.8b
  235. st1 {v0.8b}, [x2], #8
  236. 5:
  237. ands w8, w3, #3
  238. b.eq 9f
  239. 6:
  240. ldrb w9, [x0], #1
  241. ldrb w10, [x1], #1
  242. subs w8, w8, #1
  243. bfi w9, w10, #8, #8
  244. strh w9, [x2], #2
  245. b.gt 6b
  246. 9:
  247. subs w4, w4, #1
  248. b.eq 0f
  249. add x0, x0, w5, sxtw
  250. add x1, x1, w6, sxtw
  251. add x2, x2, w7, sxtw
  252. b 1b
  253. 0:
  254. ret
  255. endfunc
  256. // void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
  257. // int width, int height, int srcStride,
  258. // int dst1Stride, int dst2Stride);
  259. function ff_deinterleave_bytes_neon, export=1
  260. sub w5, w5, w3, lsl #1
  261. sub w6, w6, w3
  262. sub w7, w7, w3
  263. 1:
  264. ands w8, w3, #0xfffffff0 // & ~15
  265. b.eq 3f
  266. 2:
  267. ld2 {v0.16b, v1.16b}, [x0], #32
  268. subs w8, w8, #16
  269. st1 {v0.16b}, [x1], #16
  270. st1 {v1.16b}, [x2], #16
  271. b.gt 2b
  272. tst w3, #15
  273. b.eq 9f
  274. 3:
  275. tst w3, #8
  276. b.eq 4f
  277. ld2 {v0.8b, v1.8b}, [x0], #16
  278. st1 {v0.8b}, [x1], #8
  279. st1 {v1.8b}, [x2], #8
  280. 4:
  281. tst w3, #4
  282. b.eq 5f
  283. ld1 {v0.8b}, [x0], #8
  284. shrn v1.8b, v0.8h, #8
  285. xtn v0.8b, v0.8h
  286. st1 {v0.s}[0], [x1], #4
  287. st1 {v1.s}[0], [x2], #4
  288. 5:
  289. ands w8, w3, #3
  290. b.eq 9f
  291. 6:
  292. ldrh w9, [x0], #2
  293. subs w8, w8, #1
  294. ubfx w10, w9, #8, #8
  295. strb w9, [x1], #1
  296. strb w10, [x2], #1
  297. b.gt 6b
  298. 9:
  299. subs w4, w4, #1
  300. b.eq 0f
  301. add x0, x0, w5, sxtw
  302. add x1, x1, w6, sxtw
  303. add x2, x2, w7, sxtw
  304. b 1b
  305. 0:
  306. ret
  307. endfunc
  308. .macro neon_shuf shuf
  309. function ff_shuffle_bytes_\shuf\()_neon, export=1
  310. movrel x9, shuf_\shuf\()_tbl
  311. ld1 {v1.16b}, [x9]
  312. and w5, w2, #~15
  313. and w3, w2, #8
  314. and w4, w2, #4
  315. cbz w5, 2f
  316. 1:
  317. ld1 {v0.16b}, [x0], #16
  318. subs w5, w5, #16
  319. tbl v0.16b, {v0.16b}, v1.16b
  320. st1 {v0.16b}, [x1], #16
  321. b.gt 1b
  322. 2:
  323. cbz w3, 3f
  324. ld1 {v0.8b}, [x0], #8
  325. tbl v0.8b, {v0.16b}, v1.8b
  326. st1 {v0.8b}, [x1], #8
  327. 3:
  328. cbz w4, 4f
  329. .if \shuf == 0321
  330. ldr w5, [x0]
  331. rev w5, w5
  332. ror w5, w5, #24
  333. str w5, [x1]
  334. .endif
  335. .if \shuf == 1230
  336. ldr w5, [x0]
  337. ror w5, w5, #8
  338. str w5, [x1]
  339. .endif
  340. .if \shuf == 2103
  341. ldr w5, [x0]
  342. rev w5, w5
  343. ror w5, w5, #8
  344. str w5, [x1]
  345. .endif
  346. .if \shuf == 3012
  347. ldr w5, [x0]
  348. ror w5, w5, #24
  349. str w5, [x1]
  350. .endif
  351. .if \shuf == 3210
  352. ldr w5, [x0]
  353. rev w5, w5
  354. str w5, [x1]
  355. .endif
  356. .if \shuf == 3102 || \shuf == 2013 || \shuf == 1203 || \shuf == 2130
  357. ld1 {v0.s}[0], [x0]
  358. tbl v0.8b, {v0.16b}, v1.8b
  359. st1 {v0.s}[0], [x1]
  360. .endif
  361. 4:
  362. ret
  363. endfunc
  364. .endm
  365. neon_shuf 0321
  366. neon_shuf 1230
  367. neon_shuf 2103
  368. neon_shuf 3012
  369. neon_shuf 3102
  370. neon_shuf 2013
  371. neon_shuf 1203
  372. neon_shuf 2130
  373. neon_shuf 3210
  374. /*
  375. v0-v7 - two consecutive lines
  376. x0 - upper Y destination
  377. x1 - U destination
  378. x2 - V destination
  379. x3 - upper src line
  380. w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
  381. x6 - lum padding
  382. x7 - chrom padding
  383. x8 - src padding
  384. w9 - number of bytes remaining in the tail
  385. x10 - lower Y destination
  386. w12 - tmp
  387. x13 - lower src line
  388. w14 - tmp
  389. w17 - set to 1 if last line has to be handled separately (odd height)
  390. */
  391. // one fast path iteration processes 16 uyvy tuples
  392. // is_line_tail is set to 1 when final 16 tuples are being processed
  393. // skip_storing_chroma is set to 1 when final line is processed and the height is odd
  394. .macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
  395. ld4 {v0.16b - v3.16b}, [x3], #64
  396. .if ! \is_line_tail
  397. subs w14, w14, #32
  398. .endif
  399. .if ! \skip_storing_chroma
  400. .ifc \dst_fmt, yuv420
  401. ld4 {v4.16b - v7.16b}, [x13], #64
  402. .endif
  403. .ifc \dst_fmt, yuv420 // store UV
  404. .ifc \src_fmt, uyvy
  405. uhadd v0.16b, v4.16b, v0.16b // halving sum of U
  406. uhadd v2.16b, v6.16b, v2.16b // halving sum of V
  407. .else
  408. uhadd v1.16b, v5.16b, v1.16b // halving sum of U
  409. uhadd v3.16b, v7.16b, v3.16b // halving sum of V
  410. .endif
  411. .endif
  412. .ifc \src_fmt, uyvy
  413. st1 {v2.16b}, [x2], #16
  414. st1 {v0.16b}, [x1], #16
  415. .else
  416. st1 {v3.16b}, [x2], #16
  417. st1 {v1.16b}, [x1], #16
  418. .endif
  419. .ifc \dst_fmt, yuv420 // store_y
  420. .ifc \src_fmt, uyvy
  421. mov v6.16b, v5.16b
  422. st2 {v6.16b,v7.16b}, [x10], #32
  423. .else
  424. mov v5.16b, v4.16b
  425. st2 {v5.16b,v6.16b}, [x10], #32
  426. .endif
  427. .endif
  428. .endif // ! \skip_storing_chroma
  429. .ifc \src_fmt, uyvy
  430. mov v2.16b, v1.16b
  431. st2 {v2.16b,v3.16b}, [x0], #32
  432. .else
  433. mov v1.16b, v0.16b
  434. st2 {v1.16b,v2.16b}, [x0], #32
  435. .endif
  436. .endm
  437. // shift pointers back to width - 32 to process the tail of the line
  438. // if the height is odd, processing the final line is simplified
  439. .macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
  440. add x3, x3, w9, sxtw #1
  441. sub x3, x3, #64
  442. .if ! \is_final_odd_line
  443. .ifc \dst_fmt, yuv420
  444. add x13, x13, w9, sxtw #1
  445. sub x13, x13, #64
  446. add x10, x10, w9, sxtw
  447. sub x10, x10, #32
  448. .endif
  449. .endif
  450. add x0, x0, w9, sxtw
  451. sub x0, x0, #32
  452. .if ! \is_final_odd_line
  453. asr w14, w9, #1
  454. add x1, x1, w14, sxtw
  455. sub x1, x1, #16
  456. add x2, x2, w14, sxtw
  457. sub x2, x2, #16
  458. .endif
  459. .endm
  460. .macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
  461. .ifc \dst_fmt, yuv422
  462. .ifc \src_fmt, uyvy
  463. ldrb w12, [x3], #1
  464. ldrb w14, [x3], #1
  465. strb w12, [x1], #1
  466. strb w14, [x0], #1
  467. ldrb w12, [x3], #1
  468. ldrb w14, [x3], #1
  469. strb w12, [x2], #1
  470. strb w14, [x0], #1
  471. .else
  472. ldrb w12, [x3], #1
  473. ldrb w14, [x3], #1
  474. strb w12, [x0], #1
  475. strb w14, [x1], #1
  476. ldrb w12, [x3], #1
  477. ldrb w14, [x3], #1
  478. strb w12, [x0], #1
  479. strb w14, [x2], #1
  480. .endif
  481. .endif
  482. .ifc \dst_fmt, yuv420
  483. .ifc \src_fmt, uyvy
  484. .if \skip_storing_chroma
  485. ldrb w12, [x3], #2
  486. ldrb w14, [x3], #2
  487. strb w12, [x0], #1
  488. strb w14, [x0], #1
  489. .else
  490. ldrb w12, [x3], #1
  491. ldrb w14, [x13], #1
  492. add w12, w12, w14
  493. lsr w12, w12, #1
  494. strb w12, [x1], #1
  495. ldrb w14, [x3], #1
  496. ldrb w12, [x13], #1
  497. strb w14, [x0], #1
  498. strb w12, [x10], #1
  499. ldrb w14, [x13], #1
  500. ldrb w12, [x3], #1
  501. add w12, w12, w14
  502. lsr w12, w12, #1
  503. strb w12, [x2], #1
  504. ldrb w14, [x3], #1
  505. ldrb w12, [x13], #1
  506. strb w14, [x0], #1
  507. strb w12, [x10], #1
  508. .endif
  509. .else
  510. .if \skip_storing_chroma
  511. ldrb w12, [x3], #2
  512. ldrb w14, [x3], #2
  513. strb w12, [x0], #1
  514. strb w14, [x0], #1
  515. .else
  516. ldrb w12, [x3], #1
  517. ldrb w14, [x13], #1
  518. strb w12, [x0], #1
  519. strb w14, [x10], #1
  520. ldrb w12, [x3], #1
  521. ldrb w14, [x13], #1
  522. add w12, w12, w14
  523. lsr w12, w12, #1
  524. strb w12, [x1], #1
  525. ldrb w14, [x3], #1
  526. ldrb w12, [x13], #1
  527. strb w14, [x0], #1
  528. strb w12, [x10], #1
  529. ldrb w14, [x13], #1
  530. ldrb w12, [x3], #1
  531. add w12, w12, w14
  532. lsr w12, w12, #1
  533. strb w12, [x2], #1
  534. .endif
  535. .endif
  536. .endif
  537. .endm
  538. .macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
  539. add x3, x3, x8
  540. add x0, x0, x6
  541. .ifc \dst_fmt, yuv420
  542. add x13, x13, x8
  543. add x10, x10, x6
  544. .endif
  545. add x1, x1, x7
  546. add x2, x2, x7
  547. .endm
  548. .macro interleaved_yuv_to_planar src_fmt, dst_fmt
  549. function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
  550. sxtw x6, w6
  551. sxtw x7, w7
  552. ldrsw x8, [sp]
  553. ands w11, w4, #~31 // choose between fast and slow path
  554. .ifc \dst_fmt, yuv420
  555. add x10, x0, x6
  556. add x13, x3, x8
  557. add x8, x8, x8
  558. add x6, x6, x6
  559. and w17, w5, #1
  560. asr w5, w5, #1
  561. .endif
  562. asr w9, w4, #1
  563. sub x8, x8, w4, sxtw #1 // src offset
  564. sub x6, x6, w4, sxtw // lum offset
  565. sub x7, x7, x9 // chr offset
  566. b.eq 6f
  567. 1: // fast path - the width is at least 32
  568. and w14, w4, #~31 // w14 is the main loop counter
  569. and w9, w4, #31 // w9 holds the remaining width, 0 to 31
  570. 2:
  571. fastpath_iteration \src_fmt, \dst_fmt, 0, 0
  572. b.ne 2b
  573. fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
  574. fastpath_iteration \src_fmt, \dst_fmt, 0, 0
  575. subs w5, w5, #1
  576. move_pointers_to_next_line \src_fmt, \dst_fmt
  577. b.ne 1b
  578. .ifc \dst_fmt, yuv420 // handle the last line in case the height is odd
  579. cbz w17, 3f
  580. and w14, w4, #~31
  581. 4:
  582. fastpath_iteration \src_fmt, \dst_fmt, 0, 1
  583. b.ne 4b
  584. fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
  585. fastpath_iteration \src_fmt, \dst_fmt, 1, 1
  586. 3:
  587. .endif
  588. ret
  589. 6: // slow path - width is at most 31
  590. and w9, w4, #31
  591. 7:
  592. subs w9, w9, #2
  593. slowpath_iteration \src_fmt, \dst_fmt, 0
  594. b.ne 7b
  595. subs w5, w5, #1
  596. move_pointers_to_next_line \src_fmt, \dst_fmt
  597. b.ne 6b
  598. .ifc \dst_fmt, yuv420
  599. cbz w17, 8f
  600. and w9, w4, #31
  601. .ifc \src_fmt, uyvy
  602. add x3, x3, #1
  603. .endif
  604. 5:
  605. subs w9, w9, #2
  606. slowpath_iteration \src_fmt, \dst_fmt, 1
  607. b.ne 5b
  608. 8:
  609. .endif
  610. ret
  611. endfunc
  612. .endm
  613. interleaved_yuv_to_planar uyvy, yuv422
  614. interleaved_yuv_to_planar uyvy, yuv420
  615. interleaved_yuv_to_planar yuyv, yuv422
  616. interleaved_yuv_to_planar yuyv, yuv420