swscale_unscaled_neon.S 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. /*
  2. * Copyright (c) 2024 Ramiro Polla
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. function ff_nv24_to_yuv420p_chroma_neon, export=1
  22. // x0 uint8_t *dst1
  23. // x1 int dstStride1
  24. // x2 uint8_t *dst2
  25. // x3 int dstStride2
  26. // x4 const uint8_t *src
  27. // x5 int srcStride
  28. // w6 int w
  29. // w7 int h
  30. add x9, x4, w5, sxtw // x9 = src + srcStride
  31. lsl w5, w5, #1 // srcStride *= 2
  32. sub w5, w5, w6, lsl #2 // srcPadding = (2 * srcStride) - (4 * w)
  33. sub w1, w1, w6 // dstPadding1 = dstStride1 - w
  34. sub w3, w3, w6 // dstPadding2 = dstStride2 - w
  35. 1:
  36. mov w10, w6 // w10 = w
  37. 2:
  38. ld2 {v0.16b, v1.16b}, [x4], #32 // v0 = U1, v1 = V1
  39. ld2 {v2.16b, v3.16b}, [x9], #32 // v2 = U2, v3 = V2
  40. uaddlp v0.8h, v0.16b // pairwise add U1 into v0
  41. uaddlp v1.8h, v1.16b // pairwise add V1 into v1
  42. uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
  43. uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
  44. shrn v0.8b, v0.8h, #2 // divide by 4
  45. shrn v1.8b, v1.8h, #2 // divide by 4
  46. st1 {v0.8b}, [x0], #8 // store U into dst1
  47. st1 {v1.8b}, [x2], #8 // store V into dst2
  48. subs w10, w10, #8
  49. b.gt 2b
  50. // next row
  51. add x4, x4, w5, sxtw // src1 += srcPadding
  52. add x9, x9, w5, sxtw // src2 += srcPadding
  53. add x0, x0, w1, sxtw // dst1 += dstPadding1
  54. add x2, x2, w3, sxtw // dst2 += dstPadding2
  55. subs w7, w7, #2
  56. b.gt 1b
  57. ret
  58. endfunc