output.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /*
  2. * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. function ff_yuv2planeX_8_neon, export=1
  22. // x0 - const int16_t *filter,
  23. // x1 - int filterSize,
  24. // x2 - const int16_t **src,
  25. // x3 - uint8_t *dest,
  26. // w4 - int dstW,
  27. // x5 - const uint8_t *dither,
  28. // w6 - int offset
  29. ld1 {v0.8b}, [x5] // load 8x8-bit dither
  30. and w6, w6, #7
  31. cbz w6, 1f // check if offsetting present
  32. ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
  33. 1: uxtl v0.8h, v0.8b // extend dither to 16-bit
  34. ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
  35. ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
  36. cmp w1, #8 // if filterSize == 8, branch to specialized version
  37. b.eq 6f
  38. cmp w1, #4 // if filterSize == 4, branch to specialized version
  39. b.eq 8f
  40. cmp w1, #2 // if filterSize == 2, branch to specialized version
  41. b.eq 10f
  42. // The filter size does not match of the of specialized implementations. It is either even or odd. If it is even
  43. // then use the first section below.
  44. mov x7, #0 // i = 0
  45. tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
  46. // fs % 2 == 0
  47. 2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
  48. mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
  49. mov w8, w1 // tmpfilterSize = filterSize
  50. mov x9, x2 // srcp = src
  51. mov x10, x0 // filterp = filter
  52. 3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
  53. ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
  54. add x11, x11, x7, lsl #1 // &src[j ][i]
  55. add x12, x12, x7, lsl #1 // &src[j+1][i]
  56. ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
  57. ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
  58. smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
  59. smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
  60. smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
  61. smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
  62. subs w8, w8, #2 // tmpfilterSize -= 2
  63. b.gt 3b // loop until filterSize consumed
  64. sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
  65. sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
  66. uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
  67. st1 {v3.8b}, [x3], #8 // write to destination
  68. subs w4, w4, #8 // dstW -= 8
  69. add x7, x7, #8 // i += 8
  70. b.gt 2b // loop until width consumed
  71. ret
  72. // If filter size is odd (most likely == 1), then use this section.
  73. // fs % 2 != 0
  74. 4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
  75. mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
  76. mov w8, w1 // tmpfilterSize = filterSize
  77. mov x9, x2 // srcp = src
  78. mov x10, x0 // filterp = filter
  79. 5: ldr x11, [x9], #8 // get 1 pointer: src[j]
  80. ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
  81. add x11, x11, x7, lsl #1 // &src[j ][i]
  82. ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
  83. smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
  84. smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
  85. subs w8, w8, #1 // tmpfilterSize -= 2
  86. b.gt 5b // loop until filterSize consumed
  87. sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
  88. sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
  89. uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
  90. st1 {v3.8b}, [x3], #8 // write to destination
  91. subs w4, w4, #8 // dstW -= 8
  92. add x7, x7, #8 // i += 8
  93. b.gt 4b // loop until width consumed
  94. ret
  95. 6: // fs=8
  96. ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
  97. ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
  98. ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
  99. ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
  100. // load 8x16-bit values for filter[j], where j=0..7
  101. ld1 {v6.8h}, [x0]
  102. 7:
  103. mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
  104. mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
  105. ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
  106. ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
  107. ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
  108. ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
  109. ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
  110. ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
  111. ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
  112. ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
  113. smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
  114. smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
  115. smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
  116. smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
  117. smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
  118. smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
  119. smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
  120. smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
  121. smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
  122. smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
  123. smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
  124. smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
  125. smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
  126. smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
  127. smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
  128. smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
  129. sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
  130. sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
  131. uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
  132. subs w4, w4, #8 // dstW -= 8
  133. st1 {v3.8b}, [x3], #8 // write to destination
  134. b.gt 7b // loop until width consumed
  135. ret
  136. 8: // fs=4
  137. ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
  138. ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
  139. // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
  140. ld1 {v6.4h}, [x0]
  141. 9:
  142. mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
  143. mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
  144. ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
  145. ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
  146. ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
  147. ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
  148. smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
  149. smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
  150. smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
  151. smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
  152. smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
  153. smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
  154. smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
  155. smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
  156. sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
  157. sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
  158. uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
  159. st1 {v3.8b}, [x3], #8 // write to destination
  160. subs w4, w4, #8 // dstW -= 8
  161. b.gt 9b // loop until width consumed
  162. ret
  163. 10: // fs=2
  164. ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
  165. // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
  166. ldr s6, [x0]
  167. 11:
  168. mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
  169. mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
  170. ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
  171. ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
  172. smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
  173. smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
  174. smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
  175. smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
  176. sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
  177. sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
  178. uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
  179. st1 {v3.8b}, [x3], #8 // write to destination
  180. subs w4, w4, #8 // dstW -= 8
  181. b.gt 11b // loop until width consumed
  182. ret
  183. endfunc
  184. function ff_yuv2plane1_8_neon, export=1
  185. // x0 - const int16_t *src,
  186. // x1 - uint8_t *dest,
  187. // w2 - int dstW,
  188. // x3 - const uint8_t *dither,
  189. // w4 - int offset
  190. ld1 {v0.8b}, [x3] // load 8x8-bit dither
  191. and w4, w4, #7
  192. cbz w4, 1f // check if offsetting present
  193. ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
  194. 1: uxtl v0.8h, v0.8b // extend dither to 32-bit
  195. uxtl v1.4s, v0.4h
  196. uxtl2 v2.4s, v0.8h
  197. 2:
  198. ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
  199. sxtl v4.4s, v3.4h
  200. sxtl2 v5.4s, v3.8h
  201. add v4.4s, v4.4s, v1.4s
  202. add v5.4s, v5.4s, v2.4s
  203. sqshrun v4.4h, v4.4s, #6
  204. sqshrun2 v4.8h, v5.4s, #6
  205. uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
  206. subs w2, w2, #8 // dstW -= 8
  207. st1 {v3.8b}, [x1], #8 // write to destination
  208. b.gt 2b // loop until width consumed
  209. ret
  210. endfunc