hscale.S 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449
  1. /*
  2. * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
  3. * Copyright (c) 2019-2021 Sebastian Pop <spop@amazon.com>
  4. * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavutil/aarch64/asm.S"
  23. /*
  24. ;-----------------------------------------------------------------------------
  25. ; horizontal line scaling
  26. ;
  27. ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
  28. ; (SwsContext *c, int{16,32}_t *dst,
  29. ; int dstW, const uint{8,16}_t *src,
  30. ; const int16_t *filter,
  31. ; const int32_t *filterPos, int filterSize);
  32. ;
  33. ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
  34. ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
  35. ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
  36. ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
  37. ; output pixel is generated from $filterSize input pixels, the position of
  38. ; the first pixel is given in filterPos[nOutputPixel].
  39. ;----------------------------------------------------------------------------- */
  40. function ff_hscale8to15_X8_neon, export=1
  41. sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
  42. 1: ldr w8, [x5], #4 // filterPos[idx]
  43. ldr w0, [x5], #4 // filterPos[idx + 1]
  44. ldr w11, [x5], #4 // filterPos[idx + 2]
  45. ldr w9, [x5], #4 // filterPos[idx + 3]
  46. mov x16, x4 // filter0 = filter
  47. add x12, x16, x7 // filter1 = filter0 + filterSize*2
  48. add x13, x12, x7 // filter2 = filter1 + filterSize*2
  49. add x4, x13, x7 // filter3 = filter2 + filterSize*2
  50. movi v0.2d, #0 // val sum part 1 (for dst[0])
  51. movi v1.2d, #0 // val sum part 2 (for dst[1])
  52. movi v2.2d, #0 // val sum part 3 (for dst[2])
  53. movi v3.2d, #0 // val sum part 4 (for dst[3])
  54. add x17, x3, w8, uxtw // srcp + filterPos[0]
  55. add x8, x3, w0, uxtw // srcp + filterPos[1]
  56. add x0, x3, w11, uxtw // srcp + filterPos[2]
  57. add x11, x3, w9, uxtw // srcp + filterPos[3]
  58. mov w15, w6 // filterSize counter
  59. 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
  60. ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
  61. ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
  62. ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
  63. uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
  64. smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
  65. smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
  66. ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
  67. ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
  68. uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
  69. smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
  70. uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
  71. smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
  72. smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
  73. ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
  74. smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
  75. ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
  76. subs w15, w15, #8 // j -= 8: processed 8/filterSize
  77. uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
  78. smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
  79. smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
  80. b.gt 2b // inner loop if filterSize not consumed completely
  81. addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
  82. addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
  83. addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
  84. subs w2, w2, #4 // dstW -= 4
  85. sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
  86. st1 {v0.4h}, [x1], #8 // write to destination part0123
  87. b.gt 1b // loop until end of line
  88. ret
  89. endfunc
  90. function ff_hscale8to15_X4_neon, export=1
  91. // x0 SwsContext *c (not used)
  92. // x1 int16_t *dst
  93. // w2 int dstW
  94. // x3 const uint8_t *src
  95. // x4 const int16_t *filter
  96. // x5 const int32_t *filterPos
  97. // w6 int filterSize
  98. // This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
  99. // 0 mod 8. It also assumes that dstW is 0 mod 4.
  100. lsl w7, w6, #1 // w7 = filterSize * 2
  101. 1:
  102. ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
  103. ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
  104. movi v16.2d, #0 // initialize accumulator for idx + 0
  105. movi v17.2d, #0 // initialize accumulator for idx + 1
  106. movi v18.2d, #0 // initialize accumulator for idx + 2
  107. movi v19.2d, #0 // initialize accumulator for idx + 3
  108. mov x12, x4 // filter pointer for idx + 0
  109. add x13, x4, x7 // filter pointer for idx + 1
  110. add x8, x3, w8, uxtw // srcp + filterPos[idx + 0]
  111. add x9, x3, w9, uxtw // srcp + filterPos[idx + 1]
  112. add x14, x13, x7 // filter pointer for idx + 2
  113. add x10, x3, w10, uxtw // srcp + filterPos[idx + 2]
  114. add x11, x3, w11, uxtw // srcp + filterPos[idx + 3]
  115. mov w0, w6 // copy filterSize to a temp register, w0
  116. add x5, x5, #16 // advance the filterPos pointer
  117. add x15, x14, x7 // filter pointer for idx + 3
  118. mov x16, xzr // temp register for offsetting filter pointers
  119. 2:
  120. // This section loops over 8-wide chunks of filter size
  121. ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0
  122. ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0
  123. ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1
  124. ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1
  125. uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
  126. uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
  127. ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2
  128. ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2
  129. smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
  130. smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
  131. ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3
  132. ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3
  133. sub w0, w0, #8 // decrement the remaining filterSize counter
  134. smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
  135. smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
  136. uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
  137. uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
  138. smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
  139. smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
  140. cmp w0, #8 // are there at least 8 more elements in filter to consume?
  141. add x16, x16, #16 // advance the offsetting register for filter values
  142. smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
  143. smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
  144. b.ge 2b // branch back to inner loop
  145. // complete the remaining 4 filter elements
  146. sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements
  147. ldr s4, [x8] // load 4 bytes from srcp for idx + 0
  148. ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0
  149. ldr s5, [x9] // load 4 bytes from srcp for idx + 1
  150. ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1
  151. uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
  152. uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
  153. ldr s6, [x10] // load 4 bytes from srcp for idx + 2
  154. ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2
  155. smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
  156. smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
  157. ldr s7, [x11] // load 4 bytes from srcp for idx + 3
  158. ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3
  159. uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
  160. uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
  161. addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1
  162. smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
  163. smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
  164. addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3
  165. addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3
  166. subs w2, w2, #4 // dstW -= 4
  167. sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values
  168. st1 {v0.4h}, [x1], #8 // write to destination idx 0..3
  169. add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4
  170. b.gt 1b // loop until end of line
  171. ret
  172. endfunc
  173. function ff_hscale8to15_4_neon, export=1
  174. // x0 SwsContext *c (not used)
  175. // x1 int16_t *dst
  176. // x2 int dstW
  177. // x3 const uint8_t *src
  178. // x4 const int16_t *filter
  179. // x5 const int32_t *filterPos
  180. // x6 int filterSize
  181. // x8-x15 registers for gathering src data
  182. // v0 madd accumulator 4S
  183. // v1-v4 filter values (16 bit) 8H
  184. // v5 madd accumulator 4S
  185. // v16-v19 src values (8 bit) 8B
  186. // This implementation has 4 sections:
  187. // 1. Prefetch src data
  188. // 2. Interleaved prefetching src data and madd
  189. // 3. Complete madd
  190. // 4. Complete remaining iterations when dstW % 8 != 0
  191. sub sp, sp, #32 // allocate 32 bytes on the stack
  192. cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up
  193. b.lt 2f
  194. // load 8 values from filterPos to be used as offsets into src
  195. ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
  196. ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
  197. ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5]
  198. ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7]
  199. add x5, x5, #32 // advance filterPos
  200. // gather random access data from src into contiguous memory
  201. ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3]
  202. ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3]
  203. ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3]
  204. ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3]
  205. ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3]
  206. ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3]
  207. ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3]
  208. ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3]
  209. stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
  210. stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
  211. stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
  212. stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
  213. 1:
  214. ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
  215. // load 8 values from filterPos to be used as offsets into src
  216. ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
  217. ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
  218. ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
  219. ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
  220. movi v0.2d, #0 // Clear madd accumulator for idx 0..3
  221. movi v5.2d, #0 // Clear madd accumulator for idx 4..7
  222. ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
  223. add x5, x5, #32 // advance filterPos
  224. // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
  225. uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
  226. uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
  227. ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration
  228. ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration
  229. uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
  230. uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
  231. ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration
  232. ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration
  233. smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
  234. smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
  235. ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration
  236. ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration
  237. smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
  238. smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
  239. ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration
  240. ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration
  241. smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
  242. smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
  243. stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
  244. stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
  245. smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
  246. smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
  247. stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
  248. stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
  249. sub w2, w2, #8 // dstW -= 8
  250. sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
  251. sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
  252. st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
  253. cmp w2, #16 // continue on main loop if there are at least 16 iterations left
  254. b.ge 1b
  255. // last full iteration
  256. ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
  257. ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
  258. movi v0.2d, #0 // Clear madd accumulator for idx 0..3
  259. movi v5.2d, #0 // Clear madd accumulator for idx 4..7
  260. uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
  261. uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
  262. uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
  263. uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
  264. smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
  265. smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
  266. smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
  267. smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
  268. smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
  269. smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
  270. smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
  271. smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
  272. subs w2, w2, #8 // dstW -= 8
  273. sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
  274. sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
  275. st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
  276. cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
  277. add sp, sp, #32 // clean up stack
  278. ret
  279. // finish up when dstW % 8 != 0 or dstW < 16
  280. 2:
  281. // load src
  282. ldr w8, [x5], #4 // filterPos[i]
  283. add x9, x3, w8, uxtw // calculate the address for src load
  284. ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
  285. // load filter
  286. ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
  287. uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
  288. smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
  289. addv s0, v0.4s // add up products of src and filter values
  290. sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
  291. st1 {v0.h}[0], [x1], #2 // dst[i] = ...
  292. sub w2, w2, #1 // dstW--
  293. cbnz w2, 2b
  294. add sp, sp, #32 // clean up stack
  295. ret
  296. endfunc
  297. function ff_hscale8to19_4_neon, export=1
  298. // x0 SwsContext *c (unused)
  299. // x1 int32_t *dst
  300. // w2 int dstW
  301. // x3 const uint8_t *src // treat it as uint16_t *src
  302. // x4 const uint16_t *filter
  303. // x5 const int32_t *filterPos
  304. // w6 int filterSize
  305. movi v18.4s, #1
  306. movi v17.4s, #1
  307. shl v18.4s, v18.4s, #19
  308. sub v18.4s, v18.4s, v17.4s // max allowed value
  309. cmp w2, #16
  310. b.lt 2f // move to last block
  311. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  312. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  313. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  314. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  315. add x5, x5, #32
  316. // load data from
  317. ldr w8, [x3, w8, uxtw]
  318. ldr w9, [x3, w9, uxtw]
  319. ldr w10, [x3, w10, uxtw]
  320. ldr w11, [x3, w11, uxtw]
  321. ldr w12, [x3, w12, uxtw]
  322. ldr w13, [x3, w13, uxtw]
  323. ldr w14, [x3, w14, uxtw]
  324. ldr w15, [x3, w15, uxtw]
  325. sub sp, sp, #32
  326. stp w8, w9, [sp]
  327. stp w10, w11, [sp, #8]
  328. stp w12, w13, [sp, #16]
  329. stp w14, w15, [sp, #24]
  330. 1:
  331. ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
  332. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
  333. // load filterPositions into registers for next iteration
  334. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  335. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  336. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  337. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  338. add x5, x5, #32
  339. uxtl v0.8h, v0.8b
  340. ldr w8, [x3, w8, uxtw]
  341. smull v5.4s, v0.4h, v28.4h // multiply first column of src
  342. ldr w9, [x3, w9, uxtw]
  343. smull2 v6.4s, v0.8h, v28.8h
  344. stp w8, w9, [sp]
  345. uxtl v1.8h, v1.8b
  346. ldr w10, [x3, w10, uxtw]
  347. smlal v5.4s, v1.4h, v29.4h // multiply second column of src
  348. ldr w11, [x3, w11, uxtw]
  349. smlal2 v6.4s, v1.8h, v29.8h
  350. stp w10, w11, [sp, #8]
  351. uxtl v2.8h, v2.8b
  352. ldr w12, [x3, w12, uxtw]
  353. smlal v5.4s, v2.4h, v30.4h // multiply third column of src
  354. ldr w13, [x3, w13, uxtw]
  355. smlal2 v6.4s, v2.8h, v30.8h
  356. stp w12, w13, [sp, #16]
  357. uxtl v3.8h, v3.8b
  358. ldr w14, [x3, w14, uxtw]
  359. smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
  360. ldr w15, [x3, w15, uxtw]
  361. smlal2 v6.4s, v3.8h, v31.8h
  362. stp w14, w15, [sp, #24]
  363. sub w2, w2, #8
  364. sshr v5.4s, v5.4s, #3
  365. sshr v6.4s, v6.4s, #3
  366. smin v5.4s, v5.4s, v18.4s
  367. smin v6.4s, v6.4s, v18.4s
  368. st1 {v5.4s, v6.4s}, [x1], #32
  369. cmp w2, #16
  370. b.ge 1b
  371. // here we make last iteration, without updating the registers
  372. ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
  373. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
  374. uxtl v0.8h, v0.8b
  375. uxtl v1.8h, v1.8b
  376. smull v5.4s, v0.4h, v28.4h
  377. smull2 v6.4s, v0.8h, v28.8h
  378. uxtl v2.8h, v2.8b
  379. smlal v5.4s, v1.4h, v29.4h
  380. smlal2 v6.4s, v1.8h, v29.8h
  381. uxtl v3.8h, v3.8b
  382. smlal v5.4s, v2.4h, v30.4h
  383. smlal2 v6.4s, v2.8h, v30.8h
  384. smlal v5.4s, v3.4h, v31.4h
  385. smlal2 v6.4s, v3.8h, v31.8h
  386. sshr v5.4s, v5.4s, #3
  387. sshr v6.4s, v6.4s, #3
  388. smin v5.4s, v5.4s, v18.4s
  389. smin v6.4s, v6.4s, v18.4s
  390. sub w2, w2, #8
  391. st1 {v5.4s, v6.4s}, [x1], #32
  392. add sp, sp, #32 // restore stack
  393. cbnz w2, 2f
  394. ret
  395. 2:
  396. ldr w8, [x5], #4 // load filterPos
  397. add x9, x3, w8, uxtw // src + filterPos
  398. ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
  399. ld1 {v31.4h}, [x4], #8
  400. uxtl v0.8h, v0.8b
  401. smull v5.4s, v0.4h, v31.4h
  402. saddlv d0, v5.4s
  403. sqshrn s0, d0, #3
  404. smin v0.4s, v0.4s, v18.4s
  405. st1 {v0.s}[0], [x1], #4
  406. sub w2, w2, #1
  407. cbnz w2, 2b // if iterations remain jump to beginning
  408. ret
  409. endfunc
  410. function ff_hscale8to19_X8_neon, export=1
  411. movi v20.4s, #1
  412. movi v17.4s, #1
  413. shl v20.4s, v20.4s, #19
  414. sub v20.4s, v20.4s, v17.4s
  415. sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
  416. 1:
  417. mov x16, x4 // filter0 = filter
  418. ldr w8, [x5], #4 // filterPos[idx]
  419. add x12, x16, x7 // filter1 = filter0 + filterSize*2
  420. ldr w0, [x5], #4 // filterPos[idx + 1]
  421. add x13, x12, x7 // filter2 = filter1 + filterSize*2
  422. ldr w11, [x5], #4 // filterPos[idx + 2]
  423. add x4, x13, x7 // filter3 = filter2 + filterSize*2
  424. ldr w9, [x5], #4 // filterPos[idx + 3]
  425. movi v0.2d, #0 // val sum part 1 (for dst[0])
  426. movi v1.2d, #0 // val sum part 2 (for dst[1])
  427. movi v2.2d, #0 // val sum part 3 (for dst[2])
  428. movi v3.2d, #0 // val sum part 4 (for dst[3])
  429. add x17, x3, w8, uxtw // srcp + filterPos[0]
  430. add x8, x3, w0, uxtw // srcp + filterPos[1]
  431. add x0, x3, w11, uxtw // srcp + filterPos[2]
  432. add x11, x3, w9, uxtw // srcp + filterPos[3]
  433. mov w15, w6 // filterSize counter
  434. 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
  435. ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
  436. uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
  437. smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
  438. ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
  439. smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
  440. ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
  441. ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
  442. uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
  443. ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
  444. uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
  445. smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
  446. ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
  447. smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
  448. ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
  449. smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
  450. uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
  451. smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
  452. smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
  453. subs w15, w15, #8 // j -= 8: processed 8/filterSize
  454. smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
  455. b.gt 2b // inner loop if filterSize not consumed completely
  456. addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
  457. addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
  458. addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
  459. subs w2, w2, #4 // dstW -= 4
  460. sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
  461. smin v0.4s, v0.4s, v20.4s
  462. st1 {v0.4s}, [x1], #16 // write to destination part0123
  463. b.gt 1b // loop until end of line
  464. ret
  465. endfunc
  466. function ff_hscale8to19_X4_neon, export=1
  467. // x0 SwsContext *c (not used)
  468. // x1 int16_t *dst
  469. // w2 int dstW
  470. // x3 const uint8_t *src
  471. // x4 const int16_t *filter
  472. // x5 const int32_t *filterPos
  473. // w6 int filterSize
  474. movi v20.4s, #1
  475. movi v17.4s, #1
  476. shl v20.4s, v20.4s, #19
  477. sub v20.4s, v20.4s, v17.4s
  478. lsl w7, w6, #1
  479. 1:
  480. ldp w8, w9, [x5]
  481. ldp w10, w11, [x5, #8]
  482. movi v16.2d, #0 // initialize accumulator for idx + 0
  483. movi v17.2d, #0 // initialize accumulator for idx + 1
  484. movi v18.2d, #0 // initialize accumulator for idx + 2
  485. movi v19.2d, #0 // initialize accumulator for idx + 3
  486. mov x12, x4 // filter + 0
  487. add x13, x4, x7 // filter + 1
  488. add x8, x3, w8, uxtw // srcp + filterPos 0
  489. add x14, x13, x7 // filter + 2
  490. add x9, x3, w9, uxtw // srcp + filterPos 1
  491. add x15, x14, x7 // filter + 3
  492. add x10, x3, w10, uxtw // srcp + filterPos 2
  493. mov w0, w6 // save the filterSize to temporary variable
  494. add x11, x3, w11, uxtw // srcp + filterPos 3
  495. add x5, x5, #16 // advance filter position
  496. mov x16, xzr // clear the register x16 used for offsetting the filter values
  497. 2:
  498. ldr d4, [x8], #8 // load src values for idx 0
  499. ldr q31, [x12, x16] // load filter values for idx 0
  500. uxtl v4.8h, v4.8b // extend type to match the filter' size
  501. ldr d5, [x9], #8 // load src values for idx 1
  502. smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0
  503. uxtl v5.8h, v5.8b // extend type to match the filter' size
  504. ldr q30, [x13, x16] // load filter values for idx 1
  505. smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
  506. ldr d6, [x10], #8 // load src values for idx 2
  507. ldr q29, [x14, x16] // load filter values for idx 2
  508. smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
  509. ldr d7, [x11], #8 // load src values for idx 3
  510. smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
  511. uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
  512. ldr q28, [x15, x16] // load filter values for idx 3
  513. smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
  514. uxtl v7.8h, v7.8b
  515. smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
  516. sub w0, w0, #8
  517. smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
  518. cmp w0, #8
  519. smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
  520. add x16, x16, #16 // advance filter values indexing
  521. b.ge 2b
  522. // 4 iterations left
  523. sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
  524. ldr s4, [x8] // load src values for idx 0
  525. ldr d31, [x12, x17] // load filter values for idx 0
  526. uxtl v4.8h, v4.8b // extend type to match the filter' size
  527. ldr s5, [x9] // load src values for idx 1
  528. smlal v16.4s, v4.4h, v31.4h
  529. ldr d30, [x13, x17] // load filter values for idx 1
  530. uxtl v5.8h, v5.8b // extend type to match the filter' size
  531. ldr s6, [x10] // load src values for idx 2
  532. smlal v17.4s, v5.4h, v30.4h
  533. uxtl v6.8h, v6.8b // extend type to match the filter's size
  534. ldr d29, [x14, x17] // load filter values for idx 2
  535. ldr s7, [x11] // load src values for idx 3
  536. addp v16.4s, v16.4s, v17.4s
  537. uxtl v7.8h, v7.8b
  538. ldr d28, [x15, x17] // load filter values for idx 3
  539. smlal v18.4s, v6.4h, v29.4h
  540. smlal v19.4s, v7.4h, v28.4h
  541. subs w2, w2, #4
  542. addp v18.4s, v18.4s, v19.4s
  543. addp v16.4s, v16.4s, v18.4s
  544. sshr v16.4s, v16.4s, #3
  545. smin v16.4s, v16.4s, v20.4s
  546. st1 {v16.4s}, [x1], #16
  547. add x4, x4, x7, lsl #2
  548. b.gt 1b
  549. ret
  550. endfunc
  551. function ff_hscale16to15_4_neon_asm, export=1
  552. // w0 int shift
  553. // x1 int32_t *dst
  554. // w2 int dstW
  555. // x3 const uint8_t *src // treat it as uint16_t *src
  556. // x4 const uint16_t *filter
  557. // x5 const int32_t *filterPos
  558. // w6 int filterSize
  559. movi v18.4s, #1
  560. movi v17.4s, #1
  561. shl v18.4s, v18.4s, #15
  562. sub v18.4s, v18.4s, v17.4s // max allowed value
  563. dup v17.4s, w0 // read shift
  564. neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
  565. cmp w2, #16
  566. b.lt 2f // move to last block
  567. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  568. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  569. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  570. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  571. add x5, x5, #32
  572. // shift all filterPos left by one, as uint16_t will be read
  573. lsl x8, x8, #1
  574. lsl x9, x9, #1
  575. lsl x10, x10, #1
  576. lsl x11, x11, #1
  577. lsl x12, x12, #1
  578. lsl x13, x13, #1
  579. lsl x14, x14, #1
  580. lsl x15, x15, #1
  581. // load src with given offset
  582. ldr x8, [x3, w8, uxtw]
  583. ldr x9, [x3, w9, uxtw]
  584. ldr x10, [x3, w10, uxtw]
  585. ldr x11, [x3, w11, uxtw]
  586. ldr x12, [x3, w12, uxtw]
  587. ldr x13, [x3, w13, uxtw]
  588. ldr x14, [x3, w14, uxtw]
  589. ldr x15, [x3, w15, uxtw]
  590. sub sp, sp, #64
  591. // push src on stack so it can be loaded into vectors later
  592. stp x8, x9, [sp]
  593. stp x10, x11, [sp, #16]
  594. stp x12, x13, [sp, #32]
  595. stp x14, x15, [sp, #48]
  596. 1:
  597. ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
  598. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
  599. // Each of blocks does the following:
  600. // Extend src and filter to 32 bits with uxtl and sxtl
  601. // multiply or multiply and accumulate results
  602. // Extending to 32 bits is necessary, as unit16_t values can't
  603. // be represented as int16_t without type promotion.
  604. uxtl v26.4s, v0.4h
  605. sxtl v27.4s, v28.4h
  606. uxtl2 v0.4s, v0.8h
  607. mul v5.4s, v26.4s, v27.4s
  608. sxtl2 v28.4s, v28.8h
  609. uxtl v26.4s, v1.4h
  610. mul v6.4s, v0.4s, v28.4s
  611. sxtl v27.4s, v29.4h
  612. uxtl2 v0.4s, v1.8h
  613. mla v5.4s, v27.4s, v26.4s
  614. sxtl2 v28.4s, v29.8h
  615. uxtl v26.4s, v2.4h
  616. mla v6.4s, v28.4s, v0.4s
  617. sxtl v27.4s, v30.4h
  618. uxtl2 v0.4s, v2.8h
  619. mla v5.4s, v27.4s, v26.4s
  620. sxtl2 v28.4s, v30.8h
  621. uxtl v26.4s, v3.4h
  622. mla v6.4s, v28.4s, v0.4s
  623. sxtl v27.4s, v31.4h
  624. uxtl2 v0.4s, v3.8h
  625. mla v5.4s, v27.4s, v26.4s
  626. sxtl2 v28.4s, v31.8h
  627. sub w2, w2, #8
  628. mla v6.4s, v28.4s, v0.4s
  629. sshl v5.4s, v5.4s, v17.4s
  630. sshl v6.4s, v6.4s, v17.4s
  631. smin v5.4s, v5.4s, v18.4s
  632. smin v6.4s, v6.4s, v18.4s
  633. xtn v5.4h, v5.4s
  634. xtn2 v5.8h, v6.4s
  635. st1 {v5.8h}, [x1], #16
  636. cmp w2, #16
  637. // load filterPositions into registers for next iteration
  638. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  639. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  640. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  641. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  642. add x5, x5, #32
  643. lsl x8, x8, #1
  644. lsl x9, x9, #1
  645. lsl x10, x10, #1
  646. lsl x11, x11, #1
  647. lsl x12, x12, #1
  648. lsl x13, x13, #1
  649. lsl x14, x14, #1
  650. lsl x15, x15, #1
  651. ldr x8, [x3, w8, uxtw]
  652. ldr x9, [x3, w9, uxtw]
  653. ldr x10, [x3, w10, uxtw]
  654. ldr x11, [x3, w11, uxtw]
  655. ldr x12, [x3, w12, uxtw]
  656. ldr x13, [x3, w13, uxtw]
  657. ldr x14, [x3, w14, uxtw]
  658. ldr x15, [x3, w15, uxtw]
  659. stp x8, x9, [sp]
  660. stp x10, x11, [sp, #16]
  661. stp x12, x13, [sp, #32]
  662. stp x14, x15, [sp, #48]
  663. b.ge 1b
  664. // here we make last iteration, without updating the registers
  665. ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
  666. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
  667. uxtl v26.4s, v0.4h
  668. sxtl v27.4s, v28.4h
  669. uxtl2 v0.4s, v0.8h
  670. mul v5.4s, v26.4s, v27.4s
  671. sxtl2 v28.4s, v28.8h
  672. uxtl v26.4s, v1.4h
  673. mul v6.4s, v0.4s, v28.4s
  674. sxtl v27.4s, v29.4h
  675. uxtl2 v0.4s, v1.8h
  676. mla v5.4s, v26.4s, v27.4s
  677. sxtl2 v28.4s, v29.8h
  678. uxtl v26.4s, v2.4h
  679. mla v6.4s, v0.4s, v28.4s
  680. sxtl v27.4s, v30.4h
  681. uxtl2 v0.4s, v2.8h
  682. mla v5.4s, v26.4s, v27.4s
  683. sxtl2 v28.4s, v30.8h
  684. uxtl v26.4s, v3.4h
  685. mla v6.4s, v0.4s, v28.4s
  686. sxtl v27.4s, v31.4h
  687. uxtl2 v0.4s, v3.8h
  688. mla v5.4s, v26.4s, v27.4s
  689. sxtl2 v28.4s, v31.8h
  690. subs w2, w2, #8
  691. mla v6.4s, v0.4s, v28.4s
  692. sshl v5.4s, v5.4s, v17.4s
  693. sshl v6.4s, v6.4s, v17.4s
  694. smin v5.4s, v5.4s, v18.4s
  695. smin v6.4s, v6.4s, v18.4s
  696. xtn v5.4h, v5.4s
  697. xtn2 v5.8h, v6.4s
  698. st1 {v5.8h}, [x1], #16
  699. add sp, sp, #64 // restore stack
  700. cbnz w2, 2f
  701. ret
  702. 2:
  703. ldr w8, [x5], #4 // load filterPos
  704. lsl w8, w8, #1
  705. add x9, x3, w8, uxtw // src + filterPos
  706. ld1 {v0.4h}, [x9] // load 4 * uint16_t
  707. ld1 {v31.4h}, [x4], #8
  708. uxtl v0.4s, v0.4h
  709. sxtl v31.4s, v31.4h
  710. mul v5.4s, v0.4s, v31.4s
  711. addv s0, v5.4s
  712. sshl v0.4s, v0.4s, v17.4s
  713. smin v0.4s, v0.4s, v18.4s
  714. st1 {v0.h}[0], [x1], #2
  715. sub w2, w2, #1
  716. cbnz w2, 2b // if iterations remain jump to beginning
  717. ret
  718. endfunc
  719. function ff_hscale16to15_X8_neon_asm, export=1
  720. // w0 int shift
  721. // x1 int32_t *dst
  722. // w2 int dstW
  723. // x3 const uint8_t *src // treat it as uint16_t *src
  724. // x4 const uint16_t *filter
  725. // x5 const int32_t *filterPos
  726. // w6 int filterSize
  727. movi v20.4s, #1
  728. movi v21.4s, #1
  729. shl v20.4s, v20.4s, #15
  730. sub v20.4s, v20.4s, v21.4s
  731. dup v21.4s, w0
  732. neg v21.4s, v21.4s
  733. sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
  734. 1: ldr w8, [x5], #4 // filterPos[idx]
  735. lsl w8, w8, #1
  736. ldr w10, [x5], #4 // filterPos[idx + 1]
  737. lsl w10, w10, #1
  738. ldr w11, [x5], #4 // filterPos[idx + 2]
  739. lsl w11, w11, #1
  740. ldr w9, [x5], #4 // filterPos[idx + 3]
  741. lsl w9, w9, #1
  742. mov x16, x4 // filter0 = filter
  743. add x12, x16, x7 // filter1 = filter0 + filterSize*2
  744. add x13, x12, x7 // filter2 = filter1 + filterSize*2
  745. add x4, x13, x7 // filter3 = filter2 + filterSize*2
  746. movi v0.2d, #0 // val sum part 1 (for dst[0])
  747. movi v1.2d, #0 // val sum part 2 (for dst[1])
  748. movi v2.2d, #0 // val sum part 3 (for dst[2])
  749. movi v3.2d, #0 // val sum part 4 (for dst[3])
  750. add x17, x3, w8, uxtw // srcp + filterPos[0]
  751. add x8, x3, w10, uxtw // srcp + filterPos[1]
  752. add x10, x3, w11, uxtw // srcp + filterPos[2]
  753. add x11, x3, w9, uxtw // srcp + filterPos[3]
  754. mov w15, w6 // filterSize counter
  755. 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
  756. ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
  757. ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
  758. ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
  759. uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
  760. sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
  761. uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
  762. mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
  763. sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
  764. uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
  765. mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
  766. sxtl v27.4s, v7.4h // exted filter lower half
  767. uxtl2 v6.4s, v6.8h // extend srcp upper half
  768. sxtl2 v7.4s, v7.8h // extend filter upper half
  769. ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
  770. mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
  771. ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
  772. uxtl v22.4s, v16.4h // extend srcp lower half
  773. sxtl v23.4s, v17.4h // extend filter lower half
  774. uxtl2 v16.4s, v16.8h // extend srcp upper half
  775. sxtl2 v17.4s, v17.8h // extend filter upper half
  776. mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
  777. mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
  778. ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
  779. mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
  780. ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
  781. subs w15, w15, #8 // j -= 8: processed 8/filterSize
  782. uxtl v28.4s, v18.4h // extend srcp lower half
  783. sxtl v29.4s, v19.4h // extend filter lower half
  784. uxtl2 v18.4s, v18.8h // extend srcp upper half
  785. sxtl2 v19.4s, v19.8h // extend filter upper half
  786. mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
  787. mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
  788. b.gt 2b // inner loop if filterSize not consumed completely
  789. addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
  790. addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
  791. addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
  792. subs w2, w2, #4 // dstW -= 4
  793. sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
  794. smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
  795. xtn v0.4h, v0.4s // narrow down to 16 bits
  796. st1 {v0.4h}, [x1], #8 // write to destination part0123
  797. b.gt 1b // loop until end of line
  798. ret
  799. endfunc
  800. function ff_hscale16to15_X4_neon_asm, export=1
  801. // w0 int shift
  802. // x1 int16_t *dst
  803. // w2 int dstW
  804. // x3 const uint8_t *src
  805. // x4 const int16_t *filter
  806. // x5 const int32_t *filterPos
  807. // w6 int filterSize
  808. stp d8, d9, [sp, #-0x20]!
  809. stp d10, d11, [sp, #0x10]
  810. movi v18.4s, #1
  811. movi v17.4s, #1
  812. shl v18.4s, v18.4s, #15
  813. sub v21.4s, v18.4s, v17.4s // max allowed value
  814. dup v17.4s, w0 // read shift
  815. neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
  816. lsl w7, w6, #1
  817. 1:
  818. ldp w8, w9, [x5]
  819. ldp w10, w11, [x5, #8]
  820. movi v16.2d, #0 // initialize accumulator for idx + 0
  821. movi v17.2d, #0 // initialize accumulator for idx + 1
  822. movi v18.2d, #0 // initialize accumulator for idx + 2
  823. movi v19.2d, #0 // initialize accumulator for idx + 3
  824. mov x12, x4 // filter + 0
  825. add x13, x4, x7 // filter + 1
  826. add x8, x3, x8, lsl #1 // srcp + filterPos 0
  827. add x14, x13, x7 // filter + 2
  828. add x9, x3, x9, lsl #1 // srcp + filterPos 1
  829. add x15, x14, x7 // filter + 3
  830. add x10, x3, x10, lsl #1 // srcp + filterPos 2
  831. mov w0, w6 // save the filterSize to temporary variable
  832. add x11, x3, x11, lsl #1 // srcp + filterPos 3
  833. add x5, x5, #16 // advance filter position
  834. mov x16, xzr // clear the register x16 used for offsetting the filter values
  835. 2:
  836. ldr q4, [x8], #16 // load src values for idx 0
  837. ldr q5, [x9], #16 // load src values for idx 1
  838. uxtl v26.4s, v4.4h
  839. uxtl2 v4.4s, v4.8h
  840. ldr q31, [x12, x16] // load filter values for idx 0
  841. ldr q6, [x10], #16 // load src values for idx 2
  842. sxtl v22.4s, v31.4h
  843. sxtl2 v31.4s, v31.8h
  844. mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
  845. uxtl v25.4s, v5.4h
  846. uxtl2 v5.4s, v5.8h
  847. ldr q30, [x13, x16] // load filter values for idx 1
  848. ldr q7, [x11], #16 // load src values for idx 3
  849. mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
  850. uxtl v24.4s, v6.4h
  851. sxtl v8.4s, v30.4h
  852. sxtl2 v30.4s, v30.8h
  853. mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
  854. ldr q29, [x14, x16] // load filter values for idx 2
  855. uxtl2 v6.4s, v6.8h
  856. sxtl v9.4s, v29.4h
  857. sxtl2 v29.4s, v29.8h
  858. mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
  859. mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
  860. ldr q28, [x15, x16] // load filter values for idx 3
  861. uxtl v23.4s, v7.4h
  862. sxtl v10.4s, v28.4h
  863. mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
  864. uxtl2 v7.4s, v7.8h
  865. sxtl2 v28.4s, v28.8h
  866. mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
  867. sub w0, w0, #8
  868. cmp w0, #8
  869. mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
  870. add x16, x16, #16 // advance filter values indexing
  871. b.ge 2b
  872. // 4 iterations left
  873. sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
  874. ldr d4, [x8] // load src values for idx 0
  875. ldr d31, [x12, x17] // load filter values for idx 0
  876. uxtl v4.4s, v4.4h
  877. sxtl v31.4s, v31.4h
  878. ldr d5, [x9] // load src values for idx 1
  879. mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
  880. ldr d30, [x13, x17] // load filter values for idx 1
  881. uxtl v5.4s, v5.4h
  882. sxtl v30.4s, v30.4h
  883. ldr d6, [x10] // load src values for idx 2
  884. mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
  885. ldr d29, [x14, x17] // load filter values for idx 2
  886. uxtl v6.4s, v6.4h
  887. sxtl v29.4s, v29.4h
  888. ldr d7, [x11] // load src values for idx 3
  889. ldr d28, [x15, x17] // load filter values for idx 3
  890. mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
  891. uxtl v7.4s, v7.4h
  892. sxtl v28.4s, v28.4h
  893. addp v16.4s, v16.4s, v17.4s
  894. mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
  895. subs w2, w2, #4
  896. addp v18.4s, v18.4s, v19.4s
  897. addp v16.4s, v16.4s, v18.4s
  898. sshl v16.4s, v16.4s, v20.4s
  899. smin v16.4s, v16.4s, v21.4s
  900. xtn v16.4h, v16.4s
  901. st1 {v16.4h}, [x1], #8
  902. add x4, x4, x7, lsl #2
  903. b.gt 1b
  904. ldp d8, d9, [sp]
  905. ldp d10, d11, [sp, #0x10]
  906. add sp, sp, #0x20
  907. ret
  908. endfunc
  909. function ff_hscale16to19_4_neon_asm, export=1
  910. // w0 int shift
  911. // x1 int32_t *dst
  912. // w2 int dstW
  913. // x3 const uint8_t *src // treat it as uint16_t *src
  914. // x4 const uint16_t *filter
  915. // x5 const int32_t *filterPos
  916. // w6 int filterSize
  917. movi v18.4s, #1
  918. movi v17.4s, #1
  919. shl v18.4s, v18.4s, #19
  920. sub v18.4s, v18.4s, v17.4s // max allowed value
  921. dup v17.4s, w0 // read shift
  922. neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
  923. cmp w2, #16
  924. b.lt 2f // move to last block
  925. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  926. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  927. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  928. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  929. add x5, x5, #32
  930. // shift all filterPos left by one, as uint16_t will be read
  931. lsl x8, x8, #1
  932. lsl x9, x9, #1
  933. lsl x10, x10, #1
  934. lsl x11, x11, #1
  935. lsl x12, x12, #1
  936. lsl x13, x13, #1
  937. lsl x14, x14, #1
  938. lsl x15, x15, #1
  939. // load src with given offset
  940. ldr x8, [x3, w8, uxtw]
  941. ldr x9, [x3, w9, uxtw]
  942. ldr x10, [x3, w10, uxtw]
  943. ldr x11, [x3, w11, uxtw]
  944. ldr x12, [x3, w12, uxtw]
  945. ldr x13, [x3, w13, uxtw]
  946. ldr x14, [x3, w14, uxtw]
  947. ldr x15, [x3, w15, uxtw]
  948. sub sp, sp, #64
  949. // push src on stack so it can be loaded into vectors later
  950. stp x8, x9, [sp]
  951. stp x10, x11, [sp, #16]
  952. stp x12, x13, [sp, #32]
  953. stp x14, x15, [sp, #48]
  954. 1:
  955. ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
  956. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
  957. // Each of blocks does the following:
  958. // Extend src and filter to 32 bits with uxtl and sxtl
  959. // multiply or multiply and accumulate results
  960. // Extending to 32 bits is necessary, as unit16_t values can't
  961. // be represented as int16_t without type promotion.
  962. uxtl v26.4s, v0.4h
  963. sxtl v27.4s, v28.4h
  964. uxtl2 v0.4s, v0.8h
  965. mul v5.4s, v26.4s, v27.4s
  966. sxtl2 v28.4s, v28.8h
  967. uxtl v26.4s, v1.4h
  968. mul v6.4s, v0.4s, v28.4s
  969. sxtl v27.4s, v29.4h
  970. uxtl2 v0.4s, v1.8h
  971. mla v5.4s, v27.4s, v26.4s
  972. sxtl2 v28.4s, v29.8h
  973. uxtl v26.4s, v2.4h
  974. mla v6.4s, v28.4s, v0.4s
  975. sxtl v27.4s, v30.4h
  976. uxtl2 v0.4s, v2.8h
  977. mla v5.4s, v27.4s, v26.4s
  978. sxtl2 v28.4s, v30.8h
  979. uxtl v26.4s, v3.4h
  980. mla v6.4s, v28.4s, v0.4s
  981. sxtl v27.4s, v31.4h
  982. uxtl2 v0.4s, v3.8h
  983. mla v5.4s, v27.4s, v26.4s
  984. sxtl2 v28.4s, v31.8h
  985. sub w2, w2, #8
  986. mla v6.4s, v28.4s, v0.4s
  987. sshl v5.4s, v5.4s, v17.4s
  988. sshl v6.4s, v6.4s, v17.4s
  989. smin v5.4s, v5.4s, v18.4s
  990. smin v6.4s, v6.4s, v18.4s
  991. st1 {v5.4s, v6.4s}, [x1], #32
  992. cmp w2, #16
  993. // load filterPositions into registers for next iteration
  994. ldp w8, w9, [x5] // filterPos[0], filterPos[1]
  995. ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
  996. ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
  997. ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
  998. add x5, x5, #32
  999. lsl x8, x8, #1
  1000. lsl x9, x9, #1
  1001. lsl x10, x10, #1
  1002. lsl x11, x11, #1
  1003. lsl x12, x12, #1
  1004. lsl x13, x13, #1
  1005. lsl x14, x14, #1
  1006. lsl x15, x15, #1
  1007. ldr x8, [x3, w8, uxtw]
  1008. ldr x9, [x3, w9, uxtw]
  1009. ldr x10, [x3, w10, uxtw]
  1010. ldr x11, [x3, w11, uxtw]
  1011. ldr x12, [x3, w12, uxtw]
  1012. ldr x13, [x3, w13, uxtw]
  1013. ldr x14, [x3, w14, uxtw]
  1014. ldr x15, [x3, w15, uxtw]
  1015. stp x8, x9, [sp]
  1016. stp x10, x11, [sp, #16]
  1017. stp x12, x13, [sp, #32]
  1018. stp x14, x15, [sp, #48]
  1019. b.ge 1b
  1020. // here we make last iteration, without updating the registers
  1021. ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
  1022. ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
  1023. uxtl v26.4s, v0.4h
  1024. sxtl v27.4s, v28.4h
  1025. uxtl2 v0.4s, v0.8h
  1026. mul v5.4s, v26.4s, v27.4s
  1027. sxtl2 v28.4s, v28.8h
  1028. uxtl v26.4s, v1.4h
  1029. mul v6.4s, v0.4s, v28.4s
  1030. sxtl v27.4s, v29.4h
  1031. uxtl2 v0.4s, v1.8h
  1032. mla v5.4s, v26.4s, v27.4s
  1033. sxtl2 v28.4s, v29.8h
  1034. uxtl v26.4s, v2.4h
  1035. mla v6.4s, v0.4s, v28.4s
  1036. sxtl v27.4s, v30.4h
  1037. uxtl2 v0.4s, v2.8h
  1038. mla v5.4s, v26.4s, v27.4s
  1039. sxtl2 v28.4s, v30.8h
  1040. uxtl v26.4s, v3.4h
  1041. mla v6.4s, v0.4s, v28.4s
  1042. sxtl v27.4s, v31.4h
  1043. uxtl2 v0.4s, v3.8h
  1044. mla v5.4s, v26.4s, v27.4s
  1045. sxtl2 v28.4s, v31.8h
  1046. subs w2, w2, #8
  1047. mla v6.4s, v0.4s, v28.4s
  1048. sshl v5.4s, v5.4s, v17.4s
  1049. sshl v6.4s, v6.4s, v17.4s
  1050. smin v5.4s, v5.4s, v18.4s
  1051. smin v6.4s, v6.4s, v18.4s
  1052. st1 {v5.4s, v6.4s}, [x1], #32
  1053. add sp, sp, #64 // restore stack
  1054. cbnz w2, 2f
  1055. ret
  1056. 2:
  1057. ldr w8, [x5], #4 // load filterPos
  1058. lsl w8, w8, #1
  1059. add x9, x3, w8, uxtw // src + filterPos
  1060. ld1 {v0.4h}, [x9] // load 4 * uint16_t
  1061. ld1 {v31.4h}, [x4], #8
  1062. uxtl v0.4s, v0.4h
  1063. sxtl v31.4s, v31.4h
  1064. subs w2, w2, #1
  1065. mul v5.4s, v0.4s, v31.4s
  1066. addv s0, v5.4s
  1067. sshl v0.4s, v0.4s, v17.4s
  1068. smin v0.4s, v0.4s, v18.4s
  1069. st1 {v0.s}[0], [x1], #4
  1070. cbnz w2, 2b // if iterations remain jump to beginning
  1071. ret
  1072. endfunc
  1073. function ff_hscale16to19_X8_neon_asm, export=1
  1074. // w0 int shift
  1075. // x1 int32_t *dst
  1076. // w2 int dstW
  1077. // x3 const uint8_t *src // treat it as uint16_t *src
  1078. // x4 const uint16_t *filter
  1079. // x5 const int32_t *filterPos
  1080. // w6 int filterSize
  1081. movi v20.4s, #1
  1082. movi v21.4s, #1
  1083. shl v20.4s, v20.4s, #19
  1084. sub v20.4s, v20.4s, v21.4s
  1085. dup v21.4s, w0
  1086. neg v21.4s, v21.4s
  1087. sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
  1088. 1: ldr w8, [x5], #4 // filterPos[idx]
  1089. ldr w10, [x5], #4 // filterPos[idx + 1]
  1090. lsl w8, w8, #1
  1091. ldr w11, [x5], #4 // filterPos[idx + 2]
  1092. ldr w9, [x5], #4 // filterPos[idx + 3]
  1093. mov x16, x4 // filter0 = filter
  1094. lsl w11, w11, #1
  1095. add x12, x16, x7 // filter1 = filter0 + filterSize*2
  1096. lsl w9, w9, #1
  1097. add x13, x12, x7 // filter2 = filter1 + filterSize*2
  1098. lsl w10, w10, #1
  1099. add x4, x13, x7 // filter3 = filter2 + filterSize*2
  1100. movi v0.2d, #0 // val sum part 1 (for dst[0])
  1101. movi v1.2d, #0 // val sum part 2 (for dst[1])
  1102. movi v2.2d, #0 // val sum part 3 (for dst[2])
  1103. movi v3.2d, #0 // val sum part 4 (for dst[3])
  1104. add x17, x3, w8, uxtw // srcp + filterPos[0]
  1105. add x8, x3, w10, uxtw // srcp + filterPos[1]
  1106. add x10, x3, w11, uxtw // srcp + filterPos[2]
  1107. add x11, x3, w9, uxtw // srcp + filterPos[3]
  1108. mov w15, w6 // filterSize counter
  1109. 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
  1110. ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
  1111. ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
  1112. ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
  1113. uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
  1114. sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
  1115. uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
  1116. mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
  1117. sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
  1118. uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
  1119. mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
  1120. sxtl v27.4s, v7.4h // exted filter lower half
  1121. uxtl2 v6.4s, v6.8h // extend srcp upper half
  1122. sxtl2 v7.4s, v7.8h // extend filter upper half
  1123. ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
  1124. mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
  1125. ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
  1126. uxtl v22.4s, v16.4h // extend srcp lower half
  1127. sxtl v23.4s, v17.4h // extend filter lower half
  1128. uxtl2 v16.4s, v16.8h // extend srcp upper half
  1129. sxtl2 v17.4s, v17.8h // extend filter upper half
  1130. mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
  1131. mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
  1132. ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
  1133. mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
  1134. ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
  1135. subs w15, w15, #8 // j -= 8: processed 8/filterSize
  1136. uxtl v28.4s, v18.4h // extend srcp lower half
  1137. sxtl v29.4s, v19.4h // extend filter lower half
  1138. uxtl2 v18.4s, v18.8h // extend srcp upper half
  1139. sxtl2 v19.4s, v19.8h // extend filter upper half
  1140. mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
  1141. mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
  1142. b.gt 2b // inner loop if filterSize not consumed completely
  1143. addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
  1144. addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
  1145. addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
  1146. subs w2, w2, #4 // dstW -= 4
  1147. sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
  1148. smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
  1149. st1 {v0.4s}, [x1], #16 // write to destination part0123
  1150. b.gt 1b // loop until end of line
  1151. ret
  1152. endfunc
  1153. function ff_hscale16to19_X4_neon_asm, export=1
  1154. // w0 int shift
  1155. // x1 int16_t *dst
  1156. // w2 int dstW
  1157. // x3 const uint8_t *src
  1158. // x4 const int16_t *filter
  1159. // x5 const int32_t *filterPos
  1160. // w6 int filterSize
  1161. stp d8, d9, [sp, #-0x20]!
  1162. stp d10, d11, [sp, #0x10]
  1163. movi v18.4s, #1
  1164. movi v17.4s, #1
  1165. shl v18.4s, v18.4s, #19
  1166. sub v21.4s, v18.4s, v17.4s // max allowed value
  1167. dup v17.4s, w0 // read shift
  1168. neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
  1169. lsl w7, w6, #1
  1170. 1:
  1171. ldp w8, w9, [x5]
  1172. ldp w10, w11, [x5, #8]
  1173. movi v16.2d, #0 // initialize accumulator for idx + 0
  1174. movi v17.2d, #0 // initialize accumulator for idx + 1
  1175. movi v18.2d, #0 // initialize accumulator for idx + 2
  1176. movi v19.2d, #0 // initialize accumulator for idx + 3
  1177. mov x12, x4 // filter + 0
  1178. add x13, x4, x7 // filter + 1
  1179. add x8, x3, x8, lsl #1 // srcp + filterPos 0
  1180. add x14, x13, x7 // filter + 2
  1181. add x9, x3, x9, lsl #1 // srcp + filterPos 1
  1182. add x15, x14, x7 // filter + 3
  1183. add x10, x3, x10, lsl #1 // srcp + filterPos 2
  1184. mov w0, w6 // save the filterSize to temporary variable
  1185. add x11, x3, x11, lsl #1 // srcp + filterPos 3
  1186. add x5, x5, #16 // advance filter position
  1187. mov x16, xzr // clear the register x16 used for offsetting the filter values
  1188. 2:
  1189. ldr q4, [x8], #16 // load src values for idx 0
  1190. ldr q5, [x9], #16 // load src values for idx 1
  1191. uxtl v26.4s, v4.4h
  1192. uxtl2 v4.4s, v4.8h
  1193. ldr q31, [x12, x16] // load filter values for idx 0
  1194. ldr q6, [x10], #16 // load src values for idx 2
  1195. sxtl v22.4s, v31.4h
  1196. sxtl2 v31.4s, v31.8h
  1197. mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
  1198. uxtl v25.4s, v5.4h
  1199. uxtl2 v5.4s, v5.8h
  1200. ldr q30, [x13, x16] // load filter values for idx 1
  1201. ldr q7, [x11], #16 // load src values for idx 3
  1202. mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
  1203. uxtl v24.4s, v6.4h
  1204. sxtl v8.4s, v30.4h
  1205. sxtl2 v30.4s, v30.8h
  1206. mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
  1207. ldr q29, [x14, x16] // load filter values for idx 2
  1208. uxtl2 v6.4s, v6.8h
  1209. sxtl v9.4s, v29.4h
  1210. sxtl2 v29.4s, v29.8h
  1211. mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
  1212. ldr q28, [x15, x16] // load filter values for idx 3
  1213. mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
  1214. uxtl v23.4s, v7.4h
  1215. sxtl v10.4s, v28.4h
  1216. mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
  1217. uxtl2 v7.4s, v7.8h
  1218. sxtl2 v28.4s, v28.8h
  1219. mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
  1220. sub w0, w0, #8
  1221. cmp w0, #8
  1222. mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
  1223. add x16, x16, #16 // advance filter values indexing
  1224. b.ge 2b
  1225. // 4 iterations left
  1226. sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
  1227. ldr d4, [x8] // load src values for idx 0
  1228. ldr d31, [x12, x17] // load filter values for idx 0
  1229. uxtl v4.4s, v4.4h
  1230. sxtl v31.4s, v31.4h
  1231. ldr d5, [x9] // load src values for idx 1
  1232. mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
  1233. ldr d30, [x13, x17] // load filter values for idx 1
  1234. uxtl v5.4s, v5.4h
  1235. sxtl v30.4s, v30.4h
  1236. ldr d6, [x10] // load src values for idx 2
  1237. mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
  1238. ldr d29, [x14, x17] // load filter values for idx 2
  1239. uxtl v6.4s, v6.4h
  1240. sxtl v29.4s, v29.4h
  1241. ldr d7, [x11] // load src values for idx 3
  1242. ldr d28, [x15, x17] // load filter values for idx 3
  1243. mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
  1244. uxtl v7.4s, v7.4h
  1245. sxtl v28.4s, v28.4h
  1246. addp v16.4s, v16.4s, v17.4s
  1247. mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
  1248. subs w2, w2, #4
  1249. addp v18.4s, v18.4s, v19.4s
  1250. addp v16.4s, v16.4s, v18.4s
  1251. sshl v16.4s, v16.4s, v20.4s
  1252. smin v16.4s, v16.4s, v21.4s
  1253. st1 {v16.4s}, [x1], #16
  1254. add x4, x4, x7, lsl #2
  1255. b.gt 1b
  1256. ldp d8, d9, [sp]
  1257. ldp d10, d11, [sp, #0x10]
  1258. add sp, sp, #0x20
  1259. ret
  1260. endfunc