gf_5vect_mad_neon.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_5vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_5vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x7
  45. x_dest2 .req x8
  46. x_dest3 .req x9
  47. x_dest4 .req x10
  48. x_dest5 .req x_dest
  49. x_tmp .req x11
  50. x_tbl1 .req x12
  51. x_tbl2 .req x13
  52. x_tbl3 .req x14
  53. x_tbl4 .req x15
  54. x_tbl5 .req x16
  55. x_const .req x17
  56. /* vectors */
  57. v_mask0f .req v0
  58. v_tmp_lo .req v1
  59. v_tmp_hi .req v2
  60. v_tmp .req v3
  61. q_tmp .req q3
  62. v_gft1_lo .req v4
  63. v_gft1_hi .req v5
  64. v_gft2_lo .req v6
  65. v_gft2_hi .req v7
  66. v_gft3_lo .req v16
  67. v_gft3_hi .req v17
  68. q_gft1_lo .req q4
  69. q_gft1_hi .req q5
  70. q_gft2_lo .req q6
  71. q_gft2_hi .req q7
  72. q_gft3_lo .req q16
  73. q_gft3_hi .req q17
  74. v_gft4_lo .req v18
  75. v_gft4_hi .req v19
  76. q_gft4_lo .req q18
  77. q_gft4_hi .req q19
  78. v_gft5_lo .req v_gft2_lo
  79. v_gft5_hi .req v_gft2_hi
  80. q_gft5_lo .req q_gft2_lo
  81. q_gft5_hi .req q_gft2_hi
  82. v_data_0 .req v8
  83. v_data_1 .req v9
  84. v_data_2 .req v10
  85. v_data_3 .req v11
  86. q_data_0 .req q8
  87. q_data_1 .req q9
  88. q_data_2 .req q10
  89. q_data_3 .req q11
  90. v_data_0_lo .req v12
  91. v_data_1_lo .req v13
  92. v_data_2_lo .req v14
  93. v_data_3_lo .req v15
  94. v_data_0_hi .req v_data_0
  95. v_data_1_hi .req v_data_1
  96. v_data_2_hi .req v_data_2
  97. v_data_3_hi .req v_data_3
  98. v_d1_0 .req v20
  99. v_d1_1 .req v21
  100. v_d1_2 .req v22
  101. v_d1_3 .req v23
  102. v_d2_0 .req v24
  103. v_d2_1 .req v25
  104. v_d2_2 .req v26
  105. v_d2_3 .req v27
  106. v_d3_0 .req v28
  107. v_d3_1 .req v29
  108. v_d3_2 .req v30
  109. v_d3_3 .req v31
  110. q_d1_0 .req q20
  111. q_d1_1 .req q21
  112. q_d1_2 .req q22
  113. q_d1_3 .req q23
  114. q_d2_0 .req q24
  115. q_d2_1 .req q25
  116. q_d2_2 .req q26
  117. q_d2_3 .req q27
  118. q_d3_0 .req q28
  119. q_d3_1 .req q29
  120. q_d3_2 .req q30
  121. q_d3_3 .req q31
  122. v_d4_0 .req v_d1_0
  123. v_d4_1 .req v_d1_1
  124. v_d4_2 .req v_d1_2
  125. v_d4_3 .req v_d1_3
  126. q_d4_0 .req q_d1_0
  127. q_d4_1 .req q_d1_1
  128. q_d4_2 .req q_d1_2
  129. q_d4_3 .req q_d1_3
  130. v_d5_0 .req v_d2_0
  131. v_d5_1 .req v_d2_1
  132. v_d5_2 .req v_d2_2
  133. v_d5_3 .req v_d2_3
  134. q_d5_0 .req q_d2_0
  135. q_d5_1 .req q_d2_1
  136. q_d5_2 .req q_d2_2
  137. q_d5_3 .req q_d2_3
  138. v_data .req v21
  139. q_data .req q21
  140. v_data_lo .req v22
  141. v_data_hi .req v23
  142. cdecl(gf_5vect_mad_neon):
  143. /* less than 16 bytes, return_fail */
  144. cmp x_len, #16
  145. blt .return_fail
  146. movi v_mask0f.16b, #0x0f
  147. lsl x_vec_i, x_vec_i, #5
  148. lsl x_vec, x_vec, #5
  149. add x_tbl1, x_tbl, x_vec_i
  150. add x_tbl2, x_tbl1, x_vec
  151. add x_tbl3, x_tbl2, x_vec
  152. add x_tbl4, x_tbl3, x_vec
  153. add x_tbl5, x_tbl4, x_vec
  154. add x_src_end, x_src, x_len
  155. ldr x_dest1, [x_dest, #8*0]
  156. ldr x_dest2, [x_dest, #8*1]
  157. ldr x_dest3, [x_dest, #8*2]
  158. ldr x_dest4, [x_dest, #8*3]
  159. ldr x_dest5, [x_dest, #8*4]
  160. ldr q_gft1_lo, [x_tbl1]
  161. ldr q_gft1_hi, [x_tbl1, #16]
  162. ldr q_gft3_lo, [x_tbl3]
  163. ldr q_gft3_hi, [x_tbl3, #16]
  164. ldr q_gft4_lo, [x_tbl4]
  165. ldr q_gft4_hi, [x_tbl4, #16]
  166. .Lloop64_init:
  167. /* less than 64 bytes, goto Lloop16_init */
  168. cmp x_len, #64
  169. blt .Lloop16_init
  170. /* save d8 ~ d15 to stack */
  171. sub sp, sp, #64
  172. stp d8, d9, [sp]
  173. stp d10, d11, [sp, #16]
  174. stp d12, d13, [sp, #32]
  175. stp d14, d15, [sp, #48]
  176. sub x_src_end, x_src_end, #64
  177. .Lloop64:
  178. ldr q_data_0, [x_src, #16*0]
  179. ldr q_data_1, [x_src, #16*1]
  180. ldr q_data_2, [x_src, #16*2]
  181. ldr q_data_3, [x_src, #16*3]
  182. add x_src, x_src, #64
  183. ldr q_d1_0, [x_dest1, #16*0]
  184. ldr q_d1_1, [x_dest1, #16*1]
  185. ldr q_d1_2, [x_dest1, #16*2]
  186. ldr q_d1_3, [x_dest1, #16*3]
  187. ldr q_d2_0, [x_dest2, #16*0]
  188. ldr q_d2_1, [x_dest2, #16*1]
  189. ldr q_d2_2, [x_dest2, #16*2]
  190. ldr q_d2_3, [x_dest2, #16*3]
  191. ldr q_d3_0, [x_dest3, #16*0]
  192. ldr q_d3_1, [x_dest3, #16*1]
  193. ldr q_d3_2, [x_dest3, #16*2]
  194. ldr q_d3_3, [x_dest3, #16*3]
  195. ldr q_gft2_lo, [x_tbl2]
  196. ldr q_gft2_hi, [x_tbl2, #16]
  197. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  198. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  199. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  200. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  201. ushr v_data_0_hi.16b, v_data_0.16b, #4
  202. ushr v_data_1_hi.16b, v_data_1.16b, #4
  203. ushr v_data_2_hi.16b, v_data_2.16b, #4
  204. ushr v_data_3_hi.16b, v_data_3.16b, #4
  205. /* dest1 */
  206. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  207. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  208. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  209. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  210. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  211. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  212. eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
  213. eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
  214. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  215. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  216. eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
  217. eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
  218. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  219. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  220. eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
  221. eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
  222. /* dest2 */
  223. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
  224. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
  225. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  226. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  227. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
  228. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
  229. eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
  230. eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
  231. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
  232. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
  233. eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
  234. eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
  235. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
  236. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
  237. eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
  238. eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
  239. /* dest3 */
  240. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
  241. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
  242. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  243. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  244. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
  245. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
  246. eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
  247. eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
  248. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
  249. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
  250. eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
  251. eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
  252. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
  253. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
  254. eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
  255. eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
  256. str q_d1_0, [x_dest1, #16*0]
  257. str q_d1_1, [x_dest1, #16*1]
  258. str q_d1_2, [x_dest1, #16*2]
  259. str q_d1_3, [x_dest1, #16*3]
  260. add x_dest1, x_dest1, #64
  261. str q_d2_0, [x_dest2, #16*0]
  262. str q_d2_1, [x_dest2, #16*1]
  263. str q_d2_2, [x_dest2, #16*2]
  264. str q_d2_3, [x_dest2, #16*3]
  265. add x_dest2, x_dest2, #64
  266. str q_d3_0, [x_dest3, #16*0]
  267. str q_d3_1, [x_dest3, #16*1]
  268. str q_d3_2, [x_dest3, #16*2]
  269. str q_d3_3, [x_dest3, #16*3]
  270. add x_dest3, x_dest3, #64
  271. ldr q_d4_0, [x_dest4, #16*0]
  272. ldr q_d4_1, [x_dest4, #16*1]
  273. ldr q_d4_2, [x_dest4, #16*2]
  274. ldr q_d4_3, [x_dest4, #16*3]
  275. ldr q_d5_0, [x_dest5, #16*0]
  276. ldr q_d5_1, [x_dest5, #16*1]
  277. ldr q_d5_2, [x_dest5, #16*2]
  278. ldr q_d5_3, [x_dest5, #16*3]
  279. ldr q_gft5_lo, [x_tbl5]
  280. ldr q_gft5_hi, [x_tbl5, #16]
  281. /* dest4 */
  282. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
  283. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
  284. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  285. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  286. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
  287. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
  288. eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
  289. eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
  290. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
  291. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
  292. eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
  293. eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
  294. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
  295. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
  296. eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
  297. eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
  298. /* dest5 */
  299. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
  300. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
  301. eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
  302. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  303. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
  304. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
  305. eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
  306. eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
  307. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
  308. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
  309. eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
  310. eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
  311. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
  312. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
  313. eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
  314. eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
  315. str q_d4_0, [x_dest4, #16*0]
  316. str q_d4_1, [x_dest4, #16*1]
  317. str q_d4_2, [x_dest4, #16*2]
  318. str q_d4_3, [x_dest4, #16*3]
  319. add x_dest4, x_dest4, #64
  320. str q_d5_0, [x_dest5, #16*0]
  321. str q_d5_1, [x_dest5, #16*1]
  322. str q_d5_2, [x_dest5, #16*2]
  323. str q_d5_3, [x_dest5, #16*3]
  324. add x_dest5, x_dest5, #64
  325. cmp x_src, x_src_end
  326. bls .Lloop64
  327. .Lloop64_end:
  328. /* restore d8 ~ d15 */
  329. ldp d8, d9, [sp]
  330. ldp d10, d11, [sp, #16]
  331. ldp d12, d13, [sp, #32]
  332. ldp d14, d15, [sp, #48]
  333. add sp, sp, #64
  334. add x_src_end, x_src_end, #64
  335. .Lloop16_init:
  336. sub x_src_end, x_src_end, #16
  337. cmp x_src, x_src_end
  338. bhi .lessthan16_init
  339. .Lloop16:
  340. ldr q_data, [x_src]
  341. ldr q_d1_0, [x_dest1]
  342. ldr q_d2_0, [x_dest2]
  343. ldr q_d3_0, [x_dest3]
  344. ldr q_gft2_lo, [x_tbl2]
  345. ldr q_gft2_hi, [x_tbl2, #16]
  346. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  347. ushr v_data_hi.16b, v_data.16b, #4
  348. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  349. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  350. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  351. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  352. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  353. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  354. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  355. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  356. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  357. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  358. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  359. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  360. str q_d1_0, [x_dest1]
  361. str q_d2_0, [x_dest2]
  362. str q_d3_0, [x_dest3]
  363. ldr q_d4_0, [x_dest4]
  364. ldr q_d5_0, [x_dest5]
  365. ldr q_gft5_lo, [x_tbl5]
  366. ldr q_gft5_hi, [x_tbl5, #16]
  367. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  368. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  369. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  370. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  371. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
  372. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
  373. eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
  374. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  375. str q_d4_0, [x_dest4]
  376. str q_d5_0, [x_dest5]
  377. add x_src, x_src, #16
  378. add x_dest1, x_dest1, #16
  379. add x_dest2, x_dest2, #16
  380. add x_dest3, x_dest3, #16
  381. add x_dest4, x_dest4, #16
  382. add x_dest5, x_dest5, #16
  383. cmp x_src, x_src_end
  384. bls .Lloop16
  385. .lessthan16_init:
  386. sub x_tmp, x_src, x_src_end
  387. cmp x_tmp, #16
  388. beq .return_pass
  389. .lessthan16:
  390. mov x_src, x_src_end
  391. sub x_dest1, x_dest1, x_tmp
  392. sub x_dest2, x_dest2, x_tmp
  393. sub x_dest3, x_dest3, x_tmp
  394. sub x_dest4, x_dest4, x_tmp
  395. sub x_dest5, x_dest5, x_tmp
  396. #ifndef __APPLE__
  397. adrp x_const, const_tbl
  398. add x_const, x_const, :lo12:const_tbl
  399. #else
  400. adrp x_const, const_tbl@PAGE
  401. add x_const, x_const, const_tbl@PAGEOFF
  402. #endif
  403. sub x_const, x_const, x_tmp
  404. ldr q_tmp, [x_const, #16]
  405. ldr q_data, [x_src]
  406. ldr q_d1_0, [x_dest1]
  407. ldr q_d2_0, [x_dest2]
  408. ldr q_d3_0, [x_dest3]
  409. ldr q_gft2_lo, [x_tbl2]
  410. ldr q_gft2_hi, [x_tbl2, #16]
  411. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  412. ushr v_data_hi.16b, v_data.16b, #4
  413. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  414. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  415. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  416. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  417. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  418. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  419. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  420. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  421. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  422. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  423. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  424. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  425. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  426. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  427. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  428. str q_d1_0, [x_dest1]
  429. str q_d2_0, [x_dest2]
  430. str q_d3_0, [x_dest3]
  431. ldr q_d4_0, [x_dest4]
  432. ldr q_d5_0, [x_dest5]
  433. ldr q_gft5_lo, [x_tbl5]
  434. ldr q_gft5_hi, [x_tbl5, #16]
  435. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  436. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  437. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  438. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  439. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  440. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
  441. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
  442. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  443. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  444. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  445. str q_d4_0, [x_dest4]
  446. str q_d5_0, [x_dest5]
  447. .return_pass:
  448. mov w_ret, #0
  449. ret
  450. .return_fail:
  451. mov w_ret, #1
  452. ret
  453. ASM_DEF_RODATA
  454. .balign 8
  455. const_tbl:
  456. .dword 0x0000000000000000, 0x0000000000000000
  457. .dword 0xffffffffffffffff, 0xffffffffffffffff