gf_6vect_mad_neon.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_6vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_6vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x7
  45. x_dest2 .req x8
  46. x_dest3 .req x9
  47. x_dest4 .req x10
  48. x_dest5 .req x11
  49. x_dest6 .req x_dest
  50. x_tmp .req x12
  51. x_tbl1 .req x13
  52. x_tbl2 .req x14
  53. x_tbl3 .req x15
  54. x_tbl4 .req x16
  55. x_tbl5 .req x17
  56. x_tbl6 .req x_tbl
  57. x_const .req x18
  58. /* vectors */
  59. v_mask0f .req v0
  60. v_tmp_lo .req v1
  61. v_tmp_hi .req v2
  62. v_tmp .req v3
  63. q_tmp .req q3
  64. v_gft1_lo .req v4
  65. v_gft1_hi .req v5
  66. v_gft2_lo .req v6
  67. v_gft2_hi .req v7
  68. v_gft3_lo .req v16
  69. v_gft3_hi .req v17
  70. q_gft1_lo .req q4
  71. q_gft1_hi .req q5
  72. q_gft2_lo .req q6
  73. q_gft2_hi .req q7
  74. q_gft3_lo .req q16
  75. q_gft3_hi .req q17
  76. v_gft4_lo .req v18
  77. v_gft4_hi .req v19
  78. q_gft4_lo .req q18
  79. q_gft4_hi .req q19
  80. v_gft5_lo .req v_gft2_lo
  81. v_gft5_hi .req v_gft2_hi
  82. q_gft5_lo .req q_gft2_lo
  83. q_gft5_hi .req q_gft2_hi
  84. v_gft6_lo .req v_gft3_lo
  85. v_gft6_hi .req v_gft3_hi
  86. q_gft6_lo .req q_gft3_lo
  87. q_gft6_hi .req q_gft3_hi
  88. v_data_0 .req v8
  89. v_data_1 .req v9
  90. v_data_2 .req v10
  91. v_data_3 .req v11
  92. q_data_0 .req q8
  93. q_data_1 .req q9
  94. q_data_2 .req q10
  95. q_data_3 .req q11
  96. v_data_0_lo .req v12
  97. v_data_1_lo .req v13
  98. v_data_2_lo .req v14
  99. v_data_3_lo .req v15
  100. v_data_0_hi .req v_data_0
  101. v_data_1_hi .req v_data_1
  102. v_data_2_hi .req v_data_2
  103. v_data_3_hi .req v_data_3
  104. v_d1_0 .req v20
  105. v_d1_1 .req v21
  106. v_d1_2 .req v22
  107. v_d1_3 .req v23
  108. v_d2_0 .req v24
  109. v_d2_1 .req v25
  110. v_d2_2 .req v26
  111. v_d2_3 .req v27
  112. v_d3_0 .req v28
  113. v_d3_1 .req v29
  114. v_d3_2 .req v30
  115. v_d3_3 .req v31
  116. q_d1_0 .req q20
  117. q_d1_1 .req q21
  118. q_d1_2 .req q22
  119. q_d1_3 .req q23
  120. q_d2_0 .req q24
  121. q_d2_1 .req q25
  122. q_d2_2 .req q26
  123. q_d2_3 .req q27
  124. q_d3_0 .req q28
  125. q_d3_1 .req q29
  126. q_d3_2 .req q30
  127. q_d3_3 .req q31
  128. v_d4_0 .req v_d1_0
  129. v_d4_1 .req v_d1_1
  130. v_d4_2 .req v_d1_2
  131. v_d4_3 .req v_d1_3
  132. q_d4_0 .req q_d1_0
  133. q_d4_1 .req q_d1_1
  134. q_d4_2 .req q_d1_2
  135. q_d4_3 .req q_d1_3
  136. v_d5_0 .req v_d2_0
  137. v_d5_1 .req v_d2_1
  138. v_d5_2 .req v_d2_2
  139. v_d5_3 .req v_d2_3
  140. q_d5_0 .req q_d2_0
  141. q_d5_1 .req q_d2_1
  142. q_d5_2 .req q_d2_2
  143. q_d5_3 .req q_d2_3
  144. v_d6_0 .req v_d3_0
  145. v_d6_1 .req v_d3_1
  146. v_d6_2 .req v_d3_2
  147. v_d6_3 .req v_d3_3
  148. q_d6_0 .req q_d3_0
  149. q_d6_1 .req q_d3_1
  150. q_d6_2 .req q_d3_2
  151. q_d6_3 .req q_d3_3
  152. v_data .req v21
  153. q_data .req q21
  154. v_data_lo .req v22
  155. v_data_hi .req v23
  156. cdecl(gf_6vect_mad_neon):
  157. /* less than 16 bytes, return_fail */
  158. cmp x_len, #16
  159. blt .return_fail
  160. movi v_mask0f.16b, #0x0f
  161. lsl x_vec_i, x_vec_i, #5
  162. lsl x_vec, x_vec, #5
  163. add x_tbl1, x_tbl, x_vec_i
  164. add x_tbl2, x_tbl1, x_vec
  165. add x_tbl3, x_tbl2, x_vec
  166. add x_tbl4, x_tbl3, x_vec
  167. add x_tbl5, x_tbl4, x_vec
  168. add x_tbl6, x_tbl5, x_vec
  169. add x_src_end, x_src, x_len
  170. ldr x_dest1, [x_dest, #8*0]
  171. ldr x_dest2, [x_dest, #8*1]
  172. ldr x_dest3, [x_dest, #8*2]
  173. ldr x_dest4, [x_dest, #8*3]
  174. ldr x_dest5, [x_dest, #8*4]
  175. ldr x_dest6, [x_dest, #8*5]
  176. ldr q_gft1_lo, [x_tbl1]
  177. ldr q_gft1_hi, [x_tbl1, #16]
  178. ldr q_gft4_lo, [x_tbl4]
  179. ldr q_gft4_hi, [x_tbl4, #16]
  180. .Lloop64_init:
  181. /* less than 64 bytes, goto Lloop16_init */
  182. cmp x_len, #64
  183. blt .Lloop16_init
  184. /* save d8 ~ d15 to stack */
  185. sub sp, sp, #64
  186. stp d8, d9, [sp]
  187. stp d10, d11, [sp, #16]
  188. stp d12, d13, [sp, #32]
  189. stp d14, d15, [sp, #48]
  190. sub x_src_end, x_src_end, #64
  191. .Lloop64:
  192. ldr q_data_0, [x_src, #16*0]
  193. ldr q_data_1, [x_src, #16*1]
  194. ldr q_data_2, [x_src, #16*2]
  195. ldr q_data_3, [x_src, #16*3]
  196. add x_src, x_src, #64
  197. ldr q_d1_0, [x_dest1, #16*0]
  198. ldr q_d1_1, [x_dest1, #16*1]
  199. ldr q_d1_2, [x_dest1, #16*2]
  200. ldr q_d1_3, [x_dest1, #16*3]
  201. ldr q_d2_0, [x_dest2, #16*0]
  202. ldr q_d2_1, [x_dest2, #16*1]
  203. ldr q_d2_2, [x_dest2, #16*2]
  204. ldr q_d2_3, [x_dest2, #16*3]
  205. ldr q_d3_0, [x_dest3, #16*0]
  206. ldr q_d3_1, [x_dest3, #16*1]
  207. ldr q_d3_2, [x_dest3, #16*2]
  208. ldr q_d3_3, [x_dest3, #16*3]
  209. ldr q_gft2_lo, [x_tbl2]
  210. ldr q_gft2_hi, [x_tbl2, #16]
  211. ldr q_gft3_lo, [x_tbl3]
  212. ldr q_gft3_hi, [x_tbl3, #16]
  213. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  214. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  215. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  216. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  217. ushr v_data_0_hi.16b, v_data_0.16b, #4
  218. ushr v_data_1_hi.16b, v_data_1.16b, #4
  219. ushr v_data_2_hi.16b, v_data_2.16b, #4
  220. ushr v_data_3_hi.16b, v_data_3.16b, #4
  221. /* dest1 */
  222. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  223. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  224. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  225. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  226. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  227. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  228. eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
  229. eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
  230. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  231. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  232. eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
  233. eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
  234. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  235. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  236. eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
  237. eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
  238. /* dest2 */
  239. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
  240. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
  241. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  242. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  243. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
  244. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
  245. eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
  246. eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
  247. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
  248. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
  249. eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
  250. eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
  251. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
  252. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
  253. eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
  254. eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
  255. /* dest3 */
  256. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
  257. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
  258. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  259. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  260. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
  261. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
  262. eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
  263. eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
  264. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
  265. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
  266. eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
  267. eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
  268. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
  269. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
  270. eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
  271. eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
  272. str q_d1_0, [x_dest1, #16*0]
  273. str q_d1_1, [x_dest1, #16*1]
  274. str q_d1_2, [x_dest1, #16*2]
  275. str q_d1_3, [x_dest1, #16*3]
  276. add x_dest1, x_dest1, #64
  277. str q_d2_0, [x_dest2, #16*0]
  278. str q_d2_1, [x_dest2, #16*1]
  279. str q_d2_2, [x_dest2, #16*2]
  280. str q_d2_3, [x_dest2, #16*3]
  281. add x_dest2, x_dest2, #64
  282. str q_d3_0, [x_dest3, #16*0]
  283. str q_d3_1, [x_dest3, #16*1]
  284. str q_d3_2, [x_dest3, #16*2]
  285. str q_d3_3, [x_dest3, #16*3]
  286. add x_dest3, x_dest3, #64
  287. ldr q_d4_0, [x_dest4, #16*0]
  288. ldr q_d4_1, [x_dest4, #16*1]
  289. ldr q_d4_2, [x_dest4, #16*2]
  290. ldr q_d4_3, [x_dest4, #16*3]
  291. ldr q_d5_0, [x_dest5, #16*0]
  292. ldr q_d5_1, [x_dest5, #16*1]
  293. ldr q_d5_2, [x_dest5, #16*2]
  294. ldr q_d5_3, [x_dest5, #16*3]
  295. ldr q_d6_0, [x_dest6, #16*0]
  296. ldr q_d6_1, [x_dest6, #16*1]
  297. ldr q_d6_2, [x_dest6, #16*2]
  298. ldr q_d6_3, [x_dest6, #16*3]
  299. ldr q_gft5_lo, [x_tbl5]
  300. ldr q_gft5_hi, [x_tbl5, #16]
  301. ldr q_gft6_lo, [x_tbl6]
  302. ldr q_gft6_hi, [x_tbl6, #16]
  303. /* dest4 */
  304. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
  305. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
  306. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  307. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  308. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
  309. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
  310. eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
  311. eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
  312. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
  313. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
  314. eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
  315. eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
  316. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
  317. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
  318. eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
  319. eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
  320. /* dest5 */
  321. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
  322. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
  323. eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
  324. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  325. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
  326. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
  327. eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
  328. eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
  329. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
  330. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
  331. eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
  332. eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
  333. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
  334. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
  335. eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
  336. eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
  337. /* dest6 */
  338. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
  339. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
  340. eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
  341. eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
  342. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
  343. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
  344. eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
  345. eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
  346. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
  347. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
  348. eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
  349. eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
  350. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
  351. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
  352. eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
  353. eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
  354. str q_d4_0, [x_dest4, #16*0]
  355. str q_d4_1, [x_dest4, #16*1]
  356. str q_d4_2, [x_dest4, #16*2]
  357. str q_d4_3, [x_dest4, #16*3]
  358. add x_dest4, x_dest4, #64
  359. str q_d5_0, [x_dest5, #16*0]
  360. str q_d5_1, [x_dest5, #16*1]
  361. str q_d5_2, [x_dest5, #16*2]
  362. str q_d5_3, [x_dest5, #16*3]
  363. add x_dest5, x_dest5, #64
  364. str q_d6_0, [x_dest6, #16*0]
  365. str q_d6_1, [x_dest6, #16*1]
  366. str q_d6_2, [x_dest6, #16*2]
  367. str q_d6_3, [x_dest6, #16*3]
  368. add x_dest6, x_dest6, #64
  369. cmp x_src, x_src_end
  370. bls .Lloop64
  371. .Lloop64_end:
  372. /* restore d8 ~ d15 */
  373. ldp d8, d9, [sp]
  374. ldp d10, d11, [sp, #16]
  375. ldp d12, d13, [sp, #32]
  376. ldp d14, d15, [sp, #48]
  377. add sp, sp, #64
  378. add x_src_end, x_src_end, #64
  379. .Lloop16_init:
  380. sub x_src_end, x_src_end, #16
  381. cmp x_src, x_src_end
  382. bhi .lessthan16_init
  383. .Lloop16:
  384. ldr q_data, [x_src]
  385. ldr q_d1_0, [x_dest1]
  386. ldr q_d2_0, [x_dest2]
  387. ldr q_d3_0, [x_dest3]
  388. ldr q_gft2_lo, [x_tbl2]
  389. ldr q_gft2_hi, [x_tbl2, #16]
  390. ldr q_gft3_lo, [x_tbl3]
  391. ldr q_gft3_hi, [x_tbl3, #16]
  392. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  393. ushr v_data_hi.16b, v_data.16b, #4
  394. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  395. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  396. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  397. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  398. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  399. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  400. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  401. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  402. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  403. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  404. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  405. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  406. str q_d1_0, [x_dest1]
  407. str q_d2_0, [x_dest2]
  408. str q_d3_0, [x_dest3]
  409. ldr q_d4_0, [x_dest4]
  410. ldr q_d5_0, [x_dest5]
  411. ldr q_d6_0, [x_dest6]
  412. ldr q_gft5_lo, [x_tbl5]
  413. ldr q_gft5_hi, [x_tbl5, #16]
  414. ldr q_gft6_lo, [x_tbl6]
  415. ldr q_gft6_hi, [x_tbl6, #16]
  416. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  417. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  418. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  419. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  420. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
  421. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
  422. eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
  423. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  424. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
  425. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
  426. eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
  427. eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
  428. str q_d4_0, [x_dest4]
  429. str q_d5_0, [x_dest5]
  430. str q_d6_0, [x_dest6]
  431. add x_src, x_src, #16
  432. add x_dest1, x_dest1, #16
  433. add x_dest2, x_dest2, #16
  434. add x_dest3, x_dest3, #16
  435. add x_dest4, x_dest4, #16
  436. add x_dest5, x_dest5, #16
  437. add x_dest6, x_dest6, #16
  438. cmp x_src, x_src_end
  439. bls .Lloop16
  440. .lessthan16_init:
  441. sub x_tmp, x_src, x_src_end
  442. cmp x_tmp, #16
  443. beq .return_pass
  444. .lessthan16:
  445. mov x_src, x_src_end
  446. sub x_dest1, x_dest1, x_tmp
  447. sub x_dest2, x_dest2, x_tmp
  448. sub x_dest3, x_dest3, x_tmp
  449. sub x_dest4, x_dest4, x_tmp
  450. sub x_dest5, x_dest5, x_tmp
  451. sub x_dest6, x_dest6, x_tmp
  452. #ifndef __APPLE__
  453. adrp x_const, const_tbl
  454. add x_const, x_const, :lo12:const_tbl
  455. #else
  456. adrp x_const, const_tbl@PAGE
  457. add x_const, x_const, const_tbl@PAGEOFF
  458. #endif
  459. sub x_const, x_const, x_tmp
  460. ldr q_tmp, [x_const, #16]
  461. ldr q_data, [x_src]
  462. ldr q_d1_0, [x_dest1]
  463. ldr q_d2_0, [x_dest2]
  464. ldr q_d3_0, [x_dest3]
  465. ldr q_gft2_lo, [x_tbl2]
  466. ldr q_gft2_hi, [x_tbl2, #16]
  467. ldr q_gft3_lo, [x_tbl3]
  468. ldr q_gft3_hi, [x_tbl3, #16]
  469. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  470. ushr v_data_hi.16b, v_data.16b, #4
  471. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  472. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  473. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  474. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  475. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  476. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  477. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  478. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  479. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  480. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  481. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  482. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  483. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  484. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  485. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  486. str q_d1_0, [x_dest1]
  487. str q_d2_0, [x_dest2]
  488. str q_d3_0, [x_dest3]
  489. ldr q_d4_0, [x_dest4]
  490. ldr q_d5_0, [x_dest5]
  491. ldr q_d6_0, [x_dest6]
  492. ldr q_gft5_lo, [x_tbl5]
  493. ldr q_gft5_hi, [x_tbl5, #16]
  494. ldr q_gft6_lo, [x_tbl6]
  495. ldr q_gft6_hi, [x_tbl6, #16]
  496. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  497. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  498. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  499. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  500. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  501. tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
  502. tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
  503. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  504. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  505. eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
  506. tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
  507. tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
  508. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  509. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  510. eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
  511. str q_d4_0, [x_dest4]
  512. str q_d5_0, [x_dest5]
  513. str q_d6_0, [x_dest6]
  514. .return_pass:
  515. mov w_ret, #0
  516. ret
  517. .return_fail:
  518. mov w_ret, #1
  519. ret
  520. ASM_DEF_RODATA
  521. .balign 8
  522. const_tbl:
  523. .dword 0x0000000000000000, 0x0000000000000000
  524. .dword 0xffffffffffffffff, 0xffffffffffffffff