gf_2vect_mad_neon.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_2vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_2vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x7
  45. x_dest2 .req x8
  46. x_tmp .req x9
  47. x_tbl1 .req x10
  48. x_tbl2 .req x11
  49. x_const .req x12
  50. /* vectors */
  51. v_mask0f .req v0
  52. v_tmp_lo .req v1
  53. v_tmp_hi .req v2
  54. v_tmp .req v3
  55. q_tmp .req q3
  56. v_gft1_lo .req v4
  57. v_gft1_hi .req v5
  58. v_gft2_lo .req v6
  59. v_gft2_hi .req v7
  60. q_gft1_lo .req q4
  61. q_gft1_hi .req q5
  62. q_gft2_lo .req q6
  63. q_gft2_hi .req q7
  64. v_data_0 .req v8
  65. v_data_1 .req v9
  66. v_data_2 .req v10
  67. v_data_3 .req v11
  68. v_data_4 .req v12
  69. v_data_5 .req v13
  70. v_data_6 .req v14
  71. v_data_7 .req v15
  72. q_data_0 .req q8
  73. q_data_1 .req q9
  74. q_data_2 .req q10
  75. q_data_3 .req q11
  76. q_data_4 .req q12
  77. q_data_5 .req q13
  78. q_data_6 .req q14
  79. q_data_7 .req q15
  80. v_data_0_lo .req v16
  81. v_data_1_lo .req v17
  82. v_data_2_lo .req v18
  83. v_data_3_lo .req v19
  84. v_data_4_lo .req v20
  85. v_data_5_lo .req v21
  86. v_data_6_lo .req v22
  87. v_data_7_lo .req v23
  88. v_data_0_hi .req v_data_0
  89. v_data_1_hi .req v_data_1
  90. v_data_2_hi .req v_data_2
  91. v_data_3_hi .req v_data_3
  92. v_data_4_hi .req v_data_4
  93. v_data_5_hi .req v_data_5
  94. v_data_6_hi .req v_data_6
  95. v_data_7_hi .req v_data_7
  96. v_d0 .req v24
  97. v_d1 .req v25
  98. v_d2 .req v26
  99. v_d3 .req v27
  100. v_d4 .req v28
  101. v_d5 .req v29
  102. v_d6 .req v30
  103. v_d7 .req v31
  104. q_d0 .req q24
  105. q_d1 .req q25
  106. q_d2 .req q26
  107. q_d3 .req q27
  108. q_d4 .req q28
  109. q_d5 .req q29
  110. q_d6 .req q30
  111. q_d7 .req q31
  112. v_data .req v16
  113. q_data .req q16
  114. v_data_lo .req v17
  115. v_data_hi .req v18
  116. cdecl(gf_2vect_mad_neon):
  117. /* less than 16 bytes, return_fail */
  118. cmp x_len, #16
  119. blt .return_fail
  120. movi v_mask0f.16b, #0x0f
  121. lsl x_vec_i, x_vec_i, #5
  122. lsl x_vec, x_vec, #5
  123. add x_tbl1, x_tbl, x_vec_i
  124. add x_tbl2, x_tbl1, x_vec
  125. add x_src_end, x_src, x_len
  126. ldr x_dest1, [x_dest]
  127. ldr x_dest2, [x_dest, #8]
  128. ldr q_gft1_lo, [x_tbl1]
  129. ldr q_gft1_hi, [x_tbl1, #16]
  130. ldr q_gft2_lo, [x_tbl2]
  131. ldr q_gft2_hi, [x_tbl2, #16]
  132. .Lloop128_init:
  133. /* less than 128 bytes, goto Lloop16_init */
  134. cmp x_len, #128
  135. blt .Lloop16_init
  136. /* save d8 ~ d15 to stack */
  137. sub sp, sp, #64
  138. stp d8, d9, [sp]
  139. stp d10, d11, [sp, #16]
  140. stp d12, d13, [sp, #32]
  141. stp d14, d15, [sp, #48]
  142. sub x_src_end, x_src_end, #128
  143. .Lloop128:
  144. ldr q_data_0, [x_src, #16*0]
  145. ldr q_data_1, [x_src, #16*1]
  146. ldr q_data_2, [x_src, #16*2]
  147. ldr q_data_3, [x_src, #16*3]
  148. ldr q_data_4, [x_src, #16*4]
  149. ldr q_data_5, [x_src, #16*5]
  150. ldr q_data_6, [x_src, #16*6]
  151. ldr q_data_7, [x_src, #16*7]
  152. ldr q_d0, [x_dest1, #16*0]
  153. ldr q_d1, [x_dest1, #16*1]
  154. ldr q_d2, [x_dest1, #16*2]
  155. ldr q_d3, [x_dest1, #16*3]
  156. ldr q_d4, [x_dest1, #16*4]
  157. ldr q_d5, [x_dest1, #16*5]
  158. ldr q_d6, [x_dest1, #16*6]
  159. ldr q_d7, [x_dest1, #16*7]
  160. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  161. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  162. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  163. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  164. and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
  165. and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
  166. and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
  167. and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
  168. ushr v_data_0_hi.16b, v_data_0.16b, #4
  169. ushr v_data_1_hi.16b, v_data_1.16b, #4
  170. ushr v_data_2_hi.16b, v_data_2.16b, #4
  171. ushr v_data_3_hi.16b, v_data_3.16b, #4
  172. ushr v_data_4_hi.16b, v_data_4.16b, #4
  173. ushr v_data_5_hi.16b, v_data_5.16b, #4
  174. ushr v_data_6_hi.16b, v_data_6.16b, #4
  175. ushr v_data_7_hi.16b, v_data_7.16b, #4
  176. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  177. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  178. eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
  179. eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
  180. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  181. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  182. eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
  183. eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
  184. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  185. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  186. eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
  187. eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
  188. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  189. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  190. eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
  191. eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
  192. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
  193. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
  194. eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
  195. eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
  196. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
  197. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
  198. eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
  199. eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
  200. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
  201. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
  202. eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
  203. eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
  204. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
  205. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
  206. eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
  207. eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
  208. str q_d0, [x_dest1, #16*0]
  209. str q_d1, [x_dest1, #16*1]
  210. str q_d2, [x_dest1, #16*2]
  211. str q_d3, [x_dest1, #16*3]
  212. str q_d4, [x_dest1, #16*4]
  213. str q_d5, [x_dest1, #16*5]
  214. str q_d6, [x_dest1, #16*6]
  215. str q_d7, [x_dest1, #16*7]
  216. ldr q_d0, [x_dest2, #16*0]
  217. ldr q_d1, [x_dest2, #16*1]
  218. ldr q_d2, [x_dest2, #16*2]
  219. ldr q_d3, [x_dest2, #16*3]
  220. ldr q_d4, [x_dest2, #16*4]
  221. ldr q_d5, [x_dest2, #16*5]
  222. ldr q_d6, [x_dest2, #16*6]
  223. ldr q_d7, [x_dest2, #16*7]
  224. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
  225. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
  226. eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
  227. eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
  228. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
  229. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
  230. eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
  231. eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
  232. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
  233. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
  234. eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
  235. eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
  236. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
  237. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
  238. eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
  239. eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
  240. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
  241. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
  242. eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
  243. eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
  244. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
  245. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
  246. eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
  247. eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
  248. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
  249. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
  250. eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
  251. eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
  252. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
  253. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
  254. eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
  255. eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
  256. str q_d0, [x_dest2, #16*0]
  257. str q_d1, [x_dest2, #16*1]
  258. str q_d2, [x_dest2, #16*2]
  259. str q_d3, [x_dest2, #16*3]
  260. str q_d4, [x_dest2, #16*4]
  261. str q_d5, [x_dest2, #16*5]
  262. str q_d6, [x_dest2, #16*6]
  263. str q_d7, [x_dest2, #16*7]
  264. add x_src, x_src, #128
  265. add x_dest1, x_dest1, #128
  266. add x_dest2, x_dest2, #128
  267. cmp x_src, x_src_end
  268. bls .Lloop128
  269. .Lloop128_end:
  270. /* restore d8 ~ d15 */
  271. ldp d8, d9, [sp]
  272. ldp d10, d11, [sp, #16]
  273. ldp d12, d13, [sp, #32]
  274. ldp d14, d15, [sp, #48]
  275. add sp, sp, #64
  276. add x_src_end, x_src_end, #128
  277. .Lloop16_init:
  278. sub x_src_end, x_src_end, #16
  279. cmp x_src, x_src_end
  280. bhi .lessthan16_init
  281. .Lloop16:
  282. ldr q_data, [x_src]
  283. ldr q_d0, [x_dest1]
  284. ldr q_d1, [x_dest2]
  285. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  286. ushr v_data_hi.16b, v_data.16b, #4
  287. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  288. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  289. eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
  290. eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
  291. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  292. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  293. eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
  294. eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
  295. str q_d0, [x_dest1]
  296. str q_d1, [x_dest2]
  297. add x_dest1, x_dest1, #16
  298. add x_dest2, x_dest2, #16
  299. add x_src, x_src, #16
  300. cmp x_src, x_src_end
  301. bls .Lloop16
  302. .lessthan16_init:
  303. sub x_tmp, x_src, x_src_end
  304. cmp x_tmp, #16
  305. beq .return_pass
  306. .lessthan16:
  307. mov x_src, x_src_end
  308. sub x_dest1, x_dest1, x_tmp
  309. sub x_dest2, x_dest2, x_tmp
  310. #ifndef __APPLE__
  311. adrp x_const, const_tbl
  312. add x_const, x_const, :lo12:const_tbl
  313. #else
  314. adrp x_const, const_tbl@PAGE
  315. add x_const, x_const, const_tbl@PAGEOFF
  316. #endif
  317. sub x_const, x_const, x_tmp
  318. ldr q_tmp, [x_const, #16]
  319. ldr q_data, [x_src]
  320. ldr q_d0, [x_dest1]
  321. ldr q_d1, [x_dest2]
  322. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  323. ushr v_data_hi.16b, v_data.16b, #4
  324. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  325. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  326. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  327. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  328. eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
  329. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  330. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  331. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  332. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  333. eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
  334. str q_d0, [x_dest1]
  335. str q_d1, [x_dest2]
  336. .return_pass:
  337. mov w_ret, #0
  338. ret
  339. .return_fail:
  340. mov w_ret, #1
  341. ret
  342. ASM_DEF_RODATA
  343. .balign 8
  344. const_tbl:
  345. .dword 0x0000000000000000, 0x0000000000000000
  346. .dword 0xffffffffffffffff, 0xffffffffffffffff