gf_3vect_mad_neon.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_3vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_3vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x7
  45. x_dest2 .req x8
  46. x_dest3 .req x_dest
  47. x_tmp .req x10
  48. x_tbl1 .req x11
  49. x_tbl2 .req x12
  50. x_tbl3 .req x13
  51. x_const .req x14
  52. /* vectors */
  53. v_mask0f .req v0
  54. v_tmp_lo .req v1
  55. v_tmp_hi .req v2
  56. v_tmp .req v3
  57. q_tmp .req q3
  58. v_gft1_lo .req v4
  59. v_gft1_hi .req v5
  60. v_gft2_lo .req v6
  61. v_gft2_hi .req v7
  62. v_gft3_lo .req v16
  63. v_gft3_hi .req v17
  64. q_gft1_lo .req q4
  65. q_gft1_hi .req q5
  66. q_gft2_lo .req q6
  67. q_gft2_hi .req q7
  68. q_gft3_lo .req q16
  69. q_gft3_hi .req q17
  70. v_data_0 .req v8
  71. v_data_1 .req v9
  72. v_data_2 .req v10
  73. v_data_3 .req v11
  74. q_data_0 .req q8
  75. q_data_1 .req q9
  76. q_data_2 .req q10
  77. q_data_3 .req q11
  78. v_data_0_lo .req v12
  79. v_data_1_lo .req v13
  80. v_data_2_lo .req v14
  81. v_data_3_lo .req v15
  82. v_data_0_hi .req v_data_0
  83. v_data_1_hi .req v_data_1
  84. v_data_2_hi .req v_data_2
  85. v_data_3_hi .req v_data_3
  86. v_d1_0 .req v20
  87. v_d1_1 .req v21
  88. v_d1_2 .req v22
  89. v_d1_3 .req v23
  90. v_d2_0 .req v24
  91. v_d2_1 .req v25
  92. v_d2_2 .req v26
  93. v_d2_3 .req v27
  94. v_d3_0 .req v28
  95. v_d3_1 .req v29
  96. v_d3_2 .req v30
  97. v_d3_3 .req v31
  98. q_d1_0 .req q20
  99. q_d1_1 .req q21
  100. q_d1_2 .req q22
  101. q_d1_3 .req q23
  102. q_d2_0 .req q24
  103. q_d2_1 .req q25
  104. q_d2_2 .req q26
  105. q_d2_3 .req q27
  106. q_d3_0 .req q28
  107. q_d3_1 .req q29
  108. q_d3_2 .req q30
  109. q_d3_3 .req q31
  110. v_data .req v21
  111. q_data .req q21
  112. v_data_lo .req v22
  113. v_data_hi .req v23
  114. cdecl(gf_3vect_mad_neon):
  115. /* less than 16 bytes, return_fail */
  116. cmp x_len, #16
  117. blt .return_fail
  118. movi v_mask0f.16b, #0x0f
  119. lsl x_vec_i, x_vec_i, #5
  120. lsl x_vec, x_vec, #5
  121. add x_tbl1, x_tbl, x_vec_i
  122. add x_tbl2, x_tbl1, x_vec
  123. add x_tbl3, x_tbl2, x_vec
  124. add x_src_end, x_src, x_len
  125. ldr x_dest1, [x_dest]
  126. ldr x_dest2, [x_dest, #8]
  127. ldr x_dest3, [x_dest, #16]
  128. ldr q_gft1_lo, [x_tbl1]
  129. ldr q_gft1_hi, [x_tbl1, #16]
  130. ldr q_gft2_lo, [x_tbl2]
  131. ldr q_gft2_hi, [x_tbl2, #16]
  132. ldr q_gft3_lo, [x_tbl3]
  133. ldr q_gft3_hi, [x_tbl3, #16]
  134. .Lloop64_init:
  135. /* less than 64 bytes, goto Lloop16_init */
  136. cmp x_len, #64
  137. blt .Lloop16_init
  138. /* save d8 ~ d15 to stack */
  139. sub sp, sp, #64
  140. stp d8, d9, [sp]
  141. stp d10, d11, [sp, #16]
  142. stp d12, d13, [sp, #32]
  143. stp d14, d15, [sp, #48]
  144. sub x_src_end, x_src_end, #64
  145. .Lloop64:
  146. ldr q_data_0, [x_src, #16*0]
  147. ldr q_data_1, [x_src, #16*1]
  148. ldr q_data_2, [x_src, #16*2]
  149. ldr q_data_3, [x_src, #16*3]
  150. add x_src, x_src, #64
  151. ldr q_d1_0, [x_dest1, #16*0]
  152. ldr q_d1_1, [x_dest1, #16*1]
  153. ldr q_d1_2, [x_dest1, #16*2]
  154. ldr q_d1_3, [x_dest1, #16*3]
  155. ldr q_d2_0, [x_dest2, #16*0]
  156. ldr q_d2_1, [x_dest2, #16*1]
  157. ldr q_d2_2, [x_dest2, #16*2]
  158. ldr q_d2_3, [x_dest2, #16*3]
  159. ldr q_d3_0, [x_dest3, #16*0]
  160. ldr q_d3_1, [x_dest3, #16*1]
  161. ldr q_d3_2, [x_dest3, #16*2]
  162. ldr q_d3_3, [x_dest3, #16*3]
  163. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  164. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  165. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  166. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  167. ushr v_data_0_hi.16b, v_data_0.16b, #4
  168. ushr v_data_1_hi.16b, v_data_1.16b, #4
  169. ushr v_data_2_hi.16b, v_data_2.16b, #4
  170. ushr v_data_3_hi.16b, v_data_3.16b, #4
  171. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  172. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  173. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  174. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  175. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  176. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  177. eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
  178. eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
  179. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  180. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  181. eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
  182. eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
  183. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  184. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  185. eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
  186. eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
  187. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
  188. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
  189. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  190. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  191. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
  192. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
  193. eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
  194. eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
  195. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
  196. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
  197. eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
  198. eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
  199. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
  200. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
  201. eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
  202. eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
  203. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
  204. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
  205. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  206. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  207. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
  208. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
  209. eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
  210. eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
  211. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
  212. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
  213. eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
  214. eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
  215. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
  216. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
  217. eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
  218. eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
  219. str q_d1_0, [x_dest1, #16*0]
  220. str q_d1_1, [x_dest1, #16*1]
  221. str q_d1_2, [x_dest1, #16*2]
  222. str q_d1_3, [x_dest1, #16*3]
  223. add x_dest1, x_dest1, #64
  224. str q_d2_0, [x_dest2, #16*0]
  225. str q_d2_1, [x_dest2, #16*1]
  226. str q_d2_2, [x_dest2, #16*2]
  227. str q_d2_3, [x_dest2, #16*3]
  228. add x_dest2, x_dest2, #64
  229. str q_d3_0, [x_dest3, #16*0]
  230. str q_d3_1, [x_dest3, #16*1]
  231. str q_d3_2, [x_dest3, #16*2]
  232. str q_d3_3, [x_dest3, #16*3]
  233. add x_dest3, x_dest3, #64
  234. cmp x_src, x_src_end
  235. bls .Lloop64
  236. .Lloop64_end:
  237. /* restore d8 ~ d15 */
  238. ldp d8, d9, [sp]
  239. ldp d10, d11, [sp, #16]
  240. ldp d12, d13, [sp, #32]
  241. ldp d14, d15, [sp, #48]
  242. add sp, sp, #64
  243. add x_src_end, x_src_end, #64
  244. .Lloop16_init:
  245. sub x_src_end, x_src_end, #16
  246. cmp x_src, x_src_end
  247. bhi .lessthan16_init
  248. .Lloop16:
  249. ldr q_data, [x_src]
  250. ldr q_d1_0, [x_dest1]
  251. ldr q_d2_0, [x_dest2]
  252. ldr q_d3_0, [x_dest3]
  253. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  254. ushr v_data_hi.16b, v_data.16b, #4
  255. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  256. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  257. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  258. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  259. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  260. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  261. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  262. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  263. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  264. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  265. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  266. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  267. str q_d1_0, [x_dest1]
  268. str q_d2_0, [x_dest2]
  269. str q_d3_0, [x_dest3]
  270. add x_src, x_src, #16
  271. add x_dest1, x_dest1, #16
  272. add x_dest2, x_dest2, #16
  273. add x_dest3, x_dest3, #16
  274. cmp x_src, x_src_end
  275. bls .Lloop16
  276. .lessthan16_init:
  277. sub x_tmp, x_src, x_src_end
  278. cmp x_tmp, #16
  279. beq .return_pass
  280. .lessthan16:
  281. mov x_src, x_src_end
  282. sub x_dest1, x_dest1, x_tmp
  283. sub x_dest2, x_dest2, x_tmp
  284. sub x_dest3, x_dest3, x_tmp
  285. #ifndef __APPLE__
  286. adrp x_const, const_tbl
  287. add x_const, x_const, :lo12:const_tbl
  288. #else
  289. adrp x_const, const_tbl@PAGE
  290. add x_const, x_const, const_tbl@PAGEOFF
  291. #endif
  292. sub x_const, x_const, x_tmp
  293. ldr q_tmp, [x_const, #16]
  294. ldr q_data, [x_src]
  295. ldr q_d1_0, [x_dest1]
  296. ldr q_d2_0, [x_dest2]
  297. ldr q_d3_0, [x_dest3]
  298. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  299. ushr v_data_hi.16b, v_data.16b, #4
  300. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  301. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  302. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  303. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  304. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  305. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  306. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  307. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  308. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  309. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  310. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  311. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  312. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  313. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  314. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  315. str q_d1_0, [x_dest1]
  316. str q_d2_0, [x_dest2]
  317. str q_d3_0, [x_dest3]
  318. .return_pass:
  319. mov w_ret, #0
  320. ret
  321. .return_fail:
  322. mov w_ret, #1
  323. ret
  324. ASM_DEF_RODATA
  325. .balign 8
  326. const_tbl:
  327. .dword 0x0000000000000000, 0x0000000000000000
  328. .dword 0xffffffffffffffff, 0xffffffffffffffff