gf_4vect_mad_neon.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_4vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_4vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x7
  45. x_dest2 .req x8
  46. x_dest3 .req x9
  47. x_dest4 .req x_dest
  48. x_tmp .req x10
  49. x_tbl1 .req x11
  50. x_tbl2 .req x12
  51. x_tbl3 .req x13
  52. x_tbl4 .req x14
  53. x_const .req x15
  54. /* vectors */
  55. v_mask0f .req v0
  56. v_tmp_lo .req v1
  57. v_tmp_hi .req v2
  58. v_tmp .req v3
  59. q_tmp .req q3
  60. v_gft1_lo .req v4
  61. v_gft1_hi .req v5
  62. v_gft2_lo .req v6
  63. v_gft2_hi .req v7
  64. v_gft3_lo .req v16
  65. v_gft3_hi .req v17
  66. v_gft4_lo .req v18
  67. v_gft4_hi .req v19
  68. q_gft1_lo .req q4
  69. q_gft1_hi .req q5
  70. q_gft2_lo .req q6
  71. q_gft2_hi .req q7
  72. q_gft3_lo .req q16
  73. q_gft3_hi .req q17
  74. q_gft4_lo .req q18
  75. q_gft4_hi .req q19
  76. v_data_0 .req v8
  77. v_data_1 .req v9
  78. v_data_2 .req v10
  79. v_data_3 .req v11
  80. q_data_0 .req q8
  81. q_data_1 .req q9
  82. q_data_2 .req q10
  83. q_data_3 .req q11
  84. v_data_0_lo .req v12
  85. v_data_1_lo .req v13
  86. v_data_2_lo .req v14
  87. v_data_3_lo .req v15
  88. v_data_0_hi .req v_data_0
  89. v_data_1_hi .req v_data_1
  90. v_data_2_hi .req v_data_2
  91. v_data_3_hi .req v_data_3
  92. v_d1_0 .req v20
  93. v_d1_1 .req v21
  94. v_d1_2 .req v22
  95. v_d1_3 .req v23
  96. v_d2_0 .req v24
  97. v_d2_1 .req v25
  98. v_d2_2 .req v26
  99. v_d2_3 .req v27
  100. v_d3_0 .req v28
  101. v_d3_1 .req v29
  102. v_d3_2 .req v30
  103. v_d3_3 .req v31
  104. q_d1_0 .req q20
  105. q_d1_1 .req q21
  106. q_d1_2 .req q22
  107. q_d1_3 .req q23
  108. q_d2_0 .req q24
  109. q_d2_1 .req q25
  110. q_d2_2 .req q26
  111. q_d2_3 .req q27
  112. q_d3_0 .req q28
  113. q_d3_1 .req q29
  114. q_d3_2 .req q30
  115. q_d3_3 .req q31
  116. v_d4_0 .req v_d1_0
  117. v_d4_1 .req v_d1_1
  118. v_d4_2 .req v_d1_2
  119. v_d4_3 .req v_d1_3
  120. q_d4_0 .req q_d1_0
  121. q_d4_1 .req q_d1_1
  122. q_d4_2 .req q_d1_2
  123. q_d4_3 .req q_d1_3
  124. v_data .req v21
  125. q_data .req q21
  126. v_data_lo .req v22
  127. v_data_hi .req v23
  128. cdecl(gf_4vect_mad_neon):
  129. /* less than 16 bytes, return_fail */
  130. cmp x_len, #16
  131. blt .return_fail
  132. movi v_mask0f.16b, #0x0f
  133. lsl x_vec_i, x_vec_i, #5
  134. lsl x_vec, x_vec, #5
  135. add x_tbl1, x_tbl, x_vec_i
  136. add x_tbl2, x_tbl1, x_vec
  137. add x_tbl3, x_tbl2, x_vec
  138. add x_tbl4, x_tbl3, x_vec
  139. add x_src_end, x_src, x_len
  140. ldr x_dest1, [x_dest, #8*0]
  141. ldr x_dest2, [x_dest, #8*1]
  142. ldr x_dest3, [x_dest, #8*2]
  143. ldr x_dest4, [x_dest, #8*3]
  144. ldr q_gft1_lo, [x_tbl1]
  145. ldr q_gft1_hi, [x_tbl1, #16]
  146. ldr q_gft2_lo, [x_tbl2]
  147. ldr q_gft2_hi, [x_tbl2, #16]
  148. ldr q_gft3_lo, [x_tbl3]
  149. ldr q_gft3_hi, [x_tbl3, #16]
  150. ldr q_gft4_lo, [x_tbl4]
  151. ldr q_gft4_hi, [x_tbl4, #16]
  152. .Lloop64_init:
  153. /* less than 64 bytes, goto Lloop16_init */
  154. cmp x_len, #64
  155. blt .Lloop16_init
  156. /* save d8 ~ d15 to stack */
  157. sub sp, sp, #64
  158. stp d8, d9, [sp]
  159. stp d10, d11, [sp, #16]
  160. stp d12, d13, [sp, #32]
  161. stp d14, d15, [sp, #48]
  162. sub x_src_end, x_src_end, #64
  163. .Lloop64:
  164. ldr q_data_0, [x_src, #16*0]
  165. ldr q_data_1, [x_src, #16*1]
  166. ldr q_data_2, [x_src, #16*2]
  167. ldr q_data_3, [x_src, #16*3]
  168. add x_src, x_src, #64
  169. ldr q_d1_0, [x_dest1, #16*0]
  170. ldr q_d1_1, [x_dest1, #16*1]
  171. ldr q_d1_2, [x_dest1, #16*2]
  172. ldr q_d1_3, [x_dest1, #16*3]
  173. ldr q_d2_0, [x_dest2, #16*0]
  174. ldr q_d2_1, [x_dest2, #16*1]
  175. ldr q_d2_2, [x_dest2, #16*2]
  176. ldr q_d2_3, [x_dest2, #16*3]
  177. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  178. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  179. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  180. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  181. ushr v_data_0_hi.16b, v_data_0.16b, #4
  182. ushr v_data_1_hi.16b, v_data_1.16b, #4
  183. ushr v_data_2_hi.16b, v_data_2.16b, #4
  184. ushr v_data_3_hi.16b, v_data_3.16b, #4
  185. /* dest1 */
  186. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  187. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  188. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  189. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  190. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  191. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  192. eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
  193. eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
  194. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  195. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  196. eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
  197. eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
  198. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  199. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  200. eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
  201. eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
  202. /* dest2 */
  203. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
  204. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
  205. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  206. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  207. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
  208. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
  209. eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
  210. eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
  211. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
  212. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
  213. eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
  214. eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
  215. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
  216. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
  217. eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
  218. eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
  219. str q_d1_0, [x_dest1, #16*0]
  220. str q_d1_1, [x_dest1, #16*1]
  221. str q_d1_2, [x_dest1, #16*2]
  222. str q_d1_3, [x_dest1, #16*3]
  223. add x_dest1, x_dest1, #64
  224. str q_d2_0, [x_dest2, #16*0]
  225. str q_d2_1, [x_dest2, #16*1]
  226. str q_d2_2, [x_dest2, #16*2]
  227. str q_d2_3, [x_dest2, #16*3]
  228. add x_dest2, x_dest2, #64
  229. ldr q_d3_0, [x_dest3, #16*0]
  230. ldr q_d3_1, [x_dest3, #16*1]
  231. ldr q_d3_2, [x_dest3, #16*2]
  232. ldr q_d3_3, [x_dest3, #16*3]
  233. ldr q_d4_0, [x_dest4, #16*0]
  234. ldr q_d4_1, [x_dest4, #16*1]
  235. ldr q_d4_2, [x_dest4, #16*2]
  236. ldr q_d4_3, [x_dest4, #16*3]
  237. /* dest3 */
  238. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
  239. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
  240. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  241. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  242. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
  243. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
  244. eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
  245. eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
  246. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
  247. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
  248. eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
  249. eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
  250. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
  251. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
  252. eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
  253. eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
  254. /* dest4 */
  255. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
  256. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
  257. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  258. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  259. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
  260. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
  261. eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
  262. eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
  263. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
  264. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
  265. eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
  266. eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
  267. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
  268. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
  269. eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
  270. eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
  271. str q_d3_0, [x_dest3, #16*0]
  272. str q_d3_1, [x_dest3, #16*1]
  273. str q_d3_2, [x_dest3, #16*2]
  274. str q_d3_3, [x_dest3, #16*3]
  275. add x_dest3, x_dest3, #64
  276. str q_d4_0, [x_dest4, #16*0]
  277. str q_d4_1, [x_dest4, #16*1]
  278. str q_d4_2, [x_dest4, #16*2]
  279. str q_d4_3, [x_dest4, #16*3]
  280. add x_dest4, x_dest4, #64
  281. cmp x_src, x_src_end
  282. bls .Lloop64
  283. .Lloop64_end:
  284. /* restore d8 ~ d15 */
  285. ldp d8, d9, [sp]
  286. ldp d10, d11, [sp, #16]
  287. ldp d12, d13, [sp, #32]
  288. ldp d14, d15, [sp, #48]
  289. add sp, sp, #64
  290. add x_src_end, x_src_end, #64
  291. .Lloop16_init:
  292. sub x_src_end, x_src_end, #16
  293. cmp x_src, x_src_end
  294. bhi .lessthan16_init
  295. .Lloop16:
  296. ldr q_data, [x_src]
  297. ldr q_d1_0, [x_dest1]
  298. ldr q_d2_0, [x_dest2]
  299. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  300. ushr v_data_hi.16b, v_data.16b, #4
  301. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  302. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  303. eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
  304. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  305. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  306. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  307. eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
  308. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  309. str q_d1_0, [x_dest1]
  310. str q_d2_0, [x_dest2]
  311. ldr q_d3_0, [x_dest3]
  312. ldr q_d4_0, [x_dest4]
  313. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  314. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  315. eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
  316. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  317. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  318. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  319. eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
  320. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  321. str q_d3_0, [x_dest3]
  322. str q_d4_0, [x_dest4]
  323. add x_src, x_src, #16
  324. add x_dest1, x_dest1, #16
  325. add x_dest2, x_dest2, #16
  326. add x_dest3, x_dest3, #16
  327. add x_dest4, x_dest4, #16
  328. cmp x_src, x_src_end
  329. bls .Lloop16
  330. .lessthan16_init:
  331. sub x_tmp, x_src, x_src_end
  332. cmp x_tmp, #16
  333. beq .return_pass
  334. .lessthan16:
  335. mov x_src, x_src_end
  336. sub x_dest1, x_dest1, x_tmp
  337. sub x_dest2, x_dest2, x_tmp
  338. sub x_dest3, x_dest3, x_tmp
  339. sub x_dest4, x_dest4, x_tmp
  340. #ifndef __APPLE__
  341. adrp x_const, const_tbl
  342. add x_const, x_const, :lo12:const_tbl
  343. #else
  344. adrp x_const, const_tbl@PAGE
  345. add x_const, x_const, const_tbl@PAGEOFF
  346. #endif
  347. sub x_const, x_const, x_tmp
  348. ldr q_tmp, [x_const, #16]
  349. ldr q_data, [x_src]
  350. ldr q_d1_0, [x_dest1]
  351. ldr q_d2_0, [x_dest2]
  352. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  353. ushr v_data_hi.16b, v_data.16b, #4
  354. tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  355. tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  356. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  357. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  358. eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
  359. tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
  360. tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
  361. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  362. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  363. eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
  364. str q_d1_0, [x_dest1]
  365. str q_d2_0, [x_dest2]
  366. ldr q_d3_0, [x_dest3]
  367. ldr q_d4_0, [x_dest4]
  368. tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
  369. tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
  370. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  371. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  372. eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
  373. tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
  374. tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
  375. eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
  376. and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
  377. eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
  378. str q_d3_0, [x_dest3]
  379. str q_d4_0, [x_dest4]
  380. .return_pass:
  381. mov w_ret, #0
  382. ret
  383. .return_fail:
  384. mov w_ret, #1
  385. ret
  386. ASM_DEF_RODATA
  387. .balign 8
  388. const_tbl:
  389. .dword 0x0000000000000000, 0x0000000000000000
  390. .dword 0xffffffffffffffff, 0xffffffffffffffff