gf_vect_mad_neon.S 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_vect_mad_neon)
  30. #ifndef __APPLE__
  31. .type gf_vect_mad_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_vec .req x1
  36. x_vec_i .req x2
  37. x_tbl .req x3
  38. x_src .req x4
  39. x_dest .req x5
  40. /* returns */
  41. w_ret .req w0
  42. /* local variables */
  43. x_src_end .req x6
  44. x_dest1 .req x_dest
  45. x_tmp .req x7
  46. x_const .req x8
  47. /* vectors */
  48. v_mask0f .req v0
  49. v_tmp .req v1
  50. q_tmp .req q1
  51. v_tmp1_lo .req v2
  52. v_tmp1_hi .req v3
  53. v_tmp2_lo .req v4
  54. v_tmp2_hi .req v5
  55. v_gft1_lo .req v6
  56. v_gft1_hi .req v7
  57. q_gft1_lo .req q6
  58. q_gft1_hi .req q7
  59. v_data_0 .req v8
  60. v_data_1 .req v9
  61. v_data_2 .req v10
  62. v_data_3 .req v11
  63. v_data_4 .req v12
  64. v_data_5 .req v13
  65. v_data_6 .req v14
  66. v_data_7 .req v15
  67. q_data_0 .req q8
  68. q_data_1 .req q9
  69. q_data_2 .req q10
  70. q_data_3 .req q11
  71. q_data_4 .req q12
  72. q_data_5 .req q13
  73. q_data_6 .req q14
  74. q_data_7 .req q15
  75. v_data_0_lo .req v16
  76. v_data_1_lo .req v17
  77. v_data_2_lo .req v18
  78. v_data_3_lo .req v19
  79. v_data_4_lo .req v20
  80. v_data_5_lo .req v21
  81. v_data_6_lo .req v22
  82. v_data_7_lo .req v23
  83. v_data_0_hi .req v_data_0
  84. v_data_1_hi .req v_data_1
  85. v_data_2_hi .req v_data_2
  86. v_data_3_hi .req v_data_3
  87. v_data_4_hi .req v_data_4
  88. v_data_5_hi .req v_data_5
  89. v_data_6_hi .req v_data_6
  90. v_data_7_hi .req v_data_7
  91. v_d1_0 .req v24
  92. v_d1_1 .req v25
  93. v_d1_2 .req v26
  94. v_d1_3 .req v27
  95. v_d1_4 .req v28
  96. v_d1_5 .req v29
  97. v_d1_6 .req v30
  98. v_d1_7 .req v31
  99. q_d1_0 .req q24
  100. q_d1_1 .req q25
  101. q_d1_2 .req q26
  102. q_d1_3 .req q27
  103. q_d1_4 .req q28
  104. q_d1_5 .req q29
  105. q_d1_6 .req q30
  106. q_d1_7 .req q31
  107. v_data .req v_d1_1
  108. q_data .req q_d1_1
  109. v_data_lo .req v_d1_2
  110. v_data_hi .req v_d1_3
  111. cdecl(gf_vect_mad_neon):
  112. /* less than 16 bytes, return_fail */
  113. cmp x_len, #16
  114. blt .return_fail
  115. movi v_mask0f.16b, #0x0f
  116. lsl x_vec_i, x_vec_i, #5
  117. add x_tbl, x_tbl, x_vec_i
  118. add x_src_end, x_src, x_len
  119. ldr q_gft1_lo, [x_tbl]
  120. ldr q_gft1_hi, [x_tbl, #16]
  121. .Lloop128_init:
  122. /* less than 128 bytes, goto Lloop16_init */
  123. cmp x_len, #128
  124. blt .Lloop16_init
  125. /* save d8 ~ d15 to stack */
  126. sub sp, sp, #64
  127. stp d8, d9, [sp]
  128. stp d10, d11, [sp, #16]
  129. stp d12, d13, [sp, #32]
  130. stp d14, d15, [sp, #48]
  131. sub x_src_end, x_src_end, #128
  132. .Lloop128:
  133. ldr q_data_0, [x_src, #16*0]
  134. ldr q_data_1, [x_src, #16*1]
  135. ldr q_data_2, [x_src, #16*2]
  136. ldr q_data_3, [x_src, #16*3]
  137. ldr q_data_4, [x_src, #16*4]
  138. ldr q_data_5, [x_src, #16*5]
  139. ldr q_data_6, [x_src, #16*6]
  140. ldr q_data_7, [x_src, #16*7]
  141. ldr q_d1_0, [x_dest1, #16*0]
  142. ldr q_d1_1, [x_dest1, #16*1]
  143. ldr q_d1_2, [x_dest1, #16*2]
  144. ldr q_d1_3, [x_dest1, #16*3]
  145. ldr q_d1_4, [x_dest1, #16*4]
  146. ldr q_d1_5, [x_dest1, #16*5]
  147. ldr q_d1_6, [x_dest1, #16*6]
  148. ldr q_d1_7, [x_dest1, #16*7]
  149. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  150. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  151. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  152. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  153. and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
  154. and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
  155. and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
  156. and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
  157. ushr v_data_0_hi.16b, v_data_0.16b, #4
  158. ushr v_data_1_hi.16b, v_data_1.16b, #4
  159. ushr v_data_2_hi.16b, v_data_2.16b, #4
  160. ushr v_data_3_hi.16b, v_data_3.16b, #4
  161. ushr v_data_4_hi.16b, v_data_4.16b, #4
  162. ushr v_data_5_hi.16b, v_data_5.16b, #4
  163. ushr v_data_6_hi.16b, v_data_6.16b, #4
  164. ushr v_data_7_hi.16b, v_data_7.16b, #4
  165. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  166. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  167. tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  168. tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  169. eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
  170. eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
  171. eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
  172. eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
  173. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  174. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  175. tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  176. tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  177. eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
  178. eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
  179. eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
  180. eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
  181. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
  182. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
  183. tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
  184. tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
  185. eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
  186. eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
  187. eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
  188. eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
  189. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
  190. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
  191. tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
  192. tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
  193. eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
  194. eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
  195. eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
  196. eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
  197. str q_d1_0, [x_dest1, #16*0]
  198. str q_d1_1, [x_dest1, #16*1]
  199. str q_d1_2, [x_dest1, #16*2]
  200. str q_d1_3, [x_dest1, #16*3]
  201. str q_d1_4, [x_dest1, #16*4]
  202. str q_d1_5, [x_dest1, #16*5]
  203. str q_d1_6, [x_dest1, #16*6]
  204. str q_d1_7, [x_dest1, #16*7]
  205. add x_src, x_src, #128
  206. add x_dest1, x_dest1, #128
  207. cmp x_src, x_src_end
  208. bls .Lloop128
  209. .Lloop128_end:
  210. /* restore d8 ~ d15 */
  211. ldp d8, d9, [sp]
  212. ldp d10, d11, [sp, #16]
  213. ldp d12, d13, [sp, #32]
  214. ldp d14, d15, [sp, #48]
  215. add sp, sp, #64
  216. add x_src_end, x_src_end, #128
  217. .Lloop16_init:
  218. sub x_src_end, x_src_end, #16
  219. cmp x_src, x_src_end
  220. bhi .lessthan16_init
  221. .Lloop16:
  222. ldr q_data, [x_src]
  223. ldr q_d1_0, [x_dest1]
  224. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  225. ushr v_data_hi.16b, v_data.16b, #4
  226. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  227. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  228. eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
  229. eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
  230. str q_d1_0, [x_dest1]
  231. add x_dest1, x_dest1, #16
  232. add x_src, x_src, #16
  233. cmp x_src, x_src_end
  234. bls .Lloop16
  235. .lessthan16_init:
  236. sub x_tmp, x_src, x_src_end
  237. cmp x_tmp, #16
  238. beq .return_pass
  239. .lessthan16:
  240. mov x_src, x_src_end
  241. sub x_dest1, x_dest1, x_tmp
  242. #ifndef __APPLE__
  243. adrp x_const, const_tbl
  244. add x_const, x_const, :lo12:const_tbl
  245. #else
  246. adrp x_const, const_tbl@PAGE
  247. add x_const, x_const, const_tbl@PAGEOFF
  248. #endif
  249. sub x_const, x_const, x_tmp
  250. ldr q_tmp, [x_const, #16]
  251. ldr q_data, [x_src]
  252. ldr q_d1_0, [x_dest1]
  253. and v_data_lo.16b, v_data.16b, v_mask0f.16b
  254. ushr v_data_hi.16b, v_data.16b, #4
  255. tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
  256. tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
  257. eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
  258. and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
  259. eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
  260. str q_d1_0, [x_dest1]
  261. .return_pass:
  262. mov w_ret, #0
  263. ret
  264. .return_fail:
  265. mov w_ret, #1
  266. ret
  267. ASM_DEF_RODATA
  268. .balign 8
  269. const_tbl:
  270. .dword 0x0000000000000000, 0x0000000000000000
  271. .dword 0xffffffffffffffff, 0xffffffffffffffff