gf_vect_mul_neon.S 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /**************************************************************
  2. Copyright (c) 2019 Huawei Technologies Co., Ltd.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in
  10. the documentation and/or other materials provided with the
  11. distribution.
  12. * Neither the name of Huawei Corporation nor the names of its
  13. contributors may be used to endorse or promote products derived
  14. from this software without specific prior written permission.
  15. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************/
  27. #include "../include/aarch64_label.h"
  28. .text
  29. .global cdecl(gf_vect_mul_neon)
  30. #ifndef __APPLE__
  31. .type gf_vect_mul_neon, %function
  32. #endif
  33. /* arguments */
  34. x_len .req x0
  35. x_tbl .req x1
  36. x_src .req x2
  37. x_dest .req x3
  38. /* returns */
  39. w_ret .req w0
  40. /* local variables */
  41. x_dest1 .req x_dest
  42. x_src_end .req x4
  43. x_tmp .req x5
  44. /* vectors */
  45. v_mask0f .req v0
  46. v_gft1_lo .req v2
  47. v_gft1_hi .req v3
  48. q_gft1_lo .req q2
  49. q_gft1_hi .req q3
  50. v_data_0 .req v16
  51. v_data_1 .req v17
  52. v_data_2 .req v18
  53. v_data_3 .req v19
  54. v_data_4 .req v20
  55. v_data_5 .req v21
  56. v_data_6 .req v22
  57. v_data_7 .req v23
  58. q_data_0 .req q16
  59. q_data_1 .req q17
  60. q_data_2 .req q18
  61. q_data_3 .req q19
  62. q_data_4 .req q20
  63. q_data_5 .req q21
  64. q_data_6 .req q22
  65. q_data_7 .req q23
  66. v_data_0_lo .req v24
  67. v_data_1_lo .req v25
  68. v_data_2_lo .req v26
  69. v_data_3_lo .req v27
  70. v_data_4_lo .req v28
  71. v_data_5_lo .req v29
  72. v_data_6_lo .req v30
  73. v_data_7_lo .req v31
  74. v_data_0_hi .req v_data_0
  75. v_data_1_hi .req v_data_1
  76. v_data_2_hi .req v_data_2
  77. v_data_3_hi .req v_data_3
  78. v_data_4_hi .req v_data_4
  79. v_data_5_hi .req v_data_5
  80. v_data_6_hi .req v_data_6
  81. v_data_7_hi .req v_data_7
  82. cdecl(gf_vect_mul_neon):
  83. /* less than 32 bytes, return_fail */
  84. cmp x_len, #32
  85. blt .return_fail
  86. movi v_mask0f.16b, #0x0f
  87. add x_src_end, x_src, x_len
  88. ldr q_gft1_lo, [x_tbl]
  89. ldr q_gft1_hi, [x_tbl, #16]
  90. .Lloop128_init:
  91. /* less than 128 bytes, goto Lloop16_init */
  92. cmp x_len, #128
  93. blt .Lloop32_init
  94. /* save d8 ~ d15 to stack */
  95. sub sp, sp, #64
  96. stp d8, d9, [sp]
  97. stp d10, d11, [sp, #16]
  98. stp d12, d13, [sp, #32]
  99. stp d14, d15, [sp, #48]
  100. sub x_src_end, x_src_end, #128
  101. .Lloop128:
  102. ldr q_data_0, [x_src, #16*0]
  103. ldr q_data_1, [x_src, #16*1]
  104. ldr q_data_2, [x_src, #16*2]
  105. ldr q_data_3, [x_src, #16*3]
  106. ldr q_data_4, [x_src, #16*4]
  107. ldr q_data_5, [x_src, #16*5]
  108. ldr q_data_6, [x_src, #16*6]
  109. ldr q_data_7, [x_src, #16*7]
  110. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  111. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  112. and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
  113. and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
  114. and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
  115. and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
  116. and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
  117. and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
  118. ushr v_data_0_hi.16b, v_data_0.16b, #4
  119. ushr v_data_1_hi.16b, v_data_1.16b, #4
  120. ushr v_data_2_hi.16b, v_data_2.16b, #4
  121. ushr v_data_3_hi.16b, v_data_3.16b, #4
  122. ushr v_data_4_hi.16b, v_data_4.16b, #4
  123. ushr v_data_5_hi.16b, v_data_5.16b, #4
  124. ushr v_data_6_hi.16b, v_data_6.16b, #4
  125. ushr v_data_7_hi.16b, v_data_7.16b, #4
  126. tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  127. tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  128. tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
  129. tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
  130. tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
  131. tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
  132. tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
  133. tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
  134. tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  135. tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  136. tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
  137. tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
  138. tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
  139. tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
  140. tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
  141. tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
  142. eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
  143. eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
  144. eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b
  145. eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b
  146. eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b
  147. eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b
  148. eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b
  149. eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b
  150. str q_data_0, [x_dest1, #16*0]
  151. str q_data_1, [x_dest1, #16*1]
  152. str q_data_2, [x_dest1, #16*2]
  153. str q_data_3, [x_dest1, #16*3]
  154. str q_data_4, [x_dest1, #16*4]
  155. str q_data_5, [x_dest1, #16*5]
  156. str q_data_6, [x_dest1, #16*6]
  157. str q_data_7, [x_dest1, #16*7]
  158. add x_src, x_src, #128
  159. add x_dest1, x_dest1, #128
  160. cmp x_src, x_src_end
  161. bls .Lloop128
  162. .Lloop128_end:
  163. /* restore d8 ~ d15 */
  164. ldp d8, d9, [sp]
  165. ldp d10, d11, [sp, #16]
  166. ldp d12, d13, [sp, #32]
  167. ldp d14, d15, [sp, #48]
  168. add sp, sp, #64
  169. add x_src_end, x_src_end, #128
  170. cmp x_src, x_src_end
  171. beq .return_pass
  172. .Lloop32_init:
  173. sub x_src_end, x_src_end, #32
  174. cmp x_src, x_src_end
  175. bhi .return_fail
  176. .Lloop32:
  177. ldr q_data_0, [x_src, #16*0]
  178. ldr q_data_1, [x_src, #16*1]
  179. and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
  180. and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
  181. ushr v_data_0_hi.16b, v_data_0.16b, #4
  182. ushr v_data_1_hi.16b, v_data_1.16b, #4
  183. tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
  184. tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
  185. tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
  186. tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
  187. eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
  188. eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
  189. str q_data_0, [x_dest1, #16*0]
  190. str q_data_1, [x_dest1, #16*1]
  191. add x_dest1, x_dest1, #32
  192. add x_src, x_src, #32
  193. cmp x_src, x_src_end
  194. bls .Lloop32
  195. .Lloop32_end:
  196. sub x_tmp, x_src, x_src_end
  197. cmp x_tmp, #32
  198. beq .return_pass
  199. b .return_fail
  200. .return_pass:
  201. mov w_ret, #0
  202. ret
  203. .return_fail:
  204. mov w_ret, #1
  205. ret