armv4-gf2m.S 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #include "arm_arch.h"
  2. .text
  3. #if defined(__thumb2__)
  4. .syntax unified
  5. .thumb
  6. #else
  7. .code 32
  8. #endif
  9. .type mul_1x1_ialu,%function
  10. .align 5
  11. mul_1x1_ialu:
  12. mov r4,#0
  13. bic r5,r1,#3<<30 @ a1=a&0x3fffffff
  14. str r4,[sp,#0] @ tab[0]=0
  15. add r6,r5,r5 @ a2=a1<<1
  16. str r5,[sp,#4] @ tab[1]=a1
  17. eor r7,r5,r6 @ a1^a2
  18. str r6,[sp,#8] @ tab[2]=a2
  19. mov r8,r5,lsl#2 @ a4=a1<<2
  20. str r7,[sp,#12] @ tab[3]=a1^a2
  21. eor r9,r5,r8 @ a1^a4
  22. str r8,[sp,#16] @ tab[4]=a4
  23. eor r4,r6,r8 @ a2^a4
  24. str r9,[sp,#20] @ tab[5]=a1^a4
  25. eor r7,r7,r8 @ a1^a2^a4
  26. str r4,[sp,#24] @ tab[6]=a2^a4
  27. and r8,r12,r0,lsl#2
  28. str r7,[sp,#28] @ tab[7]=a1^a2^a4
  29. and r9,r12,r0,lsr#1
  30. ldr r5,[sp,r8] @ tab[b & 0x7]
  31. and r8,r12,r0,lsr#4
  32. ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
  33. and r9,r12,r0,lsr#7
  34. ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
  35. eor r5,r5,r7,lsl#3 @ stall
  36. mov r4,r7,lsr#29
  37. ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
  38. and r8,r12,r0,lsr#10
  39. eor r5,r5,r6,lsl#6
  40. eor r4,r4,r6,lsr#26
  41. ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
  42. and r9,r12,r0,lsr#13
  43. eor r5,r5,r7,lsl#9
  44. eor r4,r4,r7,lsr#23
  45. ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
  46. and r8,r12,r0,lsr#16
  47. eor r5,r5,r6,lsl#12
  48. eor r4,r4,r6,lsr#20
  49. ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
  50. and r9,r12,r0,lsr#19
  51. eor r5,r5,r7,lsl#15
  52. eor r4,r4,r7,lsr#17
  53. ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
  54. and r8,r12,r0,lsr#22
  55. eor r5,r5,r6,lsl#18
  56. eor r4,r4,r6,lsr#14
  57. ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
  58. and r9,r12,r0,lsr#25
  59. eor r5,r5,r7,lsl#21
  60. eor r4,r4,r7,lsr#11
  61. ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
  62. tst r1,#1<<30
  63. and r8,r12,r0,lsr#28
  64. eor r5,r5,r6,lsl#24
  65. eor r4,r4,r6,lsr#8
  66. ldr r6,[sp,r8] @ tab[b >> 30 ]
  67. #ifdef __thumb2__
  68. itt ne
  69. #endif
  70. eorne r5,r5,r0,lsl#30
  71. eorne r4,r4,r0,lsr#2
  72. tst r1,#1<<31
  73. eor r5,r5,r7,lsl#27
  74. eor r4,r4,r7,lsr#5
  75. #ifdef __thumb2__
  76. itt ne
  77. #endif
  78. eorne r5,r5,r0,lsl#31
  79. eorne r4,r4,r0,lsr#1
  80. eor r5,r5,r6,lsl#30
  81. eor r4,r4,r6,lsr#2
  82. mov pc,lr
  83. .size mul_1x1_ialu,.-mul_1x1_ialu
  84. .global bn_GF2m_mul_2x2
  85. .type bn_GF2m_mul_2x2,%function
  86. .align 5
  87. bn_GF2m_mul_2x2:
  88. #if __ARM_MAX_ARCH__>=7
  89. stmdb sp!,{r10,lr}
  90. ldr r12,.LOPENSSL_armcap
  91. adr r10,.LOPENSSL_armcap
  92. ldr r12,[r12,r10]
  93. #ifdef __APPLE__
  94. ldr r12,[r12]
  95. #endif
  96. tst r12,#ARMV7_NEON
  97. itt ne
  98. ldrne r10,[sp],#8
  99. bne .LNEON
  100. stmdb sp!,{r4-r9}
  101. #else
  102. stmdb sp!,{r4-r10,lr}
  103. #endif
  104. mov r10,r0 @ reassign 1st argument
  105. mov r0,r3 @ r0=b1
  106. sub r7,sp,#36
  107. mov r8,sp
  108. and r7,r7,#-32
  109. ldr r3,[sp,#32] @ load b0
  110. mov r12,#7<<2
  111. mov sp,r7 @ allocate tab[8]
  112. str r8,[r7,#32]
  113. bl mul_1x1_ialu @ a1·b1
  114. str r5,[r10,#8]
  115. str r4,[r10,#12]
  116. eor r0,r0,r3 @ flip b0 and b1
  117. eor r1,r1,r2 @ flip a0 and a1
  118. eor r3,r3,r0
  119. eor r2,r2,r1
  120. eor r0,r0,r3
  121. eor r1,r1,r2
  122. bl mul_1x1_ialu @ a0·b0
  123. str r5,[r10]
  124. str r4,[r10,#4]
  125. eor r1,r1,r2
  126. eor r0,r0,r3
  127. bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
  128. ldmia r10,{r6-r9}
  129. eor r5,r5,r4
  130. ldr sp,[sp,#32] @ destroy tab[8]
  131. eor r4,r4,r7
  132. eor r5,r5,r6
  133. eor r4,r4,r8
  134. eor r5,r5,r9
  135. eor r4,r4,r9
  136. str r4,[r10,#8]
  137. eor r5,r5,r4
  138. str r5,[r10,#4]
  139. #if __ARM_ARCH__>=5
  140. ldmia sp!,{r4-r10,pc}
  141. #else
  142. ldmia sp!,{r4-r10,lr}
  143. tst lr,#1
  144. moveq pc,lr @ be binary compatible with V4, yet
  145. .word 0xe12fff1e @ interoperable with Thumb ISA:-)
  146. #endif
  147. #if __ARM_MAX_ARCH__>=7
  148. .arch armv7-a
  149. .fpu neon
  150. .align 5
  151. .LNEON:
  152. ldr r12, [sp] @ 5th argument
  153. vmov d26, r2, r1
  154. vmov d27, r12, r3
  155. vmov.i64 d28, #0x0000ffffffffffff
  156. vmov.i64 d29, #0x00000000ffffffff
  157. vmov.i64 d30, #0x000000000000ffff
  158. vext.8 d2, d26, d26, #1 @ A1
  159. vmull.p8 q1, d2, d27 @ F = A1*B
  160. vext.8 d0, d27, d27, #1 @ B1
  161. vmull.p8 q0, d26, d0 @ E = A*B1
  162. vext.8 d4, d26, d26, #2 @ A2
  163. vmull.p8 q2, d4, d27 @ H = A2*B
  164. vext.8 d16, d27, d27, #2 @ B2
  165. vmull.p8 q8, d26, d16 @ G = A*B2
  166. vext.8 d6, d26, d26, #3 @ A3
  167. veor q1, q1, q0 @ L = E + F
  168. vmull.p8 q3, d6, d27 @ J = A3*B
  169. vext.8 d0, d27, d27, #3 @ B3
  170. veor q2, q2, q8 @ M = G + H
  171. vmull.p8 q0, d26, d0 @ I = A*B3
  172. veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
  173. vand d3, d3, d28
  174. vext.8 d16, d27, d27, #4 @ B4
  175. veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
  176. vand d5, d5, d29
  177. vmull.p8 q8, d26, d16 @ K = A*B4
  178. veor q3, q3, q0 @ N = I + J
  179. veor d2, d2, d3
  180. veor d4, d4, d5
  181. veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
  182. vand d7, d7, d30
  183. vext.8 q1, q1, q1, #15
  184. veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
  185. vmov.i64 d17, #0
  186. vext.8 q2, q2, q2, #14
  187. veor d6, d6, d7
  188. vmull.p8 q0, d26, d27 @ D = A*B
  189. vext.8 q8, q8, q8, #12
  190. vext.8 q3, q3, q3, #13
  191. veor q1, q1, q2
  192. veor q3, q3, q8
  193. veor q0, q0, q1
  194. veor q0, q0, q3
  195. vst1.32 {q0}, [r0]
  196. bx lr @ bx lr
  197. #endif
  198. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  199. #if __ARM_MAX_ARCH__>=7
  200. .align 5
  201. .LOPENSSL_armcap:
  202. .word OPENSSL_armcap_P-.
  203. #endif
  204. .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
  205. .align 5
  206. #if __ARM_MAX_ARCH__>=7
  207. .comm OPENSSL_armcap_P,4,4
  208. #endif