ed25519-donna-32bit-sse2.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. #if defined(ED25519_GCC_32BIT_SSE_CHOOSE)
  2. #define HAVE_GE25519_SCALARMULT_BASE_CHOOSE_NIELS
  3. DONNA_NOINLINE static void
  4. ge25519_scalarmult_base_choose_niels(ge25519_niels *t, const uint8_t table[256][96], uint32_t pos, signed char b) {
  5. int32_t breg = (int32_t)b;
  6. uint32_t sign = (uint32_t)breg >> 31;
  7. uint32_t mask = ~(sign - 1);
  8. uint32_t u = (breg + mask) ^ mask;
  9. __asm__ __volatile__ (
  10. /* ysubx+xaddy */
  11. "movl %0, %%eax ;\n"
  12. "movd %%eax, %%xmm6 ;\n"
  13. "pshufd $0x00, %%xmm6, %%xmm6 ;\n"
  14. "pxor %%xmm0, %%xmm0 ;\n"
  15. "pxor %%xmm1, %%xmm1 ;\n"
  16. "pxor %%xmm2, %%xmm2 ;\n"
  17. "pxor %%xmm3, %%xmm3 ;\n"
  18. /* 0 */
  19. "movl $0, %%eax ;\n"
  20. "movd %%eax, %%xmm7 ;\n"
  21. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  22. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  23. "movl $1, %%ecx ;\n"
  24. "movd %%ecx, %%xmm4 ;\n"
  25. "pxor %%xmm5, %%xmm5 ;\n"
  26. "pand %%xmm7, %%xmm4 ;\n"
  27. "pand %%xmm7, %%xmm5 ;\n"
  28. "por %%xmm4, %%xmm0 ;\n"
  29. "por %%xmm5, %%xmm1 ;\n"
  30. "por %%xmm4, %%xmm2 ;\n"
  31. "por %%xmm5, %%xmm3 ;\n"
  32. /* 1 */
  33. "movl $1, %%eax ;\n"
  34. "movd %%eax, %%xmm7 ;\n"
  35. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  36. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  37. "movdqa 0(%1), %%xmm4 ;\n"
  38. "movdqa 16(%1), %%xmm5 ;\n"
  39. "pand %%xmm7, %%xmm4 ;\n"
  40. "pand %%xmm7, %%xmm5 ;\n"
  41. "por %%xmm4, %%xmm0 ;\n"
  42. "por %%xmm5, %%xmm1 ;\n"
  43. "movdqa 32(%1), %%xmm4 ;\n"
  44. "movdqa 48(%1), %%xmm5 ;\n"
  45. "pand %%xmm7, %%xmm4 ;\n"
  46. "pand %%xmm7, %%xmm5 ;\n"
  47. "por %%xmm4, %%xmm2 ;\n"
  48. "por %%xmm5, %%xmm3 ;\n"
  49. /* 2 */
  50. "movl $2, %%eax ;\n"
  51. "movd %%eax, %%xmm7 ;\n"
  52. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  53. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  54. "movdqa 96(%1), %%xmm4 ;\n"
  55. "movdqa 112(%1), %%xmm5 ;\n"
  56. "pand %%xmm7, %%xmm4 ;\n"
  57. "pand %%xmm7, %%xmm5 ;\n"
  58. "por %%xmm4, %%xmm0 ;\n"
  59. "por %%xmm5, %%xmm1 ;\n"
  60. "movdqa 128(%1), %%xmm4 ;\n"
  61. "movdqa 144(%1), %%xmm5 ;\n"
  62. "pand %%xmm7, %%xmm4 ;\n"
  63. "pand %%xmm7, %%xmm5 ;\n"
  64. "por %%xmm4, %%xmm2 ;\n"
  65. "por %%xmm5, %%xmm3 ;\n"
  66. /* 3 */
  67. "movl $3, %%eax ;\n"
  68. "movd %%eax, %%xmm7 ;\n"
  69. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  70. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  71. "movdqa 192(%1), %%xmm4 ;\n"
  72. "movdqa 208(%1), %%xmm5 ;\n"
  73. "pand %%xmm7, %%xmm4 ;\n"
  74. "pand %%xmm7, %%xmm5 ;\n"
  75. "por %%xmm4, %%xmm0 ;\n"
  76. "por %%xmm5, %%xmm1 ;\n"
  77. "movdqa 224(%1), %%xmm4 ;\n"
  78. "movdqa 240(%1), %%xmm5 ;\n"
  79. "pand %%xmm7, %%xmm4 ;\n"
  80. "pand %%xmm7, %%xmm5 ;\n"
  81. "por %%xmm4, %%xmm2 ;\n"
  82. "por %%xmm5, %%xmm3 ;\n"
  83. /* 4 */
  84. "movl $4, %%eax ;\n"
  85. "movd %%eax, %%xmm7 ;\n"
  86. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  87. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  88. "movdqa 288(%1), %%xmm4 ;\n"
  89. "movdqa 304(%1), %%xmm5 ;\n"
  90. "pand %%xmm7, %%xmm4 ;\n"
  91. "pand %%xmm7, %%xmm5 ;\n"
  92. "por %%xmm4, %%xmm0 ;\n"
  93. "por %%xmm5, %%xmm1 ;\n"
  94. "movdqa 320(%1), %%xmm4 ;\n"
  95. "movdqa 336(%1), %%xmm5 ;\n"
  96. "pand %%xmm7, %%xmm4 ;\n"
  97. "pand %%xmm7, %%xmm5 ;\n"
  98. "por %%xmm4, %%xmm2 ;\n"
  99. "por %%xmm5, %%xmm3 ;\n"
  100. /* 5 */
  101. "movl $5, %%eax ;\n"
  102. "movd %%eax, %%xmm7 ;\n"
  103. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  104. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  105. "movdqa 384(%1), %%xmm4 ;\n"
  106. "movdqa 400(%1), %%xmm5 ;\n"
  107. "pand %%xmm7, %%xmm4 ;\n"
  108. "pand %%xmm7, %%xmm5 ;\n"
  109. "por %%xmm4, %%xmm0 ;\n"
  110. "por %%xmm5, %%xmm1 ;\n"
  111. "movdqa 416(%1), %%xmm4 ;\n"
  112. "movdqa 432(%1), %%xmm5 ;\n"
  113. "pand %%xmm7, %%xmm4 ;\n"
  114. "pand %%xmm7, %%xmm5 ;\n"
  115. "por %%xmm4, %%xmm2 ;\n"
  116. "por %%xmm5, %%xmm3 ;\n"
  117. /* 6 */
  118. "movl $6, %%eax ;\n"
  119. "movd %%eax, %%xmm7 ;\n"
  120. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  121. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  122. "movdqa 480(%1), %%xmm4 ;\n"
  123. "movdqa 496(%1), %%xmm5 ;\n"
  124. "pand %%xmm7, %%xmm4 ;\n"
  125. "pand %%xmm7, %%xmm5 ;\n"
  126. "por %%xmm4, %%xmm0 ;\n"
  127. "por %%xmm5, %%xmm1 ;\n"
  128. "movdqa 512(%1), %%xmm4 ;\n"
  129. "movdqa 528(%1), %%xmm5 ;\n"
  130. "pand %%xmm7, %%xmm4 ;\n"
  131. "pand %%xmm7, %%xmm5 ;\n"
  132. "por %%xmm4, %%xmm2 ;\n"
  133. "por %%xmm5, %%xmm3 ;\n"
  134. /* 7 */
  135. "movl $7, %%eax ;\n"
  136. "movd %%eax, %%xmm7 ;\n"
  137. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  138. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  139. "movdqa 576(%1), %%xmm4 ;\n"
  140. "movdqa 592(%1), %%xmm5 ;\n"
  141. "pand %%xmm7, %%xmm4 ;\n"
  142. "pand %%xmm7, %%xmm5 ;\n"
  143. "por %%xmm4, %%xmm0 ;\n"
  144. "por %%xmm5, %%xmm1 ;\n"
  145. "movdqa 608(%1), %%xmm4 ;\n"
  146. "movdqa 624(%1), %%xmm5 ;\n"
  147. "pand %%xmm7, %%xmm4 ;\n"
  148. "pand %%xmm7, %%xmm5 ;\n"
  149. "por %%xmm4, %%xmm2 ;\n"
  150. "por %%xmm5, %%xmm3 ;\n"
  151. /* 8 */
  152. "movl $8, %%eax ;\n"
  153. "movd %%eax, %%xmm7 ;\n"
  154. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  155. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  156. "movdqa 672(%1), %%xmm4 ;\n"
  157. "movdqa 688(%1), %%xmm5 ;\n"
  158. "pand %%xmm7, %%xmm4 ;\n"
  159. "pand %%xmm7, %%xmm5 ;\n"
  160. "por %%xmm4, %%xmm0 ;\n"
  161. "por %%xmm5, %%xmm1 ;\n"
  162. "movdqa 704(%1), %%xmm4 ;\n"
  163. "movdqa 720(%1), %%xmm5 ;\n"
  164. "pand %%xmm7, %%xmm4 ;\n"
  165. "pand %%xmm7, %%xmm5 ;\n"
  166. "por %%xmm4, %%xmm2 ;\n"
  167. "por %%xmm5, %%xmm3 ;\n"
  168. /* conditional swap based on sign */
  169. "movl %3, %%ecx ;\n"
  170. "movl %2, %%eax ;\n"
  171. "xorl $1, %%ecx ;\n"
  172. "movd %%ecx, %%xmm6 ;\n"
  173. "pxor %%xmm7, %%xmm7 ;\n"
  174. "pshufd $0x00, %%xmm6, %%xmm6 ;\n"
  175. "pxor %%xmm0, %%xmm2 ;\n"
  176. "pxor %%xmm1, %%xmm3 ;\n"
  177. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  178. "movdqa %%xmm2, %%xmm4 ;\n"
  179. "movdqa %%xmm3, %%xmm5 ;\n"
  180. "pand %%xmm7, %%xmm4 ;\n"
  181. "pand %%xmm7, %%xmm5 ;\n"
  182. "pxor %%xmm4, %%xmm0 ;\n"
  183. "pxor %%xmm5, %%xmm1 ;\n"
  184. "pxor %%xmm0, %%xmm2 ;\n"
  185. "pxor %%xmm1, %%xmm3 ;\n"
  186. /* store ysubx */
  187. "movd %%xmm0, %%ecx ;\n"
  188. "movl %%ecx, %%edx ;\n"
  189. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  190. "andl $0x3ffffff, %%ecx ;\n"
  191. "movl %%ecx, 0(%%eax) ;\n"
  192. "movd %%xmm0, %%ecx ;\n"
  193. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  194. "shrdl $26, %%ecx, %%edx ;\n"
  195. "andl $0x1ffffff, %%edx ;\n"
  196. "movl %%edx, 4(%%eax) ;\n"
  197. "movd %%xmm0, %%edx ;\n"
  198. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  199. "shrdl $19, %%edx, %%ecx ;\n"
  200. "andl $0x3ffffff, %%ecx ;\n"
  201. "movl %%ecx, 8(%%eax) ;\n"
  202. "movd %%xmm0, %%ecx ;\n"
  203. "shrdl $13, %%ecx, %%edx ;\n"
  204. "andl $0x1ffffff, %%edx ;\n"
  205. "movl %%edx, 12(%%eax) ;\n"
  206. "movd %%xmm1, %%edx ;\n"
  207. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  208. "shrl $6, %%ecx ;\n"
  209. "andl $0x3ffffff, %%ecx ;\n"
  210. "movl %%ecx, 16(%%eax) ;\n"
  211. "movl %%edx, %%ecx ;\n"
  212. "andl $0x1ffffff, %%edx ;\n"
  213. "movl %%edx, 20(%%eax) ;\n"
  214. "movd %%xmm1, %%edx ;\n"
  215. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  216. "shrdl $25, %%edx, %%ecx ;\n"
  217. "andl $0x3ffffff, %%ecx ;\n"
  218. "movl %%ecx, 24(%%eax) ;\n"
  219. "movd %%xmm1, %%ecx ;\n"
  220. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  221. "shrdl $19, %%ecx, %%edx ;\n"
  222. "andl $0x1ffffff, %%edx ;\n"
  223. "movl %%edx, 28(%%eax) ;\n"
  224. "movd %%xmm1, %%edx ;\n"
  225. "shrdl $12, %%edx, %%ecx ;\n"
  226. "andl $0x3ffffff, %%ecx ;\n"
  227. "movl %%ecx, 32(%%eax) ;\n"
  228. "shrl $6, %%edx ;\n"
  229. "andl $0x1ffffff, %%edx ;\n"
  230. "xorl %%ecx, %%ecx ;\n"
  231. "movl %%edx, 36(%%eax) ;\n"
  232. "movl %%ecx, 40(%%eax) ;\n"
  233. "movl %%ecx, 44(%%eax) ;\n"
  234. /* store xaddy */
  235. "addl $48, %%eax ;\n"
  236. "movdqa %%xmm2, %%xmm0 ;\n"
  237. "movdqa %%xmm3, %%xmm1 ;\n"
  238. "movd %%xmm0, %%ecx ;\n"
  239. "movl %%ecx, %%edx ;\n"
  240. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  241. "andl $0x3ffffff, %%ecx ;\n"
  242. "movl %%ecx, 0(%%eax) ;\n"
  243. "movd %%xmm0, %%ecx ;\n"
  244. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  245. "shrdl $26, %%ecx, %%edx ;\n"
  246. "andl $0x1ffffff, %%edx ;\n"
  247. "movl %%edx, 4(%%eax) ;\n"
  248. "movd %%xmm0, %%edx ;\n"
  249. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  250. "shrdl $19, %%edx, %%ecx ;\n"
  251. "andl $0x3ffffff, %%ecx ;\n"
  252. "movl %%ecx, 8(%%eax) ;\n"
  253. "movd %%xmm0, %%ecx ;\n"
  254. "shrdl $13, %%ecx, %%edx ;\n"
  255. "andl $0x1ffffff, %%edx ;\n"
  256. "movl %%edx, 12(%%eax) ;\n"
  257. "movd %%xmm1, %%edx ;\n"
  258. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  259. "shrl $6, %%ecx ;\n"
  260. "andl $0x3ffffff, %%ecx ;\n"
  261. "movl %%ecx, 16(%%eax) ;\n"
  262. "movl %%edx, %%ecx ;\n"
  263. "andl $0x1ffffff, %%edx ;\n"
  264. "movl %%edx, 20(%%eax) ;\n"
  265. "movd %%xmm1, %%edx ;\n"
  266. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  267. "shrdl $25, %%edx, %%ecx ;\n"
  268. "andl $0x3ffffff, %%ecx ;\n"
  269. "movl %%ecx, 24(%%eax) ;\n"
  270. "movd %%xmm1, %%ecx ;\n"
  271. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  272. "shrdl $19, %%ecx, %%edx ;\n"
  273. "andl $0x1ffffff, %%edx ;\n"
  274. "movl %%edx, 28(%%eax) ;\n"
  275. "movd %%xmm1, %%edx ;\n"
  276. "shrdl $12, %%edx, %%ecx ;\n"
  277. "andl $0x3ffffff, %%ecx ;\n"
  278. "movl %%ecx, 32(%%eax) ;\n"
  279. "shrl $6, %%edx ;\n"
  280. "andl $0x1ffffff, %%edx ;\n"
  281. "xorl %%ecx, %%ecx ;\n"
  282. "movl %%edx, 36(%%eax) ;\n"
  283. "movl %%ecx, 40(%%eax) ;\n"
  284. "movl %%ecx, 44(%%eax) ;\n"
  285. /* t2d */
  286. "movl %0, %%eax ;\n"
  287. "movd %%eax, %%xmm6 ;\n"
  288. "pshufd $0x00, %%xmm6, %%xmm6 ;\n"
  289. "pxor %%xmm0, %%xmm0 ;\n"
  290. "pxor %%xmm1, %%xmm1 ;\n"
  291. /* 0 */
  292. "movl $0, %%eax ;\n"
  293. "movd %%eax, %%xmm7 ;\n"
  294. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  295. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  296. "pxor %%xmm0, %%xmm0 ;\n"
  297. "pxor %%xmm1, %%xmm1 ;\n"
  298. /* 1 */
  299. "movl $1, %%eax ;\n"
  300. "movd %%eax, %%xmm7 ;\n"
  301. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  302. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  303. "movdqa 64(%1), %%xmm3 ;\n"
  304. "movdqa 80(%1), %%xmm4 ;\n"
  305. "pand %%xmm7, %%xmm3 ;\n"
  306. "pand %%xmm7, %%xmm4 ;\n"
  307. "por %%xmm3, %%xmm0 ;\n"
  308. "por %%xmm4, %%xmm1 ;\n"
  309. /* 2 */
  310. "movl $2, %%eax ;\n"
  311. "movd %%eax, %%xmm7 ;\n"
  312. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  313. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  314. "movdqa 160(%1), %%xmm3 ;\n"
  315. "movdqa 176(%1), %%xmm4 ;\n"
  316. "pand %%xmm7, %%xmm3 ;\n"
  317. "pand %%xmm7, %%xmm4 ;\n"
  318. "por %%xmm3, %%xmm0 ;\n"
  319. "por %%xmm4, %%xmm1 ;\n"
  320. /* 3 */
  321. "movl $3, %%eax ;\n"
  322. "movd %%eax, %%xmm7 ;\n"
  323. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  324. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  325. "movdqa 256(%1), %%xmm3 ;\n"
  326. "movdqa 272(%1), %%xmm4 ;\n"
  327. "pand %%xmm7, %%xmm3 ;\n"
  328. "pand %%xmm7, %%xmm4 ;\n"
  329. "por %%xmm3, %%xmm0 ;\n"
  330. "por %%xmm4, %%xmm1 ;\n"
  331. /* 4 */
  332. "movl $4, %%eax ;\n"
  333. "movd %%eax, %%xmm7 ;\n"
  334. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  335. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  336. "movdqa 352(%1), %%xmm3 ;\n"
  337. "movdqa 368(%1), %%xmm4 ;\n"
  338. "pand %%xmm7, %%xmm3 ;\n"
  339. "pand %%xmm7, %%xmm4 ;\n"
  340. "por %%xmm3, %%xmm0 ;\n"
  341. "por %%xmm4, %%xmm1 ;\n"
  342. /* 5 */
  343. "movl $5, %%eax ;\n"
  344. "movd %%eax, %%xmm7 ;\n"
  345. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  346. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  347. "movdqa 448(%1), %%xmm3 ;\n"
  348. "movdqa 464(%1), %%xmm4 ;\n"
  349. "pand %%xmm7, %%xmm3 ;\n"
  350. "pand %%xmm7, %%xmm4 ;\n"
  351. "por %%xmm3, %%xmm0 ;\n"
  352. "por %%xmm4, %%xmm1 ;\n"
  353. /* 6 */
  354. "movl $6, %%eax ;\n"
  355. "movd %%eax, %%xmm7 ;\n"
  356. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  357. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  358. "movdqa 544(%1), %%xmm3 ;\n"
  359. "movdqa 560(%1), %%xmm4 ;\n"
  360. "pand %%xmm7, %%xmm3 ;\n"
  361. "pand %%xmm7, %%xmm4 ;\n"
  362. "por %%xmm3, %%xmm0 ;\n"
  363. "por %%xmm4, %%xmm1 ;\n"
  364. /* 7 */
  365. "movl $7, %%eax ;\n"
  366. "movd %%eax, %%xmm7 ;\n"
  367. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  368. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  369. "movdqa 640(%1), %%xmm3 ;\n"
  370. "movdqa 656(%1), %%xmm4 ;\n"
  371. "pand %%xmm7, %%xmm3 ;\n"
  372. "pand %%xmm7, %%xmm4 ;\n"
  373. "por %%xmm3, %%xmm0 ;\n"
  374. "por %%xmm4, %%xmm1 ;\n"
  375. /* 8 */
  376. "movl $8, %%eax ;\n"
  377. "movd %%eax, %%xmm7 ;\n"
  378. "pshufd $0x00, %%xmm7, %%xmm7 ;\n"
  379. "pcmpeqd %%xmm6, %%xmm7 ;\n"
  380. "movdqa 736(%1), %%xmm3 ;\n"
  381. "movdqa 752(%1), %%xmm4 ;\n"
  382. "pand %%xmm7, %%xmm3 ;\n"
  383. "pand %%xmm7, %%xmm4 ;\n"
  384. "por %%xmm3, %%xmm0 ;\n"
  385. "por %%xmm4, %%xmm1 ;\n"
  386. /* store t2d */
  387. "movl %2, %%eax ;\n"
  388. "addl $96, %%eax ;\n"
  389. "movd %%xmm0, %%ecx ;\n"
  390. "movl %%ecx, %%edx ;\n"
  391. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  392. "andl $0x3ffffff, %%ecx ;\n"
  393. "movl %%ecx, 0(%%eax) ;\n"
  394. "movd %%xmm0, %%ecx ;\n"
  395. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  396. "shrdl $26, %%ecx, %%edx ;\n"
  397. "andl $0x1ffffff, %%edx ;\n"
  398. "movl %%edx, 4(%%eax) ;\n"
  399. "movd %%xmm0, %%edx ;\n"
  400. "pshufd $0x39, %%xmm0, %%xmm0 ;\n"
  401. "shrdl $19, %%edx, %%ecx ;\n"
  402. "andl $0x3ffffff, %%ecx ;\n"
  403. "movl %%ecx, 8(%%eax) ;\n"
  404. "movd %%xmm0, %%ecx ;\n"
  405. "shrdl $13, %%ecx, %%edx ;\n"
  406. "andl $0x1ffffff, %%edx ;\n"
  407. "movl %%edx, 12(%%eax) ;\n"
  408. "movd %%xmm1, %%edx ;\n"
  409. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  410. "shrl $6, %%ecx ;\n"
  411. "andl $0x3ffffff, %%ecx ;\n"
  412. "movl %%ecx, 16(%%eax) ;\n"
  413. "movl %%edx, %%ecx ;\n"
  414. "andl $0x1ffffff, %%edx ;\n"
  415. "movl %%edx, 20(%%eax) ;\n"
  416. "movd %%xmm1, %%edx ;\n"
  417. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  418. "shrdl $25, %%edx, %%ecx ;\n"
  419. "andl $0x3ffffff, %%ecx ;\n"
  420. "movl %%ecx, 24(%%eax) ;\n"
  421. "movd %%xmm1, %%ecx ;\n"
  422. "pshufd $0x39, %%xmm1, %%xmm1 ;\n"
  423. "shrdl $19, %%ecx, %%edx ;\n"
  424. "andl $0x1ffffff, %%edx ;\n"
  425. "movl %%edx, 28(%%eax) ;\n"
  426. "movd %%xmm1, %%edx ;\n"
  427. "movd %%xmm1, %%edx ;\n"
  428. "shrdl $12, %%edx, %%ecx ;\n"
  429. "andl $0x3ffffff, %%ecx ;\n"
  430. "movl %%ecx, 32(%%eax) ;\n"
  431. "shrl $6, %%edx ;\n"
  432. "andl $0x1ffffff, %%edx ;\n"
  433. "xorl %%ecx, %%ecx ;\n"
  434. "movl %%edx, 36(%%eax) ;\n"
  435. "movl %%ecx, 40(%%eax) ;\n"
  436. "movl %%ecx, 44(%%eax) ;\n"
  437. "movdqa 0(%%eax), %%xmm0 ;\n"
  438. "movdqa 16(%%eax), %%xmm1 ;\n"
  439. "movdqa 32(%%eax), %%xmm2 ;\n"
  440. /* conditionally negate t2d */
  441. /* set up 2p in to 3/4 */
  442. "movl $0x7ffffda, %%ecx ;\n"
  443. "movl $0x3fffffe, %%edx ;\n"
  444. "movd %%ecx, %%xmm3 ;\n"
  445. "movd %%edx, %%xmm5 ;\n"
  446. "movl $0x7fffffe, %%ecx ;\n"
  447. "movd %%ecx, %%xmm4 ;\n"
  448. "punpckldq %%xmm5, %%xmm3 ;\n"
  449. "punpckldq %%xmm5, %%xmm4 ;\n"
  450. "punpcklqdq %%xmm4, %%xmm3 ;\n"
  451. "movdqa %%xmm4, %%xmm5 ;\n"
  452. "punpcklqdq %%xmm4, %%xmm4 ;\n"
  453. /* subtract and conditionally move */
  454. "movl %3, %%ecx ;\n"
  455. "sub $1, %%ecx ;\n"
  456. "movd %%ecx, %%xmm6 ;\n"
  457. "pshufd $0x00, %%xmm6, %%xmm6 ;\n"
  458. "movdqa %%xmm6, %%xmm7 ;\n"
  459. "psubd %%xmm0, %%xmm3 ;\n"
  460. "psubd %%xmm1, %%xmm4 ;\n"
  461. "psubd %%xmm2, %%xmm5 ;\n"
  462. "pand %%xmm6, %%xmm0 ;\n"
  463. "pand %%xmm6, %%xmm1 ;\n"
  464. "pand %%xmm6, %%xmm2 ;\n"
  465. "pandn %%xmm3, %%xmm6 ;\n"
  466. "movdqa %%xmm7, %%xmm3 ;\n"
  467. "pandn %%xmm4, %%xmm7 ;\n"
  468. "pandn %%xmm5, %%xmm3 ;\n"
  469. "por %%xmm6, %%xmm0 ;\n"
  470. "por %%xmm7, %%xmm1 ;\n"
  471. "por %%xmm3, %%xmm2 ;\n"
  472. /* store */
  473. "movdqa %%xmm0, 0(%%eax) ;\n"
  474. "movdqa %%xmm1, 16(%%eax) ;\n"
  475. "movdqa %%xmm2, 32(%%eax) ;\n"
  476. :
  477. : "m"(u), "r"(&table[pos * 8]), "m"(t), "m"(sign) /* %0 = u, %1 = table, %2 = t, %3 = sign */
  478. : "%eax", "%ecx", "%edx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "cc", "memory"
  479. );
  480. }
  481. #endif /* defined(ED25519_GCC_32BIT_SSE_CHOOSE) */