bn_asm.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049
  1. /*
  2. * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. *
  4. * Licensed under the OpenSSL license (the "License"). You may not use
  5. * this file except in compliance with the License. You can obtain a copy
  6. * in the file LICENSE in the source distribution or at
  7. * https://www.openssl.org/source/license.html
  8. */
  9. #include <assert.h>
  10. #include <openssl/crypto.h>
  11. #include "internal/cryptlib.h"
  12. #include "bn_local.h"
  13. #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  14. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  15. BN_ULONG w)
  16. {
  17. BN_ULONG c1 = 0;
  18. assert(num >= 0);
  19. if (num <= 0)
  20. return c1;
  21. # ifndef OPENSSL_SMALL_FOOTPRINT
  22. while (num & ~3) {
  23. mul_add(rp[0], ap[0], w, c1);
  24. mul_add(rp[1], ap[1], w, c1);
  25. mul_add(rp[2], ap[2], w, c1);
  26. mul_add(rp[3], ap[3], w, c1);
  27. ap += 4;
  28. rp += 4;
  29. num -= 4;
  30. }
  31. # endif
  32. while (num) {
  33. mul_add(rp[0], ap[0], w, c1);
  34. ap++;
  35. rp++;
  36. num--;
  37. }
  38. return c1;
  39. }
  40. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  41. {
  42. BN_ULONG c1 = 0;
  43. assert(num >= 0);
  44. if (num <= 0)
  45. return c1;
  46. # ifndef OPENSSL_SMALL_FOOTPRINT
  47. while (num & ~3) {
  48. mul(rp[0], ap[0], w, c1);
  49. mul(rp[1], ap[1], w, c1);
  50. mul(rp[2], ap[2], w, c1);
  51. mul(rp[3], ap[3], w, c1);
  52. ap += 4;
  53. rp += 4;
  54. num -= 4;
  55. }
  56. # endif
  57. while (num) {
  58. mul(rp[0], ap[0], w, c1);
  59. ap++;
  60. rp++;
  61. num--;
  62. }
  63. return c1;
  64. }
  65. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  66. {
  67. assert(n >= 0);
  68. if (n <= 0)
  69. return;
  70. # ifndef OPENSSL_SMALL_FOOTPRINT
  71. while (n & ~3) {
  72. sqr(r[0], r[1], a[0]);
  73. sqr(r[2], r[3], a[1]);
  74. sqr(r[4], r[5], a[2]);
  75. sqr(r[6], r[7], a[3]);
  76. a += 4;
  77. r += 8;
  78. n -= 4;
  79. }
  80. # endif
  81. while (n) {
  82. sqr(r[0], r[1], a[0]);
  83. a++;
  84. r += 2;
  85. n--;
  86. }
  87. }
  88. #else /* !(defined(BN_LLONG) ||
  89. * defined(BN_UMULT_HIGH)) */
  90. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  91. BN_ULONG w)
  92. {
  93. BN_ULONG c = 0;
  94. BN_ULONG bl, bh;
  95. assert(num >= 0);
  96. if (num <= 0)
  97. return (BN_ULONG)0;
  98. bl = LBITS(w);
  99. bh = HBITS(w);
  100. # ifndef OPENSSL_SMALL_FOOTPRINT
  101. while (num & ~3) {
  102. mul_add(rp[0], ap[0], bl, bh, c);
  103. mul_add(rp[1], ap[1], bl, bh, c);
  104. mul_add(rp[2], ap[2], bl, bh, c);
  105. mul_add(rp[3], ap[3], bl, bh, c);
  106. ap += 4;
  107. rp += 4;
  108. num -= 4;
  109. }
  110. # endif
  111. while (num) {
  112. mul_add(rp[0], ap[0], bl, bh, c);
  113. ap++;
  114. rp++;
  115. num--;
  116. }
  117. return c;
  118. }
  119. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  120. {
  121. BN_ULONG carry = 0;
  122. BN_ULONG bl, bh;
  123. assert(num >= 0);
  124. if (num <= 0)
  125. return (BN_ULONG)0;
  126. bl = LBITS(w);
  127. bh = HBITS(w);
  128. # ifndef OPENSSL_SMALL_FOOTPRINT
  129. while (num & ~3) {
  130. mul(rp[0], ap[0], bl, bh, carry);
  131. mul(rp[1], ap[1], bl, bh, carry);
  132. mul(rp[2], ap[2], bl, bh, carry);
  133. mul(rp[3], ap[3], bl, bh, carry);
  134. ap += 4;
  135. rp += 4;
  136. num -= 4;
  137. }
  138. # endif
  139. while (num) {
  140. mul(rp[0], ap[0], bl, bh, carry);
  141. ap++;
  142. rp++;
  143. num--;
  144. }
  145. return carry;
  146. }
  147. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  148. {
  149. assert(n >= 0);
  150. if (n <= 0)
  151. return;
  152. # ifndef OPENSSL_SMALL_FOOTPRINT
  153. while (n & ~3) {
  154. sqr64(r[0], r[1], a[0]);
  155. sqr64(r[2], r[3], a[1]);
  156. sqr64(r[4], r[5], a[2]);
  157. sqr64(r[6], r[7], a[3]);
  158. a += 4;
  159. r += 8;
  160. n -= 4;
  161. }
  162. # endif
  163. while (n) {
  164. sqr64(r[0], r[1], a[0]);
  165. a++;
  166. r += 2;
  167. n--;
  168. }
  169. }
  170. #endif /* !(defined(BN_LLONG) ||
  171. * defined(BN_UMULT_HIGH)) */
  172. #if defined(BN_LLONG) && defined(BN_DIV2W)
  173. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  174. {
  175. return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
  176. }
  177. #else
  178. /* Divide h,l by d and return the result. */
  179. /* I need to test this some more :-( */
  180. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  181. {
  182. BN_ULONG dh, dl, q, ret = 0, th, tl, t;
  183. int i, count = 2;
  184. if (d == 0)
  185. return BN_MASK2;
  186. i = BN_num_bits_word(d);
  187. assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
  188. i = BN_BITS2 - i;
  189. if (h >= d)
  190. h -= d;
  191. if (i) {
  192. d <<= i;
  193. h = (h << i) | (l >> (BN_BITS2 - i));
  194. l <<= i;
  195. }
  196. dh = (d & BN_MASK2h) >> BN_BITS4;
  197. dl = (d & BN_MASK2l);
  198. for (;;) {
  199. if ((h >> BN_BITS4) == dh)
  200. q = BN_MASK2l;
  201. else
  202. q = h / dh;
  203. th = q * dh;
  204. tl = dl * q;
  205. for (;;) {
  206. t = h - th;
  207. if ((t & BN_MASK2h) ||
  208. ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
  209. break;
  210. q--;
  211. th -= dh;
  212. tl -= dl;
  213. }
  214. t = (tl >> BN_BITS4);
  215. tl = (tl << BN_BITS4) & BN_MASK2h;
  216. th += t;
  217. if (l < tl)
  218. th++;
  219. l -= tl;
  220. if (h < th) {
  221. h += d;
  222. q--;
  223. }
  224. h -= th;
  225. if (--count == 0)
  226. break;
  227. ret = q << BN_BITS4;
  228. h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
  229. l = (l & BN_MASK2l) << BN_BITS4;
  230. }
  231. ret |= q;
  232. return ret;
  233. }
  234. #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
  235. #ifdef BN_LLONG
  236. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  237. int n)
  238. {
  239. BN_ULLONG ll = 0;
  240. assert(n >= 0);
  241. if (n <= 0)
  242. return (BN_ULONG)0;
  243. # ifndef OPENSSL_SMALL_FOOTPRINT
  244. while (n & ~3) {
  245. ll += (BN_ULLONG) a[0] + b[0];
  246. r[0] = (BN_ULONG)ll & BN_MASK2;
  247. ll >>= BN_BITS2;
  248. ll += (BN_ULLONG) a[1] + b[1];
  249. r[1] = (BN_ULONG)ll & BN_MASK2;
  250. ll >>= BN_BITS2;
  251. ll += (BN_ULLONG) a[2] + b[2];
  252. r[2] = (BN_ULONG)ll & BN_MASK2;
  253. ll >>= BN_BITS2;
  254. ll += (BN_ULLONG) a[3] + b[3];
  255. r[3] = (BN_ULONG)ll & BN_MASK2;
  256. ll >>= BN_BITS2;
  257. a += 4;
  258. b += 4;
  259. r += 4;
  260. n -= 4;
  261. }
  262. # endif
  263. while (n) {
  264. ll += (BN_ULLONG) a[0] + b[0];
  265. r[0] = (BN_ULONG)ll & BN_MASK2;
  266. ll >>= BN_BITS2;
  267. a++;
  268. b++;
  269. r++;
  270. n--;
  271. }
  272. return (BN_ULONG)ll;
  273. }
  274. #else /* !BN_LLONG */
  275. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  276. int n)
  277. {
  278. BN_ULONG c, l, t;
  279. assert(n >= 0);
  280. if (n <= 0)
  281. return (BN_ULONG)0;
  282. c = 0;
  283. # ifndef OPENSSL_SMALL_FOOTPRINT
  284. while (n & ~3) {
  285. t = a[0];
  286. t = (t + c) & BN_MASK2;
  287. c = (t < c);
  288. l = (t + b[0]) & BN_MASK2;
  289. c += (l < t);
  290. r[0] = l;
  291. t = a[1];
  292. t = (t + c) & BN_MASK2;
  293. c = (t < c);
  294. l = (t + b[1]) & BN_MASK2;
  295. c += (l < t);
  296. r[1] = l;
  297. t = a[2];
  298. t = (t + c) & BN_MASK2;
  299. c = (t < c);
  300. l = (t + b[2]) & BN_MASK2;
  301. c += (l < t);
  302. r[2] = l;
  303. t = a[3];
  304. t = (t + c) & BN_MASK2;
  305. c = (t < c);
  306. l = (t + b[3]) & BN_MASK2;
  307. c += (l < t);
  308. r[3] = l;
  309. a += 4;
  310. b += 4;
  311. r += 4;
  312. n -= 4;
  313. }
  314. # endif
  315. while (n) {
  316. t = a[0];
  317. t = (t + c) & BN_MASK2;
  318. c = (t < c);
  319. l = (t + b[0]) & BN_MASK2;
  320. c += (l < t);
  321. r[0] = l;
  322. a++;
  323. b++;
  324. r++;
  325. n--;
  326. }
  327. return (BN_ULONG)c;
  328. }
  329. #endif /* !BN_LLONG */
  330. BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  331. int n)
  332. {
  333. BN_ULONG t1, t2;
  334. int c = 0;
  335. assert(n >= 0);
  336. if (n <= 0)
  337. return (BN_ULONG)0;
  338. #ifndef OPENSSL_SMALL_FOOTPRINT
  339. while (n & ~3) {
  340. t1 = a[0];
  341. t2 = (t1 - c) & BN_MASK2;
  342. c = (t2 > t1);
  343. t1 = b[0];
  344. t1 = (t2 - t1) & BN_MASK2;
  345. r[0] = t1;
  346. c += (t1 > t2);
  347. t1 = a[1];
  348. t2 = (t1 - c) & BN_MASK2;
  349. c = (t2 > t1);
  350. t1 = b[1];
  351. t1 = (t2 - t1) & BN_MASK2;
  352. r[1] = t1;
  353. c += (t1 > t2);
  354. t1 = a[2];
  355. t2 = (t1 - c) & BN_MASK2;
  356. c = (t2 > t1);
  357. t1 = b[2];
  358. t1 = (t2 - t1) & BN_MASK2;
  359. r[2] = t1;
  360. c += (t1 > t2);
  361. t1 = a[3];
  362. t2 = (t1 - c) & BN_MASK2;
  363. c = (t2 > t1);
  364. t1 = b[3];
  365. t1 = (t2 - t1) & BN_MASK2;
  366. r[3] = t1;
  367. c += (t1 > t2);
  368. a += 4;
  369. b += 4;
  370. r += 4;
  371. n -= 4;
  372. }
  373. #endif
  374. while (n) {
  375. t1 = a[0];
  376. t2 = (t1 - c) & BN_MASK2;
  377. c = (t2 > t1);
  378. t1 = b[0];
  379. t1 = (t2 - t1) & BN_MASK2;
  380. r[0] = t1;
  381. c += (t1 > t2);
  382. a++;
  383. b++;
  384. r++;
  385. n--;
  386. }
  387. return c;
  388. }
  389. #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
  390. # undef bn_mul_comba8
  391. # undef bn_mul_comba4
  392. # undef bn_sqr_comba8
  393. # undef bn_sqr_comba4
  394. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  395. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  396. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  397. /*
  398. * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
  399. * c=(c2,c1,c0)
  400. */
  401. # ifdef BN_LLONG
  402. /*
  403. * Keep in mind that additions to multiplication result can not
  404. * overflow, because its high half cannot be all-ones.
  405. */
  406. # define mul_add_c(a,b,c0,c1,c2) do { \
  407. BN_ULONG hi; \
  408. BN_ULLONG t = (BN_ULLONG)(a)*(b); \
  409. t += c0; /* no carry */ \
  410. c0 = (BN_ULONG)Lw(t); \
  411. hi = (BN_ULONG)Hw(t); \
  412. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  413. } while(0)
  414. # define mul_add_c2(a,b,c0,c1,c2) do { \
  415. BN_ULONG hi; \
  416. BN_ULLONG t = (BN_ULLONG)(a)*(b); \
  417. BN_ULLONG tt = t+c0; /* no carry */ \
  418. c0 = (BN_ULONG)Lw(tt); \
  419. hi = (BN_ULONG)Hw(tt); \
  420. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  421. t += c0; /* no carry */ \
  422. c0 = (BN_ULONG)Lw(t); \
  423. hi = (BN_ULONG)Hw(t); \
  424. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  425. } while(0)
  426. # define sqr_add_c(a,i,c0,c1,c2) do { \
  427. BN_ULONG hi; \
  428. BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
  429. t += c0; /* no carry */ \
  430. c0 = (BN_ULONG)Lw(t); \
  431. hi = (BN_ULONG)Hw(t); \
  432. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  433. } while(0)
  434. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  435. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  436. # elif defined(BN_UMULT_LOHI)
  437. /*
  438. * Keep in mind that additions to hi can not overflow, because
  439. * the high word of a multiplication result cannot be all-ones.
  440. */
  441. # define mul_add_c(a,b,c0,c1,c2) do { \
  442. BN_ULONG ta = (a), tb = (b); \
  443. BN_ULONG lo, hi; \
  444. BN_UMULT_LOHI(lo,hi,ta,tb); \
  445. c0 += lo; hi += (c0<lo); \
  446. c1 += hi; c2 += (c1<hi); \
  447. } while(0)
  448. # define mul_add_c2(a,b,c0,c1,c2) do { \
  449. BN_ULONG ta = (a), tb = (b); \
  450. BN_ULONG lo, hi, tt; \
  451. BN_UMULT_LOHI(lo,hi,ta,tb); \
  452. c0 += lo; tt = hi + (c0<lo); \
  453. c1 += tt; c2 += (c1<tt); \
  454. c0 += lo; hi += (c0<lo); \
  455. c1 += hi; c2 += (c1<hi); \
  456. } while(0)
  457. # define sqr_add_c(a,i,c0,c1,c2) do { \
  458. BN_ULONG ta = (a)[i]; \
  459. BN_ULONG lo, hi; \
  460. BN_UMULT_LOHI(lo,hi,ta,ta); \
  461. c0 += lo; hi += (c0<lo); \
  462. c1 += hi; c2 += (c1<hi); \
  463. } while(0)
  464. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  465. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  466. # elif defined(BN_UMULT_HIGH)
  467. /*
  468. * Keep in mind that additions to hi can not overflow, because
  469. * the high word of a multiplication result cannot be all-ones.
  470. */
  471. # define mul_add_c(a,b,c0,c1,c2) do { \
  472. BN_ULONG ta = (a), tb = (b); \
  473. BN_ULONG lo = ta * tb; \
  474. BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
  475. c0 += lo; hi += (c0<lo); \
  476. c1 += hi; c2 += (c1<hi); \
  477. } while(0)
  478. # define mul_add_c2(a,b,c0,c1,c2) do { \
  479. BN_ULONG ta = (a), tb = (b), tt; \
  480. BN_ULONG lo = ta * tb; \
  481. BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
  482. c0 += lo; tt = hi + (c0<lo); \
  483. c1 += tt; c2 += (c1<tt); \
  484. c0 += lo; hi += (c0<lo); \
  485. c1 += hi; c2 += (c1<hi); \
  486. } while(0)
  487. # define sqr_add_c(a,i,c0,c1,c2) do { \
  488. BN_ULONG ta = (a)[i]; \
  489. BN_ULONG lo = ta * ta; \
  490. BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
  491. c0 += lo; hi += (c0<lo); \
  492. c1 += hi; c2 += (c1<hi); \
  493. } while(0)
  494. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  495. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  496. # else /* !BN_LLONG */
  497. /*
  498. * Keep in mind that additions to hi can not overflow, because
  499. * the high word of a multiplication result cannot be all-ones.
  500. */
  501. # define mul_add_c(a,b,c0,c1,c2) do { \
  502. BN_ULONG lo = LBITS(a), hi = HBITS(a); \
  503. BN_ULONG bl = LBITS(b), bh = HBITS(b); \
  504. mul64(lo,hi,bl,bh); \
  505. c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
  506. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  507. } while(0)
  508. # define mul_add_c2(a,b,c0,c1,c2) do { \
  509. BN_ULONG tt; \
  510. BN_ULONG lo = LBITS(a), hi = HBITS(a); \
  511. BN_ULONG bl = LBITS(b), bh = HBITS(b); \
  512. mul64(lo,hi,bl,bh); \
  513. tt = hi; \
  514. c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \
  515. c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \
  516. c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
  517. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  518. } while(0)
  519. # define sqr_add_c(a,i,c0,c1,c2) do { \
  520. BN_ULONG lo, hi; \
  521. sqr64(lo,hi,(a)[i]); \
  522. c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \
  523. c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \
  524. } while(0)
  525. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  526. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  527. # endif /* !BN_LLONG */
  528. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  529. {
  530. BN_ULONG c1, c2, c3;
  531. c1 = 0;
  532. c2 = 0;
  533. c3 = 0;
  534. mul_add_c(a[0], b[0], c1, c2, c3);
  535. r[0] = c1;
  536. c1 = 0;
  537. mul_add_c(a[0], b[1], c2, c3, c1);
  538. mul_add_c(a[1], b[0], c2, c3, c1);
  539. r[1] = c2;
  540. c2 = 0;
  541. mul_add_c(a[2], b[0], c3, c1, c2);
  542. mul_add_c(a[1], b[1], c3, c1, c2);
  543. mul_add_c(a[0], b[2], c3, c1, c2);
  544. r[2] = c3;
  545. c3 = 0;
  546. mul_add_c(a[0], b[3], c1, c2, c3);
  547. mul_add_c(a[1], b[2], c1, c2, c3);
  548. mul_add_c(a[2], b[1], c1, c2, c3);
  549. mul_add_c(a[3], b[0], c1, c2, c3);
  550. r[3] = c1;
  551. c1 = 0;
  552. mul_add_c(a[4], b[0], c2, c3, c1);
  553. mul_add_c(a[3], b[1], c2, c3, c1);
  554. mul_add_c(a[2], b[2], c2, c3, c1);
  555. mul_add_c(a[1], b[3], c2, c3, c1);
  556. mul_add_c(a[0], b[4], c2, c3, c1);
  557. r[4] = c2;
  558. c2 = 0;
  559. mul_add_c(a[0], b[5], c3, c1, c2);
  560. mul_add_c(a[1], b[4], c3, c1, c2);
  561. mul_add_c(a[2], b[3], c3, c1, c2);
  562. mul_add_c(a[3], b[2], c3, c1, c2);
  563. mul_add_c(a[4], b[1], c3, c1, c2);
  564. mul_add_c(a[5], b[0], c3, c1, c2);
  565. r[5] = c3;
  566. c3 = 0;
  567. mul_add_c(a[6], b[0], c1, c2, c3);
  568. mul_add_c(a[5], b[1], c1, c2, c3);
  569. mul_add_c(a[4], b[2], c1, c2, c3);
  570. mul_add_c(a[3], b[3], c1, c2, c3);
  571. mul_add_c(a[2], b[4], c1, c2, c3);
  572. mul_add_c(a[1], b[5], c1, c2, c3);
  573. mul_add_c(a[0], b[6], c1, c2, c3);
  574. r[6] = c1;
  575. c1 = 0;
  576. mul_add_c(a[0], b[7], c2, c3, c1);
  577. mul_add_c(a[1], b[6], c2, c3, c1);
  578. mul_add_c(a[2], b[5], c2, c3, c1);
  579. mul_add_c(a[3], b[4], c2, c3, c1);
  580. mul_add_c(a[4], b[3], c2, c3, c1);
  581. mul_add_c(a[5], b[2], c2, c3, c1);
  582. mul_add_c(a[6], b[1], c2, c3, c1);
  583. mul_add_c(a[7], b[0], c2, c3, c1);
  584. r[7] = c2;
  585. c2 = 0;
  586. mul_add_c(a[7], b[1], c3, c1, c2);
  587. mul_add_c(a[6], b[2], c3, c1, c2);
  588. mul_add_c(a[5], b[3], c3, c1, c2);
  589. mul_add_c(a[4], b[4], c3, c1, c2);
  590. mul_add_c(a[3], b[5], c3, c1, c2);
  591. mul_add_c(a[2], b[6], c3, c1, c2);
  592. mul_add_c(a[1], b[7], c3, c1, c2);
  593. r[8] = c3;
  594. c3 = 0;
  595. mul_add_c(a[2], b[7], c1, c2, c3);
  596. mul_add_c(a[3], b[6], c1, c2, c3);
  597. mul_add_c(a[4], b[5], c1, c2, c3);
  598. mul_add_c(a[5], b[4], c1, c2, c3);
  599. mul_add_c(a[6], b[3], c1, c2, c3);
  600. mul_add_c(a[7], b[2], c1, c2, c3);
  601. r[9] = c1;
  602. c1 = 0;
  603. mul_add_c(a[7], b[3], c2, c3, c1);
  604. mul_add_c(a[6], b[4], c2, c3, c1);
  605. mul_add_c(a[5], b[5], c2, c3, c1);
  606. mul_add_c(a[4], b[6], c2, c3, c1);
  607. mul_add_c(a[3], b[7], c2, c3, c1);
  608. r[10] = c2;
  609. c2 = 0;
  610. mul_add_c(a[4], b[7], c3, c1, c2);
  611. mul_add_c(a[5], b[6], c3, c1, c2);
  612. mul_add_c(a[6], b[5], c3, c1, c2);
  613. mul_add_c(a[7], b[4], c3, c1, c2);
  614. r[11] = c3;
  615. c3 = 0;
  616. mul_add_c(a[7], b[5], c1, c2, c3);
  617. mul_add_c(a[6], b[6], c1, c2, c3);
  618. mul_add_c(a[5], b[7], c1, c2, c3);
  619. r[12] = c1;
  620. c1 = 0;
  621. mul_add_c(a[6], b[7], c2, c3, c1);
  622. mul_add_c(a[7], b[6], c2, c3, c1);
  623. r[13] = c2;
  624. c2 = 0;
  625. mul_add_c(a[7], b[7], c3, c1, c2);
  626. r[14] = c3;
  627. r[15] = c1;
  628. }
  629. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  630. {
  631. BN_ULONG c1, c2, c3;
  632. c1 = 0;
  633. c2 = 0;
  634. c3 = 0;
  635. mul_add_c(a[0], b[0], c1, c2, c3);
  636. r[0] = c1;
  637. c1 = 0;
  638. mul_add_c(a[0], b[1], c2, c3, c1);
  639. mul_add_c(a[1], b[0], c2, c3, c1);
  640. r[1] = c2;
  641. c2 = 0;
  642. mul_add_c(a[2], b[0], c3, c1, c2);
  643. mul_add_c(a[1], b[1], c3, c1, c2);
  644. mul_add_c(a[0], b[2], c3, c1, c2);
  645. r[2] = c3;
  646. c3 = 0;
  647. mul_add_c(a[0], b[3], c1, c2, c3);
  648. mul_add_c(a[1], b[2], c1, c2, c3);
  649. mul_add_c(a[2], b[1], c1, c2, c3);
  650. mul_add_c(a[3], b[0], c1, c2, c3);
  651. r[3] = c1;
  652. c1 = 0;
  653. mul_add_c(a[3], b[1], c2, c3, c1);
  654. mul_add_c(a[2], b[2], c2, c3, c1);
  655. mul_add_c(a[1], b[3], c2, c3, c1);
  656. r[4] = c2;
  657. c2 = 0;
  658. mul_add_c(a[2], b[3], c3, c1, c2);
  659. mul_add_c(a[3], b[2], c3, c1, c2);
  660. r[5] = c3;
  661. c3 = 0;
  662. mul_add_c(a[3], b[3], c1, c2, c3);
  663. r[6] = c1;
  664. r[7] = c2;
  665. }
  666. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  667. {
  668. BN_ULONG c1, c2, c3;
  669. c1 = 0;
  670. c2 = 0;
  671. c3 = 0;
  672. sqr_add_c(a, 0, c1, c2, c3);
  673. r[0] = c1;
  674. c1 = 0;
  675. sqr_add_c2(a, 1, 0, c2, c3, c1);
  676. r[1] = c2;
  677. c2 = 0;
  678. sqr_add_c(a, 1, c3, c1, c2);
  679. sqr_add_c2(a, 2, 0, c3, c1, c2);
  680. r[2] = c3;
  681. c3 = 0;
  682. sqr_add_c2(a, 3, 0, c1, c2, c3);
  683. sqr_add_c2(a, 2, 1, c1, c2, c3);
  684. r[3] = c1;
  685. c1 = 0;
  686. sqr_add_c(a, 2, c2, c3, c1);
  687. sqr_add_c2(a, 3, 1, c2, c3, c1);
  688. sqr_add_c2(a, 4, 0, c2, c3, c1);
  689. r[4] = c2;
  690. c2 = 0;
  691. sqr_add_c2(a, 5, 0, c3, c1, c2);
  692. sqr_add_c2(a, 4, 1, c3, c1, c2);
  693. sqr_add_c2(a, 3, 2, c3, c1, c2);
  694. r[5] = c3;
  695. c3 = 0;
  696. sqr_add_c(a, 3, c1, c2, c3);
  697. sqr_add_c2(a, 4, 2, c1, c2, c3);
  698. sqr_add_c2(a, 5, 1, c1, c2, c3);
  699. sqr_add_c2(a, 6, 0, c1, c2, c3);
  700. r[6] = c1;
  701. c1 = 0;
  702. sqr_add_c2(a, 7, 0, c2, c3, c1);
  703. sqr_add_c2(a, 6, 1, c2, c3, c1);
  704. sqr_add_c2(a, 5, 2, c2, c3, c1);
  705. sqr_add_c2(a, 4, 3, c2, c3, c1);
  706. r[7] = c2;
  707. c2 = 0;
  708. sqr_add_c(a, 4, c3, c1, c2);
  709. sqr_add_c2(a, 5, 3, c3, c1, c2);
  710. sqr_add_c2(a, 6, 2, c3, c1, c2);
  711. sqr_add_c2(a, 7, 1, c3, c1, c2);
  712. r[8] = c3;
  713. c3 = 0;
  714. sqr_add_c2(a, 7, 2, c1, c2, c3);
  715. sqr_add_c2(a, 6, 3, c1, c2, c3);
  716. sqr_add_c2(a, 5, 4, c1, c2, c3);
  717. r[9] = c1;
  718. c1 = 0;
  719. sqr_add_c(a, 5, c2, c3, c1);
  720. sqr_add_c2(a, 6, 4, c2, c3, c1);
  721. sqr_add_c2(a, 7, 3, c2, c3, c1);
  722. r[10] = c2;
  723. c2 = 0;
  724. sqr_add_c2(a, 7, 4, c3, c1, c2);
  725. sqr_add_c2(a, 6, 5, c3, c1, c2);
  726. r[11] = c3;
  727. c3 = 0;
  728. sqr_add_c(a, 6, c1, c2, c3);
  729. sqr_add_c2(a, 7, 5, c1, c2, c3);
  730. r[12] = c1;
  731. c1 = 0;
  732. sqr_add_c2(a, 7, 6, c2, c3, c1);
  733. r[13] = c2;
  734. c2 = 0;
  735. sqr_add_c(a, 7, c3, c1, c2);
  736. r[14] = c3;
  737. r[15] = c1;
  738. }
  739. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  740. {
  741. BN_ULONG c1, c2, c3;
  742. c1 = 0;
  743. c2 = 0;
  744. c3 = 0;
  745. sqr_add_c(a, 0, c1, c2, c3);
  746. r[0] = c1;
  747. c1 = 0;
  748. sqr_add_c2(a, 1, 0, c2, c3, c1);
  749. r[1] = c2;
  750. c2 = 0;
  751. sqr_add_c(a, 1, c3, c1, c2);
  752. sqr_add_c2(a, 2, 0, c3, c1, c2);
  753. r[2] = c3;
  754. c3 = 0;
  755. sqr_add_c2(a, 3, 0, c1, c2, c3);
  756. sqr_add_c2(a, 2, 1, c1, c2, c3);
  757. r[3] = c1;
  758. c1 = 0;
  759. sqr_add_c(a, 2, c2, c3, c1);
  760. sqr_add_c2(a, 3, 1, c2, c3, c1);
  761. r[4] = c2;
  762. c2 = 0;
  763. sqr_add_c2(a, 3, 2, c3, c1, c2);
  764. r[5] = c3;
  765. c3 = 0;
  766. sqr_add_c(a, 3, c1, c2, c3);
  767. r[6] = c1;
  768. r[7] = c2;
  769. }
  770. # ifdef OPENSSL_NO_ASM
  771. # ifdef OPENSSL_BN_ASM_MONT
  772. # include <alloca.h>
  773. /*
  774. * This is essentially reference implementation, which may or may not
  775. * result in performance improvement. E.g. on IA-32 this routine was
  776. * observed to give 40% faster rsa1024 private key operations and 10%
  777. * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
  778. * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
  779. * reference implementation, one to be used as starting point for
  780. * platform-specific assembler. Mentioned numbers apply to compiler
  781. * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
  782. * can vary not only from platform to platform, but even for compiler
  783. * versions. Assembler vs. assembler improvement coefficients can
  784. * [and are known to] differ and are to be documented elsewhere.
  785. */
  786. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  787. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  788. {
  789. BN_ULONG c0, c1, ml, *tp, n0;
  790. # ifdef mul64
  791. BN_ULONG mh;
  792. # endif
  793. volatile BN_ULONG *vp;
  794. int i = 0, j;
  795. # if 0 /* template for platform-specific
  796. * implementation */
  797. if (ap == bp)
  798. return bn_sqr_mont(rp, ap, np, n0p, num);
  799. # endif
  800. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  801. n0 = *n0p;
  802. c0 = 0;
  803. ml = bp[0];
  804. # ifdef mul64
  805. mh = HBITS(ml);
  806. ml = LBITS(ml);
  807. for (j = 0; j < num; ++j)
  808. mul(tp[j], ap[j], ml, mh, c0);
  809. # else
  810. for (j = 0; j < num; ++j)
  811. mul(tp[j], ap[j], ml, c0);
  812. # endif
  813. tp[num] = c0;
  814. tp[num + 1] = 0;
  815. goto enter;
  816. for (i = 0; i < num; i++) {
  817. c0 = 0;
  818. ml = bp[i];
  819. # ifdef mul64
  820. mh = HBITS(ml);
  821. ml = LBITS(ml);
  822. for (j = 0; j < num; ++j)
  823. mul_add(tp[j], ap[j], ml, mh, c0);
  824. # else
  825. for (j = 0; j < num; ++j)
  826. mul_add(tp[j], ap[j], ml, c0);
  827. # endif
  828. c1 = (tp[num] + c0) & BN_MASK2;
  829. tp[num] = c1;
  830. tp[num + 1] = (c1 < c0 ? 1 : 0);
  831. enter:
  832. c1 = tp[0];
  833. ml = (c1 * n0) & BN_MASK2;
  834. c0 = 0;
  835. # ifdef mul64
  836. mh = HBITS(ml);
  837. ml = LBITS(ml);
  838. mul_add(c1, np[0], ml, mh, c0);
  839. # else
  840. mul_add(c1, ml, np[0], c0);
  841. # endif
  842. for (j = 1; j < num; j++) {
  843. c1 = tp[j];
  844. # ifdef mul64
  845. mul_add(c1, np[j], ml, mh, c0);
  846. # else
  847. mul_add(c1, ml, np[j], c0);
  848. # endif
  849. tp[j - 1] = c1 & BN_MASK2;
  850. }
  851. c1 = (tp[num] + c0) & BN_MASK2;
  852. tp[num - 1] = c1;
  853. tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
  854. }
  855. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  856. c0 = bn_sub_words(rp, tp, np, num);
  857. if (tp[num] != 0 || c0 == 0) {
  858. for (i = 0; i < num + 2; i++)
  859. vp[i] = 0;
  860. return 1;
  861. }
  862. }
  863. for (i = 0; i < num; i++)
  864. rp[i] = tp[i], vp[i] = 0;
  865. vp[num] = 0;
  866. vp[num + 1] = 0;
  867. return 1;
  868. }
  869. # else
  870. /*
  871. * Return value of 0 indicates that multiplication/convolution was not
  872. * performed to signal the caller to fall down to alternative/original
  873. * code-path.
  874. */
  875. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  876. const BN_ULONG *np, const BN_ULONG *n0, int num)
  877. {
  878. return 0;
  879. }
  880. # endif /* OPENSSL_BN_ASM_MONT */
  881. # endif
  882. #else /* !BN_MUL_COMBA */
  883. /* hmm... is it faster just to do a multiply? */
  884. # undef bn_sqr_comba4
  885. # undef bn_sqr_comba8
  886. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  887. {
  888. BN_ULONG t[8];
  889. bn_sqr_normal(r, a, 4, t);
  890. }
  891. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  892. {
  893. BN_ULONG t[16];
  894. bn_sqr_normal(r, a, 8, t);
  895. }
  896. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  897. {
  898. r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
  899. r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
  900. r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
  901. r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
  902. }
  903. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  904. {
  905. r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
  906. r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
  907. r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
  908. r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
  909. r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
  910. r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
  911. r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
  912. r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
  913. }
  914. # ifdef OPENSSL_NO_ASM
  915. # ifdef OPENSSL_BN_ASM_MONT
  916. # include <alloca.h>
  917. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  918. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  919. {
  920. BN_ULONG c0, c1, *tp, n0 = *n0p;
  921. volatile BN_ULONG *vp;
  922. int i = 0, j;
  923. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  924. for (i = 0; i <= num; i++)
  925. tp[i] = 0;
  926. for (i = 0; i < num; i++) {
  927. c0 = bn_mul_add_words(tp, ap, num, bp[i]);
  928. c1 = (tp[num] + c0) & BN_MASK2;
  929. tp[num] = c1;
  930. tp[num + 1] = (c1 < c0 ? 1 : 0);
  931. c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
  932. c1 = (tp[num] + c0) & BN_MASK2;
  933. tp[num] = c1;
  934. tp[num + 1] += (c1 < c0 ? 1 : 0);
  935. for (j = 0; j <= num; j++)
  936. tp[j] = tp[j + 1];
  937. }
  938. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  939. c0 = bn_sub_words(rp, tp, np, num);
  940. if (tp[num] != 0 || c0 == 0) {
  941. for (i = 0; i < num + 2; i++)
  942. vp[i] = 0;
  943. return 1;
  944. }
  945. }
  946. for (i = 0; i < num; i++)
  947. rp[i] = tp[i], vp[i] = 0;
  948. vp[num] = 0;
  949. vp[num + 1] = 0;
  950. return 1;
  951. }
  952. # else
  953. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  954. const BN_ULONG *np, const BN_ULONG *n0, int num)
  955. {
  956. return 0;
  957. }
  958. # endif /* OPENSSL_BN_ASM_MONT */
  959. # endif
  960. #endif /* !BN_MUL_COMBA */