poly1305-armv4.S 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170
  1. #include "arm_arch.h"
  2. .text
  3. #if defined(__thumb2__)
  4. .syntax unified
  5. .thumb
  6. #else
  7. .code 32
  8. #endif
  9. .globl poly1305_emit
  10. .globl poly1305_blocks
  11. .globl poly1305_init
  12. .type poly1305_init,%function
  13. .align 5
  14. poly1305_init:
  15. .Lpoly1305_init:
  16. stmdb sp!,{r4-r11}
  17. eor r3,r3,r3
  18. cmp r1,#0
  19. str r3,[r0,#0] @ zero hash value
  20. str r3,[r0,#4]
  21. str r3,[r0,#8]
  22. str r3,[r0,#12]
  23. str r3,[r0,#16]
  24. str r3,[r0,#36] @ is_base2_26
  25. add r0,r0,#20
  26. #ifdef __thumb2__
  27. it eq
  28. #endif
  29. moveq r0,#0
  30. beq .Lno_key
  31. #if __ARM_MAX_ARCH__>=7
  32. adr r11,.Lpoly1305_init
  33. ldr r12,.LOPENSSL_armcap
  34. #endif
  35. ldrb r4,[r1,#0]
  36. mov r10,#0x0fffffff
  37. ldrb r5,[r1,#1]
  38. and r3,r10,#-4 @ 0x0ffffffc
  39. ldrb r6,[r1,#2]
  40. ldrb r7,[r1,#3]
  41. orr r4,r4,r5,lsl#8
  42. ldrb r5,[r1,#4]
  43. orr r4,r4,r6,lsl#16
  44. ldrb r6,[r1,#5]
  45. orr r4,r4,r7,lsl#24
  46. ldrb r7,[r1,#6]
  47. and r4,r4,r10
  48. #if __ARM_MAX_ARCH__>=7
  49. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  50. # ifdef __APPLE__
  51. ldr r12,[r12]
  52. # endif
  53. #endif
  54. ldrb r8,[r1,#7]
  55. orr r5,r5,r6,lsl#8
  56. ldrb r6,[r1,#8]
  57. orr r5,r5,r7,lsl#16
  58. ldrb r7,[r1,#9]
  59. orr r5,r5,r8,lsl#24
  60. ldrb r8,[r1,#10]
  61. and r5,r5,r3
  62. #if __ARM_MAX_ARCH__>=7
  63. tst r12,#ARMV7_NEON @ check for NEON
  64. # ifdef __APPLE__
  65. adr r9,poly1305_blocks_neon
  66. adr r11,poly1305_blocks
  67. # ifdef __thumb2__
  68. it ne
  69. # endif
  70. movne r11,r9
  71. adr r12,poly1305_emit
  72. adr r10,poly1305_emit_neon
  73. # ifdef __thumb2__
  74. it ne
  75. # endif
  76. movne r12,r10
  77. # else
  78. # ifdef __thumb2__
  79. itete eq
  80. # endif
  81. addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
  82. addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
  83. addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
  84. addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
  85. # endif
  86. # ifdef __thumb2__
  87. orr r12,r12,#1 @ thumb-ify address
  88. orr r11,r11,#1
  89. # endif
  90. #endif
  91. ldrb r9,[r1,#11]
  92. orr r6,r6,r7,lsl#8
  93. ldrb r7,[r1,#12]
  94. orr r6,r6,r8,lsl#16
  95. ldrb r8,[r1,#13]
  96. orr r6,r6,r9,lsl#24
  97. ldrb r9,[r1,#14]
  98. and r6,r6,r3
  99. ldrb r10,[r1,#15]
  100. orr r7,r7,r8,lsl#8
  101. str r4,[r0,#0]
  102. orr r7,r7,r9,lsl#16
  103. str r5,[r0,#4]
  104. orr r7,r7,r10,lsl#24
  105. str r6,[r0,#8]
  106. and r7,r7,r3
  107. str r7,[r0,#12]
  108. #if __ARM_MAX_ARCH__>=7
  109. stmia r2,{r11,r12} @ fill functions table
  110. mov r0,#1
  111. #else
  112. mov r0,#0
  113. #endif
  114. .Lno_key:
  115. ldmia sp!,{r4-r11}
  116. #if __ARM_ARCH__>=5
  117. bx lr @ bx lr
  118. #else
  119. tst lr,#1
  120. moveq pc,lr @ be binary compatible with V4, yet
  121. .word 0xe12fff1e @ interoperable with Thumb ISA:-)
  122. #endif
  123. .size poly1305_init,.-poly1305_init
  124. .type poly1305_blocks,%function
  125. .align 5
  126. poly1305_blocks:
  127. .Lpoly1305_blocks:
  128. stmdb sp!,{r3-r11,lr}
  129. ands r2,r2,#-16
  130. beq .Lno_data
  131. cmp r3,#0
  132. add r2,r2,r1 @ end pointer
  133. sub sp,sp,#32
  134. ldmia r0,{r4-r12} @ load context
  135. str r0,[sp,#12] @ offload stuff
  136. mov lr,r1
  137. str r2,[sp,#16]
  138. str r10,[sp,#20]
  139. str r11,[sp,#24]
  140. str r12,[sp,#28]
  141. b .Loop
  142. .Loop:
  143. #if __ARM_ARCH__<7
  144. ldrb r0,[lr],#16 @ load input
  145. # ifdef __thumb2__
  146. it hi
  147. # endif
  148. addhi r8,r8,#1 @ 1<<128
  149. ldrb r1,[lr,#-15]
  150. ldrb r2,[lr,#-14]
  151. ldrb r3,[lr,#-13]
  152. orr r1,r0,r1,lsl#8
  153. ldrb r0,[lr,#-12]
  154. orr r2,r1,r2,lsl#16
  155. ldrb r1,[lr,#-11]
  156. orr r3,r2,r3,lsl#24
  157. ldrb r2,[lr,#-10]
  158. adds r4,r4,r3 @ accumulate input
  159. ldrb r3,[lr,#-9]
  160. orr r1,r0,r1,lsl#8
  161. ldrb r0,[lr,#-8]
  162. orr r2,r1,r2,lsl#16
  163. ldrb r1,[lr,#-7]
  164. orr r3,r2,r3,lsl#24
  165. ldrb r2,[lr,#-6]
  166. adcs r5,r5,r3
  167. ldrb r3,[lr,#-5]
  168. orr r1,r0,r1,lsl#8
  169. ldrb r0,[lr,#-4]
  170. orr r2,r1,r2,lsl#16
  171. ldrb r1,[lr,#-3]
  172. orr r3,r2,r3,lsl#24
  173. ldrb r2,[lr,#-2]
  174. adcs r6,r6,r3
  175. ldrb r3,[lr,#-1]
  176. orr r1,r0,r1,lsl#8
  177. str lr,[sp,#8] @ offload input pointer
  178. orr r2,r1,r2,lsl#16
  179. add r10,r10,r10,lsr#2
  180. orr r3,r2,r3,lsl#24
  181. #else
  182. ldr r0,[lr],#16 @ load input
  183. # ifdef __thumb2__
  184. it hi
  185. # endif
  186. addhi r8,r8,#1 @ padbit
  187. ldr r1,[lr,#-12]
  188. ldr r2,[lr,#-8]
  189. ldr r3,[lr,#-4]
  190. # ifdef __ARMEB__
  191. rev r0,r0
  192. rev r1,r1
  193. rev r2,r2
  194. rev r3,r3
  195. # endif
  196. adds r4,r4,r0 @ accumulate input
  197. str lr,[sp,#8] @ offload input pointer
  198. adcs r5,r5,r1
  199. add r10,r10,r10,lsr#2
  200. adcs r6,r6,r2
  201. #endif
  202. add r11,r11,r11,lsr#2
  203. adcs r7,r7,r3
  204. add r12,r12,r12,lsr#2
  205. umull r2,r3,r5,r9
  206. adc r8,r8,#0
  207. umull r0,r1,r4,r9
  208. umlal r2,r3,r8,r10
  209. umlal r0,r1,r7,r10
  210. ldr r10,[sp,#20] @ reload r10
  211. umlal r2,r3,r6,r12
  212. umlal r0,r1,r5,r12
  213. umlal r2,r3,r7,r11
  214. umlal r0,r1,r6,r11
  215. umlal r2,r3,r4,r10
  216. str r0,[sp,#0] @ future r4
  217. mul r0,r11,r8
  218. ldr r11,[sp,#24] @ reload r11
  219. adds r2,r2,r1 @ d1+=d0>>32
  220. eor r1,r1,r1
  221. adc lr,r3,#0 @ future r6
  222. str r2,[sp,#4] @ future r5
  223. mul r2,r12,r8
  224. eor r3,r3,r3
  225. umlal r0,r1,r7,r12
  226. ldr r12,[sp,#28] @ reload r12
  227. umlal r2,r3,r7,r9
  228. umlal r0,r1,r6,r9
  229. umlal r2,r3,r6,r10
  230. umlal r0,r1,r5,r10
  231. umlal r2,r3,r5,r11
  232. umlal r0,r1,r4,r11
  233. umlal r2,r3,r4,r12
  234. ldr r4,[sp,#0]
  235. mul r8,r9,r8
  236. ldr r5,[sp,#4]
  237. adds r6,lr,r0 @ d2+=d1>>32
  238. ldr lr,[sp,#8] @ reload input pointer
  239. adc r1,r1,#0
  240. adds r7,r2,r1 @ d3+=d2>>32
  241. ldr r0,[sp,#16] @ reload end pointer
  242. adc r3,r3,#0
  243. add r8,r8,r3 @ h4+=d3>>32
  244. and r1,r8,#-4
  245. and r8,r8,#3
  246. add r1,r1,r1,lsr#2 @ *=5
  247. adds r4,r4,r1
  248. adcs r5,r5,#0
  249. adcs r6,r6,#0
  250. adcs r7,r7,#0
  251. adc r8,r8,#0
  252. cmp r0,lr @ done yet?
  253. bhi .Loop
  254. ldr r0,[sp,#12]
  255. add sp,sp,#32
  256. stmia r0,{r4-r8} @ store the result
  257. .Lno_data:
  258. #if __ARM_ARCH__>=5
  259. ldmia sp!,{r3-r11,pc}
  260. #else
  261. ldmia sp!,{r3-r11,lr}
  262. tst lr,#1
  263. moveq pc,lr @ be binary compatible with V4, yet
  264. .word 0xe12fff1e @ interoperable with Thumb ISA:-)
  265. #endif
  266. .size poly1305_blocks,.-poly1305_blocks
  267. .type poly1305_emit,%function
  268. .align 5
  269. poly1305_emit:
  270. .Lpoly1305_emit:
  271. stmdb sp!,{r4-r11}
  272. .Lpoly1305_emit_enter:
  273. ldmia r0,{r3-r7}
  274. adds r8,r3,#5 @ compare to modulus
  275. adcs r9,r4,#0
  276. adcs r10,r5,#0
  277. adcs r11,r6,#0
  278. adc r7,r7,#0
  279. tst r7,#4 @ did it carry/borrow?
  280. #ifdef __thumb2__
  281. it ne
  282. #endif
  283. movne r3,r8
  284. ldr r8,[r2,#0]
  285. #ifdef __thumb2__
  286. it ne
  287. #endif
  288. movne r4,r9
  289. ldr r9,[r2,#4]
  290. #ifdef __thumb2__
  291. it ne
  292. #endif
  293. movne r5,r10
  294. ldr r10,[r2,#8]
  295. #ifdef __thumb2__
  296. it ne
  297. #endif
  298. movne r6,r11
  299. ldr r11,[r2,#12]
  300. adds r3,r3,r8
  301. adcs r4,r4,r9
  302. adcs r5,r5,r10
  303. adc r6,r6,r11
  304. #if __ARM_ARCH__>=7
  305. # ifdef __ARMEB__
  306. rev r3,r3
  307. rev r4,r4
  308. rev r5,r5
  309. rev r6,r6
  310. # endif
  311. str r3,[r1,#0]
  312. str r4,[r1,#4]
  313. str r5,[r1,#8]
  314. str r6,[r1,#12]
  315. #else
  316. strb r3,[r1,#0]
  317. mov r3,r3,lsr#8
  318. strb r4,[r1,#4]
  319. mov r4,r4,lsr#8
  320. strb r5,[r1,#8]
  321. mov r5,r5,lsr#8
  322. strb r6,[r1,#12]
  323. mov r6,r6,lsr#8
  324. strb r3,[r1,#1]
  325. mov r3,r3,lsr#8
  326. strb r4,[r1,#5]
  327. mov r4,r4,lsr#8
  328. strb r5,[r1,#9]
  329. mov r5,r5,lsr#8
  330. strb r6,[r1,#13]
  331. mov r6,r6,lsr#8
  332. strb r3,[r1,#2]
  333. mov r3,r3,lsr#8
  334. strb r4,[r1,#6]
  335. mov r4,r4,lsr#8
  336. strb r5,[r1,#10]
  337. mov r5,r5,lsr#8
  338. strb r6,[r1,#14]
  339. mov r6,r6,lsr#8
  340. strb r3,[r1,#3]
  341. strb r4,[r1,#7]
  342. strb r5,[r1,#11]
  343. strb r6,[r1,#15]
  344. #endif
  345. ldmia sp!,{r4-r11}
  346. #if __ARM_ARCH__>=5
  347. bx lr @ bx lr
  348. #else
  349. tst lr,#1
  350. moveq pc,lr @ be binary compatible with V4, yet
  351. .word 0xe12fff1e @ interoperable with Thumb ISA:-)
  352. #endif
  353. .size poly1305_emit,.-poly1305_emit
  354. #if __ARM_MAX_ARCH__>=7
  355. .fpu neon
  356. .type poly1305_init_neon,%function
  357. .align 5
  358. poly1305_init_neon:
  359. ldr r4,[r0,#20] @ load key base 2^32
  360. ldr r5,[r0,#24]
  361. ldr r6,[r0,#28]
  362. ldr r7,[r0,#32]
  363. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  364. mov r3,r4,lsr#26
  365. mov r4,r5,lsr#20
  366. orr r3,r3,r5,lsl#6
  367. mov r5,r6,lsr#14
  368. orr r4,r4,r6,lsl#12
  369. mov r6,r7,lsr#8
  370. orr r5,r5,r7,lsl#18
  371. and r3,r3,#0x03ffffff
  372. and r4,r4,#0x03ffffff
  373. and r5,r5,#0x03ffffff
  374. vdup.32 d0,r2 @ r^1 in both lanes
  375. add r2,r3,r3,lsl#2 @ *5
  376. vdup.32 d1,r3
  377. add r3,r4,r4,lsl#2
  378. vdup.32 d2,r2
  379. vdup.32 d3,r4
  380. add r4,r5,r5,lsl#2
  381. vdup.32 d4,r3
  382. vdup.32 d5,r5
  383. add r5,r6,r6,lsl#2
  384. vdup.32 d6,r4
  385. vdup.32 d7,r6
  386. vdup.32 d8,r5
  387. mov r5,#2 @ counter
  388. .Lsquare_neon:
  389. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  390. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  391. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  392. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  393. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  394. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  395. vmull.u32 q5,d0,d0[1]
  396. vmull.u32 q6,d1,d0[1]
  397. vmull.u32 q7,d3,d0[1]
  398. vmull.u32 q8,d5,d0[1]
  399. vmull.u32 q9,d7,d0[1]
  400. vmlal.u32 q5,d7,d2[1]
  401. vmlal.u32 q6,d0,d1[1]
  402. vmlal.u32 q7,d1,d1[1]
  403. vmlal.u32 q8,d3,d1[1]
  404. vmlal.u32 q9,d5,d1[1]
  405. vmlal.u32 q5,d5,d4[1]
  406. vmlal.u32 q6,d7,d4[1]
  407. vmlal.u32 q8,d1,d3[1]
  408. vmlal.u32 q7,d0,d3[1]
  409. vmlal.u32 q9,d3,d3[1]
  410. vmlal.u32 q5,d3,d6[1]
  411. vmlal.u32 q8,d0,d5[1]
  412. vmlal.u32 q6,d5,d6[1]
  413. vmlal.u32 q7,d7,d6[1]
  414. vmlal.u32 q9,d1,d5[1]
  415. vmlal.u32 q8,d7,d8[1]
  416. vmlal.u32 q5,d1,d8[1]
  417. vmlal.u32 q6,d3,d8[1]
  418. vmlal.u32 q7,d5,d8[1]
  419. vmlal.u32 q9,d0,d7[1]
  420. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  421. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  422. @ and P. Schwabe
  423. @
  424. @ H0>>+H1>>+H2>>+H3>>+H4
  425. @ H3>>+H4>>*5+H0>>+H1
  426. @
  427. @ Trivia.
  428. @
  429. @ Result of multiplication of n-bit number by m-bit number is
  430. @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
  431. @ m-bit number multiplied by 2^n is still n+m bits wide.
  432. @
  433. @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
  434. @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
  435. @ one is n+1 bits wide.
  436. @
  437. @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
  438. @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
  439. @ can be 27. However! In cases when their width exceeds 26 bits
  440. @ they are limited by 2^26+2^6. This in turn means that *sum*
  441. @ of the products with these values can still be viewed as sum
  442. @ of 52-bit numbers as long as the amount of addends is not a
  443. @ power of 2. For example,
  444. @
  445. @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
  446. @
  447. @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
  448. @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
  449. @ 8 * (2^52) or 2^55. However, the value is then multiplied by
  450. @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
  451. @ which is less than 32 * (2^52) or 2^57. And when processing
  452. @ data we are looking at triple as many addends...
  453. @
  454. @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
  455. @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
  456. @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
  457. @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
  458. @ instruction accepts 2x32-bit input and writes 2x64-bit result.
  459. @ This means that result of reduction have to be compressed upon
  460. @ loop wrap-around. This can be done in the process of reduction
  461. @ to minimize amount of instructions [as well as amount of
  462. @ 128-bit instructions, which benefits low-end processors], but
  463. @ one has to watch for H2 (which is narrower than H0) and 5*H4
  464. @ not being wider than 58 bits, so that result of right shift
  465. @ by 26 bits fits in 32 bits. This is also useful on x86,
  466. @ because it allows to use paddd in place for paddq, which
  467. @ benefits Atom, where paddq is ridiculously slow.
  468. vshr.u64 q15,q8,#26
  469. vmovn.i64 d16,q8
  470. vshr.u64 q4,q5,#26
  471. vmovn.i64 d10,q5
  472. vadd.i64 q9,q9,q15 @ h3 -> h4
  473. vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
  474. vadd.i64 q6,q6,q4 @ h0 -> h1
  475. vbic.i32 d10,#0xfc000000
  476. vshrn.u64 d30,q9,#26
  477. vmovn.i64 d18,q9
  478. vshr.u64 q4,q6,#26
  479. vmovn.i64 d12,q6
  480. vadd.i64 q7,q7,q4 @ h1 -> h2
  481. vbic.i32 d18,#0xfc000000
  482. vbic.i32 d12,#0xfc000000
  483. vadd.i32 d10,d10,d30
  484. vshl.u32 d30,d30,#2
  485. vshrn.u64 d8,q7,#26
  486. vmovn.i64 d14,q7
  487. vadd.i32 d10,d10,d30 @ h4 -> h0
  488. vadd.i32 d16,d16,d8 @ h2 -> h3
  489. vbic.i32 d14,#0xfc000000
  490. vshr.u32 d30,d10,#26
  491. vbic.i32 d10,#0xfc000000
  492. vshr.u32 d8,d16,#26
  493. vbic.i32 d16,#0xfc000000
  494. vadd.i32 d12,d12,d30 @ h0 -> h1
  495. vadd.i32 d18,d18,d8 @ h3 -> h4
  496. subs r5,r5,#1
  497. beq .Lsquare_break_neon
  498. add r6,r0,#(48+0*9*4)
  499. add r7,r0,#(48+1*9*4)
  500. vtrn.32 d0,d10 @ r^2:r^1
  501. vtrn.32 d3,d14
  502. vtrn.32 d5,d16
  503. vtrn.32 d1,d12
  504. vtrn.32 d7,d18
  505. vshl.u32 d4,d3,#2 @ *5
  506. vshl.u32 d6,d5,#2
  507. vshl.u32 d2,d1,#2
  508. vshl.u32 d8,d7,#2
  509. vadd.i32 d4,d4,d3
  510. vadd.i32 d2,d2,d1
  511. vadd.i32 d6,d6,d5
  512. vadd.i32 d8,d8,d7
  513. vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
  514. vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
  515. vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
  516. vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
  517. vst1.32 {d8[0]},[r6,:32]
  518. vst1.32 {d8[1]},[r7,:32]
  519. b .Lsquare_neon
  520. .align 4
  521. .Lsquare_break_neon:
  522. add r6,r0,#(48+2*4*9)
  523. add r7,r0,#(48+3*4*9)
  524. vmov d0,d10 @ r^4:r^3
  525. vshl.u32 d2,d12,#2 @ *5
  526. vmov d1,d12
  527. vshl.u32 d4,d14,#2
  528. vmov d3,d14
  529. vshl.u32 d6,d16,#2
  530. vmov d5,d16
  531. vshl.u32 d8,d18,#2
  532. vmov d7,d18
  533. vadd.i32 d2,d2,d12
  534. vadd.i32 d4,d4,d14
  535. vadd.i32 d6,d6,d16
  536. vadd.i32 d8,d8,d18
  537. vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
  538. vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
  539. vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
  540. vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
  541. vst1.32 {d8[0]},[r6]
  542. vst1.32 {d8[1]},[r7]
  543. bx lr @ bx lr
  544. .size poly1305_init_neon,.-poly1305_init_neon
  545. .type poly1305_blocks_neon,%function
  546. .align 5
  547. poly1305_blocks_neon:
  548. .Lpoly1305_blocks_neon:
  549. ldr ip,[r0,#36] @ is_base2_26
  550. ands r2,r2,#-16
  551. beq .Lno_data_neon
  552. cmp r2,#64
  553. bhs .Lenter_neon
  554. tst ip,ip @ is_base2_26?
  555. beq .Lpoly1305_blocks
  556. .Lenter_neon:
  557. stmdb sp!,{r4-r7}
  558. vstmdb sp!,{d8-d15} @ ABI specification says so
  559. tst ip,ip @ is_base2_26?
  560. bne .Lbase2_26_neon
  561. stmdb sp!,{r1-r3,lr}
  562. bl poly1305_init_neon
  563. ldr r4,[r0,#0] @ load hash value base 2^32
  564. ldr r5,[r0,#4]
  565. ldr r6,[r0,#8]
  566. ldr r7,[r0,#12]
  567. ldr ip,[r0,#16]
  568. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  569. mov r3,r4,lsr#26
  570. veor d10,d10,d10
  571. mov r4,r5,lsr#20
  572. orr r3,r3,r5,lsl#6
  573. veor d12,d12,d12
  574. mov r5,r6,lsr#14
  575. orr r4,r4,r6,lsl#12
  576. veor d14,d14,d14
  577. mov r6,r7,lsr#8
  578. orr r5,r5,r7,lsl#18
  579. veor d16,d16,d16
  580. and r3,r3,#0x03ffffff
  581. orr r6,r6,ip,lsl#24
  582. veor d18,d18,d18
  583. and r4,r4,#0x03ffffff
  584. mov r1,#1
  585. and r5,r5,#0x03ffffff
  586. str r1,[r0,#36] @ is_base2_26
  587. vmov.32 d10[0],r2
  588. vmov.32 d12[0],r3
  589. vmov.32 d14[0],r4
  590. vmov.32 d16[0],r5
  591. vmov.32 d18[0],r6
  592. adr r5,.Lzeros
  593. ldmia sp!,{r1-r3,lr}
  594. b .Lbase2_32_neon
  595. .align 4
  596. .Lbase2_26_neon:
  597. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  598. @ load hash value
  599. veor d10,d10,d10
  600. veor d12,d12,d12
  601. veor d14,d14,d14
  602. veor d16,d16,d16
  603. veor d18,d18,d18
  604. vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
  605. adr r5,.Lzeros
  606. vld1.32 {d18[0]},[r0]
  607. sub r0,r0,#16 @ rewind
  608. .Lbase2_32_neon:
  609. add r4,r1,#32
  610. mov r3,r3,lsl#24
  611. tst r2,#31
  612. beq .Leven
  613. vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
  614. vmov.32 d28[0],r3
  615. sub r2,r2,#16
  616. add r4,r1,#32
  617. # ifdef __ARMEB__
  618. vrev32.8 q10,q10
  619. vrev32.8 q13,q13
  620. vrev32.8 q11,q11
  621. vrev32.8 q12,q12
  622. # endif
  623. vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
  624. vshl.u32 d26,d26,#18
  625. vsri.u32 d26,d24,#14
  626. vshl.u32 d24,d24,#12
  627. vadd.i32 d29,d28,d18 @ add hash value and move to #hi
  628. vbic.i32 d26,#0xfc000000
  629. vsri.u32 d24,d22,#20
  630. vshl.u32 d22,d22,#6
  631. vbic.i32 d24,#0xfc000000
  632. vsri.u32 d22,d20,#26
  633. vadd.i32 d27,d26,d16
  634. vbic.i32 d20,#0xfc000000
  635. vbic.i32 d22,#0xfc000000
  636. vadd.i32 d25,d24,d14
  637. vadd.i32 d21,d20,d10
  638. vadd.i32 d23,d22,d12
  639. mov r7,r5
  640. add r6,r0,#48
  641. cmp r2,r2
  642. b .Long_tail
  643. .align 4
  644. .Leven:
  645. subs r2,r2,#64
  646. it lo
  647. movlo r4,r5
  648. vmov.i32 q14,#1<<24 @ padbit, yes, always
  649. vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
  650. add r1,r1,#64
  651. vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
  652. add r4,r4,#64
  653. itt hi
  654. addhi r7,r0,#(48+1*9*4)
  655. addhi r6,r0,#(48+3*9*4)
  656. # ifdef __ARMEB__
  657. vrev32.8 q10,q10
  658. vrev32.8 q13,q13
  659. vrev32.8 q11,q11
  660. vrev32.8 q12,q12
  661. # endif
  662. vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
  663. vshl.u32 q13,q13,#18
  664. vsri.u32 q13,q12,#14
  665. vshl.u32 q12,q12,#12
  666. vbic.i32 q13,#0xfc000000
  667. vsri.u32 q12,q11,#20
  668. vshl.u32 q11,q11,#6
  669. vbic.i32 q12,#0xfc000000
  670. vsri.u32 q11,q10,#26
  671. vbic.i32 q10,#0xfc000000
  672. vbic.i32 q11,#0xfc000000
  673. bls .Lskip_loop
  674. vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
  675. vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
  676. vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
  677. vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
  678. b .Loop_neon
  679. .align 5
  680. .Loop_neon:
  681. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  682. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  683. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  684. @ ___________________/
  685. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  686. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  687. @ ___________________/ ____________________/
  688. @
  689. @ Note that we start with inp[2:3]*r^2. This is because it
  690. @ doesn't depend on reduction in previous iteration.
  691. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  692. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  693. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  694. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  695. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  696. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  697. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  698. @ inp[2:3]*r^2
  699. vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
  700. vmull.u32 q7,d25,d0[1]
  701. vadd.i32 d20,d20,d10
  702. vmull.u32 q5,d21,d0[1]
  703. vadd.i32 d26,d26,d16
  704. vmull.u32 q8,d27,d0[1]
  705. vmlal.u32 q7,d23,d1[1]
  706. vadd.i32 d22,d22,d12
  707. vmull.u32 q6,d23,d0[1]
  708. vadd.i32 d28,d28,d18
  709. vmull.u32 q9,d29,d0[1]
  710. subs r2,r2,#64
  711. vmlal.u32 q5,d29,d2[1]
  712. it lo
  713. movlo r4,r5
  714. vmlal.u32 q8,d25,d1[1]
  715. vld1.32 d8[1],[r7,:32]
  716. vmlal.u32 q6,d21,d1[1]
  717. vmlal.u32 q9,d27,d1[1]
  718. vmlal.u32 q5,d27,d4[1]
  719. vmlal.u32 q8,d23,d3[1]
  720. vmlal.u32 q9,d25,d3[1]
  721. vmlal.u32 q6,d29,d4[1]
  722. vmlal.u32 q7,d21,d3[1]
  723. vmlal.u32 q8,d21,d5[1]
  724. vmlal.u32 q5,d25,d6[1]
  725. vmlal.u32 q9,d23,d5[1]
  726. vmlal.u32 q6,d27,d6[1]
  727. vmlal.u32 q7,d29,d6[1]
  728. vmlal.u32 q8,d29,d8[1]
  729. vmlal.u32 q5,d23,d8[1]
  730. vmlal.u32 q9,d21,d7[1]
  731. vmlal.u32 q6,d25,d8[1]
  732. vmlal.u32 q7,d27,d8[1]
  733. vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
  734. add r4,r4,#64
  735. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  736. @ (hash+inp[0:1])*r^4 and accumulate
  737. vmlal.u32 q8,d26,d0[0]
  738. vmlal.u32 q5,d20,d0[0]
  739. vmlal.u32 q9,d28,d0[0]
  740. vmlal.u32 q6,d22,d0[0]
  741. vmlal.u32 q7,d24,d0[0]
  742. vld1.32 d8[0],[r6,:32]
  743. vmlal.u32 q8,d24,d1[0]
  744. vmlal.u32 q5,d28,d2[0]
  745. vmlal.u32 q9,d26,d1[0]
  746. vmlal.u32 q6,d20,d1[0]
  747. vmlal.u32 q7,d22,d1[0]
  748. vmlal.u32 q8,d22,d3[0]
  749. vmlal.u32 q5,d26,d4[0]
  750. vmlal.u32 q9,d24,d3[0]
  751. vmlal.u32 q6,d28,d4[0]
  752. vmlal.u32 q7,d20,d3[0]
  753. vmlal.u32 q8,d20,d5[0]
  754. vmlal.u32 q5,d24,d6[0]
  755. vmlal.u32 q9,d22,d5[0]
  756. vmlal.u32 q6,d26,d6[0]
  757. vmlal.u32 q8,d28,d8[0]
  758. vmlal.u32 q7,d28,d6[0]
  759. vmlal.u32 q5,d22,d8[0]
  760. vmlal.u32 q9,d20,d7[0]
  761. vmov.i32 q14,#1<<24 @ padbit, yes, always
  762. vmlal.u32 q6,d24,d8[0]
  763. vmlal.u32 q7,d26,d8[0]
  764. vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
  765. add r1,r1,#64
  766. # ifdef __ARMEB__
  767. vrev32.8 q10,q10
  768. vrev32.8 q11,q11
  769. vrev32.8 q12,q12
  770. vrev32.8 q13,q13
  771. # endif
  772. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  773. @ lazy reduction interleaved with base 2^32 -> base 2^26 of
  774. @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
  775. vshr.u64 q15,q8,#26
  776. vmovn.i64 d16,q8
  777. vshr.u64 q4,q5,#26
  778. vmovn.i64 d10,q5
  779. vadd.i64 q9,q9,q15 @ h3 -> h4
  780. vbic.i32 d16,#0xfc000000
  781. vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
  782. vadd.i64 q6,q6,q4 @ h0 -> h1
  783. vshl.u32 q13,q13,#18
  784. vbic.i32 d10,#0xfc000000
  785. vshrn.u64 d30,q9,#26
  786. vmovn.i64 d18,q9
  787. vshr.u64 q4,q6,#26
  788. vmovn.i64 d12,q6
  789. vadd.i64 q7,q7,q4 @ h1 -> h2
  790. vsri.u32 q13,q12,#14
  791. vbic.i32 d18,#0xfc000000
  792. vshl.u32 q12,q12,#12
  793. vbic.i32 d12,#0xfc000000
  794. vadd.i32 d10,d10,d30
  795. vshl.u32 d30,d30,#2
  796. vbic.i32 q13,#0xfc000000
  797. vshrn.u64 d8,q7,#26
  798. vmovn.i64 d14,q7
  799. vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
  800. vsri.u32 q12,q11,#20
  801. vadd.i32 d16,d16,d8 @ h2 -> h3
  802. vshl.u32 q11,q11,#6
  803. vbic.i32 d14,#0xfc000000
  804. vbic.i32 q12,#0xfc000000
  805. vshrn.u64 d30,q5,#26 @ re-narrow
  806. vmovn.i64 d10,q5
  807. vsri.u32 q11,q10,#26
  808. vbic.i32 q10,#0xfc000000
  809. vshr.u32 d8,d16,#26
  810. vbic.i32 d16,#0xfc000000
  811. vbic.i32 d10,#0xfc000000
  812. vadd.i32 d12,d12,d30 @ h0 -> h1
  813. vadd.i32 d18,d18,d8 @ h3 -> h4
  814. vbic.i32 q11,#0xfc000000
  815. bhi .Loop_neon
  816. .Lskip_loop:
  817. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  818. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  819. add r7,r0,#(48+0*9*4)
  820. add r6,r0,#(48+1*9*4)
  821. adds r2,r2,#32
  822. it ne
  823. movne r2,#0
  824. bne .Long_tail
  825. vadd.i32 d25,d24,d14 @ add hash value and move to #hi
  826. vadd.i32 d21,d20,d10
  827. vadd.i32 d27,d26,d16
  828. vadd.i32 d23,d22,d12
  829. vadd.i32 d29,d28,d18
  830. .Long_tail:
  831. vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
  832. vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
  833. vadd.i32 d24,d24,d14 @ can be redundant
  834. vmull.u32 q7,d25,d0
  835. vadd.i32 d20,d20,d10
  836. vmull.u32 q5,d21,d0
  837. vadd.i32 d26,d26,d16
  838. vmull.u32 q8,d27,d0
  839. vadd.i32 d22,d22,d12
  840. vmull.u32 q6,d23,d0
  841. vadd.i32 d28,d28,d18
  842. vmull.u32 q9,d29,d0
  843. vmlal.u32 q5,d29,d2
  844. vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
  845. vmlal.u32 q8,d25,d1
  846. vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
  847. vmlal.u32 q6,d21,d1
  848. vmlal.u32 q9,d27,d1
  849. vmlal.u32 q7,d23,d1
  850. vmlal.u32 q8,d23,d3
  851. vld1.32 d8[1],[r7,:32]
  852. vmlal.u32 q5,d27,d4
  853. vld1.32 d8[0],[r6,:32]
  854. vmlal.u32 q9,d25,d3
  855. vmlal.u32 q6,d29,d4
  856. vmlal.u32 q7,d21,d3
  857. vmlal.u32 q8,d21,d5
  858. it ne
  859. addne r7,r0,#(48+2*9*4)
  860. vmlal.u32 q5,d25,d6
  861. it ne
  862. addne r6,r0,#(48+3*9*4)
  863. vmlal.u32 q9,d23,d5
  864. vmlal.u32 q6,d27,d6
  865. vmlal.u32 q7,d29,d6
  866. vmlal.u32 q8,d29,d8
  867. vorn q0,q0,q0 @ all-ones, can be redundant
  868. vmlal.u32 q5,d23,d8
  869. vshr.u64 q0,q0,#38
  870. vmlal.u32 q9,d21,d7
  871. vmlal.u32 q6,d25,d8
  872. vmlal.u32 q7,d27,d8
  873. beq .Lshort_tail
  874. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  875. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  876. vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
  877. vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
  878. vmlal.u32 q7,d24,d0
  879. vmlal.u32 q5,d20,d0
  880. vmlal.u32 q8,d26,d0
  881. vmlal.u32 q6,d22,d0
  882. vmlal.u32 q9,d28,d0
  883. vmlal.u32 q5,d28,d2
  884. vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
  885. vmlal.u32 q8,d24,d1
  886. vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
  887. vmlal.u32 q6,d20,d1
  888. vmlal.u32 q9,d26,d1
  889. vmlal.u32 q7,d22,d1
  890. vmlal.u32 q8,d22,d3
  891. vld1.32 d8[1],[r7,:32]
  892. vmlal.u32 q5,d26,d4
  893. vld1.32 d8[0],[r6,:32]
  894. vmlal.u32 q9,d24,d3
  895. vmlal.u32 q6,d28,d4
  896. vmlal.u32 q7,d20,d3
  897. vmlal.u32 q8,d20,d5
  898. vmlal.u32 q5,d24,d6
  899. vmlal.u32 q9,d22,d5
  900. vmlal.u32 q6,d26,d6
  901. vmlal.u32 q7,d28,d6
  902. vmlal.u32 q8,d28,d8
  903. vorn q0,q0,q0 @ all-ones
  904. vmlal.u32 q5,d22,d8
  905. vshr.u64 q0,q0,#38
  906. vmlal.u32 q9,d20,d7
  907. vmlal.u32 q6,d24,d8
  908. vmlal.u32 q7,d26,d8
  909. .Lshort_tail:
  910. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  911. @ horizontal addition
  912. vadd.i64 d16,d16,d17
  913. vadd.i64 d10,d10,d11
  914. vadd.i64 d18,d18,d19
  915. vadd.i64 d12,d12,d13
  916. vadd.i64 d14,d14,d15
  917. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  918. @ lazy reduction, but without narrowing
  919. vshr.u64 q15,q8,#26
  920. vand.i64 q8,q8,q0
  921. vshr.u64 q4,q5,#26
  922. vand.i64 q5,q5,q0
  923. vadd.i64 q9,q9,q15 @ h3 -> h4
  924. vadd.i64 q6,q6,q4 @ h0 -> h1
  925. vshr.u64 q15,q9,#26
  926. vand.i64 q9,q9,q0
  927. vshr.u64 q4,q6,#26
  928. vand.i64 q6,q6,q0
  929. vadd.i64 q7,q7,q4 @ h1 -> h2
  930. vadd.i64 q5,q5,q15
  931. vshl.u64 q15,q15,#2
  932. vshr.u64 q4,q7,#26
  933. vand.i64 q7,q7,q0
  934. vadd.i64 q5,q5,q15 @ h4 -> h0
  935. vadd.i64 q8,q8,q4 @ h2 -> h3
  936. vshr.u64 q15,q5,#26
  937. vand.i64 q5,q5,q0
  938. vshr.u64 q4,q8,#26
  939. vand.i64 q8,q8,q0
  940. vadd.i64 q6,q6,q15 @ h0 -> h1
  941. vadd.i64 q9,q9,q4 @ h3 -> h4
  942. cmp r2,#0
  943. bne .Leven
  944. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  945. @ store hash value
  946. vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
  947. vst1.32 {d18[0]},[r0]
  948. vldmia sp!,{d8-d15} @ epilogue
  949. ldmia sp!,{r4-r7}
  950. .Lno_data_neon:
  951. bx lr @ bx lr
  952. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  953. .type poly1305_emit_neon,%function
  954. .align 5
  955. poly1305_emit_neon:
  956. .Lpoly1305_emit_neon:
  957. ldr ip,[r0,#36] @ is_base2_26
  958. stmdb sp!,{r4-r11}
  959. tst ip,ip
  960. beq .Lpoly1305_emit_enter
  961. ldmia r0,{r3-r7}
  962. eor r8,r8,r8
  963. adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
  964. mov r4,r4,lsr#6
  965. adcs r4,r4,r5,lsl#20
  966. mov r5,r5,lsr#12
  967. adcs r5,r5,r6,lsl#14
  968. mov r6,r6,lsr#18
  969. adcs r6,r6,r7,lsl#8
  970. adc r7,r8,r7,lsr#24 @ can be partially reduced ...
  971. and r8,r7,#-4 @ ... so reduce
  972. and r7,r6,#3
  973. add r8,r8,r8,lsr#2 @ *= 5
  974. adds r3,r3,r8
  975. adcs r4,r4,#0
  976. adcs r5,r5,#0
  977. adcs r6,r6,#0
  978. adc r7,r7,#0
  979. adds r8,r3,#5 @ compare to modulus
  980. adcs r9,r4,#0
  981. adcs r10,r5,#0
  982. adcs r11,r6,#0
  983. adc r7,r7,#0
  984. tst r7,#4 @ did it carry/borrow?
  985. it ne
  986. movne r3,r8
  987. ldr r8,[r2,#0]
  988. it ne
  989. movne r4,r9
  990. ldr r9,[r2,#4]
  991. it ne
  992. movne r5,r10
  993. ldr r10,[r2,#8]
  994. it ne
  995. movne r6,r11
  996. ldr r11,[r2,#12]
  997. adds r3,r3,r8 @ accumulate nonce
  998. adcs r4,r4,r9
  999. adcs r5,r5,r10
  1000. adc r6,r6,r11
  1001. # ifdef __ARMEB__
  1002. rev r3,r3
  1003. rev r4,r4
  1004. rev r5,r5
  1005. rev r6,r6
  1006. # endif
  1007. str r3,[r1,#0] @ store the result
  1008. str r4,[r1,#4]
  1009. str r5,[r1,#8]
  1010. str r6,[r1,#12]
  1011. ldmia sp!,{r4-r11}
  1012. bx lr @ bx lr
  1013. .size poly1305_emit_neon,.-poly1305_emit_neon
  1014. .align 5
  1015. .Lzeros:
  1016. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1017. .LOPENSSL_armcap:
  1018. .word OPENSSL_armcap_P-.Lpoly1305_init
  1019. #endif
  1020. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
  1021. .align 2
  1022. #if __ARM_MAX_ARCH__>=7
  1023. .comm OPENSSL_armcap_P,4,4
  1024. #endif