chacha-armv4.S 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470
  1. #include "arm_arch.h"
  2. .text
  3. #if defined(__thumb2__) || defined(__clang__)
  4. .syntax unified
  5. #endif
  6. #if defined(__thumb2__)
  7. .thumb
  8. #else
  9. .code 32
  10. #endif
  11. #if defined(__thumb2__) || defined(__clang__)
  12. #define ldrhsb ldrbhs
  13. #endif
  14. .align 5
  15. .Lsigma:
  16. .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
  17. .Lone:
  18. .long 1,0,0,0
  19. #if __ARM_MAX_ARCH__>=7
  20. .LOPENSSL_armcap:
  21. .word OPENSSL_armcap_P-.LChaCha20_ctr32
  22. #else
  23. .word -1
  24. #endif
  25. .globl ChaCha20_ctr32
  26. .type ChaCha20_ctr32,%function
  27. .align 5
  28. ChaCha20_ctr32:
  29. .LChaCha20_ctr32:
  30. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  31. stmdb sp!,{r0-r2,r4-r11,lr}
  32. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  33. sub r14,pc,#16 @ ChaCha20_ctr32
  34. #else
  35. adr r14,.LChaCha20_ctr32
  36. #endif
  37. cmp r2,#0 @ len==0?
  38. #ifdef __thumb2__
  39. itt eq
  40. #endif
  41. addeq sp,sp,#4*3
  42. beq .Lno_data
  43. #if __ARM_MAX_ARCH__>=7
  44. cmp r2,#192 @ test len
  45. bls .Lshort
  46. ldr r4,[r14,#-32]
  47. ldr r4,[r14,r4]
  48. # ifdef __APPLE__
  49. ldr r4,[r4]
  50. # endif
  51. tst r4,#ARMV7_NEON
  52. bne .LChaCha20_neon
  53. .Lshort:
  54. #endif
  55. ldmia r12,{r4-r7} @ load counter and nonce
  56. sub sp,sp,#4*(16) @ off-load area
  57. sub r14,r14,#64 @ .Lsigma
  58. stmdb sp!,{r4-r7} @ copy counter and nonce
  59. ldmia r3,{r4-r11} @ load key
  60. ldmia r14,{r0-r3} @ load sigma
  61. stmdb sp!,{r4-r11} @ copy key
  62. stmdb sp!,{r0-r3} @ copy sigma
  63. str r10,[sp,#4*(16+10)] @ off-load "rx"
  64. str r11,[sp,#4*(16+11)] @ off-load "rx"
  65. b .Loop_outer_enter
  66. .align 4
  67. .Loop_outer:
  68. ldmia sp,{r0-r9} @ load key material
  69. str r11,[sp,#4*(32+2)] @ save len
  70. str r12, [sp,#4*(32+1)] @ save inp
  71. str r14, [sp,#4*(32+0)] @ save out
  72. .Loop_outer_enter:
  73. ldr r11, [sp,#4*(15)]
  74. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  75. ldr r10, [sp,#4*(13)]
  76. ldr r14,[sp,#4*(14)]
  77. str r11, [sp,#4*(16+15)]
  78. mov r11,#10
  79. b .Loop
  80. .align 4
  81. .Loop:
  82. subs r11,r11,#1
  83. add r0,r0,r4
  84. mov r12,r12,ror#16
  85. add r1,r1,r5
  86. mov r10,r10,ror#16
  87. eor r12,r12,r0,ror#16
  88. eor r10,r10,r1,ror#16
  89. add r8,r8,r12
  90. mov r4,r4,ror#20
  91. add r9,r9,r10
  92. mov r5,r5,ror#20
  93. eor r4,r4,r8,ror#20
  94. eor r5,r5,r9,ror#20
  95. add r0,r0,r4
  96. mov r12,r12,ror#24
  97. add r1,r1,r5
  98. mov r10,r10,ror#24
  99. eor r12,r12,r0,ror#24
  100. eor r10,r10,r1,ror#24
  101. add r8,r8,r12
  102. mov r4,r4,ror#25
  103. add r9,r9,r10
  104. mov r5,r5,ror#25
  105. str r10,[sp,#4*(16+13)]
  106. ldr r10,[sp,#4*(16+15)]
  107. eor r4,r4,r8,ror#25
  108. eor r5,r5,r9,ror#25
  109. str r8,[sp,#4*(16+8)]
  110. ldr r8,[sp,#4*(16+10)]
  111. add r2,r2,r6
  112. mov r14,r14,ror#16
  113. str r9,[sp,#4*(16+9)]
  114. ldr r9,[sp,#4*(16+11)]
  115. add r3,r3,r7
  116. mov r10,r10,ror#16
  117. eor r14,r14,r2,ror#16
  118. eor r10,r10,r3,ror#16
  119. add r8,r8,r14
  120. mov r6,r6,ror#20
  121. add r9,r9,r10
  122. mov r7,r7,ror#20
  123. eor r6,r6,r8,ror#20
  124. eor r7,r7,r9,ror#20
  125. add r2,r2,r6
  126. mov r14,r14,ror#24
  127. add r3,r3,r7
  128. mov r10,r10,ror#24
  129. eor r14,r14,r2,ror#24
  130. eor r10,r10,r3,ror#24
  131. add r8,r8,r14
  132. mov r6,r6,ror#25
  133. add r9,r9,r10
  134. mov r7,r7,ror#25
  135. eor r6,r6,r8,ror#25
  136. eor r7,r7,r9,ror#25
  137. add r0,r0,r5
  138. mov r10,r10,ror#16
  139. add r1,r1,r6
  140. mov r12,r12,ror#16
  141. eor r10,r10,r0,ror#16
  142. eor r12,r12,r1,ror#16
  143. add r8,r8,r10
  144. mov r5,r5,ror#20
  145. add r9,r9,r12
  146. mov r6,r6,ror#20
  147. eor r5,r5,r8,ror#20
  148. eor r6,r6,r9,ror#20
  149. add r0,r0,r5
  150. mov r10,r10,ror#24
  151. add r1,r1,r6
  152. mov r12,r12,ror#24
  153. eor r10,r10,r0,ror#24
  154. eor r12,r12,r1,ror#24
  155. add r8,r8,r10
  156. mov r5,r5,ror#25
  157. str r10,[sp,#4*(16+15)]
  158. ldr r10,[sp,#4*(16+13)]
  159. add r9,r9,r12
  160. mov r6,r6,ror#25
  161. eor r5,r5,r8,ror#25
  162. eor r6,r6,r9,ror#25
  163. str r8,[sp,#4*(16+10)]
  164. ldr r8,[sp,#4*(16+8)]
  165. add r2,r2,r7
  166. mov r10,r10,ror#16
  167. str r9,[sp,#4*(16+11)]
  168. ldr r9,[sp,#4*(16+9)]
  169. add r3,r3,r4
  170. mov r14,r14,ror#16
  171. eor r10,r10,r2,ror#16
  172. eor r14,r14,r3,ror#16
  173. add r8,r8,r10
  174. mov r7,r7,ror#20
  175. add r9,r9,r14
  176. mov r4,r4,ror#20
  177. eor r7,r7,r8,ror#20
  178. eor r4,r4,r9,ror#20
  179. add r2,r2,r7
  180. mov r10,r10,ror#24
  181. add r3,r3,r4
  182. mov r14,r14,ror#24
  183. eor r10,r10,r2,ror#24
  184. eor r14,r14,r3,ror#24
  185. add r8,r8,r10
  186. mov r7,r7,ror#25
  187. add r9,r9,r14
  188. mov r4,r4,ror#25
  189. eor r7,r7,r8,ror#25
  190. eor r4,r4,r9,ror#25
  191. bne .Loop
  192. ldr r11,[sp,#4*(32+2)] @ load len
  193. str r8, [sp,#4*(16+8)] @ modulo-scheduled store
  194. str r9, [sp,#4*(16+9)]
  195. str r12,[sp,#4*(16+12)]
  196. str r10, [sp,#4*(16+13)]
  197. str r14,[sp,#4*(16+14)]
  198. @ at this point we have first half of 512-bit result in
  199. @ rx and second half at sp+4*(16+8)
  200. cmp r11,#64 @ done yet?
  201. #ifdef __thumb2__
  202. itete lo
  203. #endif
  204. addlo r12,sp,#4*(0) @ shortcut or ...
  205. ldrhs r12,[sp,#4*(32+1)] @ ... load inp
  206. addlo r14,sp,#4*(0) @ shortcut or ...
  207. ldrhs r14,[sp,#4*(32+0)] @ ... load out
  208. ldr r8,[sp,#4*(0)] @ load key material
  209. ldr r9,[sp,#4*(1)]
  210. #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
  211. # if __ARM_ARCH__<7
  212. orr r10,r12,r14
  213. tst r10,#3 @ are input and output aligned?
  214. ldr r10,[sp,#4*(2)]
  215. bne .Lunaligned
  216. cmp r11,#64 @ restore flags
  217. # else
  218. ldr r10,[sp,#4*(2)]
  219. # endif
  220. ldr r11,[sp,#4*(3)]
  221. add r0,r0,r8 @ accumulate key material
  222. add r1,r1,r9
  223. # ifdef __thumb2__
  224. itt hs
  225. # endif
  226. ldrhs r8,[r12],#16 @ load input
  227. ldrhs r9,[r12,#-12]
  228. add r2,r2,r10
  229. add r3,r3,r11
  230. # ifdef __thumb2__
  231. itt hs
  232. # endif
  233. ldrhs r10,[r12,#-8]
  234. ldrhs r11,[r12,#-4]
  235. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  236. rev r0,r0
  237. rev r1,r1
  238. rev r2,r2
  239. rev r3,r3
  240. # endif
  241. # ifdef __thumb2__
  242. itt hs
  243. # endif
  244. eorhs r0,r0,r8 @ xor with input
  245. eorhs r1,r1,r9
  246. add r8,sp,#4*(4)
  247. str r0,[r14],#16 @ store output
  248. # ifdef __thumb2__
  249. itt hs
  250. # endif
  251. eorhs r2,r2,r10
  252. eorhs r3,r3,r11
  253. ldmia r8,{r8-r11} @ load key material
  254. str r1,[r14,#-12]
  255. str r2,[r14,#-8]
  256. str r3,[r14,#-4]
  257. add r4,r4,r8 @ accumulate key material
  258. add r5,r5,r9
  259. # ifdef __thumb2__
  260. itt hs
  261. # endif
  262. ldrhs r8,[r12],#16 @ load input
  263. ldrhs r9,[r12,#-12]
  264. add r6,r6,r10
  265. add r7,r7,r11
  266. # ifdef __thumb2__
  267. itt hs
  268. # endif
  269. ldrhs r10,[r12,#-8]
  270. ldrhs r11,[r12,#-4]
  271. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  272. rev r4,r4
  273. rev r5,r5
  274. rev r6,r6
  275. rev r7,r7
  276. # endif
  277. # ifdef __thumb2__
  278. itt hs
  279. # endif
  280. eorhs r4,r4,r8
  281. eorhs r5,r5,r9
  282. add r8,sp,#4*(8)
  283. str r4,[r14],#16 @ store output
  284. # ifdef __thumb2__
  285. itt hs
  286. # endif
  287. eorhs r6,r6,r10
  288. eorhs r7,r7,r11
  289. str r5,[r14,#-12]
  290. ldmia r8,{r8-r11} @ load key material
  291. str r6,[r14,#-8]
  292. add r0,sp,#4*(16+8)
  293. str r7,[r14,#-4]
  294. ldmia r0,{r0-r7} @ load second half
  295. add r0,r0,r8 @ accumulate key material
  296. add r1,r1,r9
  297. # ifdef __thumb2__
  298. itt hs
  299. # endif
  300. ldrhs r8,[r12],#16 @ load input
  301. ldrhs r9,[r12,#-12]
  302. # ifdef __thumb2__
  303. itt hi
  304. # endif
  305. strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
  306. strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
  307. add r2,r2,r10
  308. add r3,r3,r11
  309. # ifdef __thumb2__
  310. itt hs
  311. # endif
  312. ldrhs r10,[r12,#-8]
  313. ldrhs r11,[r12,#-4]
  314. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  315. rev r0,r0
  316. rev r1,r1
  317. rev r2,r2
  318. rev r3,r3
  319. # endif
  320. # ifdef __thumb2__
  321. itt hs
  322. # endif
  323. eorhs r0,r0,r8
  324. eorhs r1,r1,r9
  325. add r8,sp,#4*(12)
  326. str r0,[r14],#16 @ store output
  327. # ifdef __thumb2__
  328. itt hs
  329. # endif
  330. eorhs r2,r2,r10
  331. eorhs r3,r3,r11
  332. str r1,[r14,#-12]
  333. ldmia r8,{r8-r11} @ load key material
  334. str r2,[r14,#-8]
  335. str r3,[r14,#-4]
  336. add r4,r4,r8 @ accumulate key material
  337. add r5,r5,r9
  338. # ifdef __thumb2__
  339. itt hi
  340. # endif
  341. addhi r8,r8,#1 @ next counter value
  342. strhi r8,[sp,#4*(12)] @ save next counter value
  343. # ifdef __thumb2__
  344. itt hs
  345. # endif
  346. ldrhs r8,[r12],#16 @ load input
  347. ldrhs r9,[r12,#-12]
  348. add r6,r6,r10
  349. add r7,r7,r11
  350. # ifdef __thumb2__
  351. itt hs
  352. # endif
  353. ldrhs r10,[r12,#-8]
  354. ldrhs r11,[r12,#-4]
  355. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  356. rev r4,r4
  357. rev r5,r5
  358. rev r6,r6
  359. rev r7,r7
  360. # endif
  361. # ifdef __thumb2__
  362. itt hs
  363. # endif
  364. eorhs r4,r4,r8
  365. eorhs r5,r5,r9
  366. # ifdef __thumb2__
  367. it ne
  368. # endif
  369. ldrne r8,[sp,#4*(32+2)] @ re-load len
  370. # ifdef __thumb2__
  371. itt hs
  372. # endif
  373. eorhs r6,r6,r10
  374. eorhs r7,r7,r11
  375. str r4,[r14],#16 @ store output
  376. str r5,[r14,#-12]
  377. # ifdef __thumb2__
  378. it hs
  379. # endif
  380. subhs r11,r8,#64 @ len-=64
  381. str r6,[r14,#-8]
  382. str r7,[r14,#-4]
  383. bhi .Loop_outer
  384. beq .Ldone
  385. # if __ARM_ARCH__<7
  386. b .Ltail
  387. .align 4
  388. .Lunaligned: @ unaligned endian-neutral path
  389. cmp r11,#64 @ restore flags
  390. # endif
  391. #endif
  392. #if __ARM_ARCH__<7
  393. ldr r11,[sp,#4*(3)]
  394. add r0,r0,r8 @ accumulate key material
  395. add r1,r1,r9
  396. add r2,r2,r10
  397. # ifdef __thumb2__
  398. itete lo
  399. # endif
  400. eorlo r8,r8,r8 @ zero or ...
  401. ldrhsb r8,[r12],#16 @ ... load input
  402. eorlo r9,r9,r9
  403. ldrhsb r9,[r12,#-12]
  404. add r3,r3,r11
  405. # ifdef __thumb2__
  406. itete lo
  407. # endif
  408. eorlo r10,r10,r10
  409. ldrhsb r10,[r12,#-8]
  410. eorlo r11,r11,r11
  411. ldrhsb r11,[r12,#-4]
  412. eor r0,r8,r0 @ xor with input (or zero)
  413. eor r1,r9,r1
  414. # ifdef __thumb2__
  415. itt hs
  416. # endif
  417. ldrhsb r8,[r12,#-15] @ load more input
  418. ldrhsb r9,[r12,#-11]
  419. eor r2,r10,r2
  420. strb r0,[r14],#16 @ store output
  421. eor r3,r11,r3
  422. # ifdef __thumb2__
  423. itt hs
  424. # endif
  425. ldrhsb r10,[r12,#-7]
  426. ldrhsb r11,[r12,#-3]
  427. strb r1,[r14,#-12]
  428. eor r0,r8,r0,lsr#8
  429. strb r2,[r14,#-8]
  430. eor r1,r9,r1,lsr#8
  431. # ifdef __thumb2__
  432. itt hs
  433. # endif
  434. ldrhsb r8,[r12,#-14] @ load more input
  435. ldrhsb r9,[r12,#-10]
  436. strb r3,[r14,#-4]
  437. eor r2,r10,r2,lsr#8
  438. strb r0,[r14,#-15]
  439. eor r3,r11,r3,lsr#8
  440. # ifdef __thumb2__
  441. itt hs
  442. # endif
  443. ldrhsb r10,[r12,#-6]
  444. ldrhsb r11,[r12,#-2]
  445. strb r1,[r14,#-11]
  446. eor r0,r8,r0,lsr#8
  447. strb r2,[r14,#-7]
  448. eor r1,r9,r1,lsr#8
  449. # ifdef __thumb2__
  450. itt hs
  451. # endif
  452. ldrhsb r8,[r12,#-13] @ load more input
  453. ldrhsb r9,[r12,#-9]
  454. strb r3,[r14,#-3]
  455. eor r2,r10,r2,lsr#8
  456. strb r0,[r14,#-14]
  457. eor r3,r11,r3,lsr#8
  458. # ifdef __thumb2__
  459. itt hs
  460. # endif
  461. ldrhsb r10,[r12,#-5]
  462. ldrhsb r11,[r12,#-1]
  463. strb r1,[r14,#-10]
  464. strb r2,[r14,#-6]
  465. eor r0,r8,r0,lsr#8
  466. strb r3,[r14,#-2]
  467. eor r1,r9,r1,lsr#8
  468. strb r0,[r14,#-13]
  469. eor r2,r10,r2,lsr#8
  470. strb r1,[r14,#-9]
  471. eor r3,r11,r3,lsr#8
  472. strb r2,[r14,#-5]
  473. strb r3,[r14,#-1]
  474. add r8,sp,#4*(4+0)
  475. ldmia r8,{r8-r11} @ load key material
  476. add r0,sp,#4*(16+8)
  477. add r4,r4,r8 @ accumulate key material
  478. add r5,r5,r9
  479. add r6,r6,r10
  480. # ifdef __thumb2__
  481. itete lo
  482. # endif
  483. eorlo r8,r8,r8 @ zero or ...
  484. ldrhsb r8,[r12],#16 @ ... load input
  485. eorlo r9,r9,r9
  486. ldrhsb r9,[r12,#-12]
  487. add r7,r7,r11
  488. # ifdef __thumb2__
  489. itete lo
  490. # endif
  491. eorlo r10,r10,r10
  492. ldrhsb r10,[r12,#-8]
  493. eorlo r11,r11,r11
  494. ldrhsb r11,[r12,#-4]
  495. eor r4,r8,r4 @ xor with input (or zero)
  496. eor r5,r9,r5
  497. # ifdef __thumb2__
  498. itt hs
  499. # endif
  500. ldrhsb r8,[r12,#-15] @ load more input
  501. ldrhsb r9,[r12,#-11]
  502. eor r6,r10,r6
  503. strb r4,[r14],#16 @ store output
  504. eor r7,r11,r7
  505. # ifdef __thumb2__
  506. itt hs
  507. # endif
  508. ldrhsb r10,[r12,#-7]
  509. ldrhsb r11,[r12,#-3]
  510. strb r5,[r14,#-12]
  511. eor r4,r8,r4,lsr#8
  512. strb r6,[r14,#-8]
  513. eor r5,r9,r5,lsr#8
  514. # ifdef __thumb2__
  515. itt hs
  516. # endif
  517. ldrhsb r8,[r12,#-14] @ load more input
  518. ldrhsb r9,[r12,#-10]
  519. strb r7,[r14,#-4]
  520. eor r6,r10,r6,lsr#8
  521. strb r4,[r14,#-15]
  522. eor r7,r11,r7,lsr#8
  523. # ifdef __thumb2__
  524. itt hs
  525. # endif
  526. ldrhsb r10,[r12,#-6]
  527. ldrhsb r11,[r12,#-2]
  528. strb r5,[r14,#-11]
  529. eor r4,r8,r4,lsr#8
  530. strb r6,[r14,#-7]
  531. eor r5,r9,r5,lsr#8
  532. # ifdef __thumb2__
  533. itt hs
  534. # endif
  535. ldrhsb r8,[r12,#-13] @ load more input
  536. ldrhsb r9,[r12,#-9]
  537. strb r7,[r14,#-3]
  538. eor r6,r10,r6,lsr#8
  539. strb r4,[r14,#-14]
  540. eor r7,r11,r7,lsr#8
  541. # ifdef __thumb2__
  542. itt hs
  543. # endif
  544. ldrhsb r10,[r12,#-5]
  545. ldrhsb r11,[r12,#-1]
  546. strb r5,[r14,#-10]
  547. strb r6,[r14,#-6]
  548. eor r4,r8,r4,lsr#8
  549. strb r7,[r14,#-2]
  550. eor r5,r9,r5,lsr#8
  551. strb r4,[r14,#-13]
  552. eor r6,r10,r6,lsr#8
  553. strb r5,[r14,#-9]
  554. eor r7,r11,r7,lsr#8
  555. strb r6,[r14,#-5]
  556. strb r7,[r14,#-1]
  557. add r8,sp,#4*(4+4)
  558. ldmia r8,{r8-r11} @ load key material
  559. ldmia r0,{r0-r7} @ load second half
  560. # ifdef __thumb2__
  561. itt hi
  562. # endif
  563. strhi r10,[sp,#4*(16+10)] @ copy "rx"
  564. strhi r11,[sp,#4*(16+11)] @ copy "rx"
  565. add r0,r0,r8 @ accumulate key material
  566. add r1,r1,r9
  567. add r2,r2,r10
  568. # ifdef __thumb2__
  569. itete lo
  570. # endif
  571. eorlo r8,r8,r8 @ zero or ...
  572. ldrhsb r8,[r12],#16 @ ... load input
  573. eorlo r9,r9,r9
  574. ldrhsb r9,[r12,#-12]
  575. add r3,r3,r11
  576. # ifdef __thumb2__
  577. itete lo
  578. # endif
  579. eorlo r10,r10,r10
  580. ldrhsb r10,[r12,#-8]
  581. eorlo r11,r11,r11
  582. ldrhsb r11,[r12,#-4]
  583. eor r0,r8,r0 @ xor with input (or zero)
  584. eor r1,r9,r1
  585. # ifdef __thumb2__
  586. itt hs
  587. # endif
  588. ldrhsb r8,[r12,#-15] @ load more input
  589. ldrhsb r9,[r12,#-11]
  590. eor r2,r10,r2
  591. strb r0,[r14],#16 @ store output
  592. eor r3,r11,r3
  593. # ifdef __thumb2__
  594. itt hs
  595. # endif
  596. ldrhsb r10,[r12,#-7]
  597. ldrhsb r11,[r12,#-3]
  598. strb r1,[r14,#-12]
  599. eor r0,r8,r0,lsr#8
  600. strb r2,[r14,#-8]
  601. eor r1,r9,r1,lsr#8
  602. # ifdef __thumb2__
  603. itt hs
  604. # endif
  605. ldrhsb r8,[r12,#-14] @ load more input
  606. ldrhsb r9,[r12,#-10]
  607. strb r3,[r14,#-4]
  608. eor r2,r10,r2,lsr#8
  609. strb r0,[r14,#-15]
  610. eor r3,r11,r3,lsr#8
  611. # ifdef __thumb2__
  612. itt hs
  613. # endif
  614. ldrhsb r10,[r12,#-6]
  615. ldrhsb r11,[r12,#-2]
  616. strb r1,[r14,#-11]
  617. eor r0,r8,r0,lsr#8
  618. strb r2,[r14,#-7]
  619. eor r1,r9,r1,lsr#8
  620. # ifdef __thumb2__
  621. itt hs
  622. # endif
  623. ldrhsb r8,[r12,#-13] @ load more input
  624. ldrhsb r9,[r12,#-9]
  625. strb r3,[r14,#-3]
  626. eor r2,r10,r2,lsr#8
  627. strb r0,[r14,#-14]
  628. eor r3,r11,r3,lsr#8
  629. # ifdef __thumb2__
  630. itt hs
  631. # endif
  632. ldrhsb r10,[r12,#-5]
  633. ldrhsb r11,[r12,#-1]
  634. strb r1,[r14,#-10]
  635. strb r2,[r14,#-6]
  636. eor r0,r8,r0,lsr#8
  637. strb r3,[r14,#-2]
  638. eor r1,r9,r1,lsr#8
  639. strb r0,[r14,#-13]
  640. eor r2,r10,r2,lsr#8
  641. strb r1,[r14,#-9]
  642. eor r3,r11,r3,lsr#8
  643. strb r2,[r14,#-5]
  644. strb r3,[r14,#-1]
  645. add r8,sp,#4*(4+8)
  646. ldmia r8,{r8-r11} @ load key material
  647. add r4,r4,r8 @ accumulate key material
  648. # ifdef __thumb2__
  649. itt hi
  650. # endif
  651. addhi r8,r8,#1 @ next counter value
  652. strhi r8,[sp,#4*(12)] @ save next counter value
  653. add r5,r5,r9
  654. add r6,r6,r10
  655. # ifdef __thumb2__
  656. itete lo
  657. # endif
  658. eorlo r8,r8,r8 @ zero or ...
  659. ldrhsb r8,[r12],#16 @ ... load input
  660. eorlo r9,r9,r9
  661. ldrhsb r9,[r12,#-12]
  662. add r7,r7,r11
  663. # ifdef __thumb2__
  664. itete lo
  665. # endif
  666. eorlo r10,r10,r10
  667. ldrhsb r10,[r12,#-8]
  668. eorlo r11,r11,r11
  669. ldrhsb r11,[r12,#-4]
  670. eor r4,r8,r4 @ xor with input (or zero)
  671. eor r5,r9,r5
  672. # ifdef __thumb2__
  673. itt hs
  674. # endif
  675. ldrhsb r8,[r12,#-15] @ load more input
  676. ldrhsb r9,[r12,#-11]
  677. eor r6,r10,r6
  678. strb r4,[r14],#16 @ store output
  679. eor r7,r11,r7
  680. # ifdef __thumb2__
  681. itt hs
  682. # endif
  683. ldrhsb r10,[r12,#-7]
  684. ldrhsb r11,[r12,#-3]
  685. strb r5,[r14,#-12]
  686. eor r4,r8,r4,lsr#8
  687. strb r6,[r14,#-8]
  688. eor r5,r9,r5,lsr#8
  689. # ifdef __thumb2__
  690. itt hs
  691. # endif
  692. ldrhsb r8,[r12,#-14] @ load more input
  693. ldrhsb r9,[r12,#-10]
  694. strb r7,[r14,#-4]
  695. eor r6,r10,r6,lsr#8
  696. strb r4,[r14,#-15]
  697. eor r7,r11,r7,lsr#8
  698. # ifdef __thumb2__
  699. itt hs
  700. # endif
  701. ldrhsb r10,[r12,#-6]
  702. ldrhsb r11,[r12,#-2]
  703. strb r5,[r14,#-11]
  704. eor r4,r8,r4,lsr#8
  705. strb r6,[r14,#-7]
  706. eor r5,r9,r5,lsr#8
  707. # ifdef __thumb2__
  708. itt hs
  709. # endif
  710. ldrhsb r8,[r12,#-13] @ load more input
  711. ldrhsb r9,[r12,#-9]
  712. strb r7,[r14,#-3]
  713. eor r6,r10,r6,lsr#8
  714. strb r4,[r14,#-14]
  715. eor r7,r11,r7,lsr#8
  716. # ifdef __thumb2__
  717. itt hs
  718. # endif
  719. ldrhsb r10,[r12,#-5]
  720. ldrhsb r11,[r12,#-1]
  721. strb r5,[r14,#-10]
  722. strb r6,[r14,#-6]
  723. eor r4,r8,r4,lsr#8
  724. strb r7,[r14,#-2]
  725. eor r5,r9,r5,lsr#8
  726. strb r4,[r14,#-13]
  727. eor r6,r10,r6,lsr#8
  728. strb r5,[r14,#-9]
  729. eor r7,r11,r7,lsr#8
  730. strb r6,[r14,#-5]
  731. strb r7,[r14,#-1]
  732. # ifdef __thumb2__
  733. it ne
  734. # endif
  735. ldrne r8,[sp,#4*(32+2)] @ re-load len
  736. # ifdef __thumb2__
  737. it hs
  738. # endif
  739. subhs r11,r8,#64 @ len-=64
  740. bhi .Loop_outer
  741. beq .Ldone
  742. #endif
  743. .Ltail:
  744. ldr r12,[sp,#4*(32+1)] @ load inp
  745. add r9,sp,#4*(0)
  746. ldr r14,[sp,#4*(32+0)] @ load out
  747. .Loop_tail:
  748. ldrb r10,[r9],#1 @ read buffer on stack
  749. ldrb r11,[r12],#1 @ read input
  750. subs r8,r8,#1
  751. eor r11,r11,r10
  752. strb r11,[r14],#1 @ store output
  753. bne .Loop_tail
  754. .Ldone:
  755. add sp,sp,#4*(32+3)
  756. .Lno_data:
  757. ldmia sp!,{r4-r11,pc}
  758. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  759. #if __ARM_MAX_ARCH__>=7
  760. .arch armv7-a
  761. .fpu neon
  762. .type ChaCha20_neon,%function
  763. .align 5
  764. ChaCha20_neon:
  765. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  766. stmdb sp!,{r0-r2,r4-r11,lr}
  767. .LChaCha20_neon:
  768. adr r14,.Lsigma
  769. vstmdb sp!,{d8-d15} @ ABI spec says so
  770. stmdb sp!,{r0-r3}
  771. vld1.32 {q1-q2},[r3] @ load key
  772. ldmia r3,{r4-r11} @ load key
  773. sub sp,sp,#4*(16+16)
  774. vld1.32 {q3},[r12] @ load counter and nonce
  775. add r12,sp,#4*8
  776. ldmia r14,{r0-r3} @ load sigma
  777. vld1.32 {q0},[r14]! @ load sigma
  778. vld1.32 {q12},[r14] @ one
  779. vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
  780. vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
  781. str r10,[sp,#4*(16+10)] @ off-load "rx"
  782. str r11,[sp,#4*(16+11)] @ off-load "rx"
  783. vshl.i32 d26,d24,#1 @ two
  784. vstr d24,[sp,#4*(16+0)]
  785. vshl.i32 d28,d24,#2 @ four
  786. vstr d26,[sp,#4*(16+2)]
  787. vmov q4,q0
  788. vstr d28,[sp,#4*(16+4)]
  789. vmov q8,q0
  790. vmov q5,q1
  791. vmov q9,q1
  792. b .Loop_neon_enter
  793. .align 4
  794. .Loop_neon_outer:
  795. ldmia sp,{r0-r9} @ load key material
  796. cmp r11,#64*2 @ if len<=64*2
  797. bls .Lbreak_neon @ switch to integer-only
  798. vmov q4,q0
  799. str r11,[sp,#4*(32+2)] @ save len
  800. vmov q8,q0
  801. str r12, [sp,#4*(32+1)] @ save inp
  802. vmov q5,q1
  803. str r14, [sp,#4*(32+0)] @ save out
  804. vmov q9,q1
  805. .Loop_neon_enter:
  806. ldr r11, [sp,#4*(15)]
  807. vadd.i32 q7,q3,q12 @ counter+1
  808. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  809. vmov q6,q2
  810. ldr r10, [sp,#4*(13)]
  811. vmov q10,q2
  812. ldr r14,[sp,#4*(14)]
  813. vadd.i32 q11,q7,q12 @ counter+2
  814. str r11, [sp,#4*(16+15)]
  815. mov r11,#10
  816. add r12,r12,#3 @ counter+3
  817. b .Loop_neon
  818. .align 4
  819. .Loop_neon:
  820. subs r11,r11,#1
  821. vadd.i32 q0,q0,q1
  822. add r0,r0,r4
  823. vadd.i32 q4,q4,q5
  824. mov r12,r12,ror#16
  825. vadd.i32 q8,q8,q9
  826. add r1,r1,r5
  827. veor q3,q3,q0
  828. mov r10,r10,ror#16
  829. veor q7,q7,q4
  830. eor r12,r12,r0,ror#16
  831. veor q11,q11,q8
  832. eor r10,r10,r1,ror#16
  833. vrev32.16 q3,q3
  834. add r8,r8,r12
  835. vrev32.16 q7,q7
  836. mov r4,r4,ror#20
  837. vrev32.16 q11,q11
  838. add r9,r9,r10
  839. vadd.i32 q2,q2,q3
  840. mov r5,r5,ror#20
  841. vadd.i32 q6,q6,q7
  842. eor r4,r4,r8,ror#20
  843. vadd.i32 q10,q10,q11
  844. eor r5,r5,r9,ror#20
  845. veor q12,q1,q2
  846. add r0,r0,r4
  847. veor q13,q5,q6
  848. mov r12,r12,ror#24
  849. veor q14,q9,q10
  850. add r1,r1,r5
  851. vshr.u32 q1,q12,#20
  852. mov r10,r10,ror#24
  853. vshr.u32 q5,q13,#20
  854. eor r12,r12,r0,ror#24
  855. vshr.u32 q9,q14,#20
  856. eor r10,r10,r1,ror#24
  857. vsli.32 q1,q12,#12
  858. add r8,r8,r12
  859. vsli.32 q5,q13,#12
  860. mov r4,r4,ror#25
  861. vsli.32 q9,q14,#12
  862. add r9,r9,r10
  863. vadd.i32 q0,q0,q1
  864. mov r5,r5,ror#25
  865. vadd.i32 q4,q4,q5
  866. str r10,[sp,#4*(16+13)]
  867. vadd.i32 q8,q8,q9
  868. ldr r10,[sp,#4*(16+15)]
  869. veor q12,q3,q0
  870. eor r4,r4,r8,ror#25
  871. veor q13,q7,q4
  872. eor r5,r5,r9,ror#25
  873. veor q14,q11,q8
  874. str r8,[sp,#4*(16+8)]
  875. vshr.u32 q3,q12,#24
  876. ldr r8,[sp,#4*(16+10)]
  877. vshr.u32 q7,q13,#24
  878. add r2,r2,r6
  879. vshr.u32 q11,q14,#24
  880. mov r14,r14,ror#16
  881. vsli.32 q3,q12,#8
  882. str r9,[sp,#4*(16+9)]
  883. vsli.32 q7,q13,#8
  884. ldr r9,[sp,#4*(16+11)]
  885. vsli.32 q11,q14,#8
  886. add r3,r3,r7
  887. vadd.i32 q2,q2,q3
  888. mov r10,r10,ror#16
  889. vadd.i32 q6,q6,q7
  890. eor r14,r14,r2,ror#16
  891. vadd.i32 q10,q10,q11
  892. eor r10,r10,r3,ror#16
  893. veor q12,q1,q2
  894. add r8,r8,r14
  895. veor q13,q5,q6
  896. mov r6,r6,ror#20
  897. veor q14,q9,q10
  898. add r9,r9,r10
  899. vshr.u32 q1,q12,#25
  900. mov r7,r7,ror#20
  901. vshr.u32 q5,q13,#25
  902. eor r6,r6,r8,ror#20
  903. vshr.u32 q9,q14,#25
  904. eor r7,r7,r9,ror#20
  905. vsli.32 q1,q12,#7
  906. add r2,r2,r6
  907. vsli.32 q5,q13,#7
  908. mov r14,r14,ror#24
  909. vsli.32 q9,q14,#7
  910. add r3,r3,r7
  911. vext.8 q2,q2,q2,#8
  912. mov r10,r10,ror#24
  913. vext.8 q6,q6,q6,#8
  914. eor r14,r14,r2,ror#24
  915. vext.8 q10,q10,q10,#8
  916. eor r10,r10,r3,ror#24
  917. vext.8 q1,q1,q1,#4
  918. add r8,r8,r14
  919. vext.8 q5,q5,q5,#4
  920. mov r6,r6,ror#25
  921. vext.8 q9,q9,q9,#4
  922. add r9,r9,r10
  923. vext.8 q3,q3,q3,#12
  924. mov r7,r7,ror#25
  925. vext.8 q7,q7,q7,#12
  926. eor r6,r6,r8,ror#25
  927. vext.8 q11,q11,q11,#12
  928. eor r7,r7,r9,ror#25
  929. vadd.i32 q0,q0,q1
  930. add r0,r0,r5
  931. vadd.i32 q4,q4,q5
  932. mov r10,r10,ror#16
  933. vadd.i32 q8,q8,q9
  934. add r1,r1,r6
  935. veor q3,q3,q0
  936. mov r12,r12,ror#16
  937. veor q7,q7,q4
  938. eor r10,r10,r0,ror#16
  939. veor q11,q11,q8
  940. eor r12,r12,r1,ror#16
  941. vrev32.16 q3,q3
  942. add r8,r8,r10
  943. vrev32.16 q7,q7
  944. mov r5,r5,ror#20
  945. vrev32.16 q11,q11
  946. add r9,r9,r12
  947. vadd.i32 q2,q2,q3
  948. mov r6,r6,ror#20
  949. vadd.i32 q6,q6,q7
  950. eor r5,r5,r8,ror#20
  951. vadd.i32 q10,q10,q11
  952. eor r6,r6,r9,ror#20
  953. veor q12,q1,q2
  954. add r0,r0,r5
  955. veor q13,q5,q6
  956. mov r10,r10,ror#24
  957. veor q14,q9,q10
  958. add r1,r1,r6
  959. vshr.u32 q1,q12,#20
  960. mov r12,r12,ror#24
  961. vshr.u32 q5,q13,#20
  962. eor r10,r10,r0,ror#24
  963. vshr.u32 q9,q14,#20
  964. eor r12,r12,r1,ror#24
  965. vsli.32 q1,q12,#12
  966. add r8,r8,r10
  967. vsli.32 q5,q13,#12
  968. mov r5,r5,ror#25
  969. vsli.32 q9,q14,#12
  970. str r10,[sp,#4*(16+15)]
  971. vadd.i32 q0,q0,q1
  972. ldr r10,[sp,#4*(16+13)]
  973. vadd.i32 q4,q4,q5
  974. add r9,r9,r12
  975. vadd.i32 q8,q8,q9
  976. mov r6,r6,ror#25
  977. veor q12,q3,q0
  978. eor r5,r5,r8,ror#25
  979. veor q13,q7,q4
  980. eor r6,r6,r9,ror#25
  981. veor q14,q11,q8
  982. str r8,[sp,#4*(16+10)]
  983. vshr.u32 q3,q12,#24
  984. ldr r8,[sp,#4*(16+8)]
  985. vshr.u32 q7,q13,#24
  986. add r2,r2,r7
  987. vshr.u32 q11,q14,#24
  988. mov r10,r10,ror#16
  989. vsli.32 q3,q12,#8
  990. str r9,[sp,#4*(16+11)]
  991. vsli.32 q7,q13,#8
  992. ldr r9,[sp,#4*(16+9)]
  993. vsli.32 q11,q14,#8
  994. add r3,r3,r4
  995. vadd.i32 q2,q2,q3
  996. mov r14,r14,ror#16
  997. vadd.i32 q6,q6,q7
  998. eor r10,r10,r2,ror#16
  999. vadd.i32 q10,q10,q11
  1000. eor r14,r14,r3,ror#16
  1001. veor q12,q1,q2
  1002. add r8,r8,r10
  1003. veor q13,q5,q6
  1004. mov r7,r7,ror#20
  1005. veor q14,q9,q10
  1006. add r9,r9,r14
  1007. vshr.u32 q1,q12,#25
  1008. mov r4,r4,ror#20
  1009. vshr.u32 q5,q13,#25
  1010. eor r7,r7,r8,ror#20
  1011. vshr.u32 q9,q14,#25
  1012. eor r4,r4,r9,ror#20
  1013. vsli.32 q1,q12,#7
  1014. add r2,r2,r7
  1015. vsli.32 q5,q13,#7
  1016. mov r10,r10,ror#24
  1017. vsli.32 q9,q14,#7
  1018. add r3,r3,r4
  1019. vext.8 q2,q2,q2,#8
  1020. mov r14,r14,ror#24
  1021. vext.8 q6,q6,q6,#8
  1022. eor r10,r10,r2,ror#24
  1023. vext.8 q10,q10,q10,#8
  1024. eor r14,r14,r3,ror#24
  1025. vext.8 q1,q1,q1,#12
  1026. add r8,r8,r10
  1027. vext.8 q5,q5,q5,#12
  1028. mov r7,r7,ror#25
  1029. vext.8 q9,q9,q9,#12
  1030. add r9,r9,r14
  1031. vext.8 q3,q3,q3,#4
  1032. mov r4,r4,ror#25
  1033. vext.8 q7,q7,q7,#4
  1034. eor r7,r7,r8,ror#25
  1035. vext.8 q11,q11,q11,#4
  1036. eor r4,r4,r9,ror#25
  1037. bne .Loop_neon
  1038. add r11,sp,#32
  1039. vld1.32 {q12-q13},[sp] @ load key material
  1040. vld1.32 {q14-q15},[r11]
  1041. ldr r11,[sp,#4*(32+2)] @ load len
  1042. str r8, [sp,#4*(16+8)] @ modulo-scheduled store
  1043. str r9, [sp,#4*(16+9)]
  1044. str r12,[sp,#4*(16+12)]
  1045. str r10, [sp,#4*(16+13)]
  1046. str r14,[sp,#4*(16+14)]
  1047. @ at this point we have first half of 512-bit result in
  1048. @ rx and second half at sp+4*(16+8)
  1049. ldr r12,[sp,#4*(32+1)] @ load inp
  1050. ldr r14,[sp,#4*(32+0)] @ load out
  1051. vadd.i32 q0,q0,q12 @ accumulate key material
  1052. vadd.i32 q4,q4,q12
  1053. vadd.i32 q8,q8,q12
  1054. vldr d24,[sp,#4*(16+0)] @ one
  1055. vadd.i32 q1,q1,q13
  1056. vadd.i32 q5,q5,q13
  1057. vadd.i32 q9,q9,q13
  1058. vldr d26,[sp,#4*(16+2)] @ two
  1059. vadd.i32 q2,q2,q14
  1060. vadd.i32 q6,q6,q14
  1061. vadd.i32 q10,q10,q14
  1062. vadd.i32 d14,d14,d24 @ counter+1
  1063. vadd.i32 d22,d22,d26 @ counter+2
  1064. vadd.i32 q3,q3,q15
  1065. vadd.i32 q7,q7,q15
  1066. vadd.i32 q11,q11,q15
  1067. cmp r11,#64*4
  1068. blo .Ltail_neon
  1069. vld1.8 {q12-q13},[r12]! @ load input
  1070. mov r11,sp
  1071. vld1.8 {q14-q15},[r12]!
  1072. veor q0,q0,q12 @ xor with input
  1073. veor q1,q1,q13
  1074. vld1.8 {q12-q13},[r12]!
  1075. veor q2,q2,q14
  1076. veor q3,q3,q15
  1077. vld1.8 {q14-q15},[r12]!
  1078. veor q4,q4,q12
  1079. vst1.8 {q0-q1},[r14]! @ store output
  1080. veor q5,q5,q13
  1081. vld1.8 {q12-q13},[r12]!
  1082. veor q6,q6,q14
  1083. vst1.8 {q2-q3},[r14]!
  1084. veor q7,q7,q15
  1085. vld1.8 {q14-q15},[r12]!
  1086. veor q8,q8,q12
  1087. vld1.32 {q0-q1},[r11]! @ load for next iteration
  1088. veor d25,d25,d25
  1089. vldr d24,[sp,#4*(16+4)] @ four
  1090. veor q9,q9,q13
  1091. vld1.32 {q2-q3},[r11]
  1092. veor q10,q10,q14
  1093. vst1.8 {q4-q5},[r14]!
  1094. veor q11,q11,q15
  1095. vst1.8 {q6-q7},[r14]!
  1096. vadd.i32 d6,d6,d24 @ next counter value
  1097. vldr d24,[sp,#4*(16+0)] @ one
  1098. ldmia sp,{r8-r11} @ load key material
  1099. add r0,r0,r8 @ accumulate key material
  1100. ldr r8,[r12],#16 @ load input
  1101. vst1.8 {q8-q9},[r14]!
  1102. add r1,r1,r9
  1103. ldr r9,[r12,#-12]
  1104. vst1.8 {q10-q11},[r14]!
  1105. add r2,r2,r10
  1106. ldr r10,[r12,#-8]
  1107. add r3,r3,r11
  1108. ldr r11,[r12,#-4]
  1109. # ifdef __ARMEB__
  1110. rev r0,r0
  1111. rev r1,r1
  1112. rev r2,r2
  1113. rev r3,r3
  1114. # endif
  1115. eor r0,r0,r8 @ xor with input
  1116. add r8,sp,#4*(4)
  1117. eor r1,r1,r9
  1118. str r0,[r14],#16 @ store output
  1119. eor r2,r2,r10
  1120. str r1,[r14,#-12]
  1121. eor r3,r3,r11
  1122. ldmia r8,{r8-r11} @ load key material
  1123. str r2,[r14,#-8]
  1124. str r3,[r14,#-4]
  1125. add r4,r4,r8 @ accumulate key material
  1126. ldr r8,[r12],#16 @ load input
  1127. add r5,r5,r9
  1128. ldr r9,[r12,#-12]
  1129. add r6,r6,r10
  1130. ldr r10,[r12,#-8]
  1131. add r7,r7,r11
  1132. ldr r11,[r12,#-4]
  1133. # ifdef __ARMEB__
  1134. rev r4,r4
  1135. rev r5,r5
  1136. rev r6,r6
  1137. rev r7,r7
  1138. # endif
  1139. eor r4,r4,r8
  1140. add r8,sp,#4*(8)
  1141. eor r5,r5,r9
  1142. str r4,[r14],#16 @ store output
  1143. eor r6,r6,r10
  1144. str r5,[r14,#-12]
  1145. eor r7,r7,r11
  1146. ldmia r8,{r8-r11} @ load key material
  1147. str r6,[r14,#-8]
  1148. add r0,sp,#4*(16+8)
  1149. str r7,[r14,#-4]
  1150. ldmia r0,{r0-r7} @ load second half
  1151. add r0,r0,r8 @ accumulate key material
  1152. ldr r8,[r12],#16 @ load input
  1153. add r1,r1,r9
  1154. ldr r9,[r12,#-12]
  1155. # ifdef __thumb2__
  1156. it hi
  1157. # endif
  1158. strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
  1159. add r2,r2,r10
  1160. ldr r10,[r12,#-8]
  1161. # ifdef __thumb2__
  1162. it hi
  1163. # endif
  1164. strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
  1165. add r3,r3,r11
  1166. ldr r11,[r12,#-4]
  1167. # ifdef __ARMEB__
  1168. rev r0,r0
  1169. rev r1,r1
  1170. rev r2,r2
  1171. rev r3,r3
  1172. # endif
  1173. eor r0,r0,r8
  1174. add r8,sp,#4*(12)
  1175. eor r1,r1,r9
  1176. str r0,[r14],#16 @ store output
  1177. eor r2,r2,r10
  1178. str r1,[r14,#-12]
  1179. eor r3,r3,r11
  1180. ldmia r8,{r8-r11} @ load key material
  1181. str r2,[r14,#-8]
  1182. str r3,[r14,#-4]
  1183. add r4,r4,r8 @ accumulate key material
  1184. add r8,r8,#4 @ next counter value
  1185. add r5,r5,r9
  1186. str r8,[sp,#4*(12)] @ save next counter value
  1187. ldr r8,[r12],#16 @ load input
  1188. add r6,r6,r10
  1189. add r4,r4,#3 @ counter+3
  1190. ldr r9,[r12,#-12]
  1191. add r7,r7,r11
  1192. ldr r10,[r12,#-8]
  1193. ldr r11,[r12,#-4]
  1194. # ifdef __ARMEB__
  1195. rev r4,r4
  1196. rev r5,r5
  1197. rev r6,r6
  1198. rev r7,r7
  1199. # endif
  1200. eor r4,r4,r8
  1201. # ifdef __thumb2__
  1202. it hi
  1203. # endif
  1204. ldrhi r8,[sp,#4*(32+2)] @ re-load len
  1205. eor r5,r5,r9
  1206. eor r6,r6,r10
  1207. str r4,[r14],#16 @ store output
  1208. eor r7,r7,r11
  1209. str r5,[r14,#-12]
  1210. sub r11,r8,#64*4 @ len-=64*4
  1211. str r6,[r14,#-8]
  1212. str r7,[r14,#-4]
  1213. bhi .Loop_neon_outer
  1214. b .Ldone_neon
  1215. .align 4
  1216. .Lbreak_neon:
  1217. @ harmonize NEON and integer-only stack frames: load data
  1218. @ from NEON frame, but save to integer-only one; distance
  1219. @ between the two is 4*(32+4+16-32)=4*(20).
  1220. str r11, [sp,#4*(20+32+2)] @ save len
  1221. add r11,sp,#4*(32+4)
  1222. str r12, [sp,#4*(20+32+1)] @ save inp
  1223. str r14, [sp,#4*(20+32+0)] @ save out
  1224. ldr r12,[sp,#4*(16+10)]
  1225. ldr r14,[sp,#4*(16+11)]
  1226. vldmia r11,{d8-d15} @ fulfill ABI requirement
  1227. str r12,[sp,#4*(20+16+10)] @ copy "rx"
  1228. str r14,[sp,#4*(20+16+11)] @ copy "rx"
  1229. ldr r11, [sp,#4*(15)]
  1230. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  1231. ldr r10, [sp,#4*(13)]
  1232. ldr r14,[sp,#4*(14)]
  1233. str r11, [sp,#4*(20+16+15)]
  1234. add r11,sp,#4*(20)
  1235. vst1.32 {q0-q1},[r11]! @ copy key
  1236. add sp,sp,#4*(20) @ switch frame
  1237. vst1.32 {q2-q3},[r11]
  1238. mov r11,#10
  1239. b .Loop @ go integer-only
  1240. .align 4
  1241. .Ltail_neon:
  1242. cmp r11,#64*3
  1243. bhs .L192_or_more_neon
  1244. cmp r11,#64*2
  1245. bhs .L128_or_more_neon
  1246. cmp r11,#64*1
  1247. bhs .L64_or_more_neon
  1248. add r8,sp,#4*(8)
  1249. vst1.8 {q0-q1},[sp]
  1250. add r10,sp,#4*(0)
  1251. vst1.8 {q2-q3},[r8]
  1252. b .Loop_tail_neon
  1253. .align 4
  1254. .L64_or_more_neon:
  1255. vld1.8 {q12-q13},[r12]!
  1256. vld1.8 {q14-q15},[r12]!
  1257. veor q0,q0,q12
  1258. veor q1,q1,q13
  1259. veor q2,q2,q14
  1260. veor q3,q3,q15
  1261. vst1.8 {q0-q1},[r14]!
  1262. vst1.8 {q2-q3},[r14]!
  1263. beq .Ldone_neon
  1264. add r8,sp,#4*(8)
  1265. vst1.8 {q4-q5},[sp]
  1266. add r10,sp,#4*(0)
  1267. vst1.8 {q6-q7},[r8]
  1268. sub r11,r11,#64*1 @ len-=64*1
  1269. b .Loop_tail_neon
  1270. .align 4
  1271. .L128_or_more_neon:
  1272. vld1.8 {q12-q13},[r12]!
  1273. vld1.8 {q14-q15},[r12]!
  1274. veor q0,q0,q12
  1275. veor q1,q1,q13
  1276. vld1.8 {q12-q13},[r12]!
  1277. veor q2,q2,q14
  1278. veor q3,q3,q15
  1279. vld1.8 {q14-q15},[r12]!
  1280. veor q4,q4,q12
  1281. veor q5,q5,q13
  1282. vst1.8 {q0-q1},[r14]!
  1283. veor q6,q6,q14
  1284. vst1.8 {q2-q3},[r14]!
  1285. veor q7,q7,q15
  1286. vst1.8 {q4-q5},[r14]!
  1287. vst1.8 {q6-q7},[r14]!
  1288. beq .Ldone_neon
  1289. add r8,sp,#4*(8)
  1290. vst1.8 {q8-q9},[sp]
  1291. add r10,sp,#4*(0)
  1292. vst1.8 {q10-q11},[r8]
  1293. sub r11,r11,#64*2 @ len-=64*2
  1294. b .Loop_tail_neon
  1295. .align 4
  1296. .L192_or_more_neon:
  1297. vld1.8 {q12-q13},[r12]!
  1298. vld1.8 {q14-q15},[r12]!
  1299. veor q0,q0,q12
  1300. veor q1,q1,q13
  1301. vld1.8 {q12-q13},[r12]!
  1302. veor q2,q2,q14
  1303. veor q3,q3,q15
  1304. vld1.8 {q14-q15},[r12]!
  1305. veor q4,q4,q12
  1306. veor q5,q5,q13
  1307. vld1.8 {q12-q13},[r12]!
  1308. veor q6,q6,q14
  1309. vst1.8 {q0-q1},[r14]!
  1310. veor q7,q7,q15
  1311. vld1.8 {q14-q15},[r12]!
  1312. veor q8,q8,q12
  1313. vst1.8 {q2-q3},[r14]!
  1314. veor q9,q9,q13
  1315. vst1.8 {q4-q5},[r14]!
  1316. veor q10,q10,q14
  1317. vst1.8 {q6-q7},[r14]!
  1318. veor q11,q11,q15
  1319. vst1.8 {q8-q9},[r14]!
  1320. vst1.8 {q10-q11},[r14]!
  1321. beq .Ldone_neon
  1322. ldmia sp,{r8-r11} @ load key material
  1323. add r0,r0,r8 @ accumulate key material
  1324. add r8,sp,#4*(4)
  1325. add r1,r1,r9
  1326. add r2,r2,r10
  1327. add r3,r3,r11
  1328. ldmia r8,{r8-r11} @ load key material
  1329. add r4,r4,r8 @ accumulate key material
  1330. add r8,sp,#4*(8)
  1331. add r5,r5,r9
  1332. add r6,r6,r10
  1333. add r7,r7,r11
  1334. ldmia r8,{r8-r11} @ load key material
  1335. # ifdef __ARMEB__
  1336. rev r0,r0
  1337. rev r1,r1
  1338. rev r2,r2
  1339. rev r3,r3
  1340. rev r4,r4
  1341. rev r5,r5
  1342. rev r6,r6
  1343. rev r7,r7
  1344. # endif
  1345. stmia sp,{r0-r7}
  1346. add r0,sp,#4*(16+8)
  1347. ldmia r0,{r0-r7} @ load second half
  1348. add r0,r0,r8 @ accumulate key material
  1349. add r8,sp,#4*(12)
  1350. add r1,r1,r9
  1351. add r2,r2,r10
  1352. add r3,r3,r11
  1353. ldmia r8,{r8-r11} @ load key material
  1354. add r4,r4,r8 @ accumulate key material
  1355. add r8,sp,#4*(8)
  1356. add r5,r5,r9
  1357. add r4,r4,#3 @ counter+3
  1358. add r6,r6,r10
  1359. add r7,r7,r11
  1360. ldr r11,[sp,#4*(32+2)] @ re-load len
  1361. # ifdef __ARMEB__
  1362. rev r0,r0
  1363. rev r1,r1
  1364. rev r2,r2
  1365. rev r3,r3
  1366. rev r4,r4
  1367. rev r5,r5
  1368. rev r6,r6
  1369. rev r7,r7
  1370. # endif
  1371. stmia r8,{r0-r7}
  1372. add r10,sp,#4*(0)
  1373. sub r11,r11,#64*3 @ len-=64*3
  1374. .Loop_tail_neon:
  1375. ldrb r8,[r10],#1 @ read buffer on stack
  1376. ldrb r9,[r12],#1 @ read input
  1377. subs r11,r11,#1
  1378. eor r8,r8,r9
  1379. strb r8,[r14],#1 @ store output
  1380. bne .Loop_tail_neon
  1381. .Ldone_neon:
  1382. add sp,sp,#4*(32+4)
  1383. vldmia sp,{d8-d15}
  1384. add sp,sp,#4*(16+3)
  1385. ldmia sp!,{r4-r11,pc}
  1386. .size ChaCha20_neon,.-ChaCha20_neon
  1387. .comm OPENSSL_armcap_P,4,4
  1388. #endif