123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977 |
- #include "arm_arch.h"
- .text
- .private_extern _OPENSSL_armcap_P
- .align 5
- Lsigma:
- .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
- Lone:
- .long 1,0,0,0
- LOPENSSL_armcap_P:
- #ifdef __ILP32__
- .long _OPENSSL_armcap_P-.
- #else
- .quad _OPENSSL_armcap_P-.
- #endif
- .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .align 2
- .globl _ChaCha20_ctr32
- .align 5
- _ChaCha20_ctr32:
- cbz x2,Labort
- adr x5,LOPENSSL_armcap_P
- cmp x2,#192
- b.lo Lshort
- #ifdef __ILP32__
- ldrsw x6,[x5]
- #else
- ldr x6,[x5]
- #endif
- ldr w17,[x6,x5]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
- Lshort:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- adr x5,Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
- ldp x22,x23,[x5] // load sigma
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ldp x28,x30,[x4] // load counter
- #ifdef __ARMEB__
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
- #endif
- Loop_outer:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov w7,w23
- lsr x8,x23,#32
- mov w9,w24
- lsr x10,x24,#32
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
- mov x4,#10
- subs x2,x2,#64
- Loop:
- sub x4,x4,#1
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- ror w21,w21,#16
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#20
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- ror w21,w21,#24
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#25
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#16
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- ror w9,w9,#20
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#24
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- ror w9,w9,#25
- cbnz x4,Loop
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
- b.lo Ltail
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- b.hi Loop_outer
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- .long 0xd50323bf // autiasp
- Labort:
- ret
- .align 4
- Ltail:
- add x2,x2,#64
- Less_than_64:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- stp x5,x7,[sp,#0]
- stp x9,x11,[sp,#16]
- stp x13,x15,[sp,#32]
- stp x17,x20,[sp,#48]
- Loop_tail:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,Loop_tail
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- .long 0xd50323bf // autiasp
- ret
- .align 5
- ChaCha20_neon:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- adr x5,Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- cmp x2,#512
- b.hs L512_or_more_neon
- sub sp,sp,#64
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
- #ifdef __ARMEB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
- #endif
- add v27.4s,v27.4s,v31.4s // += 1
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
- Loop_outer_neon:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov v0.16b,v24.16b
- mov w7,w23
- lsr x8,x23,#32
- mov v4.16b,v24.16b
- mov w9,w24
- lsr x10,x24,#32
- mov v16.16b,v24.16b
- mov w11,w25
- mov v1.16b,v25.16b
- lsr x12,x25,#32
- mov v5.16b,v25.16b
- mov w13,w26
- mov v17.16b,v25.16b
- lsr x14,x26,#32
- mov v3.16b,v27.16b
- mov w15,w27
- mov v7.16b,v28.16b
- lsr x16,x27,#32
- mov v19.16b,v29.16b
- mov w17,w28
- mov v2.16b,v26.16b
- lsr x19,x28,#32
- mov v6.16b,v26.16b
- mov w20,w30
- mov v18.16b,v26.16b
- lsr x21,x30,#32
- mov x4,#10
- subs x2,x2,#256
- Loop_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w11
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w12
- eor v7.16b,v7.16b,v4.16b
- eor w17,w17,w5
- eor v19.16b,v19.16b,v16.16b
- eor w19,w19,w6
- rev32 v3.8h,v3.8h
- eor w20,w20,w7
- rev32 v7.8h,v7.8h
- eor w21,w21,w8
- rev32 v19.8h,v19.8h
- ror w17,w17,#16
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#16
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#16
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#16
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#20
- add w16,w16,w21
- ushr v5.4s,v21.4s,#20
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#20
- eor w10,w10,w14
- sli v1.4s,v20.4s,#12
- eor w11,w11,w15
- sli v5.4s,v21.4s,#12
- eor w12,w12,w16
- sli v17.4s,v22.4s,#12
- ror w9,w9,#20
- add v0.4s,v0.4s,v1.4s
- ror w10,w10,#20
- add v4.4s,v4.4s,v5.4s
- ror w11,w11,#20
- add v16.4s,v16.4s,v17.4s
- ror w12,w12,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w9
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w10
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w11
- ushr v3.4s,v20.4s,#24
- add w8,w8,w12
- ushr v7.4s,v21.4s,#24
- eor w17,w17,w5
- ushr v19.4s,v22.4s,#24
- eor w19,w19,w6
- sli v3.4s,v20.4s,#8
- eor w20,w20,w7
- sli v7.4s,v21.4s,#8
- eor w21,w21,w8
- sli v19.4s,v22.4s,#8
- ror w17,w17,#24
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#24
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#24
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#24
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#25
- add w16,w16,w21
- ushr v5.4s,v21.4s,#25
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#25
- eor w10,w10,w14
- sli v1.4s,v20.4s,#7
- eor w11,w11,w15
- sli v5.4s,v21.4s,#7
- eor w12,w12,w16
- sli v17.4s,v22.4s,#7
- ror w9,w9,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w10,w10,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w10
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w11
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w12
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w9
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w5
- eor v19.16b,v19.16b,v16.16b
- eor w17,w17,w6
- rev32 v3.8h,v3.8h
- eor w19,w19,w7
- rev32 v7.8h,v7.8h
- eor w20,w20,w8
- rev32 v19.8h,v19.8h
- ror w21,w21,#16
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#16
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#16
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#16
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#20
- add w14,w14,w20
- ushr v5.4s,v21.4s,#20
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#20
- eor w11,w11,w16
- sli v1.4s,v20.4s,#12
- eor w12,w12,w13
- sli v5.4s,v21.4s,#12
- eor w9,w9,w14
- sli v17.4s,v22.4s,#12
- ror w10,w10,#20
- add v0.4s,v0.4s,v1.4s
- ror w11,w11,#20
- add v4.4s,v4.4s,v5.4s
- ror w12,w12,#20
- add v16.4s,v16.4s,v17.4s
- ror w9,w9,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w12
- ushr v3.4s,v20.4s,#24
- add w8,w8,w9
- ushr v7.4s,v21.4s,#24
- eor w21,w21,w5
- ushr v19.4s,v22.4s,#24
- eor w17,w17,w6
- sli v3.4s,v20.4s,#8
- eor w19,w19,w7
- sli v7.4s,v21.4s,#8
- eor w20,w20,w8
- sli v19.4s,v22.4s,#8
- ror w21,w21,#24
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#24
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#24
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#24
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#25
- add w14,w14,w20
- ushr v5.4s,v21.4s,#25
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#25
- eor w11,w11,w16
- sli v1.4s,v20.4s,#7
- eor w12,w12,w13
- sli v5.4s,v21.4s,#7
- eor w9,w9,w14
- sli v17.4s,v22.4s,#7
- ror w10,w10,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w11,w11,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w12,w12,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- cbnz x4,Loop_neon
- add w5,w5,w22 // accumulate key block
- add v0.4s,v0.4s,v24.4s
- add x6,x6,x22,lsr#32
- add v4.4s,v4.4s,v24.4s
- add w7,w7,w23
- add v16.4s,v16.4s,v24.4s
- add x8,x8,x23,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w9,w9,w24
- add v6.4s,v6.4s,v26.4s
- add x10,x10,x24,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w11,w11,w25
- add v3.4s,v3.4s,v27.4s
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add v7.4s,v7.4s,v28.4s
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add v19.4s,v19.4s,v29.4s
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add v1.4s,v1.4s,v25.4s
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add v5.4s,v5.4s,v25.4s
- add x21,x21,x30,lsr#32
- add v17.4s,v17.4s,v25.4s
- b.lo Ltail_neon
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v20.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v21.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v22.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v23.16b
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- add v27.4s,v27.4s,v31.4s // += 4
- stp x13,x15,[x0,#32]
- add v28.4s,v28.4s,v31.4s
- stp x17,x20,[x0,#48]
- add v29.4s,v29.4s,v31.4s
- add x0,x0,#64
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- eor v16.16b,v16.16b,v0.16b
- eor v17.16b,v17.16b,v1.16b
- eor v18.16b,v18.16b,v2.16b
- eor v19.16b,v19.16b,v3.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
- b.hi Loop_outer_neon
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- .long 0xd50323bf // autiasp
- ret
- Ltail_neon:
- add x2,x2,#256
- cmp x2,#64
- b.lo Less_than_64
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- b.eq Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo Less_than_128
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v0.16b,v0.16b,v20.16b
- eor v1.16b,v1.16b,v21.16b
- eor v2.16b,v2.16b,v22.16b
- eor v3.16b,v3.16b,v23.16b
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- b.eq Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo Less_than_192
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- b.eq Ldone_neon
- sub x2,x2,#64
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
- b Last_neon
- Less_than_128:
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
- b Last_neon
- Less_than_192:
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
- b Last_neon
- .align 4
- Last_neon:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
- Loop_tail_neon:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,Loop_tail_neon
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
- Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- .long 0xd50323bf // autiasp
- ret
- .align 5
- ChaCha20_512_neon:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- adr x5,Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- L512_or_more_neon:
- sub sp,sp,#128+64
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
- #ifdef __ARMEB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
- #endif
- add v27.4s,v27.4s,v31.4s // += 1
- stp q24,q25,[sp,#0] // off-load key block, invariant part
- add v27.4s,v27.4s,v31.4s // not typo
- str q26,[sp,#32]
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- add v30.4s,v29.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
- sub x2,x2,#512 // not typo
- Loop_outer_512_neon:
- mov v0.16b,v24.16b
- mov v4.16b,v24.16b
- mov v8.16b,v24.16b
- mov v12.16b,v24.16b
- mov v16.16b,v24.16b
- mov v20.16b,v24.16b
- mov v1.16b,v25.16b
- mov w5,w22 // unpack key block
- mov v5.16b,v25.16b
- lsr x6,x22,#32
- mov v9.16b,v25.16b
- mov w7,w23
- mov v13.16b,v25.16b
- lsr x8,x23,#32
- mov v17.16b,v25.16b
- mov w9,w24
- mov v21.16b,v25.16b
- lsr x10,x24,#32
- mov v3.16b,v27.16b
- mov w11,w25
- mov v7.16b,v28.16b
- lsr x12,x25,#32
- mov v11.16b,v29.16b
- mov w13,w26
- mov v15.16b,v30.16b
- lsr x14,x26,#32
- mov v2.16b,v26.16b
- mov w15,w27
- mov v6.16b,v26.16b
- lsr x16,x27,#32
- add v19.4s,v3.4s,v31.4s // +4
- mov w17,w28
- add v23.4s,v7.4s,v31.4s // +4
- lsr x19,x28,#32
- mov v10.16b,v26.16b
- mov w20,w30
- mov v14.16b,v26.16b
- lsr x21,x30,#32
- mov v18.16b,v26.16b
- stp q27,q28,[sp,#48] // off-load key block, variable part
- mov v22.16b,v26.16b
- str q29,[sp,#80]
- mov x4,#5
- subs x2,x2,#512
- Loop_upper_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,Loop_upper_neon
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- stp x9,x11,[x0,#16]
- mov w7,w23
- lsr x8,x23,#32
- stp x13,x15,[x0,#32]
- mov w9,w24
- lsr x10,x24,#32
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
- mov x4,#5
- Loop_lower_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,Loop_lower_neon
- add w5,w5,w22 // accumulate key block
- ldp q24,q25,[sp,#0]
- add x6,x6,x22,lsr#32
- ldp q26,q27,[sp,#32]
- add w7,w7,w23
- ldp q28,q29,[sp,#64]
- add x8,x8,x23,lsr#32
- add v0.4s,v0.4s,v24.4s
- add w9,w9,w24
- add v4.4s,v4.4s,v24.4s
- add x10,x10,x24,lsr#32
- add v8.4s,v8.4s,v24.4s
- add w11,w11,w25
- add v12.4s,v12.4s,v24.4s
- add x12,x12,x25,lsr#32
- add v16.4s,v16.4s,v24.4s
- add w13,w13,w26
- add v20.4s,v20.4s,v24.4s
- add x14,x14,x26,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w15,w15,w27
- add v6.4s,v6.4s,v26.4s
- add x16,x16,x27,lsr#32
- add v10.4s,v10.4s,v26.4s
- add w17,w17,w28
- add v14.4s,v14.4s,v26.4s
- add x19,x19,x28,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w20,w20,w30
- add v22.4s,v22.4s,v26.4s
- add x21,x21,x30,lsr#32
- add v19.4s,v19.4s,v31.4s // +4
- add x5,x5,x6,lsl#32 // pack
- add v23.4s,v23.4s,v31.4s // +4
- add x7,x7,x8,lsl#32
- add v3.4s,v3.4s,v27.4s
- ldp x6,x8,[x1,#0] // load input
- add v7.4s,v7.4s,v28.4s
- add x9,x9,x10,lsl#32
- add v11.4s,v11.4s,v29.4s
- add x11,x11,x12,lsl#32
- add v15.4s,v15.4s,v30.4s
- ldp x10,x12,[x1,#16]
- add v19.4s,v19.4s,v27.4s
- add x13,x13,x14,lsl#32
- add v23.4s,v23.4s,v28.4s
- add x15,x15,x16,lsl#32
- add v1.4s,v1.4s,v25.4s
- ldp x14,x16,[x1,#32]
- add v5.4s,v5.4s,v25.4s
- add x17,x17,x19,lsl#32
- add v9.4s,v9.4s,v25.4s
- add x20,x20,x21,lsl#32
- add v13.4s,v13.4s,v25.4s
- ldp x19,x21,[x1,#48]
- add v17.4s,v17.4s,v25.4s
- add x1,x1,#64
- add v21.4s,v21.4s,v25.4s
- #ifdef __ARMEB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
- #endif
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v24.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v25.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v26.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v27.16b
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#7 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
- eor v4.16b,v4.16b,v24.16b
- eor v5.16b,v5.16b,v25.16b
- eor v6.16b,v6.16b,v26.16b
- eor v7.16b,v7.16b,v27.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- eor v8.16b,v8.16b,v0.16b
- ldp q24,q25,[sp,#0]
- eor v9.16b,v9.16b,v1.16b
- ldp q26,q27,[sp,#32]
- eor v10.16b,v10.16b,v2.16b
- eor v11.16b,v11.16b,v3.16b
- st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
- ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
- eor v12.16b,v12.16b,v4.16b
- eor v13.16b,v13.16b,v5.16b
- eor v14.16b,v14.16b,v6.16b
- eor v15.16b,v15.16b,v7.16b
- st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
- ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
- eor v16.16b,v16.16b,v8.16b
- eor v17.16b,v17.16b,v9.16b
- eor v18.16b,v18.16b,v10.16b
- eor v19.16b,v19.16b,v11.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
- shl v0.4s,v31.4s,#1 // 4 -> 8
- eor v20.16b,v20.16b,v12.16b
- eor v21.16b,v21.16b,v13.16b
- eor v22.16b,v22.16b,v14.16b
- eor v23.16b,v23.16b,v15.16b
- st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
- add v27.4s,v27.4s,v0.4s // += 8
- add v28.4s,v28.4s,v0.4s
- add v29.4s,v29.4s,v0.4s
- add v30.4s,v30.4s,v0.4s
- b.hs Loop_outer_512_neon
- adds x2,x2,#512
- ushr v0.4s,v31.4s,#2 // 4 -> 1
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
- stp q24,q31,[sp,#0] // wipe off-load area
- stp q24,q31,[sp,#32]
- stp q24,q31,[sp,#64]
- b.eq Ldone_512_neon
- cmp x2,#192
- sub v27.4s,v27.4s,v0.4s // -= 1
- sub v28.4s,v28.4s,v0.4s
- sub v29.4s,v29.4s,v0.4s
- add sp,sp,#128
- b.hs Loop_outer_neon
- eor v25.16b,v25.16b,v25.16b
- eor v26.16b,v26.16b,v26.16b
- eor v27.16b,v27.16b,v27.16b
- eor v28.16b,v28.16b,v28.16b
- eor v29.16b,v29.16b,v29.16b
- eor v30.16b,v30.16b,v30.16b
- b Loop_outer
- Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- .long 0xd50323bf // autiasp
- ret
|