123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095 |
- .text
- .align 8 // strategic alignment and padding that allows to use
- // address value as loop termination condition...
- .quad 0,0,0,0,0,0,0,0
- iotas:
- .quad 0x0000000000000001
- .quad 0x0000000000008082
- .quad 0x800000000000808a
- .quad 0x8000000080008000
- .quad 0x000000000000808b
- .quad 0x0000000080000001
- .quad 0x8000000080008081
- .quad 0x8000000000008009
- .quad 0x000000000000008a
- .quad 0x0000000000000088
- .quad 0x0000000080008009
- .quad 0x000000008000000a
- .quad 0x000000008000808b
- .quad 0x800000000000008b
- .quad 0x8000000000008089
- .quad 0x8000000000008003
- .quad 0x8000000000008002
- .quad 0x8000000000000080
- .quad 0x000000000000800a
- .quad 0x800000008000000a
- .quad 0x8000000080008081
- .quad 0x8000000000008080
- .quad 0x0000000080000001
- .quad 0x8000000080008008
- .align 5
- KeccakF1600_int:
- adr x28,iotas
- .long 0xd503233f // paciasp
- stp x28,x30,[sp,#16] // 32 bytes on top are mine
- b Loop
- .align 4
- Loop:
- ////////////////////////////////////////// Theta
- eor x26,x0,x5
- stp x4,x9,[sp,#0] // offload pair...
- eor x27,x1,x6
- eor x28,x2,x7
- eor x30,x3,x8
- eor x4,x4,x9
- eor x26,x26,x10
- eor x27,x27,x11
- eor x28,x28,x12
- eor x30,x30,x13
- eor x4,x4,x14
- eor x26,x26,x15
- eor x27,x27,x16
- eor x28,x28,x17
- eor x30,x30,x25
- eor x4,x4,x19
- eor x26,x26,x20
- eor x28,x28,x22
- eor x27,x27,x21
- eor x30,x30,x23
- eor x4,x4,x24
- eor x9,x26,x28,ror#63
- eor x1,x1,x9
- eor x6,x6,x9
- eor x11,x11,x9
- eor x16,x16,x9
- eor x21,x21,x9
- eor x9,x27,x30,ror#63
- eor x28,x28,x4,ror#63
- eor x30,x30,x26,ror#63
- eor x4,x4,x27,ror#63
- eor x27, x2,x9 // mov x27,x2
- eor x7,x7,x9
- eor x12,x12,x9
- eor x17,x17,x9
- eor x22,x22,x9
- eor x0,x0,x4
- eor x5,x5,x4
- eor x10,x10,x4
- eor x15,x15,x4
- eor x20,x20,x4
- ldp x4,x9,[sp,#0] // re-load offloaded data
- eor x26, x3,x28 // mov x26,x3
- eor x8,x8,x28
- eor x13,x13,x28
- eor x25,x25,x28
- eor x23,x23,x28
- eor x28, x4,x30 // mov x28,x4
- eor x9,x9,x30
- eor x14,x14,x30
- eor x19,x19,x30
- eor x24,x24,x30
- ////////////////////////////////////////// Rho+Pi
- mov x30,x1
- ror x1,x6,#64-44
- //mov x27,x2
- ror x2,x12,#64-43
- //mov x26,x3
- ror x3,x25,#64-21
- //mov x28,x4
- ror x4,x24,#64-14
- ror x6,x9,#64-20
- ror x12,x13,#64-25
- ror x25,x17,#64-15
- ror x24,x21,#64-2
- ror x9,x22,#64-61
- ror x13,x19,#64-8
- ror x17,x11,#64-10
- ror x21,x8,#64-55
- ror x22,x14,#64-39
- ror x19,x23,#64-56
- ror x11,x7,#64-6
- ror x8,x16,#64-45
- ror x14,x20,#64-18
- ror x23,x15,#64-41
- ror x7,x10,#64-3
- ror x16,x5,#64-36
- ror x5,x26,#64-28
- ror x10,x30,#64-1
- ror x15,x28,#64-27
- ror x20,x27,#64-62
- ////////////////////////////////////////// Chi+Iota
- bic x26,x2,x1
- bic x27,x3,x2
- bic x28,x0,x4
- bic x30,x1,x0
- eor x0,x0,x26
- bic x26,x4,x3
- eor x1,x1,x27
- ldr x27,[sp,#16]
- eor x3,x3,x28
- eor x4,x4,x30
- eor x2,x2,x26
- ldr x30,[x27],#8 // Iota[i++]
- bic x26,x7,x6
- tst x27,#255 // are we done?
- str x27,[sp,#16]
- bic x27,x8,x7
- bic x28,x5,x9
- eor x0,x0,x30 // A[0][0] ^= Iota
- bic x30,x6,x5
- eor x5,x5,x26
- bic x26,x9,x8
- eor x6,x6,x27
- eor x8,x8,x28
- eor x9,x9,x30
- eor x7,x7,x26
- bic x26,x12,x11
- bic x27,x13,x12
- bic x28,x10,x14
- bic x30,x11,x10
- eor x10,x10,x26
- bic x26,x14,x13
- eor x11,x11,x27
- eor x13,x13,x28
- eor x14,x14,x30
- eor x12,x12,x26
- bic x26,x17,x16
- bic x27,x25,x17
- bic x28,x15,x19
- bic x30,x16,x15
- eor x15,x15,x26
- bic x26,x19,x25
- eor x16,x16,x27
- eor x25,x25,x28
- eor x19,x19,x30
- eor x17,x17,x26
- bic x26,x22,x21
- bic x27,x23,x22
- bic x28,x20,x24
- bic x30,x21,x20
- eor x20,x20,x26
- bic x26,x24,x23
- eor x21,x21,x27
- eor x23,x23,x28
- eor x24,x24,x30
- eor x22,x22,x26
- bne Loop
- ldr x30,[sp,#24]
- .long 0xd50323bf // autiasp
- ret
- .align 5
- KeccakF1600:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#48
- str x0,[sp,#32] // offload argument
- mov x26,x0
- ldp x0,x1,[x0,#16*0]
- ldp x2,x3,[x26,#16*1]
- ldp x4,x5,[x26,#16*2]
- ldp x6,x7,[x26,#16*3]
- ldp x8,x9,[x26,#16*4]
- ldp x10,x11,[x26,#16*5]
- ldp x12,x13,[x26,#16*6]
- ldp x14,x15,[x26,#16*7]
- ldp x16,x17,[x26,#16*8]
- ldp x25,x19,[x26,#16*9]
- ldp x20,x21,[x26,#16*10]
- ldp x22,x23,[x26,#16*11]
- ldr x24,[x26,#16*12]
- bl KeccakF1600_int
- ldr x26,[sp,#32]
- stp x0,x1,[x26,#16*0]
- stp x2,x3,[x26,#16*1]
- stp x4,x5,[x26,#16*2]
- stp x6,x7,[x26,#16*3]
- stp x8,x9,[x26,#16*4]
- stp x10,x11,[x26,#16*5]
- stp x12,x13,[x26,#16*6]
- stp x14,x15,[x26,#16*7]
- stp x16,x17,[x26,#16*8]
- stp x25,x19,[x26,#16*9]
- stp x20,x21,[x26,#16*10]
- stp x22,x23,[x26,#16*11]
- str x24,[x26,#16*12]
- ldp x19,x20,[x29,#16]
- add sp,sp,#48
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- .long 0xd50323bf // autiasp
- ret
- .globl _SHA3_absorb
- .align 5
- _SHA3_absorb:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
- stp x0,x1,[sp,#32] // offload arguments
- stp x2,x3,[sp,#48]
- mov x26,x0 // uint64_t A[5][5]
- mov x27,x1 // const void *inp
- mov x28,x2 // size_t len
- mov x30,x3 // size_t bsz
- ldp x0,x1,[x26,#16*0]
- ldp x2,x3,[x26,#16*1]
- ldp x4,x5,[x26,#16*2]
- ldp x6,x7,[x26,#16*3]
- ldp x8,x9,[x26,#16*4]
- ldp x10,x11,[x26,#16*5]
- ldp x12,x13,[x26,#16*6]
- ldp x14,x15,[x26,#16*7]
- ldp x16,x17,[x26,#16*8]
- ldp x25,x19,[x26,#16*9]
- ldp x20,x21,[x26,#16*10]
- ldp x22,x23,[x26,#16*11]
- ldr x24,[x26,#16*12]
- b Loop_absorb
- .align 4
- Loop_absorb:
- subs x26,x28,x30 // len - bsz
- blo Labsorbed
- str x26,[sp,#48] // save len - bsz
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x0,x0,x26
- cmp x30,#8*(0+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x1,x1,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x2,x2,x26
- cmp x30,#8*(2+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x3,x3,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x4,x4,x26
- cmp x30,#8*(4+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x5,x5,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x6,x6,x26
- cmp x30,#8*(6+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x7,x7,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x8,x8,x26
- cmp x30,#8*(8+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x9,x9,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x10,x10,x26
- cmp x30,#8*(10+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x11,x11,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x12,x12,x26
- cmp x30,#8*(12+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x13,x13,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x14,x14,x26
- cmp x30,#8*(14+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x15,x15,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x16,x16,x26
- cmp x30,#8*(16+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x17,x17,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x25,x25,x26
- cmp x30,#8*(18+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x19,x19,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x20,x20,x26
- cmp x30,#8*(20+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x21,x21,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x22,x22,x26
- cmp x30,#8*(22+2)
- blo Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x23,x23,x26
- beq Lprocess_block
- ldr x26,[x27],#8 // *inp++
- #ifdef __AARCH64EB__
- rev x26,x26
- #endif
- eor x24,x24,x26
- Lprocess_block:
- str x27,[sp,#40] // save inp
- bl KeccakF1600_int
- ldr x27,[sp,#40] // restore arguments
- ldp x28,x30,[sp,#48]
- b Loop_absorb
- .align 4
- Labsorbed:
- ldr x27,[sp,#32]
- stp x0,x1,[x27,#16*0]
- stp x2,x3,[x27,#16*1]
- stp x4,x5,[x27,#16*2]
- stp x6,x7,[x27,#16*3]
- stp x8,x9,[x27,#16*4]
- stp x10,x11,[x27,#16*5]
- stp x12,x13,[x27,#16*6]
- stp x14,x15,[x27,#16*7]
- stp x16,x17,[x27,#16*8]
- stp x25,x19,[x27,#16*9]
- stp x20,x21,[x27,#16*10]
- stp x22,x23,[x27,#16*11]
- str x24,[x27,#16*12]
- mov x0,x28 // return value
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- .long 0xd50323bf // autiasp
- ret
- .globl _SHA3_squeeze
- .align 5
- _SHA3_squeeze:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-48]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- mov x19,x0 // put aside arguments
- mov x20,x1
- mov x21,x2
- mov x22,x3
- Loop_squeeze:
- ldr x4,[x0],#8
- cmp x21,#8
- blo Lsqueeze_tail
- #ifdef __AARCH64EB__
- rev x4,x4
- #endif
- str x4,[x20],#8
- subs x21,x21,#8
- beq Lsqueeze_done
- subs x3,x3,#8
- bhi Loop_squeeze
- mov x0,x19
- bl KeccakF1600
- mov x0,x19
- mov x3,x22
- b Loop_squeeze
- .align 4
- Lsqueeze_tail:
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- lsr x4,x4,#8
- subs x21,x21,#1
- beq Lsqueeze_done
- strb w4,[x20],#1
- Lsqueeze_done:
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x29,x30,[sp],#48
- .long 0xd50323bf // autiasp
- ret
- .align 5
- KeccakF1600_ce:
- mov x9,#12
- adr x10,iotas
- b Loop_ce
- .align 4
- Loop_ce:
- ////////////////////////////////////////////////// Theta
- .long 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
- .long 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
- .long 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
- .long 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
- .long 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
- .long 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
- .long 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
- .long 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
- .long 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
- .long 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
- .long 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
- .long 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
- .long 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
- .long 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
- .long 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
- ////////////////////////////////////////////////// Theta+Rho+Pi
- .long 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
- .long 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
- .long 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
- .long 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
- .long 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
- .long 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
- .long 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
- .long 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
- .long 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
- .long 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
- .long 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
- .long 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
- eor v0.16b,v0.16b,v29.16b
- ldr x11,[x10],#8
- .long 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
- .long 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
- .long 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
- .long 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
- .long 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
- .long 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
- .long 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
- .long 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
- .long 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
- .long 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
- .long 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
- .long 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
- ////////////////////////////////////////////////// Chi+Iota
- dup v31.2d,x11 // borrow C[6]
- .long 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
- .long 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
- .long 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
- .long 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
- .long 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
- .long 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
- .long 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
- .long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
- .long 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
- .long 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
- eor v0.16b,v28.16b,v31.16b // Iota
- .long 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
- .long 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
- .long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
- .long 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
- .long 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
- .long 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
- .long 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
- .long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
- .long 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
- .long 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
- .long 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
- .long 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
- .long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
- .long 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
- .long 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
- ////////////////////////////////////////////////// Theta
- .long 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
- .long 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
- .long 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
- .long 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
- .long 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
- .long 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
- .long 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
- .long 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
- .long 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
- .long 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
- .long 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
- .long 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
- .long 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
- .long 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
- .long 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
- ////////////////////////////////////////////////// Theta+Rho+Pi
- .long 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
- .long 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
- .long 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
- .long 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
- .long 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
- .long 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
- .long 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
- .long 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
- .long 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
- .long 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
- .long 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
- .long 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
- eor v0.16b,v0.16b,v16.16b
- ldr x11,[x10],#8
- .long 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
- .long 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
- .long 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
- .long 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
- .long 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
- .long 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
- .long 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
- .long 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
- .long 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
- .long 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
- .long 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
- .long 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
- ////////////////////////////////////////////////// Chi+Iota
- dup v21.2d,x11 // borrow C[6]
- .long 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
- .long 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
- .long 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
- .long 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
- .long 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
- .long 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
- .long 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
- .long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
- .long 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
- .long 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
- eor v0.16b,v15.16b,v21.16b // Iota
- .long 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
- .long 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
- .long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
- .long 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
- .long 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
- .long 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
- .long 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
- .long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
- .long 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
- .long 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
- .long 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
- .long 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
- .long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
- .long 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
- .long 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
- subs x9,x9,#1
- bne Loop_ce
- ret
- .align 5
- KeccakF1600_cext:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp d8,d9,[sp,#16] // per ABI requirement
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
- ldp d0,d1,[x0,#8*0]
- ldp d2,d3,[x0,#8*2]
- ldp d4,d5,[x0,#8*4]
- ldp d6,d7,[x0,#8*6]
- ldp d8,d9,[x0,#8*8]
- ldp d10,d11,[x0,#8*10]
- ldp d12,d13,[x0,#8*12]
- ldp d14,d15,[x0,#8*14]
- ldp d16,d17,[x0,#8*16]
- ldp d18,d19,[x0,#8*18]
- ldp d20,d21,[x0,#8*20]
- ldp d22,d23,[x0,#8*22]
- ldr d24,[x0,#8*24]
- bl KeccakF1600_ce
- ldr x30,[sp,#8]
- stp d0,d1,[x0,#8*0]
- stp d2,d3,[x0,#8*2]
- stp d4,d5,[x0,#8*4]
- stp d6,d7,[x0,#8*6]
- stp d8,d9,[x0,#8*8]
- stp d10,d11,[x0,#8*10]
- stp d12,d13,[x0,#8*12]
- stp d14,d15,[x0,#8*14]
- stp d16,d17,[x0,#8*16]
- stp d18,d19,[x0,#8*18]
- stp d20,d21,[x0,#8*20]
- stp d22,d23,[x0,#8*22]
- str d24,[x0,#8*24]
- ldp d8,d9,[sp,#16]
- ldp d10,d11,[sp,#32]
- ldp d12,d13,[sp,#48]
- ldp d14,d15,[sp,#64]
- ldr x29,[sp],#80
- .long 0xd50323bf // autiasp
- ret
- .globl _SHA3_absorb_cext
- .align 5
- _SHA3_absorb_cext:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp d8,d9,[sp,#16] // per ABI requirement
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
- ldp d0,d1,[x0,#8*0]
- ldp d2,d3,[x0,#8*2]
- ldp d4,d5,[x0,#8*4]
- ldp d6,d7,[x0,#8*6]
- ldp d8,d9,[x0,#8*8]
- ldp d10,d11,[x0,#8*10]
- ldp d12,d13,[x0,#8*12]
- ldp d14,d15,[x0,#8*14]
- ldp d16,d17,[x0,#8*16]
- ldp d18,d19,[x0,#8*18]
- ldp d20,d21,[x0,#8*20]
- ldp d22,d23,[x0,#8*22]
- ldr d24,[x0,#8*24]
- b Loop_absorb_ce
- .align 4
- Loop_absorb_ce:
- subs x2,x2,x3 // len - bsz
- blo Labsorbed_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v0.16b,v0.16b,v31.16b
- cmp x3,#8*(0+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v1.16b,v1.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v2.16b,v2.16b,v31.16b
- cmp x3,#8*(2+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v3.16b,v3.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v4.16b,v4.16b,v31.16b
- cmp x3,#8*(4+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v5.16b,v5.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v6.16b,v6.16b,v31.16b
- cmp x3,#8*(6+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v7.16b,v7.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v8.16b,v8.16b,v31.16b
- cmp x3,#8*(8+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v9.16b,v9.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v10.16b,v10.16b,v31.16b
- cmp x3,#8*(10+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v11.16b,v11.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v12.16b,v12.16b,v31.16b
- cmp x3,#8*(12+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v13.16b,v13.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v14.16b,v14.16b,v31.16b
- cmp x3,#8*(14+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v15.16b,v15.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v16.16b,v16.16b,v31.16b
- cmp x3,#8*(16+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v17.16b,v17.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v18.16b,v18.16b,v31.16b
- cmp x3,#8*(18+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v19.16b,v19.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v20.16b,v20.16b,v31.16b
- cmp x3,#8*(20+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v21.16b,v21.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v22.16b,v22.16b,v31.16b
- cmp x3,#8*(22+2)
- blo Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v23.16b,v23.16b,v31.16b
- beq Lprocess_block_ce
- ldr d31,[x1],#8 // *inp++
- #ifdef __AARCH64EB__
- rev64 v31.16b,v31.16b
- #endif
- eor v24.16b,v24.16b,v31.16b
- Lprocess_block_ce:
- bl KeccakF1600_ce
- b Loop_absorb_ce
- .align 4
- Labsorbed_ce:
- stp d0,d1,[x0,#8*0]
- stp d2,d3,[x0,#8*2]
- stp d4,d5,[x0,#8*4]
- stp d6,d7,[x0,#8*6]
- stp d8,d9,[x0,#8*8]
- stp d10,d11,[x0,#8*10]
- stp d12,d13,[x0,#8*12]
- stp d14,d15,[x0,#8*14]
- stp d16,d17,[x0,#8*16]
- stp d18,d19,[x0,#8*18]
- stp d20,d21,[x0,#8*20]
- stp d22,d23,[x0,#8*22]
- str d24,[x0,#8*24]
- add x0,x2,x3 // return value
- ldp d8,d9,[sp,#16]
- ldp d10,d11,[sp,#32]
- ldp d12,d13,[sp,#48]
- ldp d14,d15,[sp,#64]
- ldp x29,x30,[sp],#80
- .long 0xd50323bf // autiasp
- ret
- .globl _SHA3_squeeze_cext
- .align 5
- _SHA3_squeeze_cext:
- .long 0xd503233f // paciasp
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- mov x9,x0
- mov x10,x3
- Loop_squeeze_ce:
- ldr x4,[x9],#8
- cmp x2,#8
- blo Lsqueeze_tail_ce
- #ifdef __AARCH64EB__
- rev x4,x4
- #endif
- str x4,[x1],#8
- beq Lsqueeze_done_ce
- sub x2,x2,#8
- subs x10,x10,#8
- bhi Loop_squeeze_ce
- bl KeccakF1600_cext
- ldr x30,[sp,#8]
- mov x9,x0
- mov x10,x3
- b Loop_squeeze_ce
- .align 4
- Lsqueeze_tail_ce:
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- lsr x4,x4,#8
- subs x2,x2,#1
- beq Lsqueeze_done_ce
- strb w4,[x1],#1
- Lsqueeze_done_ce:
- ldr x29,[sp],#16
- .long 0xd50323bf // autiasp
- ret
- .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .align 2
|