123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576 |
- /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under both the BSD-style license (found in the
- * LICENSE file in the root directory of this source tree) and the GPLv2 (found
- * in the COPYING file in the root directory of this source tree).
- * You may select, at your option, one of the above-listed licenses.
- */
- #include "../common/portability_macros.h"
- /* Stack marking
- * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
- */
- #if defined(__ELF__) && defined(__GNUC__)
- .section .note.GNU-stack,"",%progbits
- #endif
- #if ZSTD_ENABLE_ASM_X86_64_BMI2
- /* Calling convention:
- *
- * %rdi contains the first argument: HUF_DecompressAsmArgs*.
- * %rbp isn't maintained (no frame pointer).
- * %rsp contains the stack pointer that grows down.
- * No red-zone is assumed, only addresses >= %rsp are used.
- * All register contents are preserved.
- *
- * TODO: Support Windows calling convention.
- */
- ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
- ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
- ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
- ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
- .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
- .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
- .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
- .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
- .text
- /* Sets up register mappings for clarity.
- * op[], bits[], dtable & ip[0] each get their own register.
- * ip[1,2,3] & olimit alias var[].
- * %rax is a scratch register.
- */
- #define op0 rsi
- #define op1 rbx
- #define op2 rcx
- #define op3 rdi
- #define ip0 r8
- #define ip1 r9
- #define ip2 r10
- #define ip3 r11
- #define bits0 rbp
- #define bits1 rdx
- #define bits2 r12
- #define bits3 r13
- #define dtable r14
- #define olimit r15
- /* var[] aliases ip[1,2,3] & olimit
- * ip[1,2,3] are saved every iteration.
- * olimit is only used in compute_olimit.
- */
- #define var0 r15
- #define var1 r9
- #define var2 r10
- #define var3 r11
- /* 32-bit var registers */
- #define vard0 r15d
- #define vard1 r9d
- #define vard2 r10d
- #define vard3 r11d
- /* Calls X(N) for each stream 0, 1, 2, 3. */
- #define FOR_EACH_STREAM(X) \
- X(0); \
- X(1); \
- X(2); \
- X(3)
- /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
- #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
- X(0, idx); \
- X(1, idx); \
- X(2, idx); \
- X(3, idx)
- /* Define both _HUF_* & HUF_* symbols because MacOS
- * C symbols are prefixed with '_' & Linux symbols aren't.
- */
- _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
- HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
- ZSTD_CET_ENDBRANCH
- /* Save all registers - even if they are callee saved for simplicity. */
- push %rax
- push %rbx
- push %rcx
- push %rdx
- push %rbp
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- push %r12
- push %r13
- push %r14
- push %r15
- /* Read HUF_DecompressAsmArgs* args from %rax */
- movq %rdi, %rax
- movq 0(%rax), %ip0
- movq 8(%rax), %ip1
- movq 16(%rax), %ip2
- movq 24(%rax), %ip3
- movq 32(%rax), %op0
- movq 40(%rax), %op1
- movq 48(%rax), %op2
- movq 56(%rax), %op3
- movq 64(%rax), %bits0
- movq 72(%rax), %bits1
- movq 80(%rax), %bits2
- movq 88(%rax), %bits3
- movq 96(%rax), %dtable
- push %rax /* argument */
- push 104(%rax) /* ilimit */
- push 112(%rax) /* oend */
- push %olimit /* olimit space */
- subq $24, %rsp
- .L_4X1_compute_olimit:
- /* Computes how many iterations we can do safely
- * %r15, %rax may be clobbered
- * rbx, rdx must be saved
- * op3 & ip0 mustn't be clobbered
- */
- movq %rbx, 0(%rsp)
- movq %rdx, 8(%rsp)
- movq 32(%rsp), %rax /* rax = oend */
- subq %op3, %rax /* rax = oend - op3 */
- /* r15 = (oend - op3) / 5 */
- movabsq $-3689348814741910323, %rdx
- mulq %rdx
- movq %rdx, %r15
- shrq $2, %r15
- movq %ip0, %rax /* rax = ip0 */
- movq 40(%rsp), %rdx /* rdx = ilimit */
- subq %rdx, %rax /* rax = ip0 - ilimit */
- movq %rax, %rbx /* rbx = ip0 - ilimit */
- /* rdx = (ip0 - ilimit) / 7 */
- movabsq $2635249153387078803, %rdx
- mulq %rdx
- subq %rdx, %rbx
- shrq %rbx
- addq %rbx, %rdx
- shrq $2, %rdx
- /* r15 = min(%rdx, %r15) */
- cmpq %rdx, %r15
- cmova %rdx, %r15
- /* r15 = r15 * 5 */
- leaq (%r15, %r15, 4), %r15
- /* olimit = op3 + r15 */
- addq %op3, %olimit
- movq 8(%rsp), %rdx
- movq 0(%rsp), %rbx
- /* If (op3 + 20 > olimit) */
- movq %op3, %rax /* rax = op3 */
- addq $20, %rax /* rax = op3 + 20 */
- cmpq %rax, %olimit /* op3 + 20 > olimit */
- jb .L_4X1_exit
- /* If (ip1 < ip0) go to exit */
- cmpq %ip0, %ip1
- jb .L_4X1_exit
- /* If (ip2 < ip1) go to exit */
- cmpq %ip1, %ip2
- jb .L_4X1_exit
- /* If (ip3 < ip2) go to exit */
- cmpq %ip2, %ip3
- jb .L_4X1_exit
- /* Reads top 11 bits from bits[n]
- * Loads dt[bits[n]] into var[n]
- */
- #define GET_NEXT_DELT(n) \
- movq $53, %var##n; \
- shrxq %var##n, %bits##n, %var##n; \
- movzwl (%dtable,%var##n,2),%vard##n
- /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
- * Moves var[n] to %rax
- * bits[n] <<= var[n] & 63
- * op[n][idx] = %rax >> 8
- * %ah is a way to access bits [8, 16) of %rax
- */
- #define DECODE_FROM_DELT(n, idx) \
- movq %var##n, %rax; \
- shlxq %var##n, %bits##n, %bits##n; \
- movb %ah, idx(%op##n)
- /* Assumes GET_NEXT_DELT has been called.
- * Calls DECODE_FROM_DELT then GET_NEXT_DELT
- */
- #define DECODE_AND_GET_NEXT(n, idx) \
- DECODE_FROM_DELT(n, idx); \
- GET_NEXT_DELT(n) \
- /* // ctz & nbBytes is stored in bits[n]
- * // nbBits is stored in %rax
- * ctz = CTZ[bits[n]]
- * nbBits = ctz & 7
- * nbBytes = ctz >> 3
- * op[n] += 5
- * ip[n] -= nbBytes
- * // Note: x86-64 is little-endian ==> no bswap
- * bits[n] = MEM_readST(ip[n]) | 1
- * bits[n] <<= nbBits
- */
- #define RELOAD_BITS(n) \
- bsfq %bits##n, %bits##n; \
- movq %bits##n, %rax; \
- andq $7, %rax; \
- shrq $3, %bits##n; \
- leaq 5(%op##n), %op##n; \
- subq %bits##n, %ip##n; \
- movq (%ip##n), %bits##n; \
- orq $1, %bits##n; \
- shlx %rax, %bits##n, %bits##n
- /* Store clobbered variables on the stack */
- movq %olimit, 24(%rsp)
- movq %ip1, 0(%rsp)
- movq %ip2, 8(%rsp)
- movq %ip3, 16(%rsp)
- /* Call GET_NEXT_DELT for each stream */
- FOR_EACH_STREAM(GET_NEXT_DELT)
- .p2align 6
- .L_4X1_loop_body:
- /* Decode 5 symbols in each of the 4 streams (20 total)
- * Must have called GET_NEXT_DELT for each stream
- */
- FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
- FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
- FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
- FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
- FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
- /* Load ip[1,2,3] from stack (var[] aliases them)
- * ip[] is needed for RELOAD_BITS
- * Each will be stored back to the stack after RELOAD
- */
- movq 0(%rsp), %ip1
- movq 8(%rsp), %ip2
- movq 16(%rsp), %ip3
- /* Reload each stream & fetch the next table entry
- * to prepare for the next iteration
- */
- RELOAD_BITS(0)
- GET_NEXT_DELT(0)
- RELOAD_BITS(1)
- movq %ip1, 0(%rsp)
- GET_NEXT_DELT(1)
- RELOAD_BITS(2)
- movq %ip2, 8(%rsp)
- GET_NEXT_DELT(2)
- RELOAD_BITS(3)
- movq %ip3, 16(%rsp)
- GET_NEXT_DELT(3)
- /* If op3 < olimit: continue the loop */
- cmp %op3, 24(%rsp)
- ja .L_4X1_loop_body
- /* Reload ip[1,2,3] from stack */
- movq 0(%rsp), %ip1
- movq 8(%rsp), %ip2
- movq 16(%rsp), %ip3
- /* Re-compute olimit */
- jmp .L_4X1_compute_olimit
- #undef GET_NEXT_DELT
- #undef DECODE_FROM_DELT
- #undef DECODE
- #undef RELOAD_BITS
- .L_4X1_exit:
- addq $24, %rsp
- /* Restore stack (oend & olimit) */
- pop %rax /* olimit */
- pop %rax /* oend */
- pop %rax /* ilimit */
- pop %rax /* arg */
- /* Save ip / op / bits */
- movq %ip0, 0(%rax)
- movq %ip1, 8(%rax)
- movq %ip2, 16(%rax)
- movq %ip3, 24(%rax)
- movq %op0, 32(%rax)
- movq %op1, 40(%rax)
- movq %op2, 48(%rax)
- movq %op3, 56(%rax)
- movq %bits0, 64(%rax)
- movq %bits1, 72(%rax)
- movq %bits2, 80(%rax)
- movq %bits3, 88(%rax)
- /* Restore registers */
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rbp
- pop %rdx
- pop %rcx
- pop %rbx
- pop %rax
- ret
- _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
- HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
- ZSTD_CET_ENDBRANCH
- /* Save all registers - even if they are callee saved for simplicity. */
- push %rax
- push %rbx
- push %rcx
- push %rdx
- push %rbp
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- push %r12
- push %r13
- push %r14
- push %r15
- movq %rdi, %rax
- movq 0(%rax), %ip0
- movq 8(%rax), %ip1
- movq 16(%rax), %ip2
- movq 24(%rax), %ip3
- movq 32(%rax), %op0
- movq 40(%rax), %op1
- movq 48(%rax), %op2
- movq 56(%rax), %op3
- movq 64(%rax), %bits0
- movq 72(%rax), %bits1
- movq 80(%rax), %bits2
- movq 88(%rax), %bits3
- movq 96(%rax), %dtable
- push %rax /* argument */
- push %rax /* olimit */
- push 104(%rax) /* ilimit */
- movq 112(%rax), %rax
- push %rax /* oend3 */
- movq %op3, %rax
- push %rax /* oend2 */
- movq %op2, %rax
- push %rax /* oend1 */
- movq %op1, %rax
- push %rax /* oend0 */
- /* Scratch space */
- subq $8, %rsp
- .L_4X2_compute_olimit:
- /* Computes how many iterations we can do safely
- * %r15, %rax may be clobbered
- * rdx must be saved
- * op[1,2,3,4] & ip0 mustn't be clobbered
- */
- movq %rdx, 0(%rsp)
- /* We can consume up to 7 input bytes each iteration. */
- movq %ip0, %rax /* rax = ip0 */
- movq 40(%rsp), %rdx /* rdx = ilimit */
- subq %rdx, %rax /* rax = ip0 - ilimit */
- movq %rax, %r15 /* r15 = ip0 - ilimit */
- /* rdx = rax / 7 */
- movabsq $2635249153387078803, %rdx
- mulq %rdx
- subq %rdx, %r15
- shrq %r15
- addq %r15, %rdx
- shrq $2, %rdx
- /* r15 = (ip0 - ilimit) / 7 */
- movq %rdx, %r15
- /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
- movq 8(%rsp), %rax /* rax = oend0 */
- subq %op0, %rax /* rax = oend0 - op0 */
- movq 16(%rsp), %rdx /* rdx = oend1 */
- subq %op1, %rdx /* rdx = oend1 - op1 */
- cmpq %rax, %rdx
- cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
- movq 24(%rsp), %rax /* rax = oend2 */
- subq %op2, %rax /* rax = oend2 - op2 */
- cmpq %rax, %rdx
- cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
- movq 32(%rsp), %rax /* rax = oend3 */
- subq %op3, %rax /* rax = oend3 - op3 */
- cmpq %rax, %rdx
- cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
- movabsq $-3689348814741910323, %rax
- mulq %rdx
- shrq $3, %rdx /* rdx = rdx / 10 */
- /* r15 = min(%rdx, %r15) */
- cmpq %rdx, %r15
- cmova %rdx, %r15
- /* olimit = op3 + 5 * r15 */
- movq %r15, %rax
- leaq (%op3, %rax, 4), %olimit
- addq %rax, %olimit
- movq 0(%rsp), %rdx
- /* If (op3 + 10 > olimit) */
- movq %op3, %rax /* rax = op3 */
- addq $10, %rax /* rax = op3 + 10 */
- cmpq %rax, %olimit /* op3 + 10 > olimit */
- jb .L_4X2_exit
- /* If (ip1 < ip0) go to exit */
- cmpq %ip0, %ip1
- jb .L_4X2_exit
- /* If (ip2 < ip1) go to exit */
- cmpq %ip1, %ip2
- jb .L_4X2_exit
- /* If (ip3 < ip2) go to exit */
- cmpq %ip2, %ip3
- jb .L_4X2_exit
- #define DECODE(n, idx) \
- movq %bits##n, %rax; \
- shrq $53, %rax; \
- movzwl 0(%dtable,%rax,4),%r8d; \
- movzbl 2(%dtable,%rax,4),%r15d; \
- movzbl 3(%dtable,%rax,4),%eax; \
- movw %r8w, (%op##n); \
- shlxq %r15, %bits##n, %bits##n; \
- addq %rax, %op##n
- #define RELOAD_BITS(n) \
- bsfq %bits##n, %bits##n; \
- movq %bits##n, %rax; \
- shrq $3, %bits##n; \
- andq $7, %rax; \
- subq %bits##n, %ip##n; \
- movq (%ip##n), %bits##n; \
- orq $1, %bits##n; \
- shlxq %rax, %bits##n, %bits##n
- movq %olimit, 48(%rsp)
- .p2align 6
- .L_4X2_loop_body:
- /* We clobber r8, so store it on the stack */
- movq %r8, 0(%rsp)
- /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
- FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
- /* Reload r8 */
- movq 0(%rsp), %r8
- FOR_EACH_STREAM(RELOAD_BITS)
- cmp %op3, 48(%rsp)
- ja .L_4X2_loop_body
- jmp .L_4X2_compute_olimit
- #undef DECODE
- #undef RELOAD_BITS
- .L_4X2_exit:
- addq $8, %rsp
- /* Restore stack (oend & olimit) */
- pop %rax /* oend0 */
- pop %rax /* oend1 */
- pop %rax /* oend2 */
- pop %rax /* oend3 */
- pop %rax /* ilimit */
- pop %rax /* olimit */
- pop %rax /* arg */
- /* Save ip / op / bits */
- movq %ip0, 0(%rax)
- movq %ip1, 8(%rax)
- movq %ip2, 16(%rax)
- movq %ip3, 24(%rax)
- movq %op0, 32(%rax)
- movq %op1, 40(%rax)
- movq %op2, 48(%rax)
- movq %op3, 56(%rax)
- movq %bits0, 64(%rax)
- movq %bits1, 72(%rax)
- movq %bits2, 80(%rax)
- movq %bits3, 88(%rax)
- /* Restore registers */
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rbp
- pop %rdx
- pop %rcx
- pop %rbx
- pop %rax
- ret
- #endif
|