123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568 |
- // Copyright 2016 The Go Authors. All rights reserved.
- // Copyright (c) 2019 Klaus Post. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build !appengine
- // +build gc
- // +build !noasm
- #include "textflag.h"
- #define R_TMP0 AX
- #define R_TMP1 BX
- #define R_LEN CX
- #define R_OFF DX
- #define R_SRC SI
- #define R_DST DI
- #define R_DBASE R8
- #define R_DLEN R9
- #define R_DEND R10
- #define R_SBASE R11
- #define R_SLEN R12
- #define R_SEND R13
- #define R_TMP2 R14
- #define R_TMP3 R15
- // The asm code generally follows the pure Go code in decode_other.go, except
- // where marked with a "!!!".
- // func decode(dst, src []byte) int
- //
- // All local variables fit into registers. The non-zero stack size is only to
- // spill registers and push args when issuing a CALL. The register allocation:
- // - R_TMP0 scratch
- // - R_TMP1 scratch
- // - R_LEN length or x (shared)
- // - R_OFF offset
- // - R_SRC &src[s]
- // - R_DST &dst[d]
- // + R_DBASE dst_base
- // + R_DLEN dst_len
- // + R_DEND dst_base + dst_len
- // + R_SBASE src_base
- // + R_SLEN src_len
- // + R_SEND src_base + src_len
- // - R_TMP2 used by doCopy
- // - R_TMP3 used by doCopy
- //
- // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
- // function, and after a CALL returns, and are not otherwise modified.
- //
- // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
- // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
- TEXT ·s2Decode(SB), NOSPLIT, $48-56
- // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
- MOVQ dst_base+0(FP), R_DBASE
- MOVQ dst_len+8(FP), R_DLEN
- MOVQ R_DBASE, R_DST
- MOVQ R_DBASE, R_DEND
- ADDQ R_DLEN, R_DEND
- MOVQ src_base+24(FP), R_SBASE
- MOVQ src_len+32(FP), R_SLEN
- MOVQ R_SBASE, R_SRC
- MOVQ R_SBASE, R_SEND
- ADDQ R_SLEN, R_SEND
- XORQ R_OFF, R_OFF
- loop:
- // for s < len(src)
- CMPQ R_SRC, R_SEND
- JEQ end
- // R_LEN = uint32(src[s])
- //
- // switch src[s] & 0x03
- MOVBLZX (R_SRC), R_LEN
- MOVL R_LEN, R_TMP1
- ANDL $3, R_TMP1
- CMPL R_TMP1, $1
- JAE tagCopy
- // ----------------------------------------
- // The code below handles literal tags.
- // case tagLiteral:
- // x := uint32(src[s] >> 2)
- // switch
- SHRL $2, R_LEN
- CMPL R_LEN, $60
- JAE tagLit60Plus
- // case x < 60:
- // s++
- INCQ R_SRC
- doLit:
- // This is the end of the inner "switch", when we have a literal tag.
- //
- // We assume that R_LEN == x and x fits in a uint32, where x is the variable
- // used in the pure Go decode_other.go code.
- // length = int(x) + 1
- //
- // Unlike the pure Go code, we don't need to check if length <= 0 because
- // R_LEN can hold 64 bits, so the increment cannot overflow.
- INCQ R_LEN
- // Prepare to check if copying length bytes will run past the end of dst or
- // src.
- //
- // R_TMP0 = len(dst) - d
- // R_TMP1 = len(src) - s
- MOVQ R_DEND, R_TMP0
- SUBQ R_DST, R_TMP0
- MOVQ R_SEND, R_TMP1
- SUBQ R_SRC, R_TMP1
- // !!! Try a faster technique for short (16 or fewer bytes) copies.
- //
- // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
- // goto callMemmove // Fall back on calling runtime·memmove.
- // }
- //
- // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
- // against 21 instead of 16, because it cannot assume that all of its input
- // is contiguous in memory and so it needs to leave enough source bytes to
- // read the next tag without refilling buffers, but Go's Decode assumes
- // contiguousness (the src argument is a []byte).
- CMPQ R_LEN, $16
- JGT callMemmove
- CMPQ R_TMP0, $16
- JLT callMemmove
- CMPQ R_TMP1, $16
- JLT callMemmove
- // !!! Implement the copy from src to dst as a 16-byte load and store.
- // (Decode's documentation says that dst and src must not overlap.)
- //
- // This always copies 16 bytes, instead of only length bytes, but that's
- // OK. If the input is a valid Snappy encoding then subsequent iterations
- // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
- // non-nil error), so the overrun will be ignored.
- //
- // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
- // 16-byte loads and stores. This technique probably wouldn't be as
- // effective on architectures that are fussier about alignment.
- MOVOU 0(R_SRC), X0
- MOVOU X0, 0(R_DST)
- // d += length
- // s += length
- ADDQ R_LEN, R_DST
- ADDQ R_LEN, R_SRC
- JMP loop
- callMemmove:
- // if length > len(dst)-d || length > len(src)-s { etc }
- CMPQ R_LEN, R_TMP0
- JGT errCorrupt
- CMPQ R_LEN, R_TMP1
- JGT errCorrupt
- // copy(dst[d:], src[s:s+length])
- //
- // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
- // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
- // three registers to the stack, to save local variables across the CALL.
- MOVQ R_DST, 0(SP)
- MOVQ R_SRC, 8(SP)
- MOVQ R_LEN, 16(SP)
- MOVQ R_DST, 24(SP)
- MOVQ R_SRC, 32(SP)
- MOVQ R_LEN, 40(SP)
- MOVQ R_OFF, 48(SP)
- CALL runtime·memmove(SB)
- // Restore local variables: unspill registers from the stack and
- // re-calculate R_DBASE-R_SEND.
- MOVQ 24(SP), R_DST
- MOVQ 32(SP), R_SRC
- MOVQ 40(SP), R_LEN
- MOVQ 48(SP), R_OFF
- MOVQ dst_base+0(FP), R_DBASE
- MOVQ dst_len+8(FP), R_DLEN
- MOVQ R_DBASE, R_DEND
- ADDQ R_DLEN, R_DEND
- MOVQ src_base+24(FP), R_SBASE
- MOVQ src_len+32(FP), R_SLEN
- MOVQ R_SBASE, R_SEND
- ADDQ R_SLEN, R_SEND
- // d += length
- // s += length
- ADDQ R_LEN, R_DST
- ADDQ R_LEN, R_SRC
- JMP loop
- tagLit60Plus:
- // !!! This fragment does the
- //
- // s += x - 58; if uint(s) > uint(len(src)) { etc }
- //
- // checks. In the asm version, we code it once instead of once per switch case.
- ADDQ R_LEN, R_SRC
- SUBQ $58, R_SRC
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // case x == 60:
- CMPL R_LEN, $61
- JEQ tagLit61
- JA tagLit62Plus
- // x = uint32(src[s-1])
- MOVBLZX -1(R_SRC), R_LEN
- JMP doLit
- tagLit61:
- // case x == 61:
- // x = uint32(src[s-2]) | uint32(src[s-1])<<8
- MOVWLZX -2(R_SRC), R_LEN
- JMP doLit
- tagLit62Plus:
- CMPL R_LEN, $62
- JA tagLit63
- // case x == 62:
- // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
- // We read one byte, safe to read one back, since we are just reading tag.
- // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
- MOVL -4(R_SRC), R_LEN
- SHRL $8, R_LEN
- JMP doLit
- tagLit63:
- // case x == 63:
- // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
- MOVL -4(R_SRC), R_LEN
- JMP doLit
- // The code above handles literal tags.
- // ----------------------------------------
- // The code below handles copy tags.
- tagCopy4:
- // case tagCopy4:
- // s += 5
- ADDQ $5, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // length = 1 + int(src[s-5])>>2
- SHRQ $2, R_LEN
- INCQ R_LEN
- // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
- MOVLQZX -4(R_SRC), R_OFF
- JMP doCopy
- tagCopy2:
- // case tagCopy2:
- // s += 3
- ADDQ $3, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // length = 1 + int(src[s-3])>>2
- SHRQ $2, R_LEN
- INCQ R_LEN
- // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
- MOVWQZX -2(R_SRC), R_OFF
- JMP doCopy
- tagCopy:
- // We have a copy tag. We assume that:
- // - R_TMP1 == src[s] & 0x03
- // - R_LEN == src[s]
- CMPQ R_TMP1, $2
- JEQ tagCopy2
- JA tagCopy4
- // case tagCopy1:
- // s += 2
- ADDQ $2, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
- // length = 4 + int(src[s-2])>>2&0x7
- MOVBQZX -1(R_SRC), R_TMP1
- MOVQ R_LEN, R_TMP0
- SHRQ $2, R_LEN
- ANDQ $0xe0, R_TMP0
- ANDQ $7, R_LEN
- SHLQ $3, R_TMP0
- ADDQ $4, R_LEN
- ORQ R_TMP1, R_TMP0
- // check if repeat code, ZF set by ORQ.
- JZ repeatCode
- // This is a regular copy, transfer our temporary value to R_OFF (length)
- MOVQ R_TMP0, R_OFF
- JMP doCopy
- // This is a repeat code.
- repeatCode:
- // If length < 9, reuse last offset, with the length already calculated.
- CMPQ R_LEN, $9
- JL doCopyRepeat
- // Read additional bytes for length.
- JE repeatLen1
- // Rare, so the extra branch shouldn't hurt too much.
- CMPQ R_LEN, $10
- JE repeatLen2
- JMP repeatLen3
- // Read repeat lengths.
- repeatLen1:
- // s ++
- ADDQ $1, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // length = src[s-1] + 8
- MOVBQZX -1(R_SRC), R_LEN
- ADDL $8, R_LEN
- JMP doCopyRepeat
- repeatLen2:
- // s +=2
- ADDQ $2, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
- MOVWQZX -2(R_SRC), R_LEN
- ADDL $260, R_LEN
- JMP doCopyRepeat
- repeatLen3:
- // s +=3
- ADDQ $3, R_SRC
- // if uint(s) > uint(len(src)) { etc }
- CMPQ R_SRC, R_SEND
- JA errCorrupt
- // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
- // Read one byte further back (just part of the tag, shifted out)
- MOVL -4(R_SRC), R_LEN
- SHRL $8, R_LEN
- ADDL $65540, R_LEN
- JMP doCopyRepeat
- doCopy:
- // This is the end of the outer "switch", when we have a copy tag.
- //
- // We assume that:
- // - R_LEN == length && R_LEN > 0
- // - R_OFF == offset
- // if d < offset { etc }
- MOVQ R_DST, R_TMP1
- SUBQ R_DBASE, R_TMP1
- CMPQ R_TMP1, R_OFF
- JLT errCorrupt
- // Repeat values can skip the test above, since any offset > 0 will be in dst.
- doCopyRepeat:
- // if offset <= 0 { etc }
- CMPQ R_OFF, $0
- JLE errCorrupt
- // if length > len(dst)-d { etc }
- MOVQ R_DEND, R_TMP1
- SUBQ R_DST, R_TMP1
- CMPQ R_LEN, R_TMP1
- JGT errCorrupt
- // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
- //
- // Set:
- // - R_TMP2 = len(dst)-d
- // - R_TMP3 = &dst[d-offset]
- MOVQ R_DEND, R_TMP2
- SUBQ R_DST, R_TMP2
- MOVQ R_DST, R_TMP3
- SUBQ R_OFF, R_TMP3
- // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
- //
- // First, try using two 8-byte load/stores, similar to the doLit technique
- // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
- // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
- // and not one 16-byte load/store, and the first store has to be before the
- // second load, due to the overlap if offset is in the range [8, 16).
- //
- // if length > 16 || offset < 8 || len(dst)-d < 16 {
- // goto slowForwardCopy
- // }
- // copy 16 bytes
- // d += length
- CMPQ R_LEN, $16
- JGT slowForwardCopy
- CMPQ R_OFF, $8
- JLT slowForwardCopy
- CMPQ R_TMP2, $16
- JLT slowForwardCopy
- MOVQ 0(R_TMP3), R_TMP0
- MOVQ R_TMP0, 0(R_DST)
- MOVQ 8(R_TMP3), R_TMP1
- MOVQ R_TMP1, 8(R_DST)
- ADDQ R_LEN, R_DST
- JMP loop
- slowForwardCopy:
- // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
- // can still try 8-byte load stores, provided we can overrun up to 10 extra
- // bytes. As above, the overrun will be fixed up by subsequent iterations
- // of the outermost loop.
- //
- // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
- // commentary says:
- //
- // ----
- //
- // The main part of this loop is a simple copy of eight bytes at a time
- // until we've copied (at least) the requested amount of bytes. However,
- // if d and d-offset are less than eight bytes apart (indicating a
- // repeating pattern of length < 8), we first need to expand the pattern in
- // order to get the correct results. For instance, if the buffer looks like
- // this, with the eight-byte <d-offset> and <d> patterns marked as
- // intervals:
- //
- // abxxxxxxxxxxxx
- // [------] d-offset
- // [------] d
- //
- // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
- // once, after which we can move <d> two bytes without moving <d-offset>:
- //
- // ababxxxxxxxxxx
- // [------] d-offset
- // [------] d
- //
- // and repeat the exercise until the two no longer overlap.
- //
- // This allows us to do very well in the special case of one single byte
- // repeated many times, without taking a big hit for more general cases.
- //
- // The worst case of extra writing past the end of the match occurs when
- // offset == 1 and length == 1; the last copy will read from byte positions
- // [0..7] and write to [4..11], whereas it was only supposed to write to
- // position 1. Thus, ten excess bytes.
- //
- // ----
- //
- // That "10 byte overrun" worst case is confirmed by Go's
- // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
- // and finishSlowForwardCopy algorithm.
- //
- // if length > len(dst)-d-10 {
- // goto verySlowForwardCopy
- // }
- SUBQ $10, R_TMP2
- CMPQ R_LEN, R_TMP2
- JGT verySlowForwardCopy
- // We want to keep the offset, so we use R_TMP2 from here.
- MOVQ R_OFF, R_TMP2
- makeOffsetAtLeast8:
- // !!! As above, expand the pattern so that offset >= 8 and we can use
- // 8-byte load/stores.
- //
- // for offset < 8 {
- // copy 8 bytes from dst[d-offset:] to dst[d:]
- // length -= offset
- // d += offset
- // offset += offset
- // // The two previous lines together means that d-offset, and therefore
- // // R_TMP3, is unchanged.
- // }
- CMPQ R_TMP2, $8
- JGE fixUpSlowForwardCopy
- MOVQ (R_TMP3), R_TMP1
- MOVQ R_TMP1, (R_DST)
- SUBQ R_TMP2, R_LEN
- ADDQ R_TMP2, R_DST
- ADDQ R_TMP2, R_TMP2
- JMP makeOffsetAtLeast8
- fixUpSlowForwardCopy:
- // !!! Add length (which might be negative now) to d (implied by R_DST being
- // &dst[d]) so that d ends up at the right place when we jump back to the
- // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
- // length is positive, copying the remaining length bytes will write to the
- // right place.
- MOVQ R_DST, R_TMP0
- ADDQ R_LEN, R_DST
- finishSlowForwardCopy:
- // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
- // length means that we overrun, but as above, that will be fixed up by
- // subsequent iterations of the outermost loop.
- CMPQ R_LEN, $0
- JLE loop
- MOVQ (R_TMP3), R_TMP1
- MOVQ R_TMP1, (R_TMP0)
- ADDQ $8, R_TMP3
- ADDQ $8, R_TMP0
- SUBQ $8, R_LEN
- JMP finishSlowForwardCopy
- verySlowForwardCopy:
- // verySlowForwardCopy is a simple implementation of forward copy. In C
- // parlance, this is a do/while loop instead of a while loop, since we know
- // that length > 0. In Go syntax:
- //
- // for {
- // dst[d] = dst[d - offset]
- // d++
- // length--
- // if length == 0 {
- // break
- // }
- // }
- MOVB (R_TMP3), R_TMP1
- MOVB R_TMP1, (R_DST)
- INCQ R_TMP3
- INCQ R_DST
- DECQ R_LEN
- JNZ verySlowForwardCopy
- JMP loop
- // The code above handles copy tags.
- // ----------------------------------------
- end:
- // This is the end of the "for s < len(src)".
- //
- // if d != len(dst) { etc }
- CMPQ R_DST, R_DEND
- JNE errCorrupt
- // return 0
- MOVQ $0, ret+48(FP)
- RET
- errCorrupt:
- // return decodeErrCodeCorrupt
- MOVQ $1, ret+48(FP)
- RET
|