123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640 |
- // Copyright (c) 2019+ Klaus Post. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package s2
- import (
- "bytes"
- "fmt"
- "io"
- "math/rand"
- "os"
- "runtime"
- "strings"
- "testing"
- "github.com/klauspost/compress/internal/snapref"
- "github.com/klauspost/compress/zip"
- )
- func testOptions(_ testing.TB) map[string][]WriterOption {
- var testOptions = map[string][]WriterOption{
- "default": {WriterAddIndex()},
- "better": {WriterBetterCompression()},
- "best": {WriterBestCompression()},
- "none": {WriterUncompressed()},
- }
- x := make(map[string][]WriterOption)
- cloneAdd := func(org []WriterOption, add ...WriterOption) []WriterOption {
- y := make([]WriterOption, len(org)+len(add))
- copy(y, org)
- copy(y[len(org):], add)
- return y
- }
- for name, opt := range testOptions {
- x[name] = opt
- x[name+"-c1"] = cloneAdd(opt, WriterConcurrency(1))
- }
- testOptions = x
- x = make(map[string][]WriterOption)
- for name, opt := range testOptions {
- x[name] = opt
- if !testing.Short() {
- x[name+"-4k-win"] = cloneAdd(opt, WriterBlockSize(4<<10))
- x[name+"-4M-win"] = cloneAdd(opt, WriterBlockSize(4<<20))
- }
- }
- testOptions = x
- x = make(map[string][]WriterOption)
- for name, opt := range testOptions {
- x[name] = opt
- x[name+"-pad-min"] = cloneAdd(opt, WriterPadding(2), WriterPaddingSrc(zeroReader{}))
- if !testing.Short() {
- x[name+"-pad-8000"] = cloneAdd(opt, WriterPadding(8000), WriterPaddingSrc(zeroReader{}))
- x[name+"-pad-max"] = cloneAdd(opt, WriterPadding(4<<20), WriterPaddingSrc(zeroReader{}))
- }
- }
- for name, opt := range testOptions {
- x[name] = opt
- x[name+"-snappy"] = cloneAdd(opt, WriterSnappyCompat())
- x[name+"-custom"] = cloneAdd(opt, WriterCustomEncoder(snapref.EncodeBlockInto))
- }
- testOptions = x
- return testOptions
- }
- type zeroReader struct{}
- func (zeroReader) Read(p []byte) (int, error) {
- for i := range p {
- p[i] = 0
- }
- return len(p), nil
- }
- func TestEncoderRegression(t *testing.T) {
- data, err := os.ReadFile("testdata/enc_regressions.zip")
- if err != nil {
- t.Fatal(err)
- }
- zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
- if err != nil {
- t.Fatal(err)
- }
- // Same as fuzz test...
- test := func(t *testing.T, data []byte) {
- if testing.Short() && len(data) > 10000 {
- t.SkipNow()
- }
- var blocksTested bool
- for name, opts := range testOptions(t) {
- t.Run(name, func(t *testing.T) {
- var buf bytes.Buffer
- dec := NewReader(nil)
- enc := NewWriter(&buf, opts...)
- if !blocksTested {
- comp := Encode(make([]byte, MaxEncodedLen(len(data))), data)
- decoded, err := Decode(nil, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- comp = EncodeBetter(make([]byte, MaxEncodedLen(len(data))), data)
- decoded, err = Decode(nil, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- comp = EncodeBest(make([]byte, MaxEncodedLen(len(data))), data)
- decoded, err = Decode(nil, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- blocksTested = true
- }
- // Test writer.
- n, err := enc.Write(data)
- if err != nil {
- t.Error(err)
- return
- }
- if n != len(data) {
- t.Error(fmt.Errorf("Write: Short write, want %d, got %d", len(data), n))
- return
- }
- err = enc.Close()
- if err != nil {
- t.Error(err)
- return
- }
- // Calling close twice should not affect anything.
- err = enc.Close()
- if err != nil {
- t.Error(err)
- return
- }
- comp := buf.Bytes()
- if enc.pad > 0 && len(comp)%enc.pad != 0 {
- t.Error(fmt.Errorf("wanted size to be mutiple of %d, got size %d with remainder %d", enc.pad, len(comp), len(comp)%enc.pad))
- return
- }
- var got []byte
- if !strings.Contains(name, "-snappy") {
- dec.Reset(&buf)
- got, err = io.ReadAll(dec)
- } else {
- sdec := snapref.NewReader(&buf)
- got, err = io.ReadAll(sdec)
- }
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, got) {
- t.Error("block (reset) decoder mismatch")
- return
- }
- // Test Reset on both and use ReadFrom instead.
- buf.Reset()
- enc.Reset(&buf)
- n2, err := enc.ReadFrom(bytes.NewBuffer(data))
- if err != nil {
- t.Error(err)
- return
- }
- if n2 != int64(len(data)) {
- t.Error(fmt.Errorf("ReadFrom: Short read, want %d, got %d", len(data), n2))
- return
- }
- err = enc.Close()
- if err != nil {
- t.Error(err)
- return
- }
- if enc.pad > 0 && buf.Len()%enc.pad != 0 {
- t.Error(fmt.Errorf("wanted size to be mutiple of %d, got size %d with remainder %d", enc.pad, buf.Len(), buf.Len()%enc.pad))
- return
- }
- if !strings.Contains(name, "-snappy") {
- dec.Reset(&buf)
- got, err = io.ReadAll(dec)
- } else {
- sdec := snapref.NewReader(&buf)
- got, err = io.ReadAll(sdec)
- }
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, got) {
- t.Error("frame (reset) decoder mismatch")
- return
- }
- })
- }
- }
- for _, tt := range zr.File {
- if !strings.HasSuffix(t.Name(), "") {
- continue
- }
- t.Run(tt.Name, func(t *testing.T) {
- r, err := tt.Open()
- if err != nil {
- t.Error(err)
- return
- }
- b, err := io.ReadAll(r)
- if err != nil {
- t.Error(err)
- return
- }
- test(t, b[:len(b):len(b)])
- })
- }
- }
- func TestIndex(t *testing.T) {
- fatalErr := func(t testing.TB, err error) {
- if err != nil {
- t.Fatal(err)
- }
- }
- // Create a test corpus
- var input []byte
- if !testing.Short() {
- input = make([]byte, 10<<20)
- } else {
- input = make([]byte, 500<<10)
- }
- rng := rand.New(rand.NewSource(0xabeefcafe))
- rng.Read(input)
- // Make it compressible...
- for i, v := range input {
- input[i] = '0' + v&3
- }
- // Compress it...
- var buf bytes.Buffer
- // We use smaller blocks just for the example...
- enc := NewWriter(&buf, WriterBlockSize(100<<10), WriterAddIndex(), WriterBetterCompression(), WriterConcurrency(runtime.GOMAXPROCS(0)))
- todo := input
- for len(todo) > 0 {
- // Write random sized inputs..
- x := todo[:rng.Intn(1+len(todo)&65535)]
- if len(x) == 0 {
- x = todo[:1]
- }
- _, err := enc.Write(x)
- fatalErr(t, err)
- // Flush once in a while
- if rng.Intn(8) == 0 {
- err = enc.Flush()
- fatalErr(t, err)
- }
- todo = todo[len(x):]
- }
- // Close and also get index...
- idxBytes, err := enc.CloseIndex()
- fatalErr(t, err)
- if false {
- // Load the index.
- var index Index
- _, err = index.Load(idxBytes)
- fatalErr(t, err)
- t.Log(string(index.JSON()))
- }
- // This is our compressed stream...
- compressed := buf.Bytes()
- for wantOffset := int64(0); wantOffset < int64(len(input)); wantOffset += 65531 {
- t.Run(fmt.Sprintf("offset-%d", wantOffset), func(t *testing.T) {
- // Let's assume we want to read from uncompressed offset 'i'
- // and we cannot seek in input, but we have the index.
- want := input[wantOffset:]
- // Load the index.
- var index Index
- _, err = index.Load(idxBytes)
- fatalErr(t, err)
- // Find offset in file:
- compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
- fatalErr(t, err)
- // Offset the input to the compressed offset.
- // Notice how we do not provide any bytes before the offset.
- in := io.Reader(bytes.NewBuffer(compressed[compressedOffset:]))
- // When creating the decoder we must specify that it should not
- // expect a stream identifier at the beginning og the frame.
- dec := NewReader(in, ReaderIgnoreStreamIdentifier())
- // We now have a reader, but it will start outputting at uncompressedOffset,
- // and not the actual offset we want, so skip forward to that.
- toSkip := wantOffset - uncompressedOffset
- err = dec.Skip(toSkip)
- fatalErr(t, err)
- // Read the rest of the stream...
- got, err := io.ReadAll(dec)
- fatalErr(t, err)
- if !bytes.Equal(got, want) {
- t.Error("Result mismatch", wantOffset)
- }
- // Test with stream index...
- for i := io.SeekStart; i <= io.SeekEnd; i++ {
- t.Run(fmt.Sprintf("seek-%d", i), func(t *testing.T) {
- // Read it from a seekable stream
- dec = NewReader(bytes.NewReader(compressed))
- rs, err := dec.ReadSeeker(true, nil)
- fatalErr(t, err)
- // Read a little...
- var tmp = make([]byte, len(input)/2)
- _, err = io.ReadFull(rs, tmp[:])
- fatalErr(t, err)
- toSkip := wantOffset
- switch i {
- case io.SeekStart:
- case io.SeekCurrent:
- toSkip = wantOffset - int64(len(input)/2)
- case io.SeekEnd:
- toSkip = -(int64(len(input)) - wantOffset)
- }
- gotOffset, err := rs.Seek(toSkip, i)
- if gotOffset != wantOffset {
- t.Errorf("got offset %d, want %d", gotOffset, wantOffset)
- }
- // Read the rest of the stream...
- got, err := io.ReadAll(dec)
- fatalErr(t, err)
- if !bytes.Equal(got, want) {
- t.Error("Result mismatch", wantOffset)
- }
- })
- }
- t.Run(fmt.Sprintf("ReadAt"), func(t *testing.T) {
- // Read it from a seekable stream
- dec = NewReader(bytes.NewReader(compressed))
- rs, err := dec.ReadSeeker(true, nil)
- fatalErr(t, err)
- // Read a little...
- var tmp = make([]byte, len(input)/2)
- _, err = io.ReadFull(rs, tmp[:])
- fatalErr(t, err)
- wantLen := len(tmp)
- if wantLen+int(wantOffset) > len(input) {
- wantLen = len(input) - int(wantOffset)
- }
- // Read from wantOffset
- n, err := rs.ReadAt(tmp, wantOffset)
- if n != wantLen {
- t.Errorf("got length %d, want %d", n, wantLen)
- }
- if err != io.EOF {
- fatalErr(t, err)
- }
- want := want[:n]
- got := tmp[:n]
- // Read the rest of the stream...
- if !bytes.Equal(got, want) {
- t.Error("Result mismatch", wantOffset)
- }
- })
- })
- }
- }
- func TestWriterPadding(t *testing.T) {
- n := 100
- if testing.Short() {
- n = 5
- }
- rng := rand.New(rand.NewSource(0x1337))
- d := NewReader(nil)
- for i := 0; i < n; i++ {
- padding := (rng.Int() & 0xffff) + 1
- src := make([]byte, (rng.Int()&0xfffff)+1)
- for i := range src {
- src[i] = uint8(rng.Uint32()) & 3
- }
- var dst bytes.Buffer
- e := NewWriter(&dst, WriterPadding(padding))
- // Test the added padding is invisible.
- _, err := io.Copy(e, bytes.NewBuffer(src))
- if err != nil {
- t.Fatal(err)
- }
- err = e.Close()
- if err != nil {
- t.Fatal(err)
- }
- err = e.Close()
- if err != nil {
- t.Fatal(err)
- }
- if dst.Len()%padding != 0 {
- t.Fatalf("wanted size to be mutiple of %d, got size %d with remainder %d", padding, dst.Len(), dst.Len()%padding)
- }
- var got bytes.Buffer
- d.Reset(&dst)
- _, err = io.Copy(&got, d)
- if err != nil {
- t.Fatal(err)
- }
- if !bytes.Equal(src, got.Bytes()) {
- t.Fatal("output mismatch")
- }
- // Try after reset
- dst.Reset()
- e.Reset(&dst)
- _, err = io.Copy(e, bytes.NewBuffer(src))
- if err != nil {
- t.Fatal(err)
- }
- err = e.Close()
- if err != nil {
- t.Fatal(err)
- }
- if dst.Len()%padding != 0 {
- t.Fatalf("wanted size to be mutiple of %d, got size %d with remainder %d", padding, dst.Len(), dst.Len()%padding)
- }
- got.Reset()
- d.Reset(&dst)
- _, err = io.Copy(&got, d)
- if err != nil {
- t.Fatal(err)
- }
- if !bytes.Equal(src, got.Bytes()) {
- t.Fatal("output mismatch after reset")
- }
- }
- }
- func TestBigRegularWrites(t *testing.T) {
- var buf [maxBlockSize * 2]byte
- dst := bytes.NewBuffer(nil)
- enc := NewWriter(dst, WriterBestCompression())
- max := uint8(10)
- if testing.Short() {
- max = 4
- }
- for n := uint8(0); n < max; n++ {
- for i := range buf[:] {
- buf[i] = n
- }
- // Writes may not keep a reference to the data beyond the Write call.
- _, err := enc.Write(buf[:])
- if err != nil {
- t.Fatal(err)
- }
- }
- err := enc.Close()
- if err != nil {
- t.Fatal(err)
- }
- dec := NewReader(dst)
- _, err = io.Copy(io.Discard, dec)
- if err != nil {
- t.Fatal(err)
- }
- }
- func TestBigEncodeBuffer(t *testing.T) {
- const blockSize = 1 << 20
- var buf [blockSize * 2]byte
- dst := bytes.NewBuffer(nil)
- enc := NewWriter(dst, WriterBlockSize(blockSize), WriterBestCompression())
- max := uint8(10)
- if testing.Short() {
- max = 4
- }
- for n := uint8(0); n < max; n++ {
- // Change the buffer to a new value.
- for i := range buf[:] {
- buf[i] = n
- }
- err := enc.EncodeBuffer(buf[:])
- if err != nil {
- t.Fatal(err)
- }
- // We can write it again since we aren't changing it.
- err = enc.EncodeBuffer(buf[:])
- if err != nil {
- t.Fatal(err)
- }
- err = enc.Flush()
- if err != nil {
- t.Fatal(err)
- }
- }
- err := enc.Close()
- if err != nil {
- t.Fatal(err)
- }
- dec := NewReader(dst)
- n, err := io.Copy(io.Discard, dec)
- if err != nil {
- t.Fatal(err)
- }
- t.Log(n)
- }
- func TestBigEncodeBufferSync(t *testing.T) {
- const blockSize = 1 << 20
- var buf [blockSize * 2]byte
- dst := bytes.NewBuffer(nil)
- enc := NewWriter(dst, WriterBlockSize(blockSize), WriterConcurrency(1), WriterBestCompression())
- max := uint8(10)
- if testing.Short() {
- max = 2
- }
- for n := uint8(0); n < max; n++ {
- // Change the buffer to a new value.
- for i := range buf[:] {
- buf[i] = n
- }
- // When WriterConcurrency == 1 we can encode and reuse the buffer.
- err := enc.EncodeBuffer(buf[:])
- if err != nil {
- t.Fatal(err)
- }
- }
- err := enc.Close()
- if err != nil {
- t.Fatal(err)
- }
- dec := NewReader(dst)
- n, err := io.Copy(io.Discard, dec)
- if err != nil {
- t.Fatal(err)
- }
- t.Log(n)
- }
- func BenchmarkWriterRandom(b *testing.B) {
- rng := rand.New(rand.NewSource(1))
- // Make max window so we never get matches.
- data := make([]byte, 4<<20)
- for i := range data {
- data[i] = uint8(rng.Intn(256))
- }
- for name, opts := range testOptions(b) {
- w := NewWriter(io.Discard, opts...)
- b.Run(name, func(b *testing.B) {
- b.ResetTimer()
- b.ReportAllocs()
- b.SetBytes(int64(len(data)))
- for i := 0; i < b.N; i++ {
- err := w.EncodeBuffer(data)
- if err != nil {
- b.Fatal(err)
- }
- }
- // Flush output
- w.Flush()
- })
- w.Close()
- }
- }
- func BenchmarkIndexFind(b *testing.B) {
- fatalErr := func(t testing.TB, err error) {
- if err != nil {
- t.Fatal(err)
- }
- }
- for blocks := 1; blocks <= 65536; blocks *= 2 {
- if blocks == 65536 {
- blocks = 65535
- }
- var index Index
- index.reset(100)
- index.TotalUncompressed = int64(blocks) * 100
- index.TotalCompressed = int64(blocks) * 100
- for i := 0; i < blocks; i++ {
- err := index.add(int64(i*100), int64(i*100))
- fatalErr(b, err)
- }
- rng := rand.New(rand.NewSource(0xabeefcafe))
- b.Run(fmt.Sprintf("blocks-%d", len(index.info)), func(b *testing.B) {
- b.ResetTimer()
- b.ReportAllocs()
- const prime4bytes = 2654435761
- rng2 := rng.Int63()
- for i := 0; i < b.N; i++ {
- rng2 = ((rng2 + prime4bytes) * prime4bytes) >> 32
- // Find offset:
- _, _, err := index.Find(rng2 % (int64(blocks) * 100))
- fatalErr(b, err)
- }
- })
- }
- }
|