123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493 |
- // Copyright (c) 2023+ Klaus Post. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package s2
- import (
- "archive/tar"
- "bytes"
- "compress/gzip"
- "fmt"
- "io"
- "math/rand"
- "os"
- "testing"
- "github.com/klauspost/compress/internal/fuzz"
- "github.com/klauspost/compress/zstd"
- )
- func TestDict(t *testing.T) {
- rng := rand.New(rand.NewSource(1))
- data := make([]byte, 128<<10)
- for i := range data {
- data[i] = uint8(rng.Intn(256))
- }
- // Should match the first 64K
- d := NewDict(append([]byte{0}, data[:65536]...))
- encoded := make([]byte, MaxEncodedLen(len(data)))
- res := encodeBlockDictGo(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // Add dict that will produce a full match 5000 chars into the input.
- d = NewDict(append([]byte{0}, data[5000:65536+5000]...))
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockDictGo(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // generate copies
- for i := 1; i < len(data); {
- n := rng.Intn(32) + 4
- off := rng.Intn(len(data) - n)
- copy(data[i:], data[off:off+n])
- i += n
- }
- dict := make([]byte, 65536)
- for i := 1; i < len(dict); {
- n := rng.Intn(32) + 4
- off := rng.Intn(65536 - n)
- copy(dict[i:], data[off:off+n])
- i += n
- }
- d = NewDict(dict)
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockDictGo(encoded, data, d)
- if res == 0 || res > len(data)-20000 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- os.WriteFile("decoded.bin", decoded, os.ModePerm)
- os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- }
- func TestDictBetter(t *testing.T) {
- rng := rand.New(rand.NewSource(1))
- data := make([]byte, 128<<10)
- for i := range data {
- data[i] = uint8(rng.Intn(256))
- }
- // Should match the first 64K
- d := NewDict(append([]byte{0}, data[:65536]...))
- encoded := make([]byte, MaxEncodedLen(len(data)))
- res := encodeBlockBetterDict(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // Add dict that will produce a full match 5000 chars into the input.
- d = NewDict(append([]byte{0}, data[5000:65536+5000]...))
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockBetterDict(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // generate copies
- for i := 1; i < len(data); {
- n := rng.Intn(32) + 4
- off := rng.Intn(len(data) - n)
- copy(data[i:], data[off:off+n])
- i += n
- }
- dict := make([]byte, 65536)
- for i := 1; i < len(dict); {
- n := rng.Intn(32) + 4
- off := rng.Intn(65536 - n)
- copy(dict[i:], data[off:off+n])
- i += n
- }
- d = NewDict(dict)
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockBetterDict(encoded, data, d)
- if res == 0 || res > len(data)-20000 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- os.WriteFile("decoded.bin", decoded, os.ModePerm)
- os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- }
- func TestDictBest(t *testing.T) {
- rng := rand.New(rand.NewSource(1))
- data := make([]byte, 128<<10)
- for i := range data {
- data[i] = uint8(rng.Intn(256))
- }
- // Should match the first 64K
- d := NewDict(append([]byte{0}, data[:65536]...))
- encoded := make([]byte, MaxEncodedLen(len(data)))
- res := encodeBlockBest(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // Add dict that will produce a full match 5000 chars into the input.
- d = NewDict(append([]byte{0}, data[5000:65536+5000]...))
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockBest(encoded, data, d)
- if res == 0 || res > len(data)-65500 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- // generate copies
- for i := 1; i < len(data); {
- n := rng.Intn(32) + 4
- off := rng.Intn(len(data) - n)
- copy(data[i:], data[off:off+n])
- i += n
- }
- dict := make([]byte, 65536)
- for i := 1; i < len(dict); {
- n := rng.Intn(32) + 4
- off := rng.Intn(65536 - n)
- copy(dict[i:], data[off:off+n])
- i += n
- }
- d = NewDict(dict)
- encoded = make([]byte, MaxEncodedLen(len(data)))
- res = encodeBlockBest(encoded, data, d)
- if res == 0 || res > len(data)-20000 {
- t.Errorf("did no get expected dict saving. Saved %d bytes", len(data)-res)
- }
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- decoded = make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- os.WriteFile("decoded.bin", decoded, os.ModePerm)
- os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- }
- func TestDictBetter2(t *testing.T) {
- // Should match the first 64K
- data := []byte("10 bananas which were brown were added")
- d := NewDict(append([]byte{6}, []byte("Yesterday 25 bananas were added to Benjamins brown bag")...))
- encoded := make([]byte, MaxEncodedLen(len(data)))
- res := encodeBlockBetterDict(encoded, data, d)
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- t.Log(string(encoded))
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- }
- func TestDictBest2(t *testing.T) {
- // Should match the first 64K
- data := []byte("10 bananas which were brown were added")
- d := NewDict(append([]byte{6}, []byte("Yesterday 25 bananas were added to Benjamins brown bag")...))
- encoded := make([]byte, MaxEncodedLen(len(data)))
- res := encodeBlockBest(encoded, data, d)
- encoded = encoded[:res]
- t.Log("saved", len(data)-res, "bytes")
- t.Log(string(encoded))
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- //os.WriteFile("decoded.bin", decoded, os.ModePerm)
- //os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- }
- func TestDictSize(t *testing.T) {
- //f, err := os.Open("testdata/xlmeta.tar.s2")
- //f, err := os.Open("testdata/broken.tar.s2")
- f, err := os.Open("testdata/github_users_sample_set.tar.s2")
- //f, err := os.Open("testdata/gofiles2.tar.s2")
- //f, err := os.Open("testdata/gosrc.tar.s2")
- if err != nil {
- t.Skip(err)
- }
- stream := NewReader(f)
- in := tar.NewReader(stream)
- //rawDict, err := os.ReadFile("testdata/godict.dictator")
- rawDict, err := os.ReadFile("testdata/gofiles.dict")
- //rawDict, err := os.ReadFile("testdata/gosrc2.dict")
- //rawDict, err := os.ReadFile("testdata/td.dict")
- //rawDict, err := os.ReadFile("testdata/users.dict")
- //rawDict, err := os.ReadFile("testdata/xlmeta.dict")
- if err != nil {
- t.Fatal(err)
- }
- lidx := -1
- if di, err := zstd.InspectDictionary(rawDict); err == nil {
- rawDict = di.Content()
- lidx = len(rawDict) - di.Offsets()[0]
- } else {
- t.Errorf("Loading dict: %v", err)
- return
- }
- searchFor := ""
- if false {
- searchFor = "// Copyright 2022"
- }
- d := MakeDict(rawDict, []byte(searchFor))
- if d == nil {
- t.Fatal("no dict", lidx)
- }
- var totalIn int
- var totalOut int
- var totalCount int
- for {
- h, err := in.Next()
- if err != nil {
- break
- }
- if h.Size == 0 {
- continue
- }
- data := make([]byte, 65536)
- t.Run(h.Name, func(t *testing.T) {
- if int(h.Size) < 65536 {
- data = data[:h.Size]
- } else {
- data = data[:65536]
- }
- _, err := io.ReadFull(in, data)
- if err != nil {
- t.Skip()
- }
- if d == nil {
- // Use first file as dict
- d = MakeDict(data, nil)
- }
- // encode
- encoded := make([]byte, MaxEncodedLen(len(data)))
- totalIn += len(data)
- totalCount++
- //res := encodeBlockBest(encoded, data, nil)
- res := encodeBlockBest(encoded, data, d)
- //res := encodeBlockBetterDict(encoded, data, d)
- //res := encodeBlockBetterGo(encoded, data)
- //res := encodeBlockDictGo(encoded, data, d)
- // res := encodeBlockGo(encoded, data)
- if res == 0 {
- totalOut += len(data)
- return
- }
- totalOut += res
- encoded = encoded[:res]
- //t.Log("encoded", len(data), "->", res, "saved", len(data)-res, "bytes")
- decoded := make([]byte, len(data))
- res = s2DecodeDict(decoded, encoded, d)
- if res != 0 {
- t.Fatalf("got result: %d", res)
- }
- if !bytes.Equal(decoded, data) {
- os.WriteFile("decoded.bin", decoded, os.ModePerm)
- os.WriteFile("original.bin", data, os.ModePerm)
- t.Fatal("decoded mismatch")
- }
- })
- }
- fmt.Printf("%d files, %d -> %d (%.2f%%) - %.02f bytes saved/file\n", totalCount, totalIn, totalOut, float64(totalOut*100)/float64(totalIn), float64(totalIn-totalOut)/float64(totalCount))
- }
- func FuzzDictBlocks(f *testing.F) {
- fuzz.AddFromZip(f, "testdata/enc_regressions.zip", fuzz.TypeRaw, false)
- fuzz.AddFromZip(f, "testdata/fuzz/block-corpus-raw.zip", fuzz.TypeRaw, testing.Short())
- fuzz.AddFromZip(f, "testdata/fuzz/block-corpus-enc.zip", fuzz.TypeGoFuzz, testing.Short())
- // Fuzzing tweaks:
- const (
- // Max input size:
- maxSize = 8 << 20
- )
- file, err := os.Open("testdata/s2-dict.bin.gz")
- if err != nil {
- f.Fatal(err)
- }
- gzr, err := gzip.NewReader(file)
- if err != nil {
- f.Fatal(err)
- }
- dictBytes, err := io.ReadAll(gzr)
- if err != nil {
- f.Fatal(err)
- }
- dict := NewDict(dictBytes)
- if dict == nil {
- f.Fatal("invalid dict")
- }
- f.Fuzz(func(t *testing.T, data []byte) {
- if len(data) > maxSize {
- return
- }
- writeDst := make([]byte, MaxEncodedLen(len(data)), MaxEncodedLen(len(data))+4)
- writeDst = append(writeDst, 1, 2, 3, 4)
- defer func() {
- got := writeDst[MaxEncodedLen(len(data)):]
- want := []byte{1, 2, 3, 4}
- if !bytes.Equal(got, want) {
- t.Fatalf("want %v, got %v - dest modified outside cap", want, got)
- }
- }()
- compDst := writeDst[:MaxEncodedLen(len(data)):MaxEncodedLen(len(data))] // Hard cap
- decDst := make([]byte, len(data))
- comp := dict.Encode(compDst, data)
- decoded, err := dict.Decode(decDst, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- comp = dict.EncodeBetter(compDst, data)
- decoded, err = dict.Decode(decDst, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- comp = dict.EncodeBest(compDst, data)
- decoded, err = dict.Decode(decDst, comp)
- if err != nil {
- t.Error(err)
- return
- }
- if !bytes.Equal(data, decoded) {
- t.Error("block decoder mismatch")
- return
- }
- if mel := MaxEncodedLen(len(data)); len(comp) > mel {
- t.Error(fmt.Errorf("MaxEncodedLen Exceed: input: %d, mel: %d, got %d", len(data), mel, len(comp)))
- return
- }
- })
- }
|