Browse Source

Do not use minilzo and quicklz in open source. Export it to github.
d4d08d59dfff0c48a950a3faa36be4ac7e060912

iddqd 9 months ago
parent
commit
0f8b43a279

+ 76 - 0
library/cpp/streams/factory/open_by_file_extension/factory.cpp

@@ -0,0 +1,76 @@
+#include "factory.h"
+
+#include <library/cpp/streams/bzip2/bzip2.h>
+#include <library/cpp/streams/factory/open_common/factory.h>
+#include <util/stream/holder.h>
+#include <util/stream/file.h>
+#include <util/stream/output.h>
+#include <util/stream/zlib.h>
+#include <util/system/file.h>
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+#include <util/generic/store_policy.h>
+
+namespace {
+    template <class T, class TDecoder>
+    class TCompressed: public TEmbedPolicy<T>, public TDecoder {
+    public:
+        template <class C>
+        inline TCompressed(const C& c)
+            : TEmbedPolicy<T>(c)
+            , TDecoder(TEmbedPolicy<T>::Ptr())
+        {
+        }
+
+        template <class C>
+        inline TCompressed(const C& c, size_t compressionLevel, size_t buflen)
+            : TEmbedPolicy<T>(c)
+            , TDecoder(this->Ptr(), compressionLevel, buflen)
+        {
+        }
+
+        ~TCompressed() override {
+        }
+    };
+
+    class TGZipCompress: public TZLibCompress {
+    public:
+        TGZipCompress(IOutputStream* output)
+            : TZLibCompress(output, ZLib::GZip)
+        {
+        }
+
+        TGZipCompress(IOutputStream* output, size_t compressionLevel, size_t buflen)
+            : TZLibCompress(output, ZLib::GZip, compressionLevel, buflen)
+        {
+        }
+    };
+}
+
+THolder<IInputStream> OpenInput(const TString& url) {
+    if (!url || url == TStringBuf("-")) {
+        return OpenStdin();
+    }
+
+    if (url.EndsWith(TStringBuf(".gz"))) {
+        return MakeHolder<TCompressed<TFileInput, TBufferedZLibDecompress>>(url);
+    }
+
+    if (url.EndsWith(TStringBuf(".bz2"))) {
+        return MakeHolder<TCompressed<TFileInput, TBZipDecompress>>(url);
+    }
+
+    return MakeHolder<TFileInput>(url);
+}
+
+THolder<IOutputStream> OpenOutput(const TString& url, ECompression compressionLevel, size_t buflen) {
+    if (!url || url == TStringBuf("-")) {
+        return MakeHolder<TFileOutput>(Duplicate(1));
+    } else if (url.EndsWith(TStringBuf(".gz"))) {
+        return MakeHolder<TCompressed<TFileOutput, TGZipCompress>>(url, size_t(compressionLevel), buflen);
+    } else if (url.EndsWith(TStringBuf(".bz2"))) {
+        return MakeHolder<TCompressed<TFileOutput, TBZipCompress>>(url, size_t(compressionLevel), buflen);
+    }
+
+    return MakeHolder<TFileOutput>(url);
+}

+ 50 - 0
library/cpp/streams/factory/open_by_file_extension/factory.h

@@ -0,0 +1,50 @@
+#pragma once
+
+#include <util/generic/fwd.h>
+#include <util/generic/ptr.h>
+#include <util/stream/fwd.h>
+#include <util/stream/output.h>  // IOutputStream type must be complete to destroy in THolder below
+
+/**
+ * Convenience function for opening an input file passed as one of program
+ * arguments. Handles `-` as standard input, and creates a decompressing stream
+ * for `gz` and `bz2` files.
+ *
+ * @param url                           File to open.
+ */
+THolder<IInputStream> OpenInput(const TString& url);
+
+enum class ECompression {
+    L1 = 1,
+    L2,
+    L3,
+    L4,
+    L5,
+    L6,
+    L7,
+    L8,
+    L9,
+    FAST = 1,
+    DEFAULT = 6,
+    BEST = 9
+};
+
+/**
+ * Convenience function for opening an output file passed as one of program
+ * arguments. Handles `-` as standard output, and creates a compressing stream
+ * for `gz` and `bz2` files with given compression level and buffer size.
+ *
+ * @param url                           File to open.
+ * @param compression_level             Compression level.
+ * @param buflen                        Compression buffer length in bytes.
+ */
+THolder<IOutputStream> OpenOutput(const TString& url, ECompression compressionLevel, size_t buflen);
+
+inline THolder<IOutputStream> OpenOutput(const TString& url, ECompression compressionLevel) {
+    return ::OpenOutput(url, compressionLevel, 8 * 1024);
+}
+
+inline THolder<IOutputStream> OpenOutput(const TString& url) {
+    return ::OpenOutput(url, ECompression::DEFAULT);
+}
+

+ 83 - 0
library/cpp/streams/factory/open_by_file_extension/factory_ut.cpp

@@ -0,0 +1,83 @@
+#include "factory.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/buffer.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/buffer.h>
+#include <util/stream/file.h>
+#include <util/stream/mem.h>
+#include <util/stream/zlib.h>
+#include <util/system/env.h>
+
+static const TString plain = "aaaaaaaaaaabbbbbbbbbbbdddddd22222222000000aldkfa9s3jsfkjlkja909090909090q3lkjalkjf3aldjl";
+
+static const ui8 gz[] = {31, 139, 8, 8, 126, 193, 203, 80, 0, 3, 97, 46, 116, 120, 116, 0, 75, 76, 132, 131, 36, 4, 72, 1, 3, 35, 40, 48, 0, 131, 196, 156, 148, 236, 180, 68, 203, 98, 227, 172, 226, 180, 236, 172, 156, 236, 172, 68, 75, 3, 4, 44, 52, 6, 137, 0, 113, 154, 49, 80, 97, 86, 14, 0, 5, 203, 67, 131, 88, 0, 0, 0};
+static const auto gzLength = Y_ARRAY_SIZE(gz);
+
+static const ui8 gzLvl6[] = {31, 139, 8, 0, 0, 0, 0, 0, 0, 3, 75, 76, 132, 131, 36, 4, 72, 1, 3, 35, 40, 48, 0, 131, 196, 156, 148, 236, 180, 68, 203, 98, 227, 172, 226, 180, 236, 172, 156, 236, 172, 68, 75, 3, 4, 44, 52, 6, 137, 0, 113, 154, 49, 80, 97, 86, 14, 0, 5, 203, 67, 131, 88, 0, 0, 0};
+static const auto gzLvl6Length = Y_ARRAY_SIZE(gzLvl6);
+
+static const ui8 bz2[] = {66, 90, 104, 57, 49, 65, 89, 38, 83, 89, 140, 92, 215, 106, 0, 0, 17, 73, 128, 20, 128, 88, 32, 53, 28, 40, 0, 32, 0, 84, 66, 52, 211, 0, 6, 72, 122, 140, 131, 36, 97, 60, 92, 230, 1, 71, 91, 170, 135, 33, 135, 149, 133, 75, 174, 153, 146, 217, 24, 174, 177, 76, 246, 69, 254, 225, 195, 236, 95, 180, 93, 201, 20, 225, 66, 66, 49, 115, 93, 168};
+static const auto bz2Length = Y_ARRAY_SIZE(bz2);
+
+static const ui8 bz2Lvl6[] = {66, 90, 104, 54, 49, 65, 89, 38, 83, 89, 140, 92, 215, 106, 0, 0, 17, 73, 128, 20, 128, 88, 32, 53, 28, 40, 0, 32, 0, 84, 66, 52, 211, 0, 6, 72, 122, 140, 131, 36, 97, 60, 92, 230, 1, 71, 91, 170, 135, 33, 135, 149, 133, 75, 174, 153, 146, 217, 24, 174, 177, 76, 246, 69, 254, 225, 195, 236, 95, 180, 93, 201, 20, 225, 66, 66, 49, 115, 93, 168};
+static const auto bz2Lvl6Length = Y_ARRAY_SIZE(bz2Lvl6);
+
+Y_UNIT_TEST_SUITE(TRecognizeCompressorTest) {
+    Y_UNIT_TEST(TestOpenInput) {
+        const auto fileName = TString("test_open_input.file");
+        TFileOutput{fileName}.Write(plain);
+        UNIT_ASSERT_VALUES_EQUAL(OpenInput(fileName)->ReadAll(), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenInputZlib) {
+        const auto fileName = TString("test_open_input_zlib.file.gz");
+        TFileOutput{fileName}.Write(gz, gzLength);
+        UNIT_ASSERT_VALUES_EQUAL(OpenInput(fileName)->ReadAll(), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenInputBZ2) {
+        const auto fileName = TString("test_open_input_bz2.file.bz2");
+        TFileOutput{fileName}.Write(bz2, bz2Length);
+        UNIT_ASSERT_VALUES_EQUAL(OpenInput(fileName)->ReadAll(), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenOutput) {
+        const auto fileName = TString("test_open_output.file");
+        OpenOutput(fileName)->Write(plain);
+        UNIT_ASSERT_VALUES_EQUAL(TFileInput{fileName}.ReadAll(), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenOutputZlib) {
+        const auto fileName = TString("test_open_output_zlib.file.gz");
+        OpenOutput(fileName)->Write(plain);
+        const auto expected = TStringBuf{(const char*)gzLvl6, gzLvl6Length};
+        UNIT_ASSERT_VALUES_EQUAL(TFileInput{fileName}.ReadAll(), expected);
+    }
+
+    Y_UNIT_TEST(TestOpenOutputBZ2) {
+        const auto fileName = TString("test_open_output_bz2.file.bz2");
+        OpenOutput(fileName)->Write(plain);
+        const auto expected = TStringBuf{(const char*)bz2Lvl6, bz2Lvl6Length};
+        UNIT_ASSERT_VALUES_EQUAL(TFileInput{fileName}.ReadAll(), expected);
+    }
+
+    static void TestReadWrite(const TString& fileName, const TString& data) {
+        OpenOutput(fileName)->Write(data.data(), data.size());
+        UNIT_ASSERT_VALUES_EQUAL(OpenInput(fileName)->ReadAll(), data);
+    }
+
+    Y_UNIT_TEST(TestOpenInputOpenOutputSimple) {
+        TestReadWrite(TString("test_open_input_open_output.file"), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenInputOpenOutputZLib) {
+        TestReadWrite(TString("test_open_input_open_output_zlib.file.gz"), plain);
+    }
+
+    Y_UNIT_TEST(TestOpenInputOpenOutputBZ2) {
+        TestReadWrite(TString("test_open_input_open_output_bz2.file.bz2"), plain);
+    }
+}

+ 7 - 0
library/cpp/streams/factory/open_by_file_extension/ut/ya.make

@@ -0,0 +1,7 @@
+UNITTEST_FOR(library/cpp/streams/factory/open_by_file_extension)
+
+SRCS(
+    factory_ut.cpp
+)
+
+END()

+ 16 - 0
library/cpp/streams/factory/open_by_file_extension/ya.make

@@ -0,0 +1,16 @@
+LIBRARY()
+
+PEERDIR(
+    library/cpp/streams/bzip2
+    library/cpp/streams/factory/open_common
+)
+
+SRCS(
+    factory.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)

+ 99 - 0
library/cpp/streams/factory/open_by_signature/factory.cpp

@@ -0,0 +1,99 @@
+#include "factory.h"
+
+#include <library/cpp/streams/bzip2/bzip2.h>
+#include <library/cpp/streams/factory/open_common/factory.h>
+#include <util/stream/holder.h>
+#include <util/stream/file.h>
+#include <library/cpp/streams/lz/lz.h>
+#include <util/stream/str.h>
+#include <util/stream/zlib.h>
+#include <util/stream/multi.h>
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+
+namespace {
+    template <class T>
+    struct TInputHolderX: public T {
+        inline decltype(T().Get()) Set(T t) noexcept {
+            t.Swap(*this);
+
+            return this->Get();
+        }
+    };
+
+    template <class T>
+    struct TInputHolderX<T*> {
+        static inline T* Set(T* t) noexcept {
+            return t;
+        }
+    };
+
+    template <class TInput>
+    struct TStringMultiInput: private TInputHolderX<TInput>, private TString, private THolder<IInputStream>, public TMultiInput {
+        TStringMultiInput(const TString& head, TInput tail)
+            : TString(head)
+            , THolder<IInputStream>(new TStringInput(*this))
+            , TMultiInput(THolder<IInputStream>::Get(), this->Set(tail))
+        {
+        }
+
+        ~TStringMultiInput() override {
+        }
+    };
+}
+
+template <class TInput>
+THolder<IInputStream> OpenMaybeCompressedInputX(TInput input) {
+    const size_t MAX_SIGNATURE_SIZE = 4;
+    char buffer[MAX_SIGNATURE_SIZE];
+    TString header(buffer, input->Load(buffer, MAX_SIGNATURE_SIZE));
+
+    if (header.size() == MAX_SIGNATURE_SIZE) {
+        // any lz
+        THolder<IInputStream> lz = TryOpenOwnedLzDecompressor(new TStringMultiInput<TInput>(header, input));
+
+        if (lz.Get()) {
+            return lz;
+        }
+    }
+
+    THolder<IInputStream> multi(new TStringMultiInput<TInput>(header, input));
+
+    // gzip
+    const TStringBuf GZIP = "\x1F\x8B";
+    const TStringBuf ZLIB = "\x78\x9C";
+
+    if (header.StartsWith(GZIP) || header.StartsWith(ZLIB)) {
+        return MakeHolder<THoldingStream<TBufferedZLibDecompress>>(std::move(multi));
+    }
+
+    // bzip2
+    constexpr TStringBuf BZIP2 = "BZ";
+    if (header.StartsWith(BZIP2)) {
+        return MakeHolder<THoldingStream<TBZipDecompress>>(std::move(multi));
+    }
+
+    return multi;
+}
+
+THolder<IInputStream> OpenMaybeCompressedInput(IInputStream* input) {
+    return OpenMaybeCompressedInputX(input);
+}
+
+THolder<IInputStream> OpenOwnedMaybeCompressedInput(THolder<IInputStream> input) {
+    return OpenMaybeCompressedInputX(TAtomicSharedPtr<IInputStream>(input));
+}
+
+THolder<IInputStream> OpenMaybeCompressedInput(const TString& path) {
+    if (!path || path == TStringBuf("-")) {
+        return OpenOwnedMaybeCompressedInput(OpenStdin());
+    }
+    return OpenOwnedMaybeCompressedInput(MakeHolder<TFileInput>(path));
+}
+
+THolder<IInputStream> OpenMaybeCompressedInput(const TString& path, ui32 bufSize) {
+    if (!path || path == TStringBuf("-")) {
+        return OpenOwnedMaybeCompressedInput(OpenStdin(bufSize));
+    }
+    return OpenOwnedMaybeCompressedInput(MakeHolder<TFileInput>(path, bufSize));
+}

+ 37 - 0
library/cpp/streams/factory/open_by_signature/factory.h

@@ -0,0 +1,37 @@
+#pragma once
+
+#include <util/generic/fwd.h>
+#include <util/generic/ptr.h>
+#include <util/stream/fwd.h>
+
+/**
+ * Peeks into the provided input stream to determine its compression format,
+ * if any, and returns a corresponding decompressing stream. If the stream is
+ * not compressed, then returns a simple pass-through proxy stream.
+ *
+ * Note that returned stream doesn't own the provided input stream, thus it's
+ * up to the user to free them both.
+ *
+ * @param input                         Input stream.
+ * @returns                             Newly constructed stream.
+ */
+THolder<IInputStream> OpenMaybeCompressedInput(IInputStream* input);
+
+/**
+ * Same as `OpenMaybeCompressedInput`, but returned stream owns the one passed
+ * into this function.
+ *
+ * @param input                         Input stream.
+ * @returns                             Newly constructed stream.
+ * @see OpenMaybeCompressedInput(IInputStream*)
+ */
+THolder<IInputStream> OpenOwnedMaybeCompressedInput(THolder<IInputStream> input);
+
+/**
+ * @param input                         Input stream.
+ * @returns                             Newly constructed stream.
+ * @see OpenMaybeCompressedInput(IInputStream*)
+ */
+THolder<IInputStream> OpenMaybeCompressedInput(const TString& path);
+
+THolder<IInputStream> OpenMaybeCompressedInput(const TString& path, ui32 bufSize);

+ 86 - 0
library/cpp/streams/factory/open_by_signature/factory_ut.cpp

@@ -0,0 +1,86 @@
+#include "factory.h"
+
+#include <library/cpp/streams/lz/lz.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/buffer.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/buffer.h>
+#include <util/stream/file.h>
+#include <util/stream/mem.h>
+#include <util/stream/zlib.h>
+#include <util/system/env.h>
+
+static const TString plain = "aaaaaaaaaaabbbbbbbbbbbdddddd22222222000000aldkfa9s3jsfkjlkja909090909090q3lkjalkjf3aldjl";
+
+static const ui8 gz[] = {31, 139, 8, 8, 126, 193, 203, 80, 0, 3, 97, 46, 116, 120, 116, 0, 75, 76, 132, 131, 36, 4, 72, 1, 3, 35, 40, 48, 0, 131, 196, 156, 148, 236, 180, 68, 203, 98, 227, 172, 226, 180, 236, 172, 156, 236, 172, 68, 75, 3, 4, 44, 52, 6, 137, 0, 113, 154, 49, 80, 97, 86, 14, 0, 5, 203, 67, 131, 88, 0, 0, 0};
+static const auto gzLength = Y_ARRAY_SIZE(gz);
+
+static const ui8 bz2[] = {66, 90, 104, 57, 49, 65, 89, 38, 83, 89, 140, 92, 215, 106, 0, 0, 17, 73, 128, 20, 128, 88, 32, 53, 28, 40, 0, 32, 0, 84, 66, 52, 211, 0, 6, 72, 122, 140, 131, 36, 97, 60, 92, 230, 1, 71, 91, 170, 135, 33, 135, 149, 133, 75, 174, 153, 146, 217, 24, 174, 177, 76, 246, 69, 254, 225, 195, 236, 95, 180, 93, 201, 20, 225, 66, 66, 49, 115, 93, 168};
+static const auto bz2Length = Y_ARRAY_SIZE(bz2);
+
+Y_UNIT_TEST_SUITE(TRecognizeCompressorTest) {
+    static void TestRawData(const void* data, size_t len, const TString& orig) {
+        TMemoryInput mem(data, len);
+
+        THolder<IInputStream> input = OpenMaybeCompressedInput(&mem);
+        UNIT_ASSERT_VALUES_UNEQUAL(input.Get(), nullptr);
+        UNIT_ASSERT_VALUES_EQUAL(input->ReadAll(), orig);
+    }
+
+    static void TestRawDataOwned(const void* data, size_t len, const TString& orig) {
+        THolder<IInputStream> input = OpenOwnedMaybeCompressedInput(MakeHolder<TMemoryInput>(data, len));
+        UNIT_ASSERT_VALUES_UNEQUAL(input.Get(), nullptr);
+        UNIT_ASSERT_VALUES_EQUAL(input->ReadAll(), orig);
+    }
+
+    static inline void TestSame(const TString& text) {
+        TestRawData(text.data(), text.size(), text);
+        TestRawDataOwned(text.data(), text.size(), text);
+    }
+
+    Y_UNIT_TEST(TestPlain) {
+        TestSame(plain);
+        TestSame("");
+        TestSame("a");
+        TestSame("ab");
+        TestSame("abc");
+        TestSame("abcd");
+    }
+
+    Y_UNIT_TEST(TestGzip) {
+        TestRawData(gz, gzLength, plain);
+        TestRawDataOwned(gz, gzLength, plain);
+    }
+
+    Y_UNIT_TEST(TestBzip2) {
+        TestRawData(bz2, bz2Length, plain);
+        TestRawDataOwned(bz2, bz2Length, plain);
+    }
+
+    template <typename TCompress>
+    static void TestCompress() {
+        TBufferStream buf;
+        {
+            TCompress z(&buf);
+            z.Write(plain.data(), plain.size());
+        }
+        TestRawData(buf.Buffer().Data(), buf.Buffer().Size(), plain);
+    }
+
+    Y_UNIT_TEST(TestLz) {
+        TestCompress<TLz4Compress>();
+        TestCompress<TSnappyCompress>();
+#ifndef OPENSOURCE
+        TestCompress<TLzoCompress>();
+        TestCompress<TLzqCompress>();
+#endif
+        TestCompress<TLzfCompress>();
+    }
+
+    Y_UNIT_TEST(TestZlib) {
+        TestCompress<TZLibCompress>();
+    }
+}

+ 11 - 0
library/cpp/streams/factory/open_by_signature/ut/ya.make

@@ -0,0 +1,11 @@
+UNITTEST_FOR(library/cpp/streams/factory/open_by_signature)
+
+IF(OPENSOURCE)
+    CFLAGS(-DOPENSOURCE)
+ENDIF()
+
+SRCS(
+    factory_ut.cpp
+)
+
+END()

+ 17 - 0
library/cpp/streams/factory/open_by_signature/ya.make

@@ -0,0 +1,17 @@
+LIBRARY()
+
+PEERDIR(
+    library/cpp/streams/bzip2
+    library/cpp/streams/factory/open_common
+    library/cpp/streams/lz
+)
+
+SRCS(
+    factory.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+    ut
+)

Some files were not shown because too many files changed in this diff