Browse Source

Ability for ydb-cli to write request results as arrow parquet (#673)

Возможность записи parquet через ydb-cli
Олег 1 year ago
parent
commit
f0d82138f9

+ 224 - 0
ydb/library/arrow_parquet/result_set_parquet_printer.cpp

@@ -0,0 +1,224 @@
+#include "result_set_parquet_printer.h"
+
+#include <ydb/public/sdk/cpp/client/ydb_value/value.h>
+#include <ydb/public/sdk/cpp/client/ydb_result/result.h>
+
+#include <contrib/libs/apache/arrow/cpp/src/arrow/io/file.h>
+#include <contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h>
+#include <contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h>
+#include <contrib/libs/apache/arrow/cpp/src/parquet/schema.h>
+
+#include <util/folder/path.h>
+
+namespace NYdb {
+
+    class TResultSetParquetPrinter::TImpl {
+    public:
+        explicit TImpl(const std::string& outputPath, ui64 rowGroupSize);
+        void Reset();
+        void Print(const TResultSet& resultSet);
+
+    private:
+        void InitStream(const TResultSet& resultSet);
+        static parquet::schema::NodePtr ToParquetType(const char* name, const TTypeParser& type, bool nullable);
+
+    private:
+        std::unique_ptr<parquet::StreamWriter> Stream;
+        const std::string OutputPath;
+        const ui64 RowGroupSize;
+    };
+
+    void TResultSetParquetPrinter::TImpl::InitStream(const TResultSet& resultSet) {
+        parquet::schema::NodeVector fields;
+        for (const auto& field : resultSet.GetColumnsMeta()) {
+            TTypeParser type(field.Type);
+            bool nullable = false;
+            if (type.GetKind() == TTypeParser::ETypeKind::Optional) {
+                nullable = true;
+                type.OpenOptional();
+            }
+            fields.emplace_back(ToParquetType(field.Name.c_str(), type, nullable));
+        }
+        auto schema = std::static_pointer_cast<parquet::schema::GroupNode>(
+            parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields));
+        parquet::WriterProperties::Builder builder;
+        builder.compression(parquet::Compression::ZSTD);
+        builder.disable_dictionary();
+        std::shared_ptr<arrow::io::OutputStream> outstream;
+        if (OutputPath.empty()) {
+            outstream = std::make_shared<arrow::io::StdoutStream>();
+        } else {
+            if (auto parent = TFsPath(OutputPath.c_str()).Parent()) {
+                parent.MkDirs();
+            }
+            outstream = *arrow::io::FileOutputStream::Open(OutputPath);
+        }
+        Stream = std::make_unique<parquet::StreamWriter>(parquet::ParquetFileWriter::Open(outstream, schema, builder.build()));
+        Stream->SetMaxRowGroupSize(RowGroupSize);
+    }
+
+    parquet::schema::NodePtr TResultSetParquetPrinter::TImpl::ToParquetType(const char* name, const TTypeParser& type, bool nullable) {
+        if (type.GetKind() != TTypeParser::ETypeKind::Primitive) {
+            ythrow yexception() << "Cannot save not primitive type to parquet: " << type.GetKind();
+        }
+        const auto repType = nullable ? parquet::Repetition::OPTIONAL : parquet::Repetition::REQUIRED;
+        switch (type.GetPrimitive()) {
+        case EPrimitiveType::Bool:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BOOLEAN);
+        case EPrimitiveType::Int8:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::INT_8);
+        case EPrimitiveType::Uint8:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::UINT_8);
+        case EPrimitiveType::Int16:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::INT_16);
+        case EPrimitiveType::Uint16:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::UINT_16);
+        case EPrimitiveType::Int32:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::INT_32);
+        case EPrimitiveType::Uint32:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::UINT_32);
+        case EPrimitiveType::Int64:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT64, parquet::ConvertedType::INT_64);
+        case EPrimitiveType::Uint64:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT64, parquet::ConvertedType::UINT_64);
+        case EPrimitiveType::Float:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::FLOAT);
+        case EPrimitiveType::Double:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::DOUBLE);
+        case EPrimitiveType::Date:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT32, parquet::ConvertedType::UINT_32);
+        case EPrimitiveType::Timestamp:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT64, parquet::ConvertedType::INT_64);
+        case EPrimitiveType::Interval:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::INT64, parquet::ConvertedType::INT_64);
+        case EPrimitiveType::String:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        case EPrimitiveType::Utf8:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        case EPrimitiveType::Yson:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        case EPrimitiveType::Json:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        case EPrimitiveType::JsonDocument:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        case EPrimitiveType::DyNumber:
+            return parquet::schema::PrimitiveNode::Make(name, repType, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::UTF8);
+        default:
+            ythrow yexception() << "Cannot save type to parquet: " << type.GetPrimitive();
+        }
+    }
+
+    TResultSetParquetPrinter::TImpl::TImpl(const std::string& outputPath, ui64 rowGroupSize)
+        : OutputPath(outputPath)
+        , RowGroupSize(rowGroupSize)
+    {}
+
+    void TResultSetParquetPrinter::TImpl::Reset() {
+        Stream.reset();
+    }
+
+    void TResultSetParquetPrinter::TImpl::Print(const TResultSet& resultSet) {
+        if (!Stream) {
+            InitStream(resultSet);
+        }
+        auto& os = *Stream;
+        TResultSetParser parser(resultSet);
+        while (parser.TryNextRow()) {
+            for (ui32 i = 0; i < resultSet.GetColumnsMeta().size(); ++i) {
+                TValueParser value(parser.GetValue(i));
+                bool nullable = value.GetKind() == TTypeParser::ETypeKind::Optional;
+                if (nullable) {
+                    value.OpenOptional();
+                    if (value.IsNull()) {
+                        os.SkipColumns(1);
+                        continue;
+                    }
+                }
+                if (value.GetKind() != TTypeParser::ETypeKind::Primitive) {
+                    ythrow yexception() << "Cannot save not primitive type to parquet: " << value.GetKind();
+                }
+                switch (value.GetPrimitiveType()) {
+                case EPrimitiveType::Bool:
+                    os << value.GetBool();
+                    break;
+                case EPrimitiveType::Int8:
+                    os << value.GetInt8();
+                    break;
+                case EPrimitiveType::Uint8:
+                    os << value.GetUint8();
+                    break;
+                case EPrimitiveType::Int16:
+                    os << value.GetInt16();
+                    break;
+                case EPrimitiveType::Uint16:
+                    os << value.GetUint16();
+                    break;
+                case EPrimitiveType::Int32:
+                    os << value.GetInt32();
+                    break;
+                case EPrimitiveType::Uint32:
+                    os << value.GetUint32();
+                    break;
+                case EPrimitiveType::Int64:
+                    os << (std::int64_t)value.GetInt64();
+                    break;
+                case EPrimitiveType::Uint64:
+                    os << (std::uint64_t)value.GetUint64();
+                    break;
+                case EPrimitiveType::Float:
+                    os << value.GetFloat();
+                    break;
+                case EPrimitiveType::Double:
+                    os << value.GetDouble();
+                    break;
+                case EPrimitiveType::Date:
+                    os << (ui32)value.GetDate().Seconds();
+                    break;
+                case EPrimitiveType::Timestamp:
+                    os << (std::int64_t)value.GetTimestamp().MicroSeconds();
+                    break;
+                case EPrimitiveType::Interval:
+                    os << (std::int64_t)value.GetInterval();
+                    break;
+                case EPrimitiveType::String:
+                    os << arrow::util::string_view(value.GetString().c_str(), value.GetString().length());
+                    break;
+                case EPrimitiveType::Utf8:
+                    os << arrow::util::string_view(value.GetUtf8().c_str(), value.GetUtf8().length());
+                    break;
+                case EPrimitiveType::Yson:
+                    os << arrow::util::string_view(value.GetYson().c_str(), value.GetYson().length());
+                    break;
+                case EPrimitiveType::Json:
+                    os << arrow::util::string_view(value.GetJson().c_str(), value.GetJson().length());
+                    break;
+                case EPrimitiveType::JsonDocument:
+                    os << arrow::util::string_view(value.GetJsonDocument().c_str(), value.GetJsonDocument().length());
+                    break;
+                case EPrimitiveType::DyNumber:
+                    os << arrow::util::string_view(value.GetDyNumber().c_str(), value.GetDyNumber().length());
+                    break;
+                default:
+                    ythrow yexception() << "Cannot save type to parquet: " << value.GetPrimitiveType();
+                }
+            }
+            os.EndRow();
+        }
+    }
+
+    TResultSetParquetPrinter::TResultSetParquetPrinter(const std::string& outputPath, ui64 rowGroupSize /*= 100000*/)
+        : Impl(std::make_unique<TImpl>(outputPath, rowGroupSize))
+    {}
+
+    TResultSetParquetPrinter::~TResultSetParquetPrinter() {
+    }
+
+    void TResultSetParquetPrinter::Reset() {
+        Impl->Reset();
+    }
+
+    void TResultSetParquetPrinter::Print(const TResultSet& resultSet) {
+        Impl->Print(resultSet);
+    }
+
+}

+ 23 - 0
ydb/library/arrow_parquet/result_set_parquet_printer.h

@@ -0,0 +1,23 @@
+#pragma once
+
+#include <util/system/types.h>
+
+#include <string>
+
+namespace NYdb {
+
+    class TResultSet;
+
+    class TResultSetParquetPrinter {
+    public:
+        explicit TResultSetParquetPrinter(const std::string& outputPath, ui64 rowGroupSize = 100000);
+        ~TResultSetParquetPrinter();
+        void Reset();
+        void Print(const TResultSet& resultSet);
+
+    private:
+        class TImpl;
+        std::unique_ptr<TImpl> Impl;
+    };
+
+}

+ 14 - 0
ydb/library/arrow_parquet/ya.make

@@ -0,0 +1,14 @@
+LIBRARY()
+
+OWNER(g:kikimr)
+
+SRCS(
+    result_set_parquet_printer.cpp
+)
+
+PEERDIR(
+    ydb/public/sdk/cpp/client/ydb_value
+    contrib/libs/apache/arrow
+)
+
+END()

+ 1 - 0
ydb/library/ya.make

@@ -4,6 +4,7 @@ RECURSE(
     aclib
     arrow_clickhouse
     arrow_kernels
+    arrow_parquet
     backup
     binary_json
     chunks_limiter

+ 2 - 1
ydb/public/lib/ydb_cli/commands/ydb_service_scripting.cpp

@@ -38,7 +38,8 @@ void TCommandExecuteYqlScript::Config(TConfig& config) {
         EOutputFormat::JsonUnicode,
         EOutputFormat::JsonUnicodeArray,
         EOutputFormat::JsonBase64,
-        EOutputFormat::JsonBase64Array
+        EOutputFormat::JsonBase64Array,
+        EOutputFormat::Parquet,
     });
 
     AddParametersOption(config);

+ 4 - 2
ydb/public/lib/ydb_cli/commands/ydb_service_table.cpp

@@ -368,7 +368,8 @@ void TCommandExecuteQuery::Config(TConfig& config) {
         EOutputFormat::JsonBase64,
         EOutputFormat::JsonBase64Array,
         EOutputFormat::Csv,
-        EOutputFormat::Tsv
+        EOutputFormat::Tsv,
+        EOutputFormat::Parquet,
     });
 
     AddParametersOption(config, "(for data & scan queries)");
@@ -1023,7 +1024,8 @@ void TCommandReadTable::Config(TConfig& config) {
         EOutputFormat::JsonBase64,
         EOutputFormat::JsonBase64Array,
         EOutputFormat::Csv,
-        EOutputFormat::Tsv
+        EOutputFormat::Tsv,
+        EOutputFormat::Parquet,
     });
 
     config.SetFreeArgsNum(1);

+ 2 - 1
ydb/public/lib/ydb_cli/commands/ydb_yql.cpp

@@ -39,7 +39,8 @@ void TCommandYql::Config(TConfig& config) {
         EOutputFormat::JsonBase64,
         EOutputFormat::JsonBase64Array,
         EOutputFormat::Csv,
-        EOutputFormat::Tsv
+        EOutputFormat::Tsv,
+        EOutputFormat::Parquet,
     });
 
     AddParametersOption(config);

+ 9 - 0
ydb/public/lib/ydb_cli/common/format.cpp

@@ -4,6 +4,7 @@
 #include <library/cpp/json/json_prettifier.h>
 
 #include <ydb/public/lib/json_value/ydb_json_value.h>
+#include <ydb/library/arrow_parquet/result_set_parquet_printer.h>
 
 namespace NYdb {
 namespace NConsoleClient {
@@ -47,6 +48,7 @@ namespace {
         { EOutputFormat::ProtoJsonBase64, "Output result protobuf in json format, binary strings are encoded with base64" },
         { EOutputFormat::Csv, "Output in csv format" },
         { EOutputFormat::Tsv, "Output in tsv format" },
+        { EOutputFormat::Parquet, "Output in parquet format" },
     };
 
     THashMap<EMessagingFormat, TString> MessagingFormatDescriptions = {
@@ -713,6 +715,7 @@ TString TQueryPlanPrinter::JsonToString(const NJson::TJsonValue& jsonValue) {
 TResultSetPrinter::TResultSetPrinter(EOutputFormat format, std::function<bool()> isInterrupted)
     : Format(format)
     , IsInterrupted(isInterrupted)
+    , ParquetPrinter(std::make_unique<TResultSetParquetPrinter>(""))
 {}
 
 TResultSetPrinter::~TResultSetPrinter() {
@@ -750,6 +753,9 @@ void TResultSetPrinter::Print(const TResultSet& resultSet) {
     case EOutputFormat::Tsv:
         PrintCsv(resultSet, "\t");
         break;
+    case EOutputFormat::Parquet:
+        ParquetPrinter->Print(resultSet);
+        break;
     default:
         throw TMisuseException() << "This command doesn't support " << Format << " output format";
     }
@@ -783,6 +789,9 @@ void TResultSetPrinter::EndResultSet() {
     case EOutputFormat::JsonBase64Array:
         Cout << ']' << Endl;
         break;
+    case EOutputFormat::Parquet:
+        ParquetPrinter->Reset();
+        break;
     default:
         break;
     }

+ 7 - 0
ydb/public/lib/ydb_cli/common/format.h

@@ -8,6 +8,12 @@
 #include <ydb/public/sdk/cpp/client/ydb_result/result.h>
 #include <ydb/public/sdk/cpp/client/ydb_types/status/status.h>
 
+namespace NYdb {
+
+    class TResultSetParquetPrinter;
+
+}
+
 namespace NYdb {
 namespace NConsoleClient {
 
@@ -81,6 +87,7 @@ private:
     bool PrintedSomething = false;
     EOutputFormat Format;
     std::function<bool()> IsInterrupted;
+    std::unique_ptr<TResultSetParquetPrinter> ParquetPrinter;
 };
 
 class TQueryPlanPrinter {

+ 1 - 0
ydb/public/lib/ydb_cli/common/ya.make

@@ -46,6 +46,7 @@ PEERDIR(
     ydb/public/sdk/cpp/client/ydb_topic
     ydb/public/sdk/cpp/client/ydb_types
     ydb/public/sdk/cpp/client/ydb_types/credentials
+    ydb/library/arrow_parquet
 )
 
 GENERATE_ENUM_SERIALIZATION(formats.h)

Some files were not shown because too many files changed in this diff