pr34461-read-row-groups-split.patch 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. From 308af41748bc15ff794126226cb7bc4be14bf084 Mon Sep 17 00:00:00 2001
  2. From: Aleksandr Khoroshilov <hor911@ydb.tech>
  3. Date: Mon, 6 Mar 2023 02:11:54 +0300
  4. Subject: [PATCH 1/2] Split arrow::FileReader::ReadRowGroups() for flexible
  5. async IO
  6. ---
  7. cpp/src/parquet/arrow/reader.cc | 30 +++++++++++++++++++++++++++---
  8. cpp/src/parquet/arrow/reader.h | 7 +++++++
  9. 2 files changed, 34 insertions(+), 3 deletions(-)
  10. diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
  11. index 5b39de93d9ccf..11e543f935b61 100644
  12. --- a/cpp/src/parquet/arrow/reader.cc
  13. +++ b/cpp/src/parquet/arrow/reader.cc
  14. @@ -311,6 +311,13 @@ class FileReaderImpl : public FileReader {
  15. return ReadTable(Iota(reader_->metadata()->num_columns()), table);
  16. }
  17. + Status WillNeedRowGroups(const std::vector<int>& row_groups,
  18. + const std::vector<int>& column_indices) override;
  19. +
  20. + Status DecodeRowGroups(const std::vector<int>& row_groups,
  21. + const std::vector<int>& column_indices,
  22. + std::shared_ptr<::arrow::Table>* out) override;
  23. +
  24. Status ReadRowGroups(const std::vector<int>& row_groups,
  25. const std::vector<int>& indices,
  26. std::shared_ptr<Table>* table) override;
  27. @@ -1216,9 +1223,8 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
  28. return Status::OK();
  29. }
  30. -Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
  31. - const std::vector<int>& column_indices,
  32. - std::shared_ptr<Table>* out) {
  33. +Status FileReaderImpl::WillNeedRowGroups(const std::vector<int>& row_groups,
  34. + const std::vector<int>& column_indices) {
  35. RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
  36. // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
  37. @@ -1229,6 +1235,24 @@ Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
  38. reader_properties_.cache_options());
  39. END_PARQUET_CATCH_EXCEPTIONS
  40. }
  41. + return Status::OK();
  42. +}
  43. +
  44. +Status FileReaderImpl::DecodeRowGroups(const std::vector<int>& row_groups,
  45. + const std::vector<int>& column_indices,
  46. + std::shared_ptr<::arrow::Table>* out) {
  47. + RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
  48. +
  49. + auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
  50. + /*cpu_executor=*/nullptr);
  51. + ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
  52. + return Status::OK();
  53. +}
  54. +
  55. +Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
  56. + const std::vector<int>& column_indices,
  57. + std::shared_ptr<Table>* out) {
  58. + RETURN_NOT_OK(WillNeedRowGroups(row_groups, column_indices));
  59. auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
  60. /*cpu_executor=*/nullptr);
  61. diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
  62. index 5dff35e887ef0..fbabeba7c764f 100644
  63. --- a/cpp/src/parquet/arrow/reader.h
  64. +++ b/cpp/src/parquet/arrow/reader.h
  65. @@ -249,6 +249,13 @@ class PARQUET_EXPORT FileReader {
  66. virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
  67. + virtual ::arrow::Status WillNeedRowGroups(const std::vector<int>& row_groups,
  68. + const std::vector<int>& column_indices) = 0;
  69. +
  70. + virtual ::arrow::Status DecodeRowGroups(const std::vector<int>& row_groups,
  71. + const std::vector<int>& column_indices,
  72. + std::shared_ptr<::arrow::Table>* out) = 0;
  73. +
  74. virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
  75. const std::vector<int>& column_indices,
  76. std::shared_ptr<::arrow::Table>* out) = 0;
  77. From a82d7512faa11b01ff29fb724dd115f62e223aed Mon Sep 17 00:00:00 2001
  78. From: Aleksandr Khoroshilov <hor911@ydb.tech>
  79. Date: Mon, 6 Mar 2023 03:16:53 +0300
  80. Subject: [PATCH 2/2] Clang formatting
  81. ---
  82. cpp/src/parquet/arrow/reader.cc | 6 +++---
  83. cpp/src/parquet/arrow/reader.h | 2 +-
  84. 2 files changed, 4 insertions(+), 4 deletions(-)
  85. diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
  86. index 11e543f935b61..d361319f3c96a 100644
  87. --- a/cpp/src/parquet/arrow/reader.cc
  88. +++ b/cpp/src/parquet/arrow/reader.cc
  89. @@ -312,7 +312,7 @@ class FileReaderImpl : public FileReader {
  90. }
  91. Status WillNeedRowGroups(const std::vector<int>& row_groups,
  92. - const std::vector<int>& column_indices) override;
  93. + const std::vector<int>& column_indices) override;
  94. Status DecodeRowGroups(const std::vector<int>& row_groups,
  95. const std::vector<int>& column_indices,
  96. @@ -1239,8 +1239,8 @@ Status FileReaderImpl::WillNeedRowGroups(const std::vector<int>& row_groups,
  97. }
  98. Status FileReaderImpl::DecodeRowGroups(const std::vector<int>& row_groups,
  99. - const std::vector<int>& column_indices,
  100. - std::shared_ptr<::arrow::Table>* out) {
  101. + const std::vector<int>& column_indices,
  102. + std::shared_ptr<::arrow::Table>* out) {
  103. RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
  104. auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
  105. diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
  106. index fbabeba7c764f..33e8677ef7c15 100644
  107. --- a/cpp/src/parquet/arrow/reader.h
  108. +++ b/cpp/src/parquet/arrow/reader.h
  109. @@ -250,7 +250,7 @@ class PARQUET_EXPORT FileReader {
  110. virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
  111. virtual ::arrow::Status WillNeedRowGroups(const std::vector<int>& row_groups,
  112. - const std::vector<int>& column_indices) = 0;
  113. + const std::vector<int>& column_indices) = 0;
  114. virtual ::arrow::Status DecodeRowGroups(const std::vector<int>& row_groups,
  115. const std::vector<int>& column_indices,