util.cpp 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. #include "util.h"
  2. #include "bit_util.h"
  3. #include "defs.h"
  4. #include <arrow/array/array_base.h>
  5. #include <arrow/array/util.h>
  6. #include <arrow/chunked_array.h>
  7. #include <arrow/record_batch.h>
  8. namespace NYql {
  9. namespace NUdf {
  10. namespace {
  11. ui64 GetSizeOfArrayDataInBytes(const arrow::ArrayData& data) {
  12. ui64 size = sizeof(data);
  13. size += data.buffers.size() * sizeof(void*);
  14. size += data.child_data.size() * sizeof(void*);
  15. for (const auto& b : data.buffers) {
  16. if (b) {
  17. size += b->size();
  18. }
  19. }
  20. for (const auto& c : data.child_data) {
  21. if (c) {
  22. size += GetSizeOfArrayDataInBytes(*c);
  23. }
  24. }
  25. return size;
  26. }
  27. ui64 GetSizeOfDatumInBytes(const arrow::Datum& datum) {
  28. ui64 size = sizeof(datum);
  29. if (datum.is_scalar()) {
  30. const auto& scarray = ARROW_RESULT(arrow::MakeArrayFromScalar(*datum.scalar(), 1));
  31. return size + GetSizeOfArrayDataInBytes(*scarray->data());
  32. }
  33. if (datum.is_arraylike()) {
  34. ForEachArrayData(datum, [&size](const auto& arrayData) {
  35. size += GetSizeOfArrayDataInBytes(*arrayData);
  36. });
  37. return size;
  38. }
  39. Y_ABORT("Not yet implemented");
  40. }
  41. } // namespace
  42. std::shared_ptr<arrow::Buffer> AllocateBitmapWithReserve(size_t bitCount, arrow::MemoryPool* pool) {
  43. // align up to 64 bit
  44. bitCount = (bitCount + 63u) & ~size_t(63u);
  45. // this simplifies code compression code - we can write single 64 bit word after array boundaries
  46. bitCount += 64;
  47. return ARROW_RESULT(arrow::AllocateBitmap(bitCount, pool));
  48. }
  49. std::shared_ptr<arrow::Buffer> MakeDenseBitmap(const ui8* srcSparse, size_t len, arrow::MemoryPool* pool) {
  50. auto bitmap = AllocateBitmapWithReserve(len, pool);
  51. CompressSparseBitmap(bitmap->mutable_data(), srcSparse, len);
  52. return bitmap;
  53. }
  54. std::shared_ptr<arrow::Buffer> MakeDenseBitmapNegate(const ui8* srcSparse, size_t len, arrow::MemoryPool* pool) {
  55. auto bitmap = AllocateBitmapWithReserve(len, pool);
  56. CompressSparseBitmapNegate(bitmap->mutable_data(), srcSparse, len);
  57. return bitmap;
  58. }
  59. std::shared_ptr<arrow::Buffer> MakeDenseBitmapCopy(const ui8* src, size_t len, size_t offset, arrow::MemoryPool* pool) {
  60. auto bitmap = AllocateBitmapWithReserve(len, pool);
  61. CopyDenseBitmap(bitmap->mutable_data(), src, offset, len);
  62. return bitmap;
  63. }
  64. std::shared_ptr<arrow::Buffer> MakeDenseFalseBitmap(int64_t len, arrow::MemoryPool* pool) {
  65. auto bitmap = AllocateBitmapWithReserve(len, pool);
  66. std::memset(bitmap->mutable_data(), 0, bitmap->size());
  67. return bitmap;
  68. }
  69. std::shared_ptr<arrow::ArrayData> DeepSlice(const std::shared_ptr<arrow::ArrayData>& data, size_t offset, size_t len) {
  70. Y_ENSURE(data->length >= 0);
  71. Y_ENSURE(offset + len <= (size_t)data->length);
  72. if (offset == 0 && len == (size_t)data->length) {
  73. return data;
  74. }
  75. std::shared_ptr<arrow::ArrayData> result = data->Copy();
  76. result->offset = data->offset + offset;
  77. result->length = len;
  78. if (data->null_count == data->length) {
  79. result->null_count = len;
  80. } else if (len == 0) {
  81. result->null_count = 0;
  82. } else {
  83. result->null_count = data->null_count != 0 ? arrow::kUnknownNullCount : 0;
  84. }
  85. for (size_t i = 0; i < data->child_data.size(); ++i) {
  86. result->child_data[i] = DeepSlice(data->child_data[i], offset, len);
  87. }
  88. return result;
  89. }
  90. std::shared_ptr<arrow::ArrayData> Chop(std::shared_ptr<arrow::ArrayData>& data, size_t len) {
  91. auto first = DeepSlice(data, 0, len);
  92. data = DeepSlice(data, len, data->length - len);
  93. return first;
  94. }
  95. std::shared_ptr<arrow::ArrayData> Unwrap(const arrow::ArrayData& data, bool isNestedOptional) {
  96. Y_ENSURE(data.GetNullCount() == 0);
  97. if (isNestedOptional) {
  98. Y_ENSURE(data.buffers.size() == 1);
  99. Y_ENSURE(data.child_data.size() == 1);
  100. return data.child_data.front();
  101. }
  102. auto result = data.Copy();
  103. result->buffers.front().reset();
  104. return result;
  105. }
  106. void ForEachArrayData(const arrow::Datum& datum, const std::function<void(const std::shared_ptr<arrow::ArrayData>&)>& func) {
  107. Y_ENSURE(datum.is_arraylike(), "Expected array");
  108. if (datum.is_array()) {
  109. func(datum.array());
  110. } else {
  111. for (auto& chunk : datum.chunks()) {
  112. func(chunk->data());
  113. }
  114. }
  115. }
  116. arrow::Datum MakeArray(const TVector<std::shared_ptr<arrow::ArrayData>>& chunks) {
  117. Y_ENSURE(!chunks.empty(), "Expected non empty chunks");
  118. arrow::ArrayVector resultChunks;
  119. for (auto& chunk : chunks) {
  120. resultChunks.push_back(arrow::Datum(chunk).make_array());
  121. }
  122. if (resultChunks.size() > 1) {
  123. auto type = resultChunks.front()->type();
  124. auto chunked = ARROW_RESULT(arrow::ChunkedArray::Make(std::move(resultChunks), type));
  125. return arrow::Datum(chunked);
  126. }
  127. return arrow::Datum(resultChunks.front());
  128. }
  129. ui64 GetSizeOfArrowBatchInBytes(const arrow::RecordBatch& batch) {
  130. ui64 size = sizeof(batch);
  131. size += batch.num_columns() * sizeof(void*);
  132. for (int i = 0; i < batch.num_columns(); ++i) {
  133. size += GetSizeOfArrayDataInBytes(*batch.column_data(i));
  134. }
  135. return size;
  136. }
  137. ui64 GetSizeOfArrowExecBatchInBytes(const arrow::compute::ExecBatch& batch) {
  138. ui64 size = sizeof(batch);
  139. size += batch.num_values() * sizeof(void*);
  140. for (const auto& datum : batch.values) {
  141. size += GetSizeOfDatumInBytes(datum);
  142. }
  143. return size;
  144. }
  145. }
  146. }