yql_statistics.h 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #pragma once
  2. #include "yql_cost_function.h"
  3. #include <yql/essentials/core/minsketch/count_min_sketch.h>
  4. #include <library/cpp/json/json_reader.h>
  5. #include <util/generic/vector.h>
  6. #include <util/generic/hash.h>
  7. #include <util/generic/string.h>
  8. #include <optional>
  9. #include <iostream>
  10. namespace NYql {
  11. enum EStatisticsType : ui32 {
  12. BaseTable,
  13. FilteredFactTable,
  14. ManyManyJoin
  15. };
  16. enum EStorageType : ui32 {
  17. NA,
  18. RowStorage,
  19. ColumnStorage
  20. };
  21. // Providers may subclass this struct to associate specific statistics, useful to
  22. // derive stats for higher-level operators in the plan.
  23. struct IProviderStatistics {
  24. virtual ~IProviderStatistics() {}
  25. };
  26. struct TColumnStatistics {
  27. std::optional<double> NumUniqueVals;
  28. std::optional<double> HyperLogLog;
  29. std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch;
  30. TString Type;
  31. TColumnStatistics() {}
  32. };
  33. /**
  34. * Optimizer Statistics struct records per-table and per-column statistics
  35. * for the current operator in the plan. Currently, only Nrows and Ncols are
  36. * recorded.
  37. * Cost is also included in statistics, as its updated concurrently with statistics
  38. * all of the time.
  39. */
  40. struct TOptimizerStatistics {
  41. struct TKeyColumns : public TSimpleRefCount<TKeyColumns> {
  42. TVector<TString> Data;
  43. TKeyColumns(TVector<TString> data) : Data(std::move(data)) {}
  44. };
  45. struct TSortColumns : public TSimpleRefCount<TSortColumns> {
  46. TVector<TString> Columns;
  47. TVector<TString> Aliases;
  48. TSortColumns(const TVector<TString>& cols, const TVector<TString>& aliases)
  49. : Columns(cols)
  50. , Aliases(aliases)
  51. {}
  52. };
  53. struct TColumnStatMap : public TSimpleRefCount<TColumnStatMap> {
  54. THashMap<TString,TColumnStatistics> Data;
  55. TColumnStatMap() {}
  56. TColumnStatMap(THashMap<TString,TColumnStatistics> data) : Data(std::move(data)) {}
  57. };
  58. struct TShuffledByColumns : public TSimpleRefCount<TShuffledByColumns> {
  59. TVector<NDq::TJoinColumn> Data;
  60. TShuffledByColumns(TVector<NDq::TJoinColumn> data) : Data(std::move(data)) {}
  61. };
  62. EStatisticsType Type = BaseTable;
  63. double Nrows = 0;
  64. int Ncols = 0;
  65. double ByteSize = 0;
  66. double Cost = 0;
  67. double Selectivity = 1.0;
  68. TIntrusivePtr<TKeyColumns> KeyColumns;
  69. TIntrusivePtr<TColumnStatMap> ColumnStatistics;
  70. TIntrusivePtr<TShuffledByColumns> ShuffledByColumns;
  71. EStorageType StorageType = EStorageType::NA;
  72. TIntrusivePtr<TSortColumns> SortColumns;
  73. std::shared_ptr<IProviderStatistics> Specific;
  74. std::shared_ptr<TVector<TString>> Labels = {};
  75. TOptimizerStatistics(TOptimizerStatistics&&) = default;
  76. TOptimizerStatistics& operator=(TOptimizerStatistics&&) = default;
  77. TOptimizerStatistics(const TOptimizerStatistics&) = default;
  78. TOptimizerStatistics& operator=(const TOptimizerStatistics&) = default;
  79. TOptimizerStatistics() = default;
  80. TOptimizerStatistics(
  81. EStatisticsType type,
  82. double nrows = 0.0,
  83. int ncols = 0,
  84. double byteSize = 0.0,
  85. double cost = 0.0,
  86. TIntrusivePtr<TKeyColumns> keyColumns = {},
  87. TIntrusivePtr<TColumnStatMap> columnMap = {},
  88. EStorageType storageType = EStorageType::NA,
  89. std::shared_ptr<IProviderStatistics> specific = nullptr);
  90. TOptimizerStatistics& operator+=(const TOptimizerStatistics& other);
  91. bool Empty() const;
  92. friend std::ostream& operator<<(std::ostream& os, const TOptimizerStatistics& s);
  93. TString ToString() const;
  94. };
  95. std::shared_ptr<TOptimizerStatistics> OverrideStatistics(const TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats);
  96. }