yql_statistics.h 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #pragma once
  2. #include <yql/essentials/core/minsketch/count_min_sketch.h>
  3. #include <library/cpp/json/json_reader.h>
  4. #include <util/generic/vector.h>
  5. #include <util/generic/hash.h>
  6. #include <util/generic/string.h>
  7. #include <optional>
  8. #include <iostream>
  9. namespace NYql {
  10. enum EStatisticsType : ui32 {
  11. BaseTable,
  12. FilteredFactTable,
  13. ManyManyJoin
  14. };
  15. enum EStorageType : ui32 {
  16. NA,
  17. RowStorage,
  18. ColumnStorage
  19. };
  20. // Providers may subclass this struct to associate specific statistics, useful to
  21. // derive stats for higher-level operators in the plan.
  22. struct IProviderStatistics {
  23. virtual ~IProviderStatistics() {}
  24. };
  25. struct TColumnStatistics {
  26. std::optional<double> NumUniqueVals;
  27. std::optional<double> HyperLogLog;
  28. std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch;
  29. TString Type;
  30. TColumnStatistics() {}
  31. };
  32. /**
  33. * Optimizer Statistics struct records per-table and per-column statistics
  34. * for the current operator in the plan. Currently, only Nrows and Ncols are
  35. * recorded.
  36. * Cost is also included in statistics, as its updated concurrently with statistics
  37. * all of the time.
  38. */
  39. struct TOptimizerStatistics {
  40. struct TKeyColumns : public TSimpleRefCount<TKeyColumns> {
  41. TVector<TString> Data;
  42. TKeyColumns(const TVector<TString>& vec) : Data(vec) {}
  43. };
  44. struct TSortColumns : public TSimpleRefCount<TSortColumns> {
  45. TVector<TString> Columns;
  46. TVector<TString> Aliases;
  47. TSortColumns(const TVector<TString>& cols, const TVector<TString>& aliases)
  48. : Columns(cols)
  49. , Aliases(aliases)
  50. {}
  51. };
  52. struct TColumnStatMap : public TSimpleRefCount<TColumnStatMap> {
  53. THashMap<TString,TColumnStatistics> Data;
  54. TColumnStatMap() {}
  55. TColumnStatMap(const THashMap<TString,TColumnStatistics>& map) : Data(map) {}
  56. };
  57. EStatisticsType Type = BaseTable;
  58. double Nrows = 0;
  59. int Ncols = 0;
  60. double ByteSize = 0;
  61. double Cost = 0;
  62. double Selectivity = 1.0;
  63. TIntrusivePtr<TKeyColumns> KeyColumns;
  64. TIntrusivePtr<TColumnStatMap> ColumnStatistics;
  65. EStorageType StorageType = EStorageType::NA;
  66. TIntrusivePtr<TSortColumns> SortColumns;
  67. std::shared_ptr<IProviderStatistics> Specific;
  68. std::shared_ptr<TVector<TString>> Labels = {};
  69. TOptimizerStatistics(TOptimizerStatistics&&) = default;
  70. TOptimizerStatistics& operator=(TOptimizerStatistics&&) = default;
  71. TOptimizerStatistics(const TOptimizerStatistics&) = default;
  72. TOptimizerStatistics& operator=(const TOptimizerStatistics&) = default;
  73. TOptimizerStatistics() = default;
  74. TOptimizerStatistics(
  75. EStatisticsType type,
  76. double nrows = 0.0,
  77. int ncols = 0,
  78. double byteSize = 0.0,
  79. double cost = 0.0,
  80. TIntrusivePtr<TKeyColumns> keyColumns = {},
  81. TIntrusivePtr<TColumnStatMap> columnMap = {},
  82. EStorageType storageType = EStorageType::NA,
  83. std::shared_ptr<IProviderStatistics> specific = nullptr);
  84. TOptimizerStatistics& operator+=(const TOptimizerStatistics& other);
  85. bool Empty() const;
  86. friend std::ostream& operator<<(std::ostream& os, const TOptimizerStatistics& s);
  87. TString ToString() const;
  88. };
  89. std::shared_ptr<TOptimizerStatistics> OverrideStatistics(const TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats);
  90. }