yql_statistics.h 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #pragma once
  2. #include <yql/essentials/core/minsketch/count_min_sketch.h>
  3. #include <library/cpp/json/json_reader.h>
  4. #include <util/generic/vector.h>
  5. #include <util/generic/hash.h>
  6. #include <util/generic/string.h>
  7. #include <optional>
  8. #include <iostream>
  9. namespace NYql {
  10. enum EStatisticsType : ui32 {
  11. BaseTable,
  12. FilteredFactTable,
  13. ManyManyJoin
  14. };
  15. enum EStorageType : ui32 {
  16. NA,
  17. RowStorage,
  18. ColumnStorage
  19. };
  20. // Providers may subclass this struct to associate specific statistics, useful to
  21. // derive stats for higher-level operators in the plan.
  22. struct IProviderStatistics {
  23. virtual ~IProviderStatistics() {}
  24. };
  25. struct TColumnStatistics {
  26. std::optional<double> NumUniqueVals;
  27. std::optional<double> HyperLogLog;
  28. std::shared_ptr<NKikimr::TCountMinSketch> CountMinSketch;
  29. TString Type;
  30. TColumnStatistics() {}
  31. };
  32. /**
  33. * Optimizer Statistics struct records per-table and per-column statistics
  34. * for the current operator in the plan. Currently, only Nrows and Ncols are
  35. * recorded.
  36. * Cost is also included in statistics, as its updated concurrently with statistics
  37. * all of the time.
  38. */
  39. struct TOptimizerStatistics {
  40. struct TKeyColumns : public TSimpleRefCount<TKeyColumns> {
  41. TVector<TString> Data;
  42. TKeyColumns(const TVector<TString>& vec) : Data(vec) {}
  43. };
  44. struct TColumnStatMap : public TSimpleRefCount<TColumnStatMap> {
  45. THashMap<TString,TColumnStatistics> Data;
  46. TColumnStatMap() {}
  47. TColumnStatMap(const THashMap<TString,TColumnStatistics>& map) : Data(map) {}
  48. };
  49. EStatisticsType Type = BaseTable;
  50. double Nrows = 0;
  51. int Ncols = 0;
  52. double ByteSize = 0;
  53. double Cost = 0;
  54. double Selectivity = 1.0;
  55. TIntrusivePtr<TKeyColumns> KeyColumns;
  56. TIntrusivePtr<TColumnStatMap> ColumnStatistics;
  57. EStorageType StorageType = EStorageType::NA;
  58. std::shared_ptr<IProviderStatistics> Specific;
  59. std::shared_ptr<TVector<TString>> Labels = {};
  60. TOptimizerStatistics(TOptimizerStatistics&&) = default;
  61. TOptimizerStatistics& operator=(TOptimizerStatistics&&) = default;
  62. TOptimizerStatistics(const TOptimizerStatistics&) = default;
  63. TOptimizerStatistics& operator=(const TOptimizerStatistics&) = default;
  64. TOptimizerStatistics() = default;
  65. TOptimizerStatistics(
  66. EStatisticsType type,
  67. double nrows = 0.0,
  68. int ncols = 0,
  69. double byteSize = 0.0,
  70. double cost = 0.0,
  71. TIntrusivePtr<TKeyColumns> keyColumns = {},
  72. TIntrusivePtr<TColumnStatMap> columnMap = {},
  73. EStorageType storageType = EStorageType::NA,
  74. std::shared_ptr<IProviderStatistics> specific = nullptr);
  75. TOptimizerStatistics& operator+=(const TOptimizerStatistics& other);
  76. bool Empty() const;
  77. friend std::ostream& operator<<(std::ostream& os, const TOptimizerStatistics& s);
  78. TString ToString() const;
  79. };
  80. std::shared_ptr<TOptimizerStatistics> OverrideStatistics(const TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats);
  81. }