yql_statistics.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #include <util/generic/yexception.h>
  2. #include "yql_statistics.h"
  3. #include <library/cpp/json/json_reader.h>
  4. #include <library/cpp/string_utils/base64/base64.h>
  5. #include <sstream>
  6. using namespace NYql;
  7. static TString ConvertToStatisticsTypeString(EStatisticsType type) {
  8. switch (type) {
  9. case EStatisticsType::BaseTable:
  10. return "BaseTable";
  11. case EStatisticsType::FilteredFactTable:
  12. return "FilteredFactTable";
  13. case EStatisticsType::ManyManyJoin:
  14. return "ManyManyJoin";
  15. default:
  16. Y_ENSURE(false,"Unknown EStatisticsType");
  17. }
  18. return "";
  19. }
  20. static TString ConvertToStatisticsTypeString(EStorageType storageType) {
  21. switch (storageType) {
  22. case EStorageType::NA:
  23. return "NA";
  24. case EStorageType::RowStorage:
  25. return "RowStorage";
  26. case EStorageType::ColumnStorage:
  27. return "ColumnStorage";
  28. default:
  29. Y_ENSURE(false,"Unknown Storage type");
  30. }
  31. return "";
  32. }
  33. TString TOptimizerStatistics::ToString() const {
  34. std::stringstream ss;
  35. ss << *this;
  36. return ss.str();
  37. }
  38. std::ostream& NYql::operator<<(std::ostream& os, const TOptimizerStatistics& s) {
  39. os << "Type: " << ConvertToStatisticsTypeString(s.Type) << ", Nrows: " << s.Nrows
  40. << ", Ncols: " << s.Ncols << ", ByteSize: " << s.ByteSize << ", Cost: " << s.Cost;
  41. if (s.KeyColumns) {
  42. for (const auto& c : s.KeyColumns->Data) {
  43. os << ", " << c;
  44. }
  45. }
  46. os << ", Sel: " << s.Selectivity;
  47. os << ", Storage: " << ConvertToStatisticsTypeString(s.StorageType);
  48. return os;
  49. }
  50. bool TOptimizerStatistics::Empty() const {
  51. return ! (Nrows || Ncols || Cost);
  52. }
  53. TOptimizerStatistics::TOptimizerStatistics(
  54. EStatisticsType type,
  55. double nrows,
  56. int ncols,
  57. double byteSize,
  58. double cost,
  59. TIntrusivePtr<TKeyColumns> keyColumns,
  60. TIntrusivePtr<TColumnStatMap> columnMap,
  61. EStorageType storageType,
  62. std::shared_ptr<IProviderStatistics> specific)
  63. : Type(type)
  64. , Nrows(nrows)
  65. , Ncols(ncols)
  66. , ByteSize(byteSize)
  67. , Cost(cost)
  68. , KeyColumns(keyColumns)
  69. , ColumnStatistics(columnMap)
  70. , StorageType(storageType)
  71. , Specific(std::move(specific))
  72. {
  73. }
  74. TOptimizerStatistics& TOptimizerStatistics::operator+=(const TOptimizerStatistics& other) {
  75. Nrows += other.Nrows;
  76. Ncols += other.Ncols;
  77. ByteSize += other.ByteSize;
  78. Cost += other.Cost;
  79. return *this;
  80. }
  81. std::shared_ptr<TOptimizerStatistics> NYql::OverrideStatistics(const NYql::TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats) {
  82. auto res = std::make_shared<TOptimizerStatistics>(s.Type, s.Nrows, s.Ncols, s.ByteSize, s.Cost, s.KeyColumns, s.ColumnStatistics, s.StorageType, s.Specific);
  83. auto dbStats = stats->GetMapSafe();
  84. if (!dbStats.contains(tablePath)){
  85. return res;
  86. }
  87. auto tableStats = dbStats.at(tablePath).GetMapSafe();
  88. if (auto keyCols = tableStats.find("key_columns"); keyCols != tableStats.end()) {
  89. TVector<TString> cols;
  90. for (auto c : keyCols->second.GetArraySafe()) {
  91. cols.push_back(c.GetStringSafe());
  92. }
  93. res->KeyColumns = TIntrusivePtr<TOptimizerStatistics::TKeyColumns>(new TOptimizerStatistics::TKeyColumns(cols));
  94. }
  95. if (auto nrows = tableStats.find("n_rows"); nrows != tableStats.end()) {
  96. res->Nrows = nrows->second.GetDoubleSafe();
  97. }
  98. if (auto byteSize = tableStats.find("byte_size"); byteSize != tableStats.end()) {
  99. res->ByteSize = byteSize->second.GetDoubleSafe();
  100. }
  101. if (auto nattrs = tableStats.find("n_attrs"); nattrs != tableStats.end()) {
  102. res->Ncols = nattrs->second.GetIntegerSafe();
  103. }
  104. if (auto columns = tableStats.find("columns"); columns != tableStats.end()) {
  105. if (!res->ColumnStatistics) {
  106. res->ColumnStatistics = TIntrusivePtr<TOptimizerStatistics::TColumnStatMap>(new TOptimizerStatistics::TColumnStatMap());
  107. }
  108. for (auto col : columns->second.GetArraySafe()) {
  109. auto colMap = col.GetMapSafe();
  110. TColumnStatistics cStat;
  111. auto columnName = colMap.at("name").GetStringSafe();
  112. if (auto numUniqueVals = colMap.find("n_unique_vals"); numUniqueVals != colMap.end()) {
  113. cStat.NumUniqueVals = numUniqueVals->second.IsNull()? 0.0: numUniqueVals->second.GetDoubleSafe();
  114. }
  115. if (auto hll = colMap.find("hyperloglog"); hll != colMap.end()) {
  116. cStat.HyperLogLog = hll->second.IsNull()? 0.0: hll->second.GetDoubleSafe();
  117. }
  118. if (auto countMinSketch = colMap.find("count-min"); countMinSketch != colMap.end()) {
  119. TString countMinBase64 = countMinSketch->second.GetStringSafe();
  120. TString countMinRaw{};
  121. Base64StrictDecode(countMinBase64, countMinRaw);
  122. cStat.CountMinSketch.reset(NKikimr::TCountMinSketch::FromString(countMinRaw.data(), countMinRaw.size()));
  123. }
  124. res->ColumnStatistics->Data[columnName] = cStat;
  125. }
  126. }
  127. return res;
  128. }