yql_statistics.cpp 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #include <util/generic/yexception.h>
  2. #include "yql_statistics.h"
  3. #include <library/cpp/json/json_reader.h>
  4. #include <library/cpp/string_utils/base64/base64.h>
  5. #include <sstream>
  6. using namespace NYql;
  7. static TString ConvertToStatisticsTypeString(EStatisticsType type) {
  8. switch (type) {
  9. case EStatisticsType::BaseTable:
  10. return "BaseTable";
  11. case EStatisticsType::FilteredFactTable:
  12. return "FilteredFactTable";
  13. case EStatisticsType::ManyManyJoin:
  14. return "ManyManyJoin";
  15. default:
  16. Y_ENSURE(false,"Unknown EStatisticsType");
  17. }
  18. return "";
  19. }
  20. static TString ConvertToStatisticsTypeString(EStorageType storageType) {
  21. switch (storageType) {
  22. case EStorageType::NA:
  23. return "NA";
  24. case EStorageType::RowStorage:
  25. return "RowStorage";
  26. case EStorageType::ColumnStorage:
  27. return "ColumnStorage";
  28. default:
  29. Y_ENSURE(false,"Unknown Storage type");
  30. }
  31. return "";
  32. }
  33. TString TOptimizerStatistics::ToString() const {
  34. std::stringstream ss;
  35. ss << *this;
  36. return ss.str();
  37. }
  38. std::ostream& NYql::operator<<(std::ostream& os, const TOptimizerStatistics& s) {
  39. os << "Type: " << ConvertToStatisticsTypeString(s.Type) << ", Nrows: " << s.Nrows
  40. << ", Ncols: " << s.Ncols << ", ByteSize: " << s.ByteSize << ", Cost: " << s.Cost;
  41. if (s.KeyColumns) {
  42. os << ", keys: ";
  43. std::string tmp;
  44. for (const auto& c: s.KeyColumns->Data) {
  45. tmp.append(c).append(", ");
  46. }
  47. if (!tmp.empty()) {
  48. tmp.pop_back();
  49. tmp.pop_back();
  50. }
  51. os << tmp;
  52. }
  53. os << ", Sel: " << s.Selectivity;
  54. os << ", Storage: " << ConvertToStatisticsTypeString(s.StorageType);
  55. if (s.SortColumns) {
  56. os << ", sorted: ";
  57. std::string tmp;
  58. for (size_t i = 0; i < s.SortColumns->Columns.size() && i < s.SortColumns->Aliases.size(); i++) {
  59. auto c = s.SortColumns->Columns[i];
  60. auto a = s.SortColumns->Aliases[i];
  61. if (a.empty()) {
  62. tmp.append(a).append(".");
  63. }
  64. tmp.append(c).append(", ");
  65. }
  66. if (!tmp.empty()) {
  67. tmp.pop_back();
  68. tmp.pop_back();
  69. }
  70. os << tmp;
  71. }
  72. return os;
  73. }
  74. bool TOptimizerStatistics::Empty() const {
  75. return ! (Nrows || Ncols || Cost);
  76. }
  77. TOptimizerStatistics::TOptimizerStatistics(
  78. EStatisticsType type,
  79. double nrows,
  80. int ncols,
  81. double byteSize,
  82. double cost,
  83. TIntrusivePtr<TKeyColumns> keyColumns,
  84. TIntrusivePtr<TColumnStatMap> columnMap,
  85. EStorageType storageType,
  86. std::shared_ptr<IProviderStatistics> specific)
  87. : Type(type)
  88. , Nrows(nrows)
  89. , Ncols(ncols)
  90. , ByteSize(byteSize)
  91. , Cost(cost)
  92. , KeyColumns(keyColumns)
  93. , ColumnStatistics(columnMap)
  94. , StorageType(storageType)
  95. , Specific(std::move(specific))
  96. {
  97. }
  98. TOptimizerStatistics& TOptimizerStatistics::operator+=(const TOptimizerStatistics& other) {
  99. Nrows += other.Nrows;
  100. Ncols += other.Ncols;
  101. ByteSize += other.ByteSize;
  102. Cost += other.Cost;
  103. return *this;
  104. }
  105. std::shared_ptr<TOptimizerStatistics> NYql::OverrideStatistics(const NYql::TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats) {
  106. auto res = std::make_shared<TOptimizerStatistics>(s.Type, s.Nrows, s.Ncols, s.ByteSize, s.Cost, s.KeyColumns, s.ColumnStatistics, s.StorageType, s.Specific);
  107. res->SortColumns = s.SortColumns;
  108. auto dbStats = stats->GetMapSafe();
  109. if (!dbStats.contains(tablePath)){
  110. return res;
  111. }
  112. auto tableStats = dbStats.at(tablePath).GetMapSafe();
  113. if (auto keyCols = tableStats.find("key_columns"); keyCols != tableStats.end()) {
  114. TVector<TString> cols;
  115. for (auto c : keyCols->second.GetArraySafe()) {
  116. cols.push_back(c.GetStringSafe());
  117. }
  118. res->KeyColumns = TIntrusivePtr<TOptimizerStatistics::TKeyColumns>(new TOptimizerStatistics::TKeyColumns(cols));
  119. }
  120. if (auto nrows = tableStats.find("n_rows"); nrows != tableStats.end()) {
  121. res->Nrows = nrows->second.GetDoubleSafe();
  122. }
  123. if (auto byteSize = tableStats.find("byte_size"); byteSize != tableStats.end()) {
  124. res->ByteSize = byteSize->second.GetDoubleSafe();
  125. }
  126. if (auto nattrs = tableStats.find("n_attrs"); nattrs != tableStats.end()) {
  127. res->Ncols = nattrs->second.GetIntegerSafe();
  128. }
  129. if (auto columns = tableStats.find("columns"); columns != tableStats.end()) {
  130. if (!res->ColumnStatistics) {
  131. res->ColumnStatistics = TIntrusivePtr<TOptimizerStatistics::TColumnStatMap>(new TOptimizerStatistics::TColumnStatMap());
  132. }
  133. for (auto col : columns->second.GetArraySafe()) {
  134. auto colMap = col.GetMapSafe();
  135. TColumnStatistics cStat;
  136. auto columnName = colMap.at("name").GetStringSafe();
  137. if (auto numUniqueVals = colMap.find("n_unique_vals"); numUniqueVals != colMap.end()) {
  138. cStat.NumUniqueVals = numUniqueVals->second.IsNull()? 0.0: numUniqueVals->second.GetDoubleSafe();
  139. }
  140. if (auto hll = colMap.find("hyperloglog"); hll != colMap.end()) {
  141. cStat.HyperLogLog = hll->second.IsNull()? 0.0: hll->second.GetDoubleSafe();
  142. }
  143. if (auto countMinSketch = colMap.find("count-min"); countMinSketch != colMap.end()) {
  144. TString countMinBase64 = countMinSketch->second.GetStringSafe();
  145. TString countMinRaw{};
  146. Base64StrictDecode(countMinBase64, countMinRaw);
  147. cStat.CountMinSketch.reset(NKikimr::TCountMinSketch::FromString(countMinRaw.data(), countMinRaw.size()));
  148. }
  149. res->ColumnStatistics->Data[columnName] = cStat;
  150. }
  151. }
  152. return res;
  153. }