yql_statistics.cpp 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #include <util/generic/yexception.h>
  2. #include "yql_statistics.h"
  3. #include <library/cpp/json/json_reader.h>
  4. #include <library/cpp/string_utils/base64/base64.h>
  5. #include <sstream>
  6. using namespace NYql;
  7. static TString ConvertToStatisticsTypeString(EStatisticsType type) {
  8. switch (type) {
  9. case EStatisticsType::BaseTable:
  10. return "BaseTable";
  11. case EStatisticsType::FilteredFactTable:
  12. return "FilteredFactTable";
  13. case EStatisticsType::ManyManyJoin:
  14. return "ManyManyJoin";
  15. default:
  16. Y_ENSURE(false,"Unknown EStatisticsType");
  17. }
  18. return "";
  19. }
  20. static TString ConvertToStatisticsTypeString(EStorageType storageType) {
  21. switch (storageType) {
  22. case EStorageType::NA:
  23. return "NA";
  24. case EStorageType::RowStorage:
  25. return "RowStorage";
  26. case EStorageType::ColumnStorage:
  27. return "ColumnStorage";
  28. default:
  29. Y_ENSURE(false,"Unknown Storage type");
  30. }
  31. return "";
  32. }
  33. TString TOptimizerStatistics::ToString() const {
  34. std::stringstream ss;
  35. ss << *this;
  36. return ss.str();
  37. }
  38. std::ostream& NYql::operator<<(std::ostream& os, const TOptimizerStatistics& s) {
  39. os << "Type: " << ConvertToStatisticsTypeString(s.Type) << ", Nrows: " << s.Nrows
  40. << ", Ncols: " << s.Ncols << ", ByteSize: " << s.ByteSize << ", Cost: " << s.Cost;
  41. if (s.KeyColumns) {
  42. os << ", keys: ";
  43. std::string tmp;
  44. for (const auto& c: s.KeyColumns->Data) {
  45. tmp.append(c).append(", ");
  46. }
  47. if (!tmp.empty()) {
  48. tmp.pop_back();
  49. tmp.pop_back();
  50. }
  51. os << "[" << tmp << "]";
  52. }
  53. if (s.ShuffledByColumns) {
  54. os << ", shuffled by: ";
  55. std::string tmp;
  56. for (const auto& c: s.ShuffledByColumns->Data) {
  57. tmp.append(c.RelName).append(".").append(c.AttributeName).append(", ");
  58. }
  59. if (!tmp.empty()) {
  60. tmp.pop_back();
  61. tmp.pop_back();
  62. }
  63. os << "[" << tmp << "]";
  64. }
  65. os << ", Sel: " << s.Selectivity;
  66. os << ", Storage: " << ConvertToStatisticsTypeString(s.StorageType);
  67. if (s.SortColumns) {
  68. os << ", sorted: ";
  69. std::string tmp;
  70. for (size_t i = 0; i < s.SortColumns->Columns.size() && i < s.SortColumns->Aliases.size(); i++) {
  71. auto c = s.SortColumns->Columns[i];
  72. auto a = s.SortColumns->Aliases[i];
  73. if (a.empty()) {
  74. tmp.append(a).append(".");
  75. }
  76. tmp.append(c).append(", ");
  77. }
  78. if (!tmp.empty()) {
  79. tmp.pop_back();
  80. tmp.pop_back();
  81. }
  82. os << tmp;
  83. }
  84. return os;
  85. }
  86. bool TOptimizerStatistics::Empty() const {
  87. return ! (Nrows || Ncols || Cost);
  88. }
  89. TOptimizerStatistics::TOptimizerStatistics(
  90. EStatisticsType type,
  91. double nrows,
  92. int ncols,
  93. double byteSize,
  94. double cost,
  95. TIntrusivePtr<TKeyColumns> keyColumns,
  96. TIntrusivePtr<TColumnStatMap> columnMap,
  97. EStorageType storageType,
  98. std::shared_ptr<IProviderStatistics> specific)
  99. : Type(type)
  100. , Nrows(nrows)
  101. , Ncols(ncols)
  102. , ByteSize(byteSize)
  103. , Cost(cost)
  104. , KeyColumns(keyColumns)
  105. , ColumnStatistics(columnMap)
  106. , StorageType(storageType)
  107. , Specific(std::move(specific))
  108. {
  109. }
  110. TOptimizerStatistics& TOptimizerStatistics::operator+=(const TOptimizerStatistics& other) {
  111. Nrows += other.Nrows;
  112. Ncols += other.Ncols;
  113. ByteSize += other.ByteSize;
  114. Cost += other.Cost;
  115. return *this;
  116. }
  117. std::shared_ptr<TOptimizerStatistics> NYql::OverrideStatistics(const NYql::TOptimizerStatistics& s, const TStringBuf& tablePath, const std::shared_ptr<NJson::TJsonValue>& stats) {
  118. auto res = std::make_shared<TOptimizerStatistics>(s.Type, s.Nrows, s.Ncols, s.ByteSize, s.Cost, s.KeyColumns, s.ColumnStatistics, s.StorageType, s.Specific);
  119. res->SortColumns = s.SortColumns;
  120. auto dbStats = stats->GetMapSafe();
  121. if (!dbStats.contains(tablePath)){
  122. return res;
  123. }
  124. auto tableStats = dbStats.at(tablePath).GetMapSafe();
  125. if (auto keyCols = tableStats.find("key_columns"); keyCols != tableStats.end()) {
  126. TVector<TString> cols;
  127. for (auto c : keyCols->second.GetArraySafe()) {
  128. cols.push_back(c.GetStringSafe());
  129. }
  130. res->KeyColumns = TIntrusivePtr<TOptimizerStatistics::TKeyColumns>(new TOptimizerStatistics::TKeyColumns(cols));
  131. }
  132. if (auto nrows = tableStats.find("n_rows"); nrows != tableStats.end()) {
  133. res->Nrows = nrows->second.GetDoubleSafe();
  134. }
  135. if (auto byteSize = tableStats.find("byte_size"); byteSize != tableStats.end()) {
  136. res->ByteSize = byteSize->second.GetDoubleSafe();
  137. }
  138. if (auto nattrs = tableStats.find("n_attrs"); nattrs != tableStats.end()) {
  139. res->Ncols = nattrs->second.GetIntegerSafe();
  140. }
  141. if (auto columns = tableStats.find("columns"); columns != tableStats.end()) {
  142. if (!res->ColumnStatistics) {
  143. res->ColumnStatistics = TIntrusivePtr<TOptimizerStatistics::TColumnStatMap>(new TOptimizerStatistics::TColumnStatMap());
  144. }
  145. for (auto col : columns->second.GetArraySafe()) {
  146. auto colMap = col.GetMapSafe();
  147. TColumnStatistics cStat;
  148. auto columnName = colMap.at("name").GetStringSafe();
  149. if (auto numUniqueVals = colMap.find("n_unique_vals"); numUniqueVals != colMap.end()) {
  150. cStat.NumUniqueVals = numUniqueVals->second.IsNull()? 0.0: numUniqueVals->second.GetDoubleSafe();
  151. }
  152. if (auto hll = colMap.find("hyperloglog"); hll != colMap.end()) {
  153. cStat.HyperLogLog = hll->second.IsNull()? 0.0: hll->second.GetDoubleSafe();
  154. }
  155. if (auto countMinSketch = colMap.find("count-min"); countMinSketch != colMap.end()) {
  156. TString countMinBase64 = countMinSketch->second.GetStringSafe();
  157. TString countMinRaw{};
  158. Base64StrictDecode(countMinBase64, countMinRaw);
  159. cStat.CountMinSketch.reset(NKikimr::TCountMinSketch::FromString(countMinRaw.data(), countMinRaw.size()));
  160. }
  161. res->ColumnStatistics->Data[columnName] = cStat;
  162. }
  163. }
  164. return res;
  165. }