transform.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. #pragma once
  2. #include "data.h"
  3. namespace NAnalytics {
  4. template <class TSkip, class TX, class TY>
  5. inline TTable Histogram(const TTable& in, TSkip skip,
  6. const TString& xn_out, TX x_in,
  7. const TString& yn_out, TY y_in,
  8. double x1, double x2, double dx)
  9. {
  10. long buckets = (x2 - x1) / dx;
  11. TTable out;
  12. TString yn_sum = yn_out + "_sum";
  13. TString yn_share = yn_out + "_share";
  14. double ysum = 0.0;
  15. out.resize(buckets);
  16. for (size_t i = 0; i < out.size(); i++) {
  17. double lb = x1 + dx*i;
  18. double ub = lb + dx;
  19. out[i].Name = "[" + ToString(lb) + ";" + ToString(ub) + (ub==x2? "]": ")");
  20. out[i][xn_out] = (lb + ub) / 2;
  21. out[i][yn_sum] = 0.0;
  22. }
  23. for (const auto& row : in) {
  24. if (skip(row)) {
  25. continue;
  26. }
  27. double x = x_in(row);
  28. long i = (x - x1) / dx;
  29. if (x == x2) { // Special hack to include right edge
  30. i--;
  31. }
  32. double y = y_in(row);
  33. ysum += y;
  34. if (i >= 0 && i < buckets) {
  35. out[i][yn_sum] = y + out[i].GetOrDefault(yn_sum, 0.0);
  36. }
  37. }
  38. for (TRow& row : out) {
  39. if (ysum != 0.0) {
  40. row[yn_share] = row.GetOrDefault(yn_sum, 0.0) / ysum;
  41. }
  42. }
  43. return out;
  44. }
  45. inline TTable HistogramAll(const TTable& in, const TString& xn, double x1, double x2, double dx)
  46. {
  47. long buckets = (dx == 0.0? 1: (x2 - x1) / dx);
  48. TTable out;
  49. THashMap<TString, double> colSum;
  50. out.resize(buckets);
  51. TSet<TString> cols;
  52. for (auto& row : in) {
  53. for (auto& kv : row) {
  54. cols.insert(kv.first);
  55. }
  56. }
  57. cols.insert("_count");
  58. cols.erase(xn);
  59. for (const TString& col : cols) {
  60. colSum[col] = 0.0;
  61. }
  62. for (size_t i = 0; i < out.size(); i++) {
  63. double lb = x1 + dx*i;
  64. double ub = lb + dx;
  65. TRow& row = out[i];
  66. row.Name = "[" + ToString(lb) + ";" + ToString(ub) + (ub==x2? "]": ")");
  67. row[xn] = (lb + ub) / 2;
  68. for (const TString& col : cols) {
  69. row[col + "_sum"] = 0.0;
  70. }
  71. }
  72. for (const TRow& row_in : in) {
  73. double x;
  74. if (!row_in.Get(xn, x)) {
  75. continue;
  76. }
  77. long i = (dx == 0.0? 0: (x - x1) / dx);
  78. if (x == x2 && dx > 0.0) { // Special hack to include right edge
  79. i--;
  80. }
  81. for (const auto& kv : row_in) {
  82. const TString& yn = kv.first;
  83. if (yn == xn) {
  84. continue;
  85. }
  86. double y;
  87. if (!row_in.Get(yn, y)) {
  88. continue;
  89. }
  90. colSum[yn] += y;
  91. if (i >= 0 && i < buckets) {
  92. out[i][yn + "_cnt"] = out[i].GetOrDefault(yn + "_cnt") + 1;
  93. out[i][yn + "_sum"] = out[i].GetOrDefault(yn + "_sum") + y;
  94. if (out[i].contains(yn + "_min")) {
  95. out[i][yn + "_min"] = Min(y, out[i].GetOrDefault(yn + "_min"));
  96. } else {
  97. out[i][yn + "_min"] = y;
  98. }
  99. if (out[i].contains(yn + "_max")) {
  100. out[i][yn + "_max"] = Max(y, out[i].GetOrDefault(yn + "_max"));
  101. } else {
  102. out[i][yn + "_max"] = y;
  103. }
  104. }
  105. }
  106. colSum["_count"]++;
  107. if (i >= 0 && i < buckets) {
  108. out[i]["_count_sum"] = out[i].GetOrDefault("_count_sum") + 1;
  109. }
  110. }
  111. for (TRow& row : out) {
  112. for (const TString& col : cols) {
  113. double ysum = colSum[col];
  114. if (col != "_count") {
  115. if (row.GetOrDefault(col + "_cnt") != 0.0) {
  116. row[col + "_avg"] = row.GetOrDefault(col + "_sum") / row.GetOrDefault(col + "_cnt");
  117. }
  118. }
  119. if (ysum != 0.0) {
  120. row[col + "_share"] = row.GetOrDefault(col + "_sum") / ysum;
  121. }
  122. }
  123. }
  124. return out;
  125. }
  126. inline TMatrix CovarianceMatrix(const TTable& in)
  127. {
  128. TSet<TString> cols;
  129. for (auto& row : in) {
  130. for (auto& kv : row) {
  131. cols.insert(kv.first);
  132. }
  133. }
  134. struct TAggregate {
  135. size_t Idx = 0;
  136. double Sum = 0;
  137. size_t Count = 0;
  138. double Mean = 0;
  139. };
  140. THashMap<TString, TAggregate> colAggr;
  141. size_t colCount = 0;
  142. for (const TString& col : cols) {
  143. TAggregate& aggr = colAggr[col];
  144. aggr.Idx = colCount++;
  145. }
  146. for (const TRow& row : in) {
  147. for (const auto& kv : row) {
  148. const TString& xn = kv.first;
  149. double x;
  150. if (!row.Get(xn, x)) {
  151. continue;
  152. }
  153. TAggregate& aggr = colAggr[xn];
  154. aggr.Sum += x;
  155. aggr.Count++;
  156. }
  157. }
  158. for (auto& kv : colAggr) {
  159. TAggregate& aggr = kv.second;
  160. aggr.Mean = aggr.Sum / aggr.Count;
  161. }
  162. TMatrix covCount(cols.size(), cols.size());
  163. TMatrix cov(cols.size(), cols.size());
  164. for (const TRow& row : in) {
  165. for (const auto& kv1 : row) {
  166. double x;
  167. if (!row.Get(kv1.first, x)) {
  168. continue;
  169. }
  170. TAggregate& xaggr = colAggr[kv1.first];
  171. for (const auto& kv2 : row) {
  172. double y;
  173. if (!row.Get(kv2.first, y)) {
  174. continue;
  175. }
  176. TAggregate& yaggr = colAggr[kv2.first];
  177. covCount.Cell(xaggr.Idx, yaggr.Idx)++;
  178. cov.Cell(xaggr.Idx, yaggr.Idx) += (x - xaggr.Mean) * (y - yaggr.Mean);
  179. }
  180. }
  181. }
  182. for (size_t idx = 0; idx < cov.size(); idx++) {
  183. cov[idx] /= covCount[idx];
  184. }
  185. return cov;
  186. }
  187. }