statistical.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "../libnetdata.h"
  3. NETDATA_DOUBLE default_single_exponential_smoothing_alpha = 0.1;
  4. void log_series_to_stderr(NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE result, const char *msg) {
  5. const NETDATA_DOUBLE *value, *end = &series[entries];
  6. fprintf(stderr, "%s of %zu entries [ ", msg, entries);
  7. for(value = series; value < end ;value++) {
  8. if(value != series) fprintf(stderr, ", ");
  9. fprintf(stderr, "%" NETDATA_DOUBLE_MODIFIER, *value);
  10. }
  11. fprintf(stderr, " ] results in " NETDATA_DOUBLE_FORMAT "\n", result);
  12. }
  13. // --------------------------------------------------------------------------------------------------------------------
  14. inline NETDATA_DOUBLE sum_and_count(const NETDATA_DOUBLE *series, size_t entries, size_t *count) {
  15. const NETDATA_DOUBLE *value, *end = &series[entries];
  16. NETDATA_DOUBLE sum = 0;
  17. size_t c = 0;
  18. for(value = series; value < end ; value++) {
  19. if(netdata_double_isnumber(*value)) {
  20. sum += *value;
  21. c++;
  22. }
  23. }
  24. if(unlikely(!c)) sum = NAN;
  25. if(likely(count)) *count = c;
  26. return sum;
  27. }
  28. inline NETDATA_DOUBLE sum(const NETDATA_DOUBLE *series, size_t entries) {
  29. return sum_and_count(series, entries, NULL);
  30. }
  31. inline NETDATA_DOUBLE average(const NETDATA_DOUBLE *series, size_t entries) {
  32. size_t count = 0;
  33. NETDATA_DOUBLE sum = sum_and_count(series, entries, &count);
  34. if(unlikely(!count)) return NAN;
  35. return sum / (NETDATA_DOUBLE)count;
  36. }
  37. // --------------------------------------------------------------------------------------------------------------------
  38. NETDATA_DOUBLE moving_average(const NETDATA_DOUBLE *series, size_t entries, size_t period) {
  39. if(unlikely(period <= 0))
  40. return 0.0;
  41. size_t i, count;
  42. NETDATA_DOUBLE sum = 0, avg = 0;
  43. NETDATA_DOUBLE p[period];
  44. for(count = 0; count < period ; count++)
  45. p[count] = 0.0;
  46. for(i = 0, count = 0; i < entries; i++) {
  47. NETDATA_DOUBLE value = series[i];
  48. if(unlikely(!netdata_double_isnumber(value))) continue;
  49. if(unlikely(count < period)) {
  50. sum += value;
  51. avg = (count == period - 1) ? sum / (NETDATA_DOUBLE)period : 0;
  52. }
  53. else {
  54. sum = sum - p[count % period] + value;
  55. avg = sum / (NETDATA_DOUBLE)period;
  56. }
  57. p[count % period] = value;
  58. count++;
  59. }
  60. return avg;
  61. }
  62. // --------------------------------------------------------------------------------------------------------------------
  63. static int qsort_compare(const void *a, const void *b) {
  64. NETDATA_DOUBLE *p1 = (NETDATA_DOUBLE *)a, *p2 = (NETDATA_DOUBLE *)b;
  65. NETDATA_DOUBLE n1 = *p1, n2 = *p2;
  66. if(unlikely(isnan(n1) || isnan(n2))) {
  67. if(isnan(n1) && !isnan(n2)) return -1;
  68. if(!isnan(n1) && isnan(n2)) return 1;
  69. return 0;
  70. }
  71. if(unlikely(isinf(n1) || isinf(n2))) {
  72. if(!isinf(n1) && isinf(n2)) return -1;
  73. if(isinf(n1) && !isinf(n2)) return 1;
  74. return 0;
  75. }
  76. if(unlikely(n1 < n2)) return -1;
  77. if(unlikely(n1 > n2)) return 1;
  78. return 0;
  79. }
  80. inline void sort_series(NETDATA_DOUBLE *series, size_t entries) {
  81. qsort(series, entries, sizeof(NETDATA_DOUBLE), qsort_compare);
  82. }
  83. inline NETDATA_DOUBLE *copy_series(const NETDATA_DOUBLE *series, size_t entries) {
  84. NETDATA_DOUBLE *copy = mallocz(sizeof(NETDATA_DOUBLE) * entries);
  85. memcpy(copy, series, sizeof(NETDATA_DOUBLE) * entries);
  86. return copy;
  87. }
  88. NETDATA_DOUBLE median_on_sorted_series(const NETDATA_DOUBLE *series, size_t entries) {
  89. if(unlikely(entries == 0)) return NAN;
  90. if(unlikely(entries == 1)) return series[0];
  91. if(unlikely(entries == 2)) return (series[0] + series[1]) / 2;
  92. NETDATA_DOUBLE average;
  93. if(entries % 2 == 0) {
  94. size_t m = entries / 2;
  95. average = (series[m] + series[m + 1]) / 2;
  96. }
  97. else {
  98. average = series[entries / 2];
  99. }
  100. return average;
  101. }
  102. NETDATA_DOUBLE median(const NETDATA_DOUBLE *series, size_t entries) {
  103. if(unlikely(entries == 0)) return NAN;
  104. if(unlikely(entries == 1)) return series[0];
  105. if(unlikely(entries == 2))
  106. return (series[0] + series[1]) / 2;
  107. NETDATA_DOUBLE *copy = copy_series(series, entries);
  108. sort_series(copy, entries);
  109. NETDATA_DOUBLE avg = median_on_sorted_series(copy, entries);
  110. freez(copy);
  111. return avg;
  112. }
  113. // --------------------------------------------------------------------------------------------------------------------
  114. NETDATA_DOUBLE moving_median(const NETDATA_DOUBLE *series, size_t entries, size_t period) {
  115. if(entries <= period)
  116. return median(series, entries);
  117. NETDATA_DOUBLE *data = copy_series(series, entries);
  118. size_t i;
  119. for(i = period; i < entries; i++) {
  120. data[i - period] = median(&series[i - period], period);
  121. }
  122. NETDATA_DOUBLE avg = median(data, entries - period);
  123. freez(data);
  124. return avg;
  125. }
  126. // --------------------------------------------------------------------------------------------------------------------
  127. // http://stackoverflow.com/a/15150143/4525767
  128. NETDATA_DOUBLE running_median_estimate(const NETDATA_DOUBLE *series, size_t entries) {
  129. NETDATA_DOUBLE median = 0.0f;
  130. NETDATA_DOUBLE average = 0.0f;
  131. size_t i;
  132. for(i = 0; i < entries ; i++) {
  133. NETDATA_DOUBLE value = series[i];
  134. if(unlikely(!netdata_double_isnumber(value))) continue;
  135. average += ( value - average ) * 0.1f; // rough running average.
  136. median += copysignndd( average * 0.01, value - median );
  137. }
  138. return median;
  139. }
  140. // --------------------------------------------------------------------------------------------------------------------
  141. NETDATA_DOUBLE standard_deviation(const NETDATA_DOUBLE *series, size_t entries) {
  142. if(unlikely(entries == 0)) return NAN;
  143. if(unlikely(entries == 1)) return series[0];
  144. const NETDATA_DOUBLE *value, *end = &series[entries];
  145. size_t count;
  146. NETDATA_DOUBLE sum;
  147. for(count = 0, sum = 0, value = series ; value < end ;value++) {
  148. if(likely(netdata_double_isnumber(*value))) {
  149. count++;
  150. sum += *value;
  151. }
  152. }
  153. if(unlikely(count == 0)) return NAN;
  154. if(unlikely(count == 1)) return sum;
  155. NETDATA_DOUBLE average = sum / (NETDATA_DOUBLE)count;
  156. for(count = 0, sum = 0, value = series ; value < end ;value++) {
  157. if(netdata_double_isnumber(*value)) {
  158. count++;
  159. sum += powndd(*value - average, 2);
  160. }
  161. }
  162. if(unlikely(count == 0)) return NAN;
  163. if(unlikely(count == 1)) return average;
  164. NETDATA_DOUBLE variance = sum / (NETDATA_DOUBLE)(count); // remove -1 from count to have a population stddev
  165. NETDATA_DOUBLE stddev = sqrtndd(variance);
  166. return stddev;
  167. }
  168. // --------------------------------------------------------------------------------------------------------------------
  169. NETDATA_DOUBLE single_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha) {
  170. if(unlikely(entries == 0))
  171. return NAN;
  172. if(unlikely(isnan(alpha)))
  173. alpha = default_single_exponential_smoothing_alpha;
  174. const NETDATA_DOUBLE *value = series, *end = &series[entries];
  175. NETDATA_DOUBLE level = (1.0 - alpha) * (*value);
  176. for(value++ ; value < end; value++) {
  177. if(likely(netdata_double_isnumber(*value)))
  178. level = alpha * (*value) + (1.0 - alpha) * level;
  179. }
  180. return level;
  181. }
  182. NETDATA_DOUBLE single_exponential_smoothing_reverse(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha) {
  183. if(unlikely(entries == 0))
  184. return NAN;
  185. if(unlikely(isnan(alpha)))
  186. alpha = default_single_exponential_smoothing_alpha;
  187. const NETDATA_DOUBLE *value = &series[entries -1];
  188. NETDATA_DOUBLE level = (1.0 - alpha) * (*value);
  189. for(value++ ; value >= series; value--) {
  190. if(likely(netdata_double_isnumber(*value)))
  191. level = alpha * (*value) + (1.0 - alpha) * level;
  192. }
  193. return level;
  194. }
  195. // --------------------------------------------------------------------------------------------------------------------
  196. // http://grisha.org/blog/2016/02/16/triple-exponential-smoothing-forecasting-part-ii/
  197. NETDATA_DOUBLE double_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries,
  198. NETDATA_DOUBLE alpha,
  199. NETDATA_DOUBLE beta,
  200. NETDATA_DOUBLE *forecast) {
  201. if(unlikely(entries == 0))
  202. return NAN;
  203. NETDATA_DOUBLE level, trend;
  204. if(unlikely(isnan(alpha)))
  205. alpha = 0.3;
  206. if(unlikely(isnan(beta)))
  207. beta = 0.05;
  208. level = series[0];
  209. if(likely(entries > 1))
  210. trend = series[1] - series[0];
  211. else
  212. trend = 0;
  213. const NETDATA_DOUBLE *value = series;
  214. for(value++ ; value >= series; value--) {
  215. if(likely(netdata_double_isnumber(*value))) {
  216. NETDATA_DOUBLE last_level = level;
  217. level = alpha * *value + (1.0 - alpha) * (level + trend);
  218. trend = beta * (level - last_level) + (1.0 - beta) * trend;
  219. }
  220. }
  221. if(forecast)
  222. *forecast = level + trend;
  223. return level;
  224. }
  225. // --------------------------------------------------------------------------------------------------------------------
  226. /*
  227. * Based on th R implementation
  228. *
  229. * a: level component
  230. * b: trend component
  231. * s: seasonal component
  232. *
  233. * Additive:
  234. *
  235. * Yhat[t+h] = a[t] + h * b[t] + s[t + 1 + (h - 1) mod p],
  236. * a[t] = α (Y[t] - s[t-p]) + (1-α) (a[t-1] + b[t-1])
  237. * b[t] = β (a[t] - a[t-1]) + (1-β) b[t-1]
  238. * s[t] = γ (Y[t] - a[t]) + (1-γ) s[t-p]
  239. *
  240. * Multiplicative:
  241. *
  242. * Yhat[t+h] = (a[t] + h * b[t]) * s[t + 1 + (h - 1) mod p],
  243. * a[t] = α (Y[t] / s[t-p]) + (1-α) (a[t-1] + b[t-1])
  244. * b[t] = β (a[t] - a[t-1]) + (1-β) b[t-1]
  245. * s[t] = γ (Y[t] / a[t]) + (1-γ) s[t-p]
  246. */
  247. static int __HoltWinters(
  248. const NETDATA_DOUBLE *series,
  249. int entries, // start_time + h
  250. NETDATA_DOUBLE alpha, // alpha parameter of Holt-Winters Filter.
  251. NETDATA_DOUBLE
  252. beta, // beta parameter of Holt-Winters Filter. If set to 0, the function will do exponential smoothing.
  253. NETDATA_DOUBLE
  254. gamma, // gamma parameter used for the seasonal component. If set to 0, an non-seasonal model is fitted.
  255. const int *seasonal,
  256. const int *period,
  257. const NETDATA_DOUBLE *a, // Start value for level (a[0]).
  258. const NETDATA_DOUBLE *b, // Start value for trend (b[0]).
  259. NETDATA_DOUBLE *s, // Vector of start values for the seasonal component (s_1[0] ... s_p[0])
  260. /* return values */
  261. NETDATA_DOUBLE *SSE, // The final sum of squared errors achieved in optimizing
  262. NETDATA_DOUBLE *level, // Estimated values for the level component (size entries - t + 2)
  263. NETDATA_DOUBLE *trend, // Estimated values for the trend component (size entries - t + 2)
  264. NETDATA_DOUBLE *season // Estimated values for the seasonal component (size entries - t + 2)
  265. )
  266. {
  267. if(unlikely(entries < 4))
  268. return 0;
  269. int start_time = 2;
  270. NETDATA_DOUBLE res = 0, xhat = 0, stmp = 0;
  271. int i, i0, s0;
  272. /* copy start values to the beginning of the vectors */
  273. level[0] = *a;
  274. if(beta > 0) trend[0] = *b;
  275. if(gamma > 0) memcpy(season, s, *period * sizeof(NETDATA_DOUBLE));
  276. for(i = start_time - 1; i < entries; i++) {
  277. /* indices for period i */
  278. i0 = i - start_time + 2;
  279. s0 = i0 + *period - 1;
  280. /* forecast *for* period i */
  281. xhat = level[i0 - 1] + (beta > 0 ? trend[i0 - 1] : 0);
  282. stmp = gamma > 0 ? season[s0 - *period] : (*seasonal != 1);
  283. if (*seasonal == 1)
  284. xhat += stmp;
  285. else
  286. xhat *= stmp;
  287. /* Sum of Squared Errors */
  288. res = series[i] - xhat;
  289. *SSE += res * res;
  290. /* estimate of level *in* period i */
  291. if (*seasonal == 1)
  292. level[i0] = alpha * (series[i] - stmp)
  293. + (1 - alpha) * (level[i0 - 1] + trend[i0 - 1]);
  294. else
  295. level[i0] = alpha * (series[i] / stmp)
  296. + (1 - alpha) * (level[i0 - 1] + trend[i0 - 1]);
  297. /* estimate of trend *in* period i */
  298. if (beta > 0)
  299. trend[i0] = beta * (level[i0] - level[i0 - 1])
  300. + (1 - beta) * trend[i0 - 1];
  301. /* estimate of seasonal component *in* period i */
  302. if (gamma > 0) {
  303. if (*seasonal == 1)
  304. season[s0] = gamma * (series[i] - level[i0])
  305. + (1 - gamma) * stmp;
  306. else
  307. season[s0] = gamma * (series[i] / level[i0])
  308. + (1 - gamma) * stmp;
  309. }
  310. }
  311. return 1;
  312. }
  313. NETDATA_DOUBLE holtwinters(const NETDATA_DOUBLE *series, size_t entries,
  314. NETDATA_DOUBLE alpha,
  315. NETDATA_DOUBLE beta,
  316. NETDATA_DOUBLE gamma,
  317. NETDATA_DOUBLE *forecast) {
  318. if(unlikely(isnan(alpha)))
  319. alpha = 0.3;
  320. if(unlikely(isnan(beta)))
  321. beta = 0.05;
  322. if(unlikely(isnan(gamma)))
  323. gamma = 0;
  324. int seasonal = 0;
  325. int period = 0;
  326. NETDATA_DOUBLE a0 = series[0];
  327. NETDATA_DOUBLE b0 = 0;
  328. NETDATA_DOUBLE s[] = {};
  329. NETDATA_DOUBLE errors = 0.0;
  330. size_t nb_computations = entries;
  331. NETDATA_DOUBLE *estimated_level = callocz(nb_computations, sizeof(NETDATA_DOUBLE));
  332. NETDATA_DOUBLE *estimated_trend = callocz(nb_computations, sizeof(NETDATA_DOUBLE));
  333. NETDATA_DOUBLE *estimated_season = callocz(nb_computations, sizeof(NETDATA_DOUBLE));
  334. int ret = __HoltWinters(
  335. series,
  336. (int)entries,
  337. alpha,
  338. beta,
  339. gamma,
  340. &seasonal,
  341. &period,
  342. &a0,
  343. &b0,
  344. s,
  345. &errors,
  346. estimated_level,
  347. estimated_trend,
  348. estimated_season
  349. );
  350. NETDATA_DOUBLE value = estimated_level[nb_computations - 1];
  351. if(forecast)
  352. *forecast = 0.0;
  353. freez(estimated_level);
  354. freez(estimated_trend);
  355. freez(estimated_season);
  356. if(!ret)
  357. return 0.0;
  358. return value;
  359. }