weights.c 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "daemon/common.h"
  3. #include "database/KolmogorovSmirnovDist.h"
  4. #define MAX_POINTS 10000
  5. int enable_metric_correlations = CONFIG_BOOLEAN_YES;
  6. int metric_correlations_version = 1;
  7. WEIGHTS_METHOD default_metric_correlations_method = WEIGHTS_METHOD_MC_KS2;
  8. typedef struct weights_stats {
  9. NETDATA_DOUBLE max_base_high_ratio;
  10. size_t db_points;
  11. size_t result_points;
  12. size_t db_queries;
  13. size_t db_points_per_tier[RRD_STORAGE_TIERS];
  14. size_t binary_searches;
  15. } WEIGHTS_STATS;
  16. // ----------------------------------------------------------------------------
  17. // parse and render metric correlations methods
  18. static struct {
  19. const char *name;
  20. WEIGHTS_METHOD value;
  21. } weights_methods[] = {
  22. { "ks2" , WEIGHTS_METHOD_MC_KS2}
  23. , { "volume" , WEIGHTS_METHOD_MC_VOLUME}
  24. , { "anomaly-rate" , WEIGHTS_METHOD_ANOMALY_RATE}
  25. , { NULL , 0 }
  26. };
  27. WEIGHTS_METHOD weights_string_to_method(const char *method) {
  28. for(int i = 0; weights_methods[i].name ;i++)
  29. if(strcmp(method, weights_methods[i].name) == 0)
  30. return weights_methods[i].value;
  31. return default_metric_correlations_method;
  32. }
  33. const char *weights_method_to_string(WEIGHTS_METHOD method) {
  34. for(int i = 0; weights_methods[i].name ;i++)
  35. if(weights_methods[i].value == method)
  36. return weights_methods[i].name;
  37. return "unknown";
  38. }
  39. // ----------------------------------------------------------------------------
  40. // The results per dimension are aggregated into a dictionary
  41. typedef enum {
  42. RESULT_IS_BASE_HIGH_RATIO = (1 << 0),
  43. RESULT_IS_PERCENTAGE_OF_TIME = (1 << 1),
  44. } RESULT_FLAGS;
  45. struct register_result {
  46. RESULT_FLAGS flags;
  47. RRDSET *st;
  48. const char *chart_id;
  49. const char *context;
  50. const char *dim_name;
  51. NETDATA_DOUBLE value;
  52. struct register_result *next; // used to link contexts together
  53. };
  54. static void register_result_insert_callback(const char *name, void *value, void *data) {
  55. (void)name;
  56. (void)data;
  57. struct register_result *t = (struct register_result *)value;
  58. if(t->chart_id) t->chart_id = strdupz(t->chart_id);
  59. if(t->context) t->context = strdupz(t->context);
  60. if(t->dim_name) t->dim_name = strdupz(t->dim_name);
  61. }
  62. static void register_result_delete_callback(const char *name, void *value, void *data) {
  63. (void)name;
  64. (void)data;
  65. struct register_result *t = (struct register_result *)value;
  66. freez((void *)t->chart_id);
  67. freez((void *)t->context);
  68. freez((void *)t->dim_name);
  69. }
  70. static DICTIONARY *register_result_init() {
  71. DICTIONARY *results = dictionary_create(DICTIONARY_FLAG_SINGLE_THREADED);
  72. dictionary_register_insert_callback(results, register_result_insert_callback, results);
  73. dictionary_register_delete_callback(results, register_result_delete_callback, results);
  74. return results;
  75. }
  76. static void register_result_destroy(DICTIONARY *results) {
  77. dictionary_destroy(results);
  78. }
  79. static void register_result(DICTIONARY *results,
  80. RRDSET *st,
  81. RRDDIM *d,
  82. NETDATA_DOUBLE value,
  83. RESULT_FLAGS flags,
  84. WEIGHTS_STATS *stats,
  85. bool register_zero) {
  86. if(!netdata_double_isnumber(value)) return;
  87. // make it positive
  88. NETDATA_DOUBLE v = fabsndd(value);
  89. // no need to store zero scored values
  90. if(unlikely(fpclassify(v) == FP_ZERO && !register_zero))
  91. return;
  92. // keep track of the max of the baseline / highlight ratio
  93. if(flags & RESULT_IS_BASE_HIGH_RATIO && v > stats->max_base_high_ratio)
  94. stats->max_base_high_ratio = v;
  95. struct register_result t = {
  96. .flags = flags,
  97. .st = st,
  98. .chart_id = st->id,
  99. .context = st->context,
  100. .dim_name = d->name,
  101. .value = v
  102. };
  103. char buf[5000 + 1];
  104. snprintfz(buf, 5000, "%s:%s", st->id, d->name);
  105. dictionary_set(results, buf, &t, sizeof(struct register_result));
  106. }
  107. // ----------------------------------------------------------------------------
  108. // Generation of JSON output for the results
  109. static void results_header_to_json(DICTIONARY *results __maybe_unused, BUFFER *wb,
  110. long long after, long long before,
  111. long long baseline_after, long long baseline_before,
  112. long points, WEIGHTS_METHOD method,
  113. RRDR_GROUPING group, RRDR_OPTIONS options, uint32_t shifts,
  114. size_t examined_dimensions __maybe_unused, usec_t duration,
  115. WEIGHTS_STATS *stats) {
  116. buffer_sprintf(wb, "{\n"
  117. "\t\"after\": %lld,\n"
  118. "\t\"before\": %lld,\n"
  119. "\t\"duration\": %lld,\n"
  120. "\t\"points\": %ld,\n",
  121. after,
  122. before,
  123. before - after,
  124. points
  125. );
  126. if(method == WEIGHTS_METHOD_MC_KS2 || method == WEIGHTS_METHOD_MC_VOLUME)
  127. buffer_sprintf(wb, ""
  128. "\t\"baseline_after\": %lld,\n"
  129. "\t\"baseline_before\": %lld,\n"
  130. "\t\"baseline_duration\": %lld,\n"
  131. "\t\"baseline_points\": %ld,\n",
  132. baseline_after,
  133. baseline_before,
  134. baseline_before - baseline_after,
  135. points << shifts
  136. );
  137. buffer_sprintf(wb, ""
  138. "\t\"statistics\": {\n"
  139. "\t\t\"query_time_ms\": %f,\n"
  140. "\t\t\"db_queries\": %zu,\n"
  141. "\t\t\"query_result_points\": %zu,\n"
  142. "\t\t\"binary_searches\": %zu,\n"
  143. "\t\t\"db_points_read\": %zu,\n"
  144. "\t\t\"db_points_per_tier\": [ ",
  145. (double)duration / (double)USEC_PER_MS,
  146. stats->db_queries,
  147. stats->result_points,
  148. stats->binary_searches,
  149. stats->db_points
  150. );
  151. for(int tier = 0; tier < storage_tiers ;tier++)
  152. buffer_sprintf(wb, "%s%zu", tier?", ":"", stats->db_points_per_tier[tier]);
  153. buffer_sprintf(wb, " ]\n"
  154. "\t},\n"
  155. "\t\"group\": \"%s\",\n"
  156. "\t\"method\": \"%s\",\n"
  157. "\t\"options\": \"",
  158. web_client_api_request_v1_data_group_to_string(group),
  159. weights_method_to_string(method)
  160. );
  161. web_client_api_request_v1_data_options_to_string(wb, options);
  162. }
  163. static size_t registered_results_to_json_charts(DICTIONARY *results, BUFFER *wb,
  164. long long after, long long before,
  165. long long baseline_after, long long baseline_before,
  166. long points, WEIGHTS_METHOD method,
  167. RRDR_GROUPING group, RRDR_OPTIONS options, uint32_t shifts,
  168. size_t examined_dimensions, usec_t duration,
  169. WEIGHTS_STATS *stats) {
  170. results_header_to_json(results, wb, after, before, baseline_after, baseline_before,
  171. points, method, group, options, shifts, examined_dimensions, duration, stats);
  172. buffer_strcat(wb, "\",\n\t\"correlated_charts\": {\n");
  173. size_t charts = 0, chart_dims = 0, total_dimensions = 0;
  174. struct register_result *t;
  175. RRDSET *last_st = NULL; // never access this - we use it only for comparison
  176. dfe_start_read(results, t) {
  177. if(!last_st || t->st != last_st) {
  178. last_st = t->st;
  179. if(charts) buffer_strcat(wb, "\n\t\t\t}\n\t\t},\n");
  180. buffer_strcat(wb, "\t\t\"");
  181. buffer_strcat(wb, t->chart_id);
  182. buffer_strcat(wb, "\": {\n");
  183. buffer_strcat(wb, "\t\t\t\"context\": \"");
  184. buffer_strcat(wb, t->context);
  185. buffer_strcat(wb, "\",\n\t\t\t\"dimensions\": {\n");
  186. charts++;
  187. chart_dims = 0;
  188. }
  189. if (chart_dims) buffer_sprintf(wb, ",\n");
  190. buffer_sprintf(wb, "\t\t\t\t\"%s\": " NETDATA_DOUBLE_FORMAT, t->dim_name, t->value);
  191. chart_dims++;
  192. total_dimensions++;
  193. }
  194. dfe_done(t);
  195. // close dimensions and chart
  196. if (total_dimensions)
  197. buffer_strcat(wb, "\n\t\t\t}\n\t\t}\n");
  198. // close correlated_charts
  199. buffer_sprintf(wb, "\t},\n"
  200. "\t\"correlated_dimensions\": %zu,\n"
  201. "\t\"total_dimensions_count\": %zu\n"
  202. "}\n",
  203. total_dimensions,
  204. examined_dimensions
  205. );
  206. return total_dimensions;
  207. }
  208. static size_t registered_results_to_json_contexts(DICTIONARY *results, BUFFER *wb,
  209. long long after, long long before,
  210. long long baseline_after, long long baseline_before,
  211. long points, WEIGHTS_METHOD method,
  212. RRDR_GROUPING group, RRDR_OPTIONS options, uint32_t shifts,
  213. size_t examined_dimensions, usec_t duration,
  214. WEIGHTS_STATS *stats) {
  215. results_header_to_json(results, wb, after, before, baseline_after, baseline_before,
  216. points, method, group, options, shifts, examined_dimensions, duration, stats);
  217. DICTIONARY *context_results = dictionary_create(
  218. DICTIONARY_FLAG_SINGLE_THREADED
  219. |DICTIONARY_FLAG_VALUE_LINK_DONT_CLONE
  220. |DICTIONARY_FLAG_NAME_LINK_DONT_CLONE
  221. |DICTIONARY_FLAG_DONT_OVERWRITE_VALUE
  222. );
  223. struct register_result *t;
  224. dfe_start_read(results, t) {
  225. struct register_result *tc = dictionary_set(context_results, t->context, t, sizeof(*t));
  226. if(tc == t)
  227. t->next = NULL;
  228. else {
  229. t->next = tc->next;
  230. tc->next = t;
  231. }
  232. }
  233. dfe_done(t);
  234. buffer_strcat(wb, "\",\n\t\"contexts\": {\n");
  235. size_t contexts = 0, total_dimensions = 0, charts = 0, context_dims = 0, chart_dims = 0;
  236. NETDATA_DOUBLE contexts_total_weight = 0.0, charts_total_weight = 0.0;
  237. RRDSET *last_st = NULL; // never access this - we use it only for comparison
  238. dfe_start_read(context_results, t) {
  239. if(contexts)
  240. buffer_sprintf(wb, "\n\t\t\t\t\t},\n\t\t\t\t\t\"weight\":" NETDATA_DOUBLE_FORMAT "\n\t\t\t\t}\n\t\t\t},\n\t\t\t\"weight\":" NETDATA_DOUBLE_FORMAT "\n\t\t},\n", charts_total_weight / chart_dims, contexts_total_weight / context_dims);
  241. contexts++;
  242. context_dims = 0;
  243. contexts_total_weight = 0.0;
  244. buffer_strcat(wb, "\t\t\"");
  245. buffer_strcat(wb, t->context);
  246. buffer_strcat(wb, "\": {\n\t\t\t\"charts\":{\n");
  247. charts = 0;
  248. chart_dims = 0;
  249. struct register_result *tt;
  250. for(tt = t; tt ; tt = tt->next) {
  251. if(!last_st || tt->st != last_st) {
  252. last_st = tt->st;
  253. if(charts)
  254. buffer_sprintf(wb, "\n\t\t\t\t\t},\n\t\t\t\t\t\"weight\":" NETDATA_DOUBLE_FORMAT "\n\t\t\t\t},\n", charts_total_weight / chart_dims);
  255. buffer_strcat(wb, "\t\t\t\t\"");
  256. buffer_strcat(wb, tt->chart_id);
  257. buffer_strcat(wb, "\": {\n");
  258. buffer_strcat(wb, "\t\t\t\t\t\"dimensions\": {\n");
  259. charts++;
  260. chart_dims = 0;
  261. charts_total_weight = 0.0;
  262. }
  263. if (chart_dims) buffer_sprintf(wb, ",\n");
  264. buffer_sprintf(wb, "\t\t\t\t\t\t\"%s\": " NETDATA_DOUBLE_FORMAT, tt->dim_name, tt->value);
  265. charts_total_weight += tt->value;
  266. contexts_total_weight += tt->value;
  267. chart_dims++;
  268. context_dims++;
  269. total_dimensions++;
  270. }
  271. }
  272. dfe_done(t);
  273. dictionary_destroy(context_results);
  274. // close dimensions and chart
  275. if (total_dimensions)
  276. buffer_sprintf(wb, "\n\t\t\t\t\t},\n\t\t\t\t\t\"weight\":" NETDATA_DOUBLE_FORMAT "\n\t\t\t\t}\n\t\t\t},\n\t\t\t\"weight\":" NETDATA_DOUBLE_FORMAT "\n\t\t}\n", charts_total_weight / chart_dims, contexts_total_weight / context_dims);
  277. // close correlated_charts
  278. buffer_sprintf(wb, "\t},\n"
  279. "\t\"weighted_dimensions\": %zu,\n"
  280. "\t\"total_dimensions_count\": %zu\n"
  281. "}\n",
  282. total_dimensions,
  283. examined_dimensions
  284. );
  285. return total_dimensions;
  286. }
  287. // ----------------------------------------------------------------------------
  288. // KS2 algorithm functions
  289. typedef long int DIFFS_NUMBERS;
  290. #define DOUBLE_TO_INT_MULTIPLIER 100000
  291. static inline int binary_search_bigger_than(const DIFFS_NUMBERS arr[], int left, int size, DIFFS_NUMBERS K) {
  292. // binary search to find the index the smallest index
  293. // of the first value in the array that is greater than K
  294. int right = size;
  295. while(left < right) {
  296. int middle = (int)(((unsigned int)(left + right)) >> 1);
  297. if(arr[middle] > K)
  298. right = middle;
  299. else
  300. left = middle + 1;
  301. }
  302. return left;
  303. }
  304. int compare_diffs(const void *left, const void *right) {
  305. DIFFS_NUMBERS lt = *(DIFFS_NUMBERS *)left;
  306. DIFFS_NUMBERS rt = *(DIFFS_NUMBERS *)right;
  307. // https://stackoverflow.com/a/3886497/1114110
  308. return (lt > rt) - (lt < rt);
  309. }
  310. static size_t calculate_pairs_diff(DIFFS_NUMBERS *diffs, NETDATA_DOUBLE *arr, size_t size) {
  311. NETDATA_DOUBLE *last = &arr[size - 1];
  312. size_t added = 0;
  313. while(last > arr) {
  314. NETDATA_DOUBLE second = *last--;
  315. NETDATA_DOUBLE first = *last;
  316. *diffs++ = (DIFFS_NUMBERS)((first - second) * (NETDATA_DOUBLE)DOUBLE_TO_INT_MULTIPLIER);
  317. added++;
  318. }
  319. return added;
  320. }
  321. static double ks_2samp(DIFFS_NUMBERS baseline_diffs[], int base_size, DIFFS_NUMBERS highlight_diffs[], int high_size, uint32_t base_shifts) {
  322. qsort(baseline_diffs, base_size, sizeof(DIFFS_NUMBERS), compare_diffs);
  323. qsort(highlight_diffs, high_size, sizeof(DIFFS_NUMBERS), compare_diffs);
  324. // Now we should be calculating this:
  325. //
  326. // For each number in the diffs arrays, we should find the index of the
  327. // number bigger than them in both arrays and calculate the % of this index
  328. // vs the total array size. Once we have the 2 percentages, we should find
  329. // the min and max across the delta of all of them.
  330. //
  331. // It should look like this:
  332. //
  333. // base_pcent = binary_search_bigger_than(...) / base_size;
  334. // high_pcent = binary_search_bigger_than(...) / high_size;
  335. // delta = base_pcent - high_pcent;
  336. // if(delta < min) min = delta;
  337. // if(delta > max) max = delta;
  338. //
  339. // This would require a lot of multiplications and divisions.
  340. //
  341. // To speed it up, we do the binary search to find the index of each number
  342. // but then we divide the base index by the power of two number (shifts) it
  343. // is bigger than high index. So the 2 indexes are now comparable.
  344. // We also keep track of the original indexes with min and max, to properly
  345. // calculate their percentages once the loops finish.
  346. // initialize min and max using the first number of baseline_diffs
  347. DIFFS_NUMBERS K = baseline_diffs[0];
  348. int base_idx = binary_search_bigger_than(baseline_diffs, 1, base_size, K);
  349. int high_idx = binary_search_bigger_than(highlight_diffs, 0, high_size, K);
  350. int delta = base_idx - (high_idx << base_shifts);
  351. int min = delta, max = delta;
  352. int base_min_idx = base_idx;
  353. int base_max_idx = base_idx;
  354. int high_min_idx = high_idx;
  355. int high_max_idx = high_idx;
  356. // do the baseline_diffs starting from 1 (we did position 0 above)
  357. for(int i = 1; i < base_size; i++) {
  358. K = baseline_diffs[i];
  359. base_idx = binary_search_bigger_than(baseline_diffs, i + 1, base_size, K); // starting from i, since data1 is sorted
  360. high_idx = binary_search_bigger_than(highlight_diffs, 0, high_size, K);
  361. delta = base_idx - (high_idx << base_shifts);
  362. if(delta < min) {
  363. min = delta;
  364. base_min_idx = base_idx;
  365. high_min_idx = high_idx;
  366. }
  367. else if(delta > max) {
  368. max = delta;
  369. base_max_idx = base_idx;
  370. high_max_idx = high_idx;
  371. }
  372. }
  373. // do the highlight_diffs starting from 0
  374. for(int i = 0; i < high_size; i++) {
  375. K = highlight_diffs[i];
  376. base_idx = binary_search_bigger_than(baseline_diffs, 0, base_size, K);
  377. high_idx = binary_search_bigger_than(highlight_diffs, i + 1, high_size, K); // starting from i, since data2 is sorted
  378. delta = base_idx - (high_idx << base_shifts);
  379. if(delta < min) {
  380. min = delta;
  381. base_min_idx = base_idx;
  382. high_min_idx = high_idx;
  383. }
  384. else if(delta > max) {
  385. max = delta;
  386. base_max_idx = base_idx;
  387. high_max_idx = high_idx;
  388. }
  389. }
  390. // now we have the min, max and their indexes
  391. // properly calculate min and max as dmin and dmax
  392. double dbase_size = (double)base_size;
  393. double dhigh_size = (double)high_size;
  394. double dmin = ((double)base_min_idx / dbase_size) - ((double)high_min_idx / dhigh_size);
  395. double dmax = ((double)base_max_idx / dbase_size) - ((double)high_max_idx / dhigh_size);
  396. dmin = -dmin;
  397. if(islessequal(dmin, 0.0)) dmin = 0.0;
  398. else if(isgreaterequal(dmin, 1.0)) dmin = 1.0;
  399. double d;
  400. if(isgreaterequal(dmin, dmax)) d = dmin;
  401. else d = dmax;
  402. double en = round(dbase_size * dhigh_size / (dbase_size + dhigh_size));
  403. // under these conditions, KSfbar() crashes
  404. if(unlikely(isnan(en) || isinf(en) || en == 0.0 || isnan(d) || isinf(d)))
  405. return NAN;
  406. return KSfbar((int)en, d);
  407. }
  408. static double kstwo(
  409. NETDATA_DOUBLE baseline[], int baseline_points,
  410. NETDATA_DOUBLE highlight[], int highlight_points, uint32_t base_shifts) {
  411. // -1 in size, since the calculate_pairs_diffs() returns one less point
  412. DIFFS_NUMBERS baseline_diffs[baseline_points - 1];
  413. DIFFS_NUMBERS highlight_diffs[highlight_points - 1];
  414. int base_size = (int)calculate_pairs_diff(baseline_diffs, baseline, baseline_points);
  415. int high_size = (int)calculate_pairs_diff(highlight_diffs, highlight, highlight_points);
  416. if(unlikely(!base_size || !high_size))
  417. return NAN;
  418. if(unlikely(base_size != baseline_points - 1 || high_size != highlight_points - 1)) {
  419. error("Metric correlations: internal error - calculate_pairs_diff() returns the wrong number of entries");
  420. return NAN;
  421. }
  422. return ks_2samp(baseline_diffs, base_size, highlight_diffs, high_size, base_shifts);
  423. }
  424. static int rrdset_metric_correlations_ks2(RRDSET *st, DICTIONARY *results,
  425. long long baseline_after, long long baseline_before,
  426. long long after, long long before,
  427. long long points, RRDR_OPTIONS options,
  428. RRDR_GROUPING group, const char *group_options, int tier,
  429. uint32_t shifts, int timeout,
  430. WEIGHTS_STATS *stats, bool register_zero) {
  431. options |= RRDR_OPTION_NATURAL_POINTS;
  432. long group_time = 0;
  433. struct context_param *context_param_list = NULL;
  434. int examined_dimensions = 0;
  435. RRDR *high_rrdr = NULL;
  436. RRDR *base_rrdr = NULL;
  437. // get first the highlight to find the number of points available
  438. stats->db_queries++;
  439. usec_t started_usec = now_realtime_usec();
  440. ONEWAYALLOC *owa = onewayalloc_create(0);
  441. high_rrdr = rrd2rrdr(owa, st, points,
  442. after, before, group,
  443. group_time, options, NULL, context_param_list, group_options,
  444. timeout, tier);
  445. if(!high_rrdr) {
  446. info("Metric correlations: rrd2rrdr() failed for the highlighted window on chart '%s'.", st->name);
  447. goto cleanup;
  448. }
  449. for(int i = 0; i < storage_tiers ;i++)
  450. stats->db_points_per_tier[i] += high_rrdr->internal.tier_points_read[i];
  451. stats->db_points += high_rrdr->internal.db_points_read;
  452. stats->result_points += high_rrdr->internal.result_points_generated;
  453. if(!high_rrdr->d) {
  454. info("Metric correlations: rrd2rrdr() did not return any dimensions on chart '%s'.", st->name);
  455. goto cleanup;
  456. }
  457. if(high_rrdr->result_options & RRDR_RESULT_OPTION_CANCEL) {
  458. info("Metric correlations: rrd2rrdr() on highlighted window timed out '%s'.", st->name);
  459. goto cleanup;
  460. }
  461. int high_points = rrdr_rows(high_rrdr);
  462. usec_t now_usec = now_realtime_usec();
  463. if(now_usec - started_usec > timeout * USEC_PER_MS)
  464. goto cleanup;
  465. // get the baseline, requesting the same number of points as the highlight
  466. stats->db_queries++;
  467. base_rrdr = rrd2rrdr(owa, st,high_points << shifts,
  468. baseline_after, baseline_before, group,
  469. group_time, options, NULL, context_param_list, group_options,
  470. (int)(timeout - ((now_usec - started_usec) / USEC_PER_MS)), tier);
  471. if(!base_rrdr) {
  472. info("Metric correlations: rrd2rrdr() failed for the baseline window on chart '%s'.", st->name);
  473. goto cleanup;
  474. }
  475. for(int i = 0; i < storage_tiers ;i++)
  476. stats->db_points_per_tier[i] += base_rrdr->internal.tier_points_read[i];
  477. stats->db_points += base_rrdr->internal.db_points_read;
  478. stats->result_points += base_rrdr->internal.result_points_generated;
  479. if(!base_rrdr->d) {
  480. info("Metric correlations: rrd2rrdr() did not return any dimensions on chart '%s'.", st->name);
  481. goto cleanup;
  482. }
  483. if (base_rrdr->d != high_rrdr->d) {
  484. info("Cannot generate metric correlations for chart '%s' when the baseline and the highlight have different number of dimensions.", st->name);
  485. goto cleanup;
  486. }
  487. if(base_rrdr->result_options & RRDR_RESULT_OPTION_CANCEL) {
  488. info("Metric correlations: rrd2rrdr() on baseline window timed out '%s'.", st->name);
  489. goto cleanup;
  490. }
  491. int base_points = rrdr_rows(base_rrdr);
  492. now_usec = now_realtime_usec();
  493. if(now_usec - started_usec > timeout * USEC_PER_MS)
  494. goto cleanup;
  495. // we need at least 2 points to do the job
  496. if(base_points < 2 || high_points < 2)
  497. goto cleanup;
  498. // for each dimension
  499. RRDDIM *d;
  500. int i;
  501. for(i = 0, d = base_rrdr->st->dimensions ; d && i < base_rrdr->d; i++, d = d->next) {
  502. // skip the not evaluated ones
  503. if(unlikely(base_rrdr->od[i] & RRDR_DIMENSION_HIDDEN) || (high_rrdr->od[i] & RRDR_DIMENSION_HIDDEN))
  504. continue;
  505. examined_dimensions++;
  506. // skip the dimensions that are just zero for both the baseline and the highlight
  507. if(unlikely(!(base_rrdr->od[i] & RRDR_DIMENSION_NONZERO) && !(high_rrdr->od[i] & RRDR_DIMENSION_NONZERO)))
  508. continue;
  509. // copy the baseline points of the dimension to a contiguous array
  510. // there is no need to check for empty values, since empty are already zero
  511. NETDATA_DOUBLE baseline[base_points];
  512. for(int c = 0; c < base_points; c++)
  513. baseline[c] = base_rrdr->v[ c * base_rrdr->d + i ];
  514. // copy the highlight points of the dimension to a contiguous array
  515. // there is no need to check for empty values, since empty values are already zero
  516. // https://github.com/netdata/netdata/blob/6e3144683a73a2024d51425b20ecfd569034c858/web/api/queries/average/average.c#L41-L43
  517. NETDATA_DOUBLE highlight[high_points];
  518. for(int c = 0; c < high_points; c++)
  519. highlight[c] = high_rrdr->v[ c * high_rrdr->d + i ];
  520. stats->binary_searches += 2 * (base_points - 1) + 2 * (high_points - 1);
  521. double prob = kstwo(baseline, base_points, highlight, high_points, shifts);
  522. if(!isnan(prob) && !isinf(prob)) {
  523. // these conditions should never happen, but still let's check
  524. if(unlikely(prob < 0.0)) {
  525. error("Metric correlations: kstwo() returned a negative number: %f", prob);
  526. prob = -prob;
  527. }
  528. if(unlikely(prob > 1.0)) {
  529. error("Metric correlations: kstwo() returned a number above 1.0: %f", prob);
  530. prob = 1.0;
  531. }
  532. // to spread the results evenly, 0.0 needs to be the less correlated and 1.0 the most correlated
  533. // so we flip the result of kstwo()
  534. register_result(results, base_rrdr->st, d, 1.0 - prob, RESULT_IS_BASE_HIGH_RATIO, stats, register_zero);
  535. }
  536. }
  537. cleanup:
  538. rrdr_free(owa, high_rrdr);
  539. rrdr_free(owa, base_rrdr);
  540. onewayalloc_destroy(owa);
  541. return examined_dimensions;
  542. }
  543. // ----------------------------------------------------------------------------
  544. // VOLUME algorithm functions
  545. static int rrdset_metric_correlations_volume(RRDSET *st, DICTIONARY *results,
  546. long long baseline_after, long long baseline_before,
  547. long long after, long long before,
  548. RRDR_OPTIONS options, RRDR_GROUPING group, const char *group_options,
  549. int tier, int timeout,
  550. WEIGHTS_STATS *stats, bool register_zero) {
  551. options |= RRDR_OPTION_MATCH_IDS | RRDR_OPTION_ABSOLUTE | RRDR_OPTION_NATURAL_POINTS;
  552. long group_time = 0;
  553. int examined_dimensions = 0;
  554. int ret, value_is_null;
  555. usec_t started_usec = now_realtime_usec();
  556. RRDDIM *d;
  557. for(d = st->dimensions; d ; d = d->next) {
  558. usec_t now_usec = now_realtime_usec();
  559. if(now_usec - started_usec > timeout * USEC_PER_MS)
  560. return examined_dimensions;
  561. // we count how many metrics we evaluated
  562. examined_dimensions++;
  563. // there is no point to pass a timeout to these queries
  564. // since the query engine checks for a timeout between
  565. // dimensions, and we query a single dimension at a time.
  566. stats->db_queries++;
  567. NETDATA_DOUBLE baseline_average = NAN;
  568. NETDATA_DOUBLE base_anomaly_rate = 0;
  569. value_is_null = 1;
  570. ret = rrdset2value_api_v1(st, NULL, &baseline_average, d->id, 1,
  571. baseline_after, baseline_before,
  572. group, group_options, group_time, options,
  573. NULL, NULL,
  574. &stats->db_points, stats->db_points_per_tier,
  575. &stats->result_points,
  576. &value_is_null, &base_anomaly_rate, 0, tier);
  577. if(ret != HTTP_RESP_OK || value_is_null || !netdata_double_isnumber(baseline_average)) {
  578. // this means no data for the baseline window, but we may have data for the highlighted one - assume zero
  579. baseline_average = 0.0;
  580. }
  581. stats->db_queries++;
  582. NETDATA_DOUBLE highlight_average = NAN;
  583. NETDATA_DOUBLE high_anomaly_rate = 0;
  584. value_is_null = 1;
  585. ret = rrdset2value_api_v1(st, NULL, &highlight_average, d->id, 1,
  586. after, before,
  587. group, group_options, group_time, options,
  588. NULL, NULL,
  589. &stats->db_points, stats->db_points_per_tier,
  590. &stats->result_points,
  591. &value_is_null, &high_anomaly_rate, 0, tier);
  592. if(ret != HTTP_RESP_OK || value_is_null || !netdata_double_isnumber(highlight_average)) {
  593. // this means no data for the highlighted duration - so skip it
  594. continue;
  595. }
  596. if(baseline_average == highlight_average) {
  597. // they are the same - let's move on
  598. continue;
  599. }
  600. stats->db_queries++;
  601. NETDATA_DOUBLE highlight_countif = NAN;
  602. value_is_null = 1;
  603. char highlighted_countif_options[50 + 1];
  604. snprintfz(highlighted_countif_options, 50, "%s" NETDATA_DOUBLE_FORMAT, highlight_average < baseline_average ? "<":">", baseline_average);
  605. ret = rrdset2value_api_v1(st, NULL, &highlight_countif, d->id, 1,
  606. after, before,
  607. RRDR_GROUPING_COUNTIF,highlighted_countif_options,
  608. group_time, options,
  609. NULL, NULL,
  610. &stats->db_points, stats->db_points_per_tier,
  611. &stats->result_points,
  612. &value_is_null, NULL, 0, tier);
  613. if(ret != HTTP_RESP_OK || value_is_null || !netdata_double_isnumber(highlight_countif)) {
  614. info("MC: highlighted countif query failed, but highlighted average worked - strange...");
  615. continue;
  616. }
  617. // this represents the percentage of time
  618. // the highlighted window was above/below the baseline window
  619. // (above or below depending on their averages)
  620. highlight_countif = highlight_countif / 100.0; // countif returns 0 - 100.0
  621. RESULT_FLAGS flags;
  622. NETDATA_DOUBLE pcent = NAN;
  623. if(isgreater(baseline_average, 0.0) || isless(baseline_average, 0.0)) {
  624. flags = RESULT_IS_BASE_HIGH_RATIO;
  625. pcent = (highlight_average - baseline_average) / baseline_average * highlight_countif;
  626. }
  627. else {
  628. flags = RESULT_IS_PERCENTAGE_OF_TIME;
  629. pcent = highlight_countif;
  630. }
  631. register_result(results, st, d, pcent, flags, stats, register_zero);
  632. }
  633. return examined_dimensions;
  634. }
  635. // ----------------------------------------------------------------------------
  636. // ANOMALY RATE algorithm functions
  637. static int rrdset_weights_anomaly_rate(RRDSET *st, DICTIONARY *results,
  638. long long after, long long before,
  639. RRDR_OPTIONS options, RRDR_GROUPING group, const char *group_options,
  640. int tier, int timeout,
  641. WEIGHTS_STATS *stats, bool register_zero) {
  642. options |= RRDR_OPTION_MATCH_IDS | RRDR_OPTION_ANOMALY_BIT | RRDR_OPTION_NATURAL_POINTS;
  643. long group_time = 0;
  644. int examined_dimensions = 0;
  645. int ret, value_is_null;
  646. usec_t started_usec = now_realtime_usec();
  647. RRDDIM *d;
  648. for(d = st->dimensions; d ; d = d->next) {
  649. usec_t now_usec = now_realtime_usec();
  650. if(now_usec - started_usec > timeout * USEC_PER_MS)
  651. return examined_dimensions;
  652. // we count how many metrics we evaluated
  653. examined_dimensions++;
  654. // there is no point to pass a timeout to these queries
  655. // since the query engine checks for a timeout between
  656. // dimensions, and we query a single dimension at a time.
  657. stats->db_queries++;
  658. NETDATA_DOUBLE average = NAN;
  659. NETDATA_DOUBLE anomaly_rate = 0;
  660. value_is_null = 1;
  661. ret = rrdset2value_api_v1(st, NULL, &average, d->id, 1,
  662. after, before,
  663. group, group_options, group_time, options,
  664. NULL, NULL,
  665. &stats->db_points, stats->db_points_per_tier,
  666. &stats->result_points,
  667. &value_is_null, &anomaly_rate, 0, tier);
  668. if(ret == HTTP_RESP_OK || !value_is_null || netdata_double_isnumber(average))
  669. register_result(results, st, d, average, 0, stats, register_zero);
  670. }
  671. return examined_dimensions;
  672. }
  673. // ----------------------------------------------------------------------------
  674. int compare_netdata_doubles(const void *left, const void *right) {
  675. NETDATA_DOUBLE lt = *(NETDATA_DOUBLE *)left;
  676. NETDATA_DOUBLE rt = *(NETDATA_DOUBLE *)right;
  677. // https://stackoverflow.com/a/3886497/1114110
  678. return (lt > rt) - (lt < rt);
  679. }
  680. static inline int binary_search_bigger_than_netdata_double(const NETDATA_DOUBLE arr[], int left, int size, NETDATA_DOUBLE K) {
  681. // binary search to find the index the smallest index
  682. // of the first value in the array that is greater than K
  683. int right = size;
  684. while(left < right) {
  685. int middle = (int)(((unsigned int)(left + right)) >> 1);
  686. if(arr[middle] > K)
  687. right = middle;
  688. else
  689. left = middle + 1;
  690. }
  691. return left;
  692. }
  693. // ----------------------------------------------------------------------------
  694. // spread the results evenly according to their value
  695. static size_t spread_results_evenly(DICTIONARY *results, WEIGHTS_STATS *stats) {
  696. struct register_result *t;
  697. // count the dimensions
  698. size_t dimensions = dictionary_stats_entries(results);
  699. if(!dimensions) return 0;
  700. if(stats->max_base_high_ratio == 0.0)
  701. stats->max_base_high_ratio = 1.0;
  702. // create an array of the right size and copy all the values in it
  703. NETDATA_DOUBLE slots[dimensions];
  704. dimensions = 0;
  705. dfe_start_read(results, t) {
  706. if(t->flags & (RESULT_IS_PERCENTAGE_OF_TIME))
  707. t->value = t->value * stats->max_base_high_ratio;
  708. slots[dimensions++] = t->value;
  709. }
  710. dfe_done(t);
  711. // sort the array with the values of all dimensions
  712. qsort(slots, dimensions, sizeof(NETDATA_DOUBLE), compare_netdata_doubles);
  713. // skip the duplicates in the sorted array
  714. NETDATA_DOUBLE last_value = NAN;
  715. size_t unique_values = 0;
  716. for(size_t i = 0; i < dimensions ;i++) {
  717. if(likely(slots[i] != last_value))
  718. slots[unique_values++] = last_value = slots[i];
  719. }
  720. // this cannot happen, but coverity thinks otherwise...
  721. if(!unique_values)
  722. unique_values = dimensions;
  723. // calculate the weight of each slot, using the number of unique values
  724. NETDATA_DOUBLE slot_weight = 1.0 / (NETDATA_DOUBLE)unique_values;
  725. dfe_start_read(results, t) {
  726. int slot = binary_search_bigger_than_netdata_double(slots, 0, (int)unique_values, t->value);
  727. NETDATA_DOUBLE v = slot * slot_weight;
  728. if(unlikely(v > 1.0)) v = 1.0;
  729. v = 1.0 - v;
  730. t->value = v;
  731. }
  732. dfe_done(t);
  733. return dimensions;
  734. }
  735. // ----------------------------------------------------------------------------
  736. // The main function
  737. int web_api_v1_weights(RRDHOST *host, BUFFER *wb, WEIGHTS_METHOD method, WEIGHTS_FORMAT format,
  738. RRDR_GROUPING group, const char *group_options,
  739. long long baseline_after, long long baseline_before,
  740. long long after, long long before,
  741. long long points, RRDR_OPTIONS options, SIMPLE_PATTERN *contexts, int tier, int timeout) {
  742. WEIGHTS_STATS stats = {};
  743. DICTIONARY *results = register_result_init();
  744. DICTIONARY *charts = dictionary_create(DICTIONARY_FLAG_SINGLE_THREADED|DICTIONARY_FLAG_VALUE_LINK_DONT_CLONE);;
  745. char *error = NULL;
  746. int resp = HTTP_RESP_OK;
  747. // if the user didn't give a timeout
  748. // assume 60 seconds
  749. if(!timeout)
  750. timeout = 60 * MSEC_PER_SEC;
  751. // if the timeout is less than 1 second
  752. // make it at least 1 second
  753. if(timeout < (long)(1 * MSEC_PER_SEC))
  754. timeout = 1 * MSEC_PER_SEC;
  755. usec_t timeout_usec = timeout * USEC_PER_MS;
  756. usec_t started_usec = now_realtime_usec();
  757. if(!rrdr_relative_window_to_absolute(&after, &before))
  758. buffer_no_cacheable(wb);
  759. if (before <= after) {
  760. resp = HTTP_RESP_BAD_REQUEST;
  761. error = "Invalid selected time-range.";
  762. goto cleanup;
  763. }
  764. uint32_t shifts = 0;
  765. if(method == WEIGHTS_METHOD_MC_KS2 || method == WEIGHTS_METHOD_MC_VOLUME) {
  766. if(!points) points = 500;
  767. if(baseline_before <= API_RELATIVE_TIME_MAX)
  768. baseline_before += after;
  769. rrdr_relative_window_to_absolute(&baseline_after, &baseline_before);
  770. if (baseline_before <= baseline_after) {
  771. resp = HTTP_RESP_BAD_REQUEST;
  772. error = "Invalid baseline time-range.";
  773. goto cleanup;
  774. }
  775. // baseline should be a power of two multiple of highlight
  776. long long base_delta = baseline_before - baseline_after;
  777. long long high_delta = before - after;
  778. uint32_t multiplier = (uint32_t)round((double)base_delta / (double)high_delta);
  779. // check if the multiplier is a power of two
  780. // https://stackoverflow.com/a/600306/1114110
  781. if((multiplier & (multiplier - 1)) != 0) {
  782. // it is not power of two
  783. // let's find the closest power of two
  784. // https://stackoverflow.com/a/466242/1114110
  785. multiplier--;
  786. multiplier |= multiplier >> 1;
  787. multiplier |= multiplier >> 2;
  788. multiplier |= multiplier >> 4;
  789. multiplier |= multiplier >> 8;
  790. multiplier |= multiplier >> 16;
  791. multiplier++;
  792. }
  793. // convert the multiplier to the number of shifts
  794. // we need to do, to divide baseline numbers to match
  795. // the highlight ones
  796. while(multiplier > 1) {
  797. shifts++;
  798. multiplier = multiplier >> 1;
  799. }
  800. // if the baseline size will not comply to MAX_POINTS
  801. // lower the window of the baseline
  802. while(shifts && (points << shifts) > MAX_POINTS)
  803. shifts--;
  804. // if the baseline size still does not comply to MAX_POINTS
  805. // lower the resolution of the highlight and the baseline
  806. while((points << shifts) > MAX_POINTS)
  807. points = points >> 1;
  808. if(points < 15) {
  809. resp = HTTP_RESP_BAD_REQUEST;
  810. error = "Too few points available, at least 15 are needed.";
  811. goto cleanup;
  812. }
  813. // adjust the baseline to be multiplier times bigger than the highlight
  814. baseline_after = baseline_before - (high_delta << shifts);
  815. }
  816. // dont lock here and wait for results
  817. // get the charts and run mc after
  818. RRDSET *st;
  819. rrdhost_rdlock(host);
  820. rrdset_foreach_read(st, host) {
  821. if (rrdset_is_available_for_viewers(st)) {
  822. if(!contexts || simple_pattern_matches(contexts, st->context))
  823. dictionary_set(charts, st->name, NULL, 0);
  824. }
  825. }
  826. rrdhost_unlock(host);
  827. size_t examined_dimensions = 0;
  828. void *ptr;
  829. bool register_zero = true;
  830. if(options & RRDR_OPTION_NONZERO) {
  831. register_zero = false;
  832. options &= ~RRDR_OPTION_NONZERO;
  833. }
  834. // for every chart in the dictionary
  835. dfe_start_read(charts, ptr) {
  836. usec_t now_usec = now_realtime_usec();
  837. if(now_usec - started_usec > timeout_usec) {
  838. error = "timed out";
  839. resp = HTTP_RESP_GATEWAY_TIMEOUT;
  840. goto cleanup;
  841. }
  842. st = rrdset_find_byname(host, ptr_name);
  843. if(!st) continue;
  844. rrdset_rdlock(st);
  845. switch(method) {
  846. case WEIGHTS_METHOD_ANOMALY_RATE:
  847. options |= RRDR_OPTION_ANOMALY_BIT;
  848. points = 1;
  849. examined_dimensions += rrdset_weights_anomaly_rate(st, results,
  850. after, before,
  851. options, group, group_options, tier,
  852. (int)(timeout - ((now_usec - started_usec) / USEC_PER_MS)),
  853. &stats, register_zero);
  854. break;
  855. case WEIGHTS_METHOD_MC_VOLUME:
  856. points = 1;
  857. examined_dimensions += rrdset_metric_correlations_volume(st, results,
  858. baseline_after, baseline_before,
  859. after, before,
  860. options, group, group_options, tier,
  861. (int)(timeout - ((now_usec - started_usec) / USEC_PER_MS)),
  862. &stats, register_zero);
  863. break;
  864. default:
  865. case WEIGHTS_METHOD_MC_KS2:
  866. examined_dimensions += rrdset_metric_correlations_ks2(st, results,
  867. baseline_after, baseline_before,
  868. after, before,
  869. points, options, group, group_options, tier, shifts,
  870. (int)(timeout - ((now_usec - started_usec) / USEC_PER_MS)),
  871. &stats, register_zero);
  872. break;
  873. }
  874. rrdset_unlock(st);
  875. }
  876. dfe_done(ptr);
  877. if(!register_zero)
  878. options |= RRDR_OPTION_NONZERO;
  879. if(!(options & RRDR_OPTION_RETURN_RAW))
  880. spread_results_evenly(results, &stats);
  881. usec_t ended_usec = now_realtime_usec();
  882. // generate the json output we need
  883. buffer_flush(wb);
  884. size_t added_dimensions = 0;
  885. switch(format) {
  886. case WEIGHTS_FORMAT_CHARTS:
  887. added_dimensions = registered_results_to_json_charts(results, wb,
  888. after, before,
  889. baseline_after, baseline_before,
  890. points, method, group, options, shifts,
  891. examined_dimensions,
  892. ended_usec - started_usec, &stats);
  893. break;
  894. default:
  895. case WEIGHTS_FORMAT_CONTEXTS:
  896. added_dimensions = registered_results_to_json_contexts(results, wb,
  897. after, before,
  898. baseline_after, baseline_before,
  899. points, method, group, options, shifts,
  900. examined_dimensions,
  901. ended_usec - started_usec, &stats);
  902. break;
  903. }
  904. if(!added_dimensions) {
  905. error = "no results produced.";
  906. resp = HTTP_RESP_NOT_FOUND;
  907. }
  908. cleanup:
  909. if(charts) dictionary_destroy(charts);
  910. if(results) register_result_destroy(results);
  911. if(error) {
  912. buffer_flush(wb);
  913. buffer_sprintf(wb, "{\"error\": \"%s\" }", error);
  914. }
  915. return resp;
  916. }
  917. // ----------------------------------------------------------------------------
  918. // unittest
  919. /*
  920. Unit tests against the output of this:
  921. https://github.com/scipy/scipy/blob/4cf21e753cf937d1c6c2d2a0e372fbc1dbbeea81/scipy/stats/_stats_py.py#L7275-L7449
  922. import matplotlib.pyplot as plt
  923. import pandas as pd
  924. import numpy as np
  925. import scipy as sp
  926. from scipy import stats
  927. data1 = np.array([ 1111, -2222, 33, 100, 100, 15555, -1, 19999, 888, 755, -1, -730 ])
  928. data2 = np.array([365, -123, 0])
  929. data1 = np.sort(data1)
  930. data2 = np.sort(data2)
  931. n1 = data1.shape[0]
  932. n2 = data2.shape[0]
  933. data_all = np.concatenate([data1, data2])
  934. cdf1 = np.searchsorted(data1, data_all, side='right') / n1
  935. cdf2 = np.searchsorted(data2, data_all, side='right') / n2
  936. print(data_all)
  937. print("\ndata1", data1, cdf1)
  938. print("\ndata2", data2, cdf2)
  939. cddiffs = cdf1 - cdf2
  940. print("\ncddiffs", cddiffs)
  941. minS = np.clip(-np.min(cddiffs), 0, 1)
  942. maxS = np.max(cddiffs)
  943. print("\nmin", minS)
  944. print("max", maxS)
  945. m, n = sorted([float(n1), float(n2)], reverse=True)
  946. en = m * n / (m + n)
  947. d = max(minS, maxS)
  948. prob = stats.distributions.kstwo.sf(d, np.round(en))
  949. print("\nprob", prob)
  950. */
  951. static int double_expect(double v, const char *str, const char *descr) {
  952. char buf[100 + 1];
  953. snprintfz(buf, 100, "%0.6f", v);
  954. int ret = strcmp(buf, str) ? 1 : 0;
  955. fprintf(stderr, "%s %s, expected %s, got %s\n", ret?"FAILED":"OK", descr, str, buf);
  956. return ret;
  957. }
  958. static int mc_unittest1(void) {
  959. int bs = 3, hs = 3;
  960. DIFFS_NUMBERS base[3] = { 1, 2, 3 };
  961. DIFFS_NUMBERS high[3] = { 3, 4, 6 };
  962. double prob = ks_2samp(base, bs, high, hs, 0);
  963. return double_expect(prob, "0.222222", "3x3");
  964. }
  965. static int mc_unittest2(void) {
  966. int bs = 6, hs = 3;
  967. DIFFS_NUMBERS base[6] = { 1, 2, 3, 10, 10, 15 };
  968. DIFFS_NUMBERS high[3] = { 3, 4, 6 };
  969. double prob = ks_2samp(base, bs, high, hs, 1);
  970. return double_expect(prob, "0.500000", "6x3");
  971. }
  972. static int mc_unittest3(void) {
  973. int bs = 12, hs = 3;
  974. DIFFS_NUMBERS base[12] = { 1, 2, 3, 10, 10, 15, 111, 19999, 8, 55, -1, -73 };
  975. DIFFS_NUMBERS high[3] = { 3, 4, 6 };
  976. double prob = ks_2samp(base, bs, high, hs, 2);
  977. return double_expect(prob, "0.347222", "12x3");
  978. }
  979. static int mc_unittest4(void) {
  980. int bs = 12, hs = 3;
  981. DIFFS_NUMBERS base[12] = { 1111, -2222, 33, 100, 100, 15555, -1, 19999, 888, 755, -1, -730 };
  982. DIFFS_NUMBERS high[3] = { 365, -123, 0 };
  983. double prob = ks_2samp(base, bs, high, hs, 2);
  984. return double_expect(prob, "0.777778", "12x3");
  985. }
  986. int mc_unittest(void) {
  987. int errors = 0;
  988. errors += mc_unittest1();
  989. errors += mc_unittest2();
  990. errors += mc_unittest3();
  991. errors += mc_unittest4();
  992. return errors;
  993. }