ml-private.h 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #ifndef NETDATA_ML_PRIVATE_H
  3. #define NETDATA_ML_PRIVATE_H
  4. #include "dlib/matrix.h"
  5. #include "ml/ml.h"
  6. #include <vector>
  7. #include <queue>
  8. #include <unordered_map>
  9. typedef double calculated_number_t;
  10. typedef dlib::matrix<calculated_number_t, 6, 1> DSample;
  11. /*
  12. * Features
  13. */
  14. typedef struct {
  15. size_t diff_n;
  16. size_t smooth_n;
  17. size_t lag_n;
  18. calculated_number_t *dst;
  19. size_t dst_n;
  20. calculated_number_t *src;
  21. size_t src_n;
  22. std::vector<DSample> &preprocessed_features;
  23. } ml_features_t;
  24. /*
  25. * KMeans
  26. */
  27. typedef struct {
  28. std::vector<DSample> cluster_centers;
  29. calculated_number_t min_dist;
  30. calculated_number_t max_dist;
  31. uint32_t after;
  32. uint32_t before;
  33. } ml_kmeans_t;
  34. typedef struct machine_learning_stats_t {
  35. size_t num_machine_learning_status_enabled;
  36. size_t num_machine_learning_status_disabled_sp;
  37. size_t num_metric_type_constant;
  38. size_t num_metric_type_variable;
  39. size_t num_training_status_untrained;
  40. size_t num_training_status_pending_without_model;
  41. size_t num_training_status_trained;
  42. size_t num_training_status_pending_with_model;
  43. size_t num_training_status_silenced;
  44. size_t num_anomalous_dimensions;
  45. size_t num_normal_dimensions;
  46. } ml_machine_learning_stats_t;
  47. typedef struct training_stats_t {
  48. size_t queue_size;
  49. size_t num_popped_items;
  50. usec_t allotted_ut;
  51. usec_t consumed_ut;
  52. usec_t remaining_ut;
  53. size_t training_result_ok;
  54. size_t training_result_invalid_query_time_range;
  55. size_t training_result_not_enough_collected_values;
  56. size_t training_result_null_acquired_dimension;
  57. size_t training_result_chart_under_replication;
  58. } ml_training_stats_t;
  59. enum ml_metric_type {
  60. // The dimension has constant values, no need to train
  61. METRIC_TYPE_CONSTANT,
  62. // The dimension's values fluctuate, we need to generate a model
  63. METRIC_TYPE_VARIABLE,
  64. };
  65. enum ml_machine_learning_status {
  66. // Enable training/prediction
  67. MACHINE_LEARNING_STATUS_ENABLED,
  68. // Disable because configuration pattern matches the chart's id
  69. MACHINE_LEARNING_STATUS_DISABLED_DUE_TO_EXCLUDED_CHART,
  70. };
  71. enum ml_training_status {
  72. // We don't have a model for this dimension
  73. TRAINING_STATUS_UNTRAINED,
  74. // Request for training sent, but we don't have any models yet
  75. TRAINING_STATUS_PENDING_WITHOUT_MODEL,
  76. // Request to update existing models sent
  77. TRAINING_STATUS_PENDING_WITH_MODEL,
  78. // Have a valid, up-to-date model
  79. TRAINING_STATUS_TRAINED,
  80. // Have a valid, up-to-date model that is silenced because its too noisy
  81. TRAINING_STATUS_SILENCED,
  82. };
  83. enum ml_training_result {
  84. // We managed to create a KMeans model
  85. TRAINING_RESULT_OK,
  86. // Could not query DB with a correct time range
  87. TRAINING_RESULT_INVALID_QUERY_TIME_RANGE,
  88. // Did not gather enough data from DB to run KMeans
  89. TRAINING_RESULT_NOT_ENOUGH_COLLECTED_VALUES,
  90. // Acquired a null dimension
  91. TRAINING_RESULT_NULL_ACQUIRED_DIMENSION,
  92. // Chart is under replication
  93. TRAINING_RESULT_CHART_UNDER_REPLICATION,
  94. };
  95. typedef struct {
  96. // Chart/dimension we want to train
  97. char machine_guid[GUID_LEN + 1];
  98. STRING *chart_id;
  99. STRING *dimension_id;
  100. // Creation time of request
  101. time_t request_time;
  102. // First/last entry of this dimension in DB
  103. // at the point the request was made
  104. time_t first_entry_on_request;
  105. time_t last_entry_on_request;
  106. } ml_training_request_t;
  107. typedef struct {
  108. // Time when the request for this response was made
  109. time_t request_time;
  110. // First/last entry of the dimension in DB when generating the request
  111. time_t first_entry_on_request;
  112. time_t last_entry_on_request;
  113. // First/last entry of the dimension in DB when generating the response
  114. time_t first_entry_on_response;
  115. time_t last_entry_on_response;
  116. // After/Before timestamps of our DB query
  117. time_t query_after_t;
  118. time_t query_before_t;
  119. // Actual after/before returned by the DB query ops
  120. time_t db_after_t;
  121. time_t db_before_t;
  122. // Number of doubles returned by the DB query
  123. size_t collected_values;
  124. // Number of values we return to the caller
  125. size_t total_values;
  126. // Result of training response
  127. enum ml_training_result result;
  128. } ml_training_response_t;
  129. /*
  130. * Queue
  131. */
  132. typedef struct {
  133. std::queue<ml_training_request_t> internal;
  134. netdata_mutex_t mutex;
  135. pthread_cond_t cond_var;
  136. std::atomic<bool> exit;
  137. } ml_queue_t;
  138. typedef struct {
  139. RRDDIM *rd;
  140. enum ml_metric_type mt;
  141. enum ml_training_status ts;
  142. enum ml_machine_learning_status mls;
  143. ml_training_response_t tr;
  144. time_t last_training_time;
  145. std::vector<calculated_number_t> cns;
  146. std::vector<ml_kmeans_t> km_contexts;
  147. SPINLOCK slock;
  148. ml_kmeans_t kmeans;
  149. std::vector<DSample> feature;
  150. uint32_t suppression_window_counter;
  151. uint32_t suppression_anomaly_counter;
  152. } ml_dimension_t;
  153. typedef struct {
  154. RRDSET *rs;
  155. ml_machine_learning_stats_t mls;
  156. } ml_chart_t;
  157. void ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomalous);
  158. typedef struct {
  159. RRDDIM *rd;
  160. size_t normal_dimensions;
  161. size_t anomalous_dimensions;
  162. } ml_type_anomaly_rate_t;
  163. typedef struct {
  164. RRDHOST *rh;
  165. std::atomic<bool> ml_running;
  166. ml_machine_learning_stats_t mls;
  167. calculated_number_t host_anomaly_rate;
  168. netdata_mutex_t mutex;
  169. ml_queue_t *training_queue;
  170. /*
  171. * bookkeeping for anomaly detection charts
  172. */
  173. RRDSET *ml_running_rs;
  174. RRDDIM *ml_running_rd;
  175. RRDSET *machine_learning_status_rs;
  176. RRDDIM *machine_learning_status_enabled_rd;
  177. RRDDIM *machine_learning_status_disabled_sp_rd;
  178. RRDSET *metric_type_rs;
  179. RRDDIM *metric_type_constant_rd;
  180. RRDDIM *metric_type_variable_rd;
  181. RRDSET *training_status_rs;
  182. RRDDIM *training_status_untrained_rd;
  183. RRDDIM *training_status_pending_without_model_rd;
  184. RRDDIM *training_status_trained_rd;
  185. RRDDIM *training_status_pending_with_model_rd;
  186. RRDDIM *training_status_silenced_rd;
  187. RRDSET *dimensions_rs;
  188. RRDDIM *dimensions_anomalous_rd;
  189. RRDDIM *dimensions_normal_rd;
  190. RRDSET *anomaly_rate_rs;
  191. RRDDIM *anomaly_rate_rd;
  192. RRDSET *detector_events_rs;
  193. RRDDIM *detector_events_above_threshold_rd;
  194. RRDDIM *detector_events_new_anomaly_event_rd;
  195. RRDSET *type_anomaly_rate_rs;
  196. std::unordered_map<STRING *, ml_type_anomaly_rate_t> type_anomaly_rate;
  197. } ml_host_t;
  198. typedef struct {
  199. uuid_t metric_uuid;
  200. ml_kmeans_t kmeans;
  201. } ml_model_info_t;
  202. typedef struct {
  203. size_t id;
  204. netdata_thread_t nd_thread;
  205. netdata_mutex_t nd_mutex;
  206. ml_queue_t *training_queue;
  207. ml_training_stats_t training_stats;
  208. calculated_number_t *training_cns;
  209. calculated_number_t *scratch_training_cns;
  210. std::vector<DSample> training_samples;
  211. std::vector<ml_model_info_t> pending_model_info;
  212. RRDSET *queue_stats_rs;
  213. RRDDIM *queue_stats_queue_size_rd;
  214. RRDDIM *queue_stats_popped_items_rd;
  215. RRDSET *training_time_stats_rs;
  216. RRDDIM *training_time_stats_allotted_rd;
  217. RRDDIM *training_time_stats_consumed_rd;
  218. RRDDIM *training_time_stats_remaining_rd;
  219. RRDSET *training_results_rs;
  220. RRDDIM *training_results_ok_rd;
  221. RRDDIM *training_results_invalid_query_time_range_rd;
  222. RRDDIM *training_results_not_enough_collected_values_rd;
  223. RRDDIM *training_results_null_acquired_dimension_rd;
  224. RRDDIM *training_results_chart_under_replication_rd;
  225. size_t num_db_transactions;
  226. size_t num_models_to_prune;
  227. } ml_training_thread_t;
  228. typedef struct {
  229. bool enable_anomaly_detection;
  230. unsigned max_train_samples;
  231. unsigned min_train_samples;
  232. unsigned train_every;
  233. unsigned num_models_to_use;
  234. unsigned delete_models_older_than;
  235. unsigned db_engine_anomaly_rate_every;
  236. unsigned diff_n;
  237. unsigned smooth_n;
  238. unsigned lag_n;
  239. double random_sampling_ratio;
  240. unsigned max_kmeans_iters;
  241. double dimension_anomaly_score_threshold;
  242. double host_anomaly_rate_threshold;
  243. RRDR_TIME_GROUPING anomaly_detection_grouping_method;
  244. time_t anomaly_detection_query_duration;
  245. bool stream_anomaly_detection_charts;
  246. std::string hosts_to_skip;
  247. SIMPLE_PATTERN *sp_host_to_skip;
  248. std::string charts_to_skip;
  249. SIMPLE_PATTERN *sp_charts_to_skip;
  250. std::vector<uint32_t> random_nums;
  251. netdata_thread_t detection_thread;
  252. std::atomic<bool> detection_stop;
  253. size_t num_training_threads;
  254. size_t flush_models_batch_size;
  255. std::vector<ml_training_thread_t> training_threads;
  256. std::atomic<bool> training_stop;
  257. size_t suppression_window;
  258. size_t suppression_threshold;
  259. bool enable_statistics_charts;
  260. } ml_config_t;
  261. void ml_config_load(ml_config_t *cfg);
  262. extern ml_config_t Cfg;
  263. #endif /* NETDATA_ML_PRIVATE_H */