Browse Source

Refactor ML code and add support for multiple KMeans models. (#14065)

* Add profile.plugin

Creates the specified number of charts/dimensions, and supports
backfilling with pseudo-historical data.

* Bump

* Remove wrongly merged line.

* Use the number of models specified from the config section.

* Add option to consult all ML models.

* Remove profiling option consuming all models.

* Add underscore after chart name prefix.

* prediction -> dimensions chart

* reorder funcs

* Split charts across types with correct priority

* Ignore training request when chart is under replication.

* Track global number of models consulted.

* Cleanup config.

* initial readme updates

* fix readme

* readme

* Fix function definition when ML is disabled.

* Add dummy ml_chart_update_{begin,end}

* Remove profile_plugin

* Define chart priorities under collectors/all.h

* s/curr_t/current_time/

Co-authored-by: Andrew Maguire <andrewm4894@gmail.com>
vkalintiris 2 years ago
parent
commit
689dc6b7fb
10 changed files with 82 additions and 61 deletions
  1. 4 13
      Makefile.am
  2. 18 0
      collectors/all.h
  3. 1 22
      configure.ac
  4. 34 0
      daemon/global_statistics.c
  5. 1 0
      daemon/global_statistics.h
  6. 0 5
      daemon/main.c
  7. 12 6
      database/rrd.h
  8. 0 9
      database/rrdcontext.c
  9. 9 3
      database/rrddim.c
  10. 3 3
      database/rrdhost.c

+ 4 - 13
Makefile.am

@@ -236,10 +236,14 @@ ML_FILES += \
     ml/ADCharts.cc \
     ml/Config.h \
     ml/Config.cc \
+    ml/Chart.cc \
+    ml/Chart.h \
+    ml/Stats.h \
     ml/Dimension.cc \
     ml/Dimension.h \
     ml/Host.h \
     ml/Host.cc \
+    ml/Queue.h \
     ml/Query.h \
     ml/KMeans.h \
     ml/KMeans.cc \
@@ -262,13 +266,6 @@ ml/ml.$(OBJEXT) : CXXFLAGS += -Wno-psabi
 
 endif
 
-
-if ENABLE_ML_TESTS
-ML_TESTS_FILES = \
-    ml/SamplesBufferTests.cc \
-    $(NULL)
-endif
-
 IDLEJITTER_PLUGIN_FILES = \
     collectors/idlejitter.plugin/plugin_idlejitter.c \
     $(NULL)
@@ -920,7 +917,6 @@ NETDATA_FILES = \
     $(EXPORTING_ENGINE_FILES) \
     $(HEALTH_PLUGIN_FILES) \
     $(ML_FILES) \
-    $(ML_TESTS_FILES) \
     $(IDLEJITTER_PLUGIN_FILES) \
     $(PLUGINSD_PLUGIN_FILES) \
     $(REGISTRY_PLUGIN_FILES) \
@@ -1008,11 +1004,6 @@ if ENABLE_ACLK
     $(NULL)
 endif
 
-if ENABLE_ML_TESTS
-    netdata_LDADD += $(OPTIONAL_ML_TESTS_LIBS) \
-        $(NULL)
-endif
-
 netdata_LINK = $(CXXLD) $(CXXFLAGS) $(LDFLAGS) -o $@
 
 sbin_PROGRAMS += netdatacli

+ 18 - 0
collectors/all.h

@@ -363,5 +363,23 @@
 #define NETDATA_CHART_PRIO_NETDATA_TIMEX            132030
 #define NETDATA_CHART_PRIO_NETDATA_TC_TIME          1000100
 
+// NETDATA ML CHARTS
+
+// [ml] charts
+#define ML_CHART_PRIO_DIMENSIONS        39181
+#define ML_CHART_PRIO_ANOMALY_RATE      39182
+#define ML_CHART_PRIO_DETECTOR_EVENTS   39183
+
+// [netdata.ml] charts
+#define NETDATA_ML_CHART_PRIO_MACHINE_LEARNING_STATUS 890001
+#define NETDATA_ML_CHART_PRIO_METRIC_TYPES            890002
+#define NETDATA_ML_CHART_PRIO_TRAINING_STATUS         890003
+
+#define NETDATA_ML_CHART_PRIO_PREDICTION_USAGE        890004
+#define NETDATA_ML_CHART_PRIO_TRAINING_USAGE          890005
+
+#define NETDATA_ML_CHART_PRIO_QUEUE_STATS             890006
+#define NETDATA_ML_CHART_PRIO_TRAINING_TIME_STATS     890007
+#define NETDATA_ML_CHART_PRIO_TRAINING_RESULTS        890008
 
 #endif //NETDATA_ALL_H

+ 1 - 22
configure.ac

@@ -207,12 +207,6 @@ AC_ARG_ENABLE(
     ,
     [enable_ml="detect"]
 )
-AC_ARG_ENABLE(
-    [ml_tests],
-    [AS_HELP_STRING([--enable-ml-tests], [Enable anomaly detection tests @<:@no@:>@])],
-    [enable_ml_tests="yes"],
-    [enable_ml_tests="no"]
-)
 AC_ARG_ENABLE(
     [aclk_ssl_debug],
     [AS_HELP_STRING([--enable-aclk-ssl-debug], [Enables possibility for SSL key logging @<:@default no@:>@])],
@@ -1180,19 +1174,6 @@ if test "${build_ml}" = "yes"; then
     OPTIONAL_ML_LIBS=""
 fi
 
-# Decide if we should build ML tests.
-if test "${build_ml}" = "yes" -a "${enable_ml_tests}" = "yes" -a "${have_gtest}" = "yes"; then
-    build_ml_tests="yes"
-else
-    build_ml_tests="no"
-fi
-
-AM_CONDITIONAL([ENABLE_ML_TESTS], [test "${build_ml_tests}" = "yes"])
-if test "${build_ml_tests}" = "yes"; then
-    AC_DEFINE([ENABLE_ML_TESTS], [1], [anomaly detection tests])
-    OPTIONAL_ML_TESTS_CFLAGS="${OPTIONAL_GTEST_CFLAGS}"
-    OPTIONAL_ML_TESTS_LIBS="${OPTIONAL_GTEST_LIBS}"
-fi
 
 # -----------------------------------------------------------------------------
 # ebpf.plugin
@@ -1612,7 +1593,7 @@ CFLAGS="${originalCFLAGS} ${OPTIONAL_LTO_CFLAGS} ${OPTIONAL_PROTOBUF_CFLAGS} ${O
     ${OPTIONAL_LIBCAP_CFLAGS} ${OPTIONAL_IPMIMONITORING_CFLAGS} ${OPTIONAL_CUPS_CFLAGS} ${OPTIONAL_XENSTAT_FLAGS} \
     ${OPTIONAL_KINESIS_CFLAGS} ${OPTIONAL_PUBSUB_CFLAGS} ${OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS} \
     ${OPTIONAL_MONGOC_CFLAGS} ${LWS_CFLAGS} ${OPTIONAL_JSONC_STATIC_CFLAGS} ${OPTIONAL_BPF_CFLAGS} ${JUDY_CFLAGS} \
-    ${OPTIONAL_ACLK_CFLAGS} ${OPTIONAL_ML_CFLAGS} ${OPTIONAL_ML_TESTS_CFLAGS} ${OPTIONAL_OS_DEP_CFLAGS}"
+    ${OPTIONAL_ACLK_CFLAGS} ${OPTIONAL_ML_CFLAGS} ${OPTIONAL_OS_DEP_CFLAGS}"
 
 CXXFLAGS="${CFLAGS} ${CXX11FLAG}"
 
@@ -1666,8 +1647,6 @@ AC_SUBST([OPTIONAL_GTEST_CFLAGS])
 AC_SUBST([OPTIONAL_GTEST_LIBS])
 AC_SUBST([OPTIONAL_ML_CFLAGS])
 AC_SUBST([OPTIONAL_ML_LIBS])
-AC_SUBST([OPTIONAL_ML_TESTS_CFLAGS])
-AC_SUBST([OPTIONAL_ML_TESTS_LIBS])
 
 # -----------------------------------------------------------------------------
 # Check if cmocka is available - needed for unit testing

+ 34 - 0
daemon/global_statistics.c

@@ -52,6 +52,7 @@ static struct global_statistics {
     uint64_t ml_queries_made;
     uint64_t ml_db_points_read;
     uint64_t ml_result_points_generated;
+    uint64_t ml_models_consulted;
 
     uint64_t exporters_queries_made;
     uint64_t exporters_db_points_read;
@@ -88,6 +89,10 @@ void global_statistics_ml_query_completed(size_t points_read) {
     __atomic_fetch_add(&global_statistics.ml_db_points_read, points_read, __ATOMIC_RELAXED);
 }
 
+void global_statistics_ml_models_consulted(size_t models_consulted) {
+    __atomic_fetch_add(&global_statistics.ml_models_consulted, models_consulted, __ATOMIC_RELAXED);
+}
+
 void global_statistics_exporters_query_completed(size_t points_read) {
     __atomic_fetch_add(&global_statistics.exporters_queries_made, 1, __ATOMIC_RELAXED);
     __atomic_fetch_add(&global_statistics.exporters_db_points_read, points_read, __ATOMIC_RELAXED);
@@ -193,6 +198,7 @@ static inline void global_statistics_copy(struct global_statistics *gs, uint8_t
     gs->ml_queries_made              = __atomic_load_n(&global_statistics.ml_queries_made, __ATOMIC_RELAXED);
     gs->ml_db_points_read            = __atomic_load_n(&global_statistics.ml_db_points_read, __ATOMIC_RELAXED);
     gs->ml_result_points_generated   = __atomic_load_n(&global_statistics.ml_result_points_generated, __ATOMIC_RELAXED);
+    gs->ml_models_consulted          = __atomic_load_n(&global_statistics.ml_models_consulted, __ATOMIC_RELAXED);
 
     gs->exporters_queries_made       = __atomic_load_n(&global_statistics.exporters_queries_made, __ATOMIC_RELAXED);
     gs->exporters_db_points_read     = __atomic_load_n(&global_statistics.exporters_db_points_read, __ATOMIC_RELAXED);
@@ -653,6 +659,34 @@ static void global_statistics_charts(void) {
 
         rrdset_done(st_points_stored);
     }
+
+    {
+        static RRDSET *st = NULL;
+        static RRDDIM *rd = NULL;
+
+        if (unlikely(!st)) {
+            st = rrdset_create_localhost(
+                    "netdata" // type
+                    , "ml_models_consulted" // id
+                    , NULL // name
+                    , "ml" // family
+                    , NULL // context
+                    , "KMeans models used for prediction" // title
+                    , "models" // units
+                    , "netdata" // plugin
+                    , "ml" // module
+                    , 131004 // priority
+                    , localhost->rrd_update_every // update_every
+                    , RRDSET_TYPE_STACKED // chart_type
+            );
+
+            rd = rrddim_add(st, "num_models_consulted", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+        }
+
+        rrddim_set_by_pointer(st, rd, (collected_number) gs.ml_models_consulted);
+
+        rrdset_done(st);
+    }
 }
 
 // ----------------------------------------------------------------------------

+ 1 - 0
daemon/global_statistics.h

@@ -9,6 +9,7 @@
 // global statistics
 
 void global_statistics_ml_query_completed(size_t points_read);
+void global_statistics_ml_models_consulted(size_t models_consulted);
 void global_statistics_exporters_query_completed(size_t points_read);
 void global_statistics_backfill_query_completed(size_t points_read);
 void global_statistics_rrdr_query_completed(size_t queries, uint64_t db_points_read, uint64_t result_points_generated, QUERY_SOURCE query_source);

+ 0 - 5
daemon/main.c

@@ -1027,11 +1027,6 @@ int main(int argc, char **argv) {
                         else if(strcmp(optarg, "escapetest") == 0) {
                             return command_argument_sanitization_tests();
                         }
-#ifdef ENABLE_ML_TESTS
-                        else if(strcmp(optarg, "mltest") == 0) {
-                            return test_ml(argc, argv);
-                        }
-#endif
 #ifdef ENABLE_DBENGINE
                         else if(strcmp(optarg, "mctest") == 0) {
                             unittest_running = true;

+ 12 - 6
database/rrd.h

@@ -30,8 +30,9 @@ typedef struct rrdhost_acquired RRDHOST_ACQUIRED;
 typedef struct rrdset_acquired RRDSET_ACQUIRED;
 typedef struct rrddim_acquired RRDDIM_ACQUIRED;
 
-typedef void *ml_host_t;
-typedef void *ml_dimension_t;
+typedef struct ml_host ml_host_t;
+typedef struct ml_chart ml_chart_t;
+typedef struct ml_dimension ml_dimension_t;
 
 typedef enum {
     QUERY_SOURCE_UNKNOWN,
@@ -296,7 +297,7 @@ struct rrddim {
     // ------------------------------------------------------------------------
     // operational state members
 
-    ml_dimension_t ml_dimension;                    // machine learning data about this dimension
+    ml_dimension_t *ml_dimension;                   // machine learning data about this dimension
 
     // ------------------------------------------------------------------------
     // linking to siblings and parents
@@ -595,6 +596,8 @@ struct rrdset {
     DICTIONARY *rrddimvar_root_index;               // dimension variables
                                                     // we use this dictionary to manage their allocation
 
+    ml_chart_t *ml_chart;
+
     // ------------------------------------------------------------------------
     // operational state members
 
@@ -1028,7 +1031,7 @@ struct rrdhost {
 
     // ------------------------------------------------------------------------
     // ML handle
-    ml_host_t ml_host;
+    ml_host_t *ml_host;
 
     // ------------------------------------------------------------------------
     // Support for host-level labels
@@ -1301,9 +1304,12 @@ void rrdset_isnot_obsolete(RRDSET *st);
 time_t rrddim_first_entry_t(RRDDIM *rd);
 time_t rrddim_first_entry_t_of_tier(RRDDIM *rd, size_t tier);
 time_t rrddim_last_entry_t(RRDDIM *rd);
-time_t rrdset_last_entry_t(RRDSET *st);
-time_t rrdset_first_entry_t_of_tier(RRDSET *st, size_t tier);
+time_t rrddim_last_entry_t_of_tier(RRDDIM *rd, size_t tier);
+
 time_t rrdset_first_entry_t(RRDSET *st);
+time_t rrdset_first_entry_t_of_tier(RRDSET *st, size_t tier);
+time_t rrdset_last_entry_t(RRDSET *st);
+
 time_t rrdhost_last_entry_t(RRDHOST *h);
 
 // ----------------------------------------------------------------------------

+ 0 - 9
database/rrdcontext.c

@@ -750,11 +750,6 @@ static void rrdinstance_free(RRDINSTANCE *ri) {
 }
 
 static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext) {
-    static STRING *ml_anomaly_rates_id = NULL;
-
-    if(unlikely(!ml_anomaly_rates_id))
-        ml_anomaly_rates_id = string_strdupz(ML_ANOMALY_RATES_CHART_ID);
-
     RRDINSTANCE *ri = value;
 
     // link it to its parent
@@ -781,10 +776,6 @@ static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unus
             ri->flags &= ~RRD_FLAG_HIDDEN; // no need of atomics at the constructor
     }
 
-    // we need this when loading from SQL
-    if(unlikely(ri->id == ml_anomaly_rates_id))
-        ri->flags |= RRD_FLAG_HIDDEN; // no need of atomics at the constructor
-
     rrdmetrics_create_in_rrdinstance(ri);
 
     // signal the react callback to do the job

+ 9 - 3
database/rrddim.c

@@ -172,7 +172,7 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v
     rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK);
     rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED);
 
-    ml_new_dimension(rd);
+    ml_dimension_new(rd);
 
     ctr->react_action = RRDDIM_REACT_NEW;
 
@@ -191,7 +191,7 @@ static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v
 
     rrdcontext_removed_rrddim(rd);
 
-    ml_delete_dimension(rd);
+    ml_dimension_delete(rd);
 
     debug(D_RRD_CALLS, "rrddim_free() %s.%s", rrdset_name(st), rrddim_name(rd));
 
@@ -420,7 +420,13 @@ inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor)
 
 // ----------------------------------------------------------------------------
 
-// get the timestamp of the last entry in the round-robin database
+time_t rrddim_last_entry_t_of_tier(RRDDIM *rd, size_t tier) {
+    if(unlikely(tier > storage_tiers || !rd->tiers[tier]))
+        return 0;
+
+    return rd->tiers[tier]->query_ops->latest_time(rd->tiers[tier]->db_metric_handle);
+}
+
 time_t rrddim_last_entry_t(RRDDIM *rd) {
     time_t latest = rd->tiers[0]->query_ops->latest_time(rd->tiers[0]->db_metric_handle);
 

+ 3 - 3
database/rrdhost.c

@@ -518,7 +518,7 @@ int is_legacy = 1;
 
     rrdhost_load_rrdcontext_data(host);
     if (!archived)
-        ml_new_host(host);
+        ml_host_new(host);
     else
         rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED);
 
@@ -629,7 +629,7 @@ void rrdhost_update(RRDHOST *host
         host->rrdpush_replication_step = rrdpush_replication_step;
 
         rrd_hosts_available++;
-        ml_new_host(host);
+        ml_host_new(host);
         rrdhost_load_rrdcontext_data(host);
         info("Host %s is not in archived mode anymore", rrdhost_hostname(host));
     }
@@ -1089,7 +1089,7 @@ void rrdhost_free(RRDHOST *host, bool force) {
     rrd_check_wrlock();     // make sure the RRDs are write locked
 
     rrdhost_wrlock(host);
-    ml_delete_host(host);
+    ml_host_delete(host);
     rrdhost_unlock(host);
 
     // ------------------------------------------------------------------------

Some files were not shown because too many files changed in this diff