Просмотр исходного кода

Move cleanup of obsolete charts to a separate thread (#11222)

Vladimir Kobal 3 лет назад
Родитель
Сommit
96636b7c6f
10 измененных файлов с 130 добавлено и 12 удалено
  1. 2 0
      CMakeLists.txt
  2. 2 0
      Makefile.am
  3. 1 0
      daemon/common.h
  4. 1 0
      daemon/main.c
  5. 38 0
      daemon/service.c
  6. 19 0
      daemon/service.h
  7. 4 2
      database/rrd.h
  8. 33 0
      database/rrdhost.c
  9. 5 10
      database/rrdset.c
  10. 25 0
      libnetdata/config/appconfig.c

+ 2 - 0
CMakeLists.txt

@@ -896,6 +896,8 @@ set(DAEMON_FILES
         daemon/main.h
         daemon/signals.c
         daemon/signals.h
+        daemon/service.c
+        daemon/service.h
         daemon/commands.c
         daemon/commands.h
         daemon/unit_test.c

+ 2 - 0
Makefile.am

@@ -733,6 +733,8 @@ DAEMON_FILES = \
     daemon/main.h \
     daemon/signals.c \
     daemon/signals.h \
+    daemon/service.c \
+    daemon/service.h \
     daemon/commands.c \
     daemon/commands.h \
     daemon/unit_test.c \

+ 1 - 0
daemon/common.h

@@ -77,6 +77,7 @@
 #include "daemon.h"
 #include "main.h"
 #include "signals.h"
+#include "service.h"
 #include "commands.h"
 #include "analytics.h"
 

+ 1 - 0
daemon/main.c

@@ -104,6 +104,7 @@ struct netdata_static_thread static_threads[] = {
     NETDATA_PLUGIN_HOOK_PLUGINSD
     NETDATA_PLUGIN_HOOK_HEALTH
     NETDATA_PLUGIN_HOOK_ANALYTICS
+    NETDATA_PLUGIN_HOOK_SERVICE
 
     {NULL,                   NULL,                    NULL,         0, NULL, NULL, NULL}
 };

+ 38 - 0
daemon/service.c

@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "common.h"
+
+/* Run service jobs every X seconds */
+#define SERVICE_HEARTBEAT 10
+
+void service_main_cleanup(void *ptr)
+{
+    struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
+    static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
+
+    debug(D_SYSTEM, "Cleaning up...");
+
+    static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
+}
+
+/*
+ * The service thread.
+ */
+void *service_main(void *ptr)
+{
+    netdata_thread_cleanup_push(service_main_cleanup, ptr);
+    heartbeat_t hb;
+    heartbeat_init(&hb);
+    usec_t step = USEC_PER_SEC * SERVICE_HEARTBEAT;
+
+    debug(D_SYSTEM, "Service thread starts");
+
+    while (!netdata_exit) {
+        heartbeat_next(&hb, step);
+
+        rrd_cleanup_obsolete_charts();
+    }
+
+    netdata_thread_cleanup_pop(1);
+    return NULL;
+}

+ 19 - 0
daemon/service.h

@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_SERVICE_H
+#define NETDATA_SERVICE_H 1
+
+#define NETDATA_PLUGIN_HOOK_SERVICE \
+    { \
+        .name = "SERVICE", \
+        .config_section = NULL, \
+        .config_name = NULL, \
+        .enabled = 1, \
+        .thread = NULL, \
+        .init_routine = NULL, \
+        .start_routine = service_main \
+    },
+
+extern void *service_main(void *ptr);
+
+#endif //NETDATA_SERVICE_H

+ 4 - 2
database/rrd.h

@@ -762,7 +762,7 @@ struct rrdhost {
     const char *timezone;                           // the timezone of the host
 
 #ifdef ENABLE_ACLK
-    long    obsolete_count;
+    long    deleted_charts_count;
 #endif
 
     const char *abbrev_timezone;                    // the abbriviated timezone of the host
@@ -859,6 +859,8 @@ struct rrdhost {
 
     RRDSET *rrdset_root;                            // the host charts
 
+    unsigned int obsolete_charts_count;
+
 
     // ------------------------------------------------------------------------
     // locks
@@ -1038,6 +1040,7 @@ extern void rrdhost_system_info_free(struct rrdhost_system_info *system_info);
 extern void rrdhost_free(RRDHOST *host);
 extern void rrdhost_save_charts(RRDHOST *host);
 extern void rrdhost_delete_charts(RRDHOST *host);
+extern void rrd_cleanup_obsolete_charts();
 
 extern int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now);
 
@@ -1326,7 +1329,6 @@ extern void rrdset_save(RRDSET *st);
 extern void rrdset_delete_custom(RRDSET *st, int db_rotated);
 extern void rrdset_delete_obsolete_dimensions(RRDSET *st);
 
-extern void rrdhost_cleanup_obsolete_charts(RRDHOST *host);
 extern RRDHOST *rrdhost_create(
     const char *hostname, const char *registry_hostname, const char *guid, const char *os, const char *timezone,
     const char *abbrev_timezone, int32_t utc_offset,const char *tags, const char *program_name, const char *program_version,

+ 33 - 0
database/rrdhost.c

@@ -650,6 +650,13 @@ restart_after_removal:
 
 int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
     rrdset_free_obsolete_time = config_get_number(CONFIG_SECTION_GLOBAL, "cleanup obsolete charts after seconds", rrdset_free_obsolete_time);
+    // Current chart locking and invalidation scheme doesn't prevent Netdata from segmentaion faults if a short
+    // cleanup delay is set. Extensive stress tests showed that 10 seconds is quite a safe delay. Look at
+    // https://github.com/netdata/netdata/pull/11222#issuecomment-868367920 for more information.
+    if (rrdset_free_obsolete_time < 10) {
+        rrdset_free_obsolete_time = 10;
+        info("The \"cleanup obsolete charts after seconds\" option was set to 10 seconds. A lower delay can potentially cause a segmentaion fault.");
+    }
     gap_when_lost_iterations_above = (int)config_get_number(CONFIG_SECTION_GLOBAL, "gap when lost iterations above", gap_when_lost_iterations_above);
     if (gap_when_lost_iterations_above < 1)
         gap_when_lost_iterations_above = 1;
@@ -1392,6 +1399,7 @@ restart_after_removal:
                     && st->last_updated.tv_sec + rrdset_free_obsolete_time < now
                     && st->last_collected_time.tv_sec + rrdset_free_obsolete_time < now
         )) {
+            st->rrdhost->obsolete_charts_count--;
 #ifdef ENABLE_DBENGINE
             if(st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) {
                 RRDDIM *rd, *last;
@@ -1437,6 +1445,7 @@ restart_after_removal:
                 rrdvar_free_remaining_variables(host, &st->rrdvar_root_index);
 
                 rrdset_flag_clear(st, RRDSET_FLAG_OBSOLETE);
+                
                 if (st->dimensions) {
                     /* If the chart still has dimensions don't delete it from the metadata log */
                     continue;
@@ -1458,6 +1467,30 @@ restart_after_removal:
     }
 }
 
+void rrd_cleanup_obsolete_charts()
+{
+    rrd_rdlock();
+
+    RRDHOST *host;
+    rrdhost_foreach_read(host)
+    {
+        if (host->obsolete_charts_count) {
+            rrdhost_wrlock(host);
+#ifdef ENABLE_ACLK
+            host->deleted_charts_count = 0;
+#endif
+            rrdhost_cleanup_obsolete_charts(host);
+#ifdef ENABLE_ACLK
+            if (host->deleted_charts_count)
+                aclk_update_chart(host, "dummy-chart", 0);
+#endif
+            rrdhost_unlock(host);
+        }
+    }
+
+    rrd_unlock();
+}
+
 // ----------------------------------------------------------------------------
 // RRDHOST - set system info from environment variables
 // system_info fields must be heap allocated or NULL

+ 5 - 10
database/rrdset.c

@@ -194,6 +194,8 @@ inline void rrdset_is_obsolete(RRDSET *st) {
 
     if(unlikely(!(rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)))) {
         rrdset_flag_set(st, RRDSET_FLAG_OBSOLETE);
+        st->rrdhost->obsolete_charts_count++;
+
         rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED);
 
         // the chart will not get more updates (data collection)
@@ -205,6 +207,8 @@ inline void rrdset_is_obsolete(RRDSET *st) {
 inline void rrdset_isnot_obsolete(RRDSET *st) {
     if(unlikely((rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)))) {
         rrdset_flag_clear(st, RRDSET_FLAG_OBSOLETE);
+        st->rrdhost->obsolete_charts_count--;
+
         rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED);
 
         // the chart will be pushed upstream automatically
@@ -452,7 +456,7 @@ void rrdset_delete_custom(RRDSET *st, int db_rotated) {
 #ifdef ENABLE_ACLK
     if ((netdata_cloud_setting) && (db_rotated || RRD_MEMORY_MODE_DBENGINE != st->rrd_memory_mode)) {
         aclk_del_collector(st->rrdhost, st->plugin_name, st->module_name);
-        st->rrdhost->obsolete_count++;
+        st->rrdhost->deleted_charts_count++;
     }
 #endif
 
@@ -932,15 +936,6 @@ RRDSET *rrdset_create_custom(
 
     store_active_chart(st->chart_uuid);
 
-#ifdef ENABLE_ACLK
-    host->obsolete_count = 0;
-#endif
-    rrdhost_cleanup_obsolete_charts(host);
-#ifdef ENABLE_ACLK
-    if (host->obsolete_count)
-        aclk_update_chart(st->rrdhost, "dummy-chart", 0);
-#endif
-
     rrdhost_unlock(host);
 #ifdef ENABLE_ACLK
     if (netdata_cloud_setting)

+ 25 - 0
libnetdata/config/appconfig.c

@@ -225,6 +225,31 @@ void appconfig_section_destroy_non_loaded(struct config *root, const char *secti
         error("Cannot remove section '%s' from config.", section);
         return;
     }
+    
+    appconfig_wrlock(root);
+
+    if (root->first_section == co) {
+        root->first_section = co->next;
+
+        if (root->last_section == co)
+            root->last_section = root->first_section;
+    } else {
+        struct section *co_cur = root->first_section, *co_prev = NULL;
+
+        while(co_cur && co_cur != co) {
+            co_prev = co_cur;
+            co_cur = co_cur->next;
+        }
+
+        if (co_cur) {
+            co_prev->next = co_cur->next;
+
+            if (root->last_section == co_cur)
+                root->last_section = co_prev;
+        }
+    }
+
+    appconfig_unlock(root);
 
     avl_destroy_lock(&co->values_index);
     freez(co->name);