Browse Source

Improve agent shutdown time (#18434)

Modify datafile acquire for delete timeout if we are shutting down
Stelios Fragkakis 6 months ago
parent
commit
ceeba615af

+ 5 - 4
src/database/engine/datafile.c

@@ -66,7 +66,8 @@ void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS re
     spinlock_unlock(&df->users.spinlock);
 }
 
-bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) {
+bool datafile_acquire_for_deletion(struct rrdengine_datafile *df, bool is_shutdown)
+{
     bool can_be_deleted = false;
 
     spinlock_lock(&df->users.spinlock);
@@ -107,7 +108,7 @@ bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) {
 
                 if(!df->users.time_to_evict) {
                     // first time we did the above
-                    df->users.time_to_evict = now_s + 120;
+                    df->users.time_to_evict = now_s + is_shutdown ? DATAFILE_DELETE_TIMEOUT_SHORT : DATAFILE_DELETE_TIMEOUT_LONG;
                     internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, "
                                          "but it has %u lockers (oc:%u, pd:%u), "
                                          "%zu clean and %zu hot open cache pages "
@@ -572,8 +573,8 @@ void finalize_data_files(struct rrdengine_instance *ctx)
         struct rrdengine_journalfile *journalfile = datafile->journalfile;
 
         logged = false;
-        size_t iterations = 100;
-        while(!datafile_acquire_for_deletion(datafile) && datafile != ctx->datafiles.first->prev && --iterations > 0) {
+        size_t iterations = 10;
+        while(!datafile_acquire_for_deletion(datafile, true) && datafile != ctx->datafiles.first->prev && --iterations > 0) {
             if(!logged) {
                 netdata_log_info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier);
                 logged = true;

+ 6 - 1
src/database/engine/datafile.h

@@ -24,6 +24,11 @@ struct rrdengine_instance;
 #define MAX_DATAFILES (65536 * 4) /* Supports up to 64TiB for now */
 #define TARGET_DATAFILES (50)
 
+// When trying to acquire a datafile for deletion and an attempt to evict pages is completed
+// the acquire for deletion will return true after this timeout
+#define DATAFILE_DELETE_TIMEOUT_SHORT (1)
+#define DATAFILE_DELETE_TIMEOUT_LONG (120)
+
 typedef enum __attribute__ ((__packed__)) {
     DATAFILE_ACQUIRE_OPEN_CACHE = 0,
     DATAFILE_ACQUIRE_PAGE_DETAILS,
@@ -72,7 +77,7 @@ struct rrdengine_datafile {
 
 bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason);
 void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason);
-bool datafile_acquire_for_deletion(struct rrdengine_datafile *df);
+bool datafile_acquire_for_deletion(struct rrdengine_datafile *df, bool is_shutdown);
 
 void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock);
 void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile);

+ 2 - 2
src/database/engine/rrdengine.c

@@ -1218,7 +1218,7 @@ void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *
     if(worker)
         worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT);
 
-    bool datafile_got_for_deletion = datafile_acquire_for_deletion(datafile);
+    bool datafile_got_for_deletion = datafile_acquire_for_deletion(datafile, false);
 
     if (update_retention)
         update_metrics_first_time_s(ctx, datafile, datafile->next, worker);
@@ -1227,7 +1227,7 @@ void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *
         if(worker)
             worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT);
 
-        datafile_got_for_deletion = datafile_acquire_for_deletion(datafile);
+        datafile_got_for_deletion = datafile_acquire_for_deletion(datafile, false);
 
         if (!datafile_got_for_deletion) {
             netdata_log_info("DBENGINE: waiting for data file '%s/"