Browse Source

Misc alert fixes (#15274)

* rebase

* proper pointer
Emmanuel Vasilakis 1 year ago
parent
commit
f29145fe2b

+ 4 - 4
database/rrdcalc.c

@@ -61,13 +61,13 @@ inline const char *rrdcalc_status2string(RRDCALC_STATUS status) {
     }
 }
 
-uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id) {
+uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id) {
     netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
 
     // re-use old IDs, by looking them up in the alarm log
     ALARM_ENTRY *ae = NULL;
     for(ae = host->health_log.alarms; ae ;ae = ae->next) {
-        if(unlikely(name == ae->name && chart == ae->chart)) {
+        if(unlikely(name == ae->name && chart == ae->chart && !uuid_memcmp(&ae->config_hash_id, config_hash_id))) {
             if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
             break;
         }
@@ -79,7 +79,7 @@ uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint3
         alarm_id = ae->alarm_id;
 
     else {
-        alarm_id = sql_get_alarm_id(host, chart, name, next_event_id);
+        alarm_id = sql_get_alarm_id(host, chart, name, next_event_id, config_hash_id);
 
         if (!alarm_id) {
             if (unlikely(!host->health_log.next_alarm_id))
@@ -531,7 +531,7 @@ static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_
         ;
     }
 
-    rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
+    rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id, &rc->config_hash_id);
 
     if(rc->calculation) {
         rc->calculation->status = &rc->status;

+ 1 - 1
database/rrdcalc.h

@@ -240,7 +240,7 @@ const char *rrdcalc_status2string(RRDCALC_STATUS status);
 
 void rrdcalc_free_unused_rrdcalc_loaded_from_config(RRDCALC *rc);
 
-uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id);
+uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id);
 void rrdcalc_add_from_rrdcalctemplate(RRDHOST *host, RRDCALCTEMPLATE *rt, RRDSET *st, const char *overwrite_alert_name, const char *overwrite_dimensions);
 int rrdcalc_add_from_config(RRDHOST *host, RRDCALC *rc);
 

+ 3 - 7
database/sqlite/sqlite_aclk_alert.c

@@ -69,7 +69,7 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae)
         return 0;
     }
 
-    if (unlikely(uuid_is_null(ae->config_hash_id))) 
+    if (unlikely(uuid_is_null(ae->config_hash_id)))
         return 0;
 
     char sql[ACLK_SYNC_QUERY_SIZE];
@@ -101,7 +101,6 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae)
         if (sqlite3_column_type(res, 1) != SQLITE_NULL)
             uuid_copy(config_hash_id, *((uuid_t *) sqlite3_column_blob(res, 1)));
         unique_id = (uint32_t) sqlite3_column_int64(res, 2);
-        
     } else {
         send = 1;
         goto done;
@@ -118,11 +117,8 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae)
     }
 
     //same status, same config
-    if (ae->new_status == RRDCALC_STATUS_CLEAR || ae->new_status == RRDCALC_STATUS_UNDEFINED) {
-        send = 0;
-        update_filtered(ae, unique_id, uuid_str);
-        goto done;
-    }
+    send = 0;
+    update_filtered(ae, unique_id, uuid_str);
 
 done:
     rc = sqlite3_finalize(res);

+ 9 - 2
database/sqlite/sqlite_health.c

@@ -1612,9 +1612,9 @@ int health_migrate_old_health_log_table(char *table) {
     return 1;
 }
 
-#define SQL_GET_ALARM_ID "select alarm_id, health_log_id from health_log where host_id = @host_id and chart = @chart and name = @name"
+#define SQL_GET_ALARM_ID "select alarm_id, health_log_id from health_log where host_id = @host_id and chart = @chart and name = @name and config_hash_id = @config_hash_id"
 #define SQL_GET_EVENT_ID "select max(alarm_event_id) + 1 from health_log_detail where health_log_id = @health_log_id and alarm_id = @alarm_id"
-uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id)
+uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id)
 {
     int rc = 0;
     sqlite3_stmt *res = NULL;
@@ -1648,6 +1648,13 @@ uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *
         return alarm_id;
     }
 
+    rc = sqlite3_bind_blob(res, 4, config_hash_id, sizeof(*config_hash_id), SQLITE_STATIC);
+    if (unlikely(rc != SQLITE_OK)) {
+        error_report("Failed to bind config_hash_id parameter for SQL_GET_ALARM_ID.");
+        sqlite3_finalize(res);
+        return alarm_id;
+    }
+
     while (sqlite3_step_monitored(res) == SQLITE_ROW) {
         alarm_id = (uint32_t) sqlite3_column_int64(res, 0);
         health_log_id = (uint64_t) sqlite3_column_int64(res, 1);

+ 1 - 1
database/sqlite/sqlite_health.h

@@ -16,7 +16,7 @@ void sql_aclk_alert_clean_dead_entries(RRDHOST *host);
 int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status);
 void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
 int health_migrate_old_health_log_table(char *table);
-uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id);
+uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id);
 void sql_health_alarm_log2json_v3(BUFFER *wb, DICTIONARY *alert_instances, time_t after, time_t before, const char *transition, uint32_t max, bool debug);
 bool sql_find_alert_transition(const char *transition, void (*cb)(const char *machine_guid, const char *context, time_t alert_id, void *data), void *data);
 #endif //NETDATA_SQLITE_HEALTH_H

+ 14 - 12
health/health.c

@@ -448,7 +448,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
 
         if (likely(ret == 1)) {
             // we have executed this alarm notification in the past
-            if(last_executed_status == ae->new_status) {
+            if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
                 // don't send the notification for the same status again
                 debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
                       , rrdcalc_status2string(ae->new_status));
@@ -563,7 +563,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
                               ae->old_value,
                               ae->source?ae_source(ae):"UNKNOWN",
                               (uint32_t)ae->duration,
-                              (uint32_t)ae->non_clear_duration,
+                              (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
                               ae_units(ae),
                               ae_info(ae),
                               ae_new_value_string(ae),
@@ -636,17 +636,15 @@ static inline void health_alarm_log_process(RRDHOST *host) {
 
     ALARM_ENTRY *ae;
     for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
-        if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
-            if(unlikely(
+        if(unlikely(
                     !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
                     !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
-            )) {
-                if(unlikely(ae->unique_id < first_waiting))
-                    first_waiting = ae->unique_id;
+                    )) {
+            if(unlikely(ae->unique_id < first_waiting))
+                first_waiting = ae->unique_id;
 
-                if(likely(now >= ae->delay_up_to_timestamp))
-                    health_process_notifications(host, ae);
-            }
+            if(likely(now >= ae->delay_up_to_timestamp))
+                health_process_notifications(host, ae);
         }
     }
 
@@ -1431,6 +1429,12 @@ void *health_main(void *ptr) {
                         rc->old_status = rc->status;
                         rc->status = status;
                         rc->ae = ae;
+
+                        if(unlikely(rrdcalc_isrepeating(rc))) {
+                            rc->last_repeat = now;
+                            if (rc->status == RRDCALC_STATUS_CLEAR)
+                                rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
+                        }
                     }
 
                     rc->last_updated = now;
@@ -1471,7 +1475,6 @@ void *health_main(void *ptr) {
                         worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
                         rc->last_repeat = now;
                         if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
-
                         ALARM_ENTRY *ae = health_create_alarm_entry(
                                                                     host,
                                                                     rc->id,
@@ -1508,7 +1511,6 @@ void *health_main(void *ptr) {
                             ae->flags |= HEALTH_ENTRY_RUN_ONCE;
                         }
                         rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
-                        rc->ae = ae;
                         health_process_notifications(host, ae);
                         debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
                         health_alarm_wait_for_execution(ae);