Browse Source

Repeating alarm notifications (#6309)

* Alarm_repeat mergin the original!

* Alarm_repeat binary tree!

* Alarm_repeat binary tree finished!

* Alarm_repeat move function and format string

* Alarms bringing a new Binary tree

* Alarms fixing the last two

* Alarm_repeat useless var!

* Alarm fix format and repeat alarm!

* Alarm_backend steps!

* Alarm_repeat stopping to test cloud!

* Alarm_repeat stopping to test cloud 2!

* Alarm_repeat fixing when restart!
thiagoftsm 5 years ago
parent
commit
dd73f3e0cd
10 changed files with 205 additions and 20 deletions
  1. 1 0
      daemon/common.h
  2. 1 1
      daemon/main.c
  3. 1 1
      database/rrd.c
  4. 13 0
      database/rrd.h
  5. 91 11
      database/rrdcalc.c
  6. 23 2
      database/rrdcalc.h
  7. 1 1
      database/rrdcalctemplate.c
  8. 6 0
      database/rrdcalctemplate.h
  9. 47 3
      database/rrdhost.c
  10. 21 1
      health/README.md

+ 1 - 0
daemon/common.h

@@ -14,6 +14,7 @@
 #define config_get_float(section, name, value) appconfig_get_float(&netdata_config, section, name, value)
 #define config_get_boolean(section, name, value) appconfig_get_boolean(&netdata_config, section, name, value)
 #define config_get_boolean_ondemand(section, name, value) appconfig_get_boolean_ondemand(&netdata_config, section, name, value)
+#define config_get_duration(section, name, value) appconfig_get_duration(&netdata_config, section, name, value)
 
 #define config_set(section, name, default_value) appconfig_set(&netdata_config, section, name, default_value)
 #define config_set_default(section, name, value) appconfig_set_default(&netdata_config, section, name, value)

+ 1 - 1
daemon/main.c

@@ -1217,7 +1217,7 @@ int main(int argc, char **argv) {
     info("netdata initialization completed. Enjoy real-time performance monitoring!");
     netdata_ready = 1;
   
-    send_statistics("START","-", "-");
+    send_statistics("START", "-",  "-");
 
     // ------------------------------------------------------------------------
     // unblock signals

+ 1 - 1
database/rrd.c

@@ -132,7 +132,6 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) {
     }
 }
 
-
 // ----------------------------------------------------------------------------
 // RRD - cache directory
 
@@ -154,3 +153,4 @@ char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section
 
     return ret;
 }
+

+ 13 - 0
database/rrd.h

@@ -572,6 +572,8 @@ struct alarm_entry {
     uint32_t updated_by_id;
     uint32_t updates_id;
 
+    time_t last_repeat;
+
     struct alarm_entry *next;
 };
 
@@ -686,11 +688,16 @@ struct rrdhost {
     char *health_log_filename;                      // the alarms event log filename
     size_t health_log_entries_written;              // the number of alarm events writtern to the alarms event log
     FILE *health_log_fp;                            // the FILE pointer to the open alarms event log file
+    uint32_t health_default_warn_repeat_every;      // the default value for the interval between repeating warning notifications
+    uint32_t health_default_crit_repeat_every;      // the default value for the interval between repeating critical notifications
+
 
     // all RRDCALCs are primarily allocated and linked here
     // RRDCALCs may be linked to charts at any point
     // (charts may or may not exist when these are loaded)
     RRDCALC *alarms;
+    avl_tree_lock alarms_idx_health_log;
+    avl_tree_lock alarms_idx_name;
 
     ALARM_LOG health_log;                           // alarms historical events (event log)
     uint32_t health_last_processed_id;              // the last processed health id from the log
@@ -1021,6 +1028,12 @@ extern collected_number rrddim_set(RRDSET *st, const char *id, collected_number
 
 extern long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries);
 
+// ----------------------------------------------------------------------------
+// Miscellaneous functions
+
+extern int alarm_compare_id(void *a, void *b);
+extern int alarm_compare_name(void *a, void *b);
+
 // ----------------------------------------------------------------------------
 // RRD internal functions
 

+ 91 - 11
database/rrdcalc.c

@@ -81,9 +81,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
 
     if(!rc->units) rc->units = strdupz(st->units);
 
-    {
+    if(!rrdcalc_isrepeating(rc)) {
         time_t now = now_realtime_sec();
-        health_alarm_log(
+        ALARM_ENTRY *ae = health_create_alarm_entry(
                 host,
                 rc->id,
                 rc->next_event_id++,
@@ -104,6 +104,7 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
                 0,
                 0
         );
+        health_alarm_log(host, ae);
     }
 }
 
@@ -142,9 +143,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
 
     RRDHOST *host = st->rrdhost;
 
-    {
+    if(!rrdcalc_isrepeating(rc)) {
         time_t now = now_realtime_sec();
-        health_alarm_log(
+        ALARM_ENTRY *ae = health_create_alarm_entry(
                 host,
                 rc->id,
                 rc->next_event_id++,
@@ -165,6 +166,7 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
                 0,
                 0
         );
+        health_alarm_log(host, ae);
     }
 
     debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
@@ -253,7 +255,7 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch
     return host->health_log.next_alarm_id++;
 }
 
-inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
+inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) {
     rrdhost_check_rdlock(host);
 
     if(rc->calculation) {
@@ -301,8 +303,7 @@ inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
     }
 }
 
-inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
-
+inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
     debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
 
     if(rrdcalc_exists(host, chart, rt->name, 0, 0))
@@ -328,6 +329,10 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
     rc->delay_max_duration = rt->delay_max_duration;
     rc->delay_multiplier = rt->delay_multiplier;
 
+    rc->last_repeat = 0;
+    rc->warn_repeat_every = rt->warn_repeat_every;
+    rc->crit_repeat_every = rt->crit_repeat_every;
+
     rc->group = rt->group;
     rc->after = rt->after;
     rc->before = rt->before;
@@ -356,7 +361,7 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
             error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
     }
 
-    debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
+    debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
             (rc->chart)?rc->chart:"NOCHART",
             rc->name,
             (rc->exec)?rc->exec:"DEFAULT",
@@ -376,16 +381,24 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
             rc->delay_up_duration,
             rc->delay_down_duration,
             rc->delay_max_duration,
-            rc->delay_multiplier
+            rc->delay_multiplier,
+            rc->warn_repeat_every,
+            rc->crit_repeat_every
     );
 
-    rrdcalc_create_part2(host, rc);
+    rrdcalc_add_to_host(host, rc);
+    RRDCALC *rdcmp  = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc);
+    if (rdcmp != rc) {
+        error("Cannot insert the alarm index ID %s",rc->name);
+    }
+
     return rc;
 }
 
 void rrdcalc_free(RRDCALC *rc) {
     if(unlikely(!rc)) return;
 
+
     expression_free(rc->calculation);
     expression_free(rc->warning);
     expression_free(rc->critical);
@@ -413,7 +426,6 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
     // unlink it from RRDHOST
     if(unlikely(rc == host->alarms))
         host->alarms = rc->next;
-
     else {
         RRDCALC *t;
         for(t = host->alarms; t && t->next != rc; t = t->next) ;
@@ -425,5 +437,73 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
             error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
     }
 
+    if (rc) {
+        RRDCALC *rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc);
+        if (!rdcmp) {
+            error("Cannot remove the health alarm index");
+        }
+
+        rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc);
+        if (!rdcmp) {
+            error("Cannot remove the health alarm index");
+        }
+    }
+
     rrdcalc_free(rc);
 }
+
+// ----------------------------------------------------------------------------
+// Alarm
+
+
+/**
+ * Alarm is repeating
+ *
+ * Is this alarm repeating ?
+ *
+ * @param host The structure that has the binary tree
+ * @param alarm_id the id of the alarm to search
+ *
+ * @return It returns 1 case it is repeating and 0 otherwise
+ */
+int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id) {
+    RRDCALC findme;
+    findme.id = alarm_id;
+    RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl *)&findme);
+    if (!rc) {
+        return 0;
+    }
+    return rrdcalc_isrepeating(rc);
+}
+
+/**
+ * Entry is repeating
+ *
+ * Check whether the id of alarm entry is yet present in the host structure
+ *
+ * @param host The structure that has the binary tree
+ * @param ae the alarm entry
+ *
+ * @return It returns 1 case it is repeating and 0 otherwise
+ */
+int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae) {
+    return alarm_isrepeating(host, ae->alarm_id);
+}
+
+/**
+ * Max last repeat
+ *
+ * Check the maximum last_repeat for the alarms associated a host
+ *
+ * @param host The structure that has the binary tree
+ *
+ * @return It returns 1 case it is repeating and 0 otherwise
+ */
+RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name,uint32_t hash) {
+    RRDCALC findme;
+    findme.name = alarm_name;
+    findme.hash = hash;
+    RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl *)&findme);
+
+    return rc;
+}

+ 23 - 2
database/rrdcalc.h

@@ -29,7 +29,9 @@
 #define RRDCALC_FLAG_SILENCED              0x00000100
 #define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
 
+
 struct rrdcalc {
+    avl avl;                        // the index, with key the id - this has to be first!
     uint32_t id;                    // the unique id of this alarm
     uint32_t next_event_id;         // the next event id that will be used for this alarm
 
@@ -77,9 +79,16 @@ struct rrdcalc {
     float delay_multiplier;        // multiplier for all delays when alarms switch status
     // while now < delay_up_to
 
+    // ------------------------------------------------------------------------
+    // notification repeat settings
+
+    uint32_t warn_repeat_every;     // interval between repeating warning notifications
+    uint32_t crit_repeat_every; // interval between repeating critical notifications
+
     // ------------------------------------------------------------------------
     // runtime information
 
+    RRDCALC_STATUS old_status; // the old status of the alarm
     RRDCALC_STATUS status;          // the current status of the alarm
 
     calculated_number value;        // the current value of the alarm
@@ -90,6 +99,7 @@ struct rrdcalc {
     time_t last_updated;            // the last update timestamp of the alarm
     time_t next_update;             // the next update timestamp of the alarm
     time_t last_status_change;      // the timestamp of the last time this alarm changed status
+    time_t last_repeat; // the last time the alarm got repeated
 
     time_t db_after;                // the first timestamp evaluated by the db lookup
     time_t db_before;               // the last timestamp evaluated by the db lookup
@@ -119,6 +129,10 @@ struct rrdcalc {
     struct rrdcalc *next;
 };
 
+extern int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id);
+extern int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae);
+extern RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name, uint32_t hash);
+
 #define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->after)
 
 extern void rrdsetcalc_link_matching(RRDSET *st);
@@ -132,7 +146,14 @@ extern void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc);
 
 extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name);
 extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id);
-extern RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
-extern void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc);
+extern RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
+extern void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc);
+
+static inline int rrdcalc_isrepeating(RRDCALC *rc) {
+    if (unlikely(rc->warn_repeat_every > 0 || rc->crit_repeat_every > 0)) {
+        return 1;
+    }
+    return 0;
+}
 
 #endif //NETDATA_RRDCALC_H

+ 1 - 1
database/rrdcalctemplate.c

@@ -13,7 +13,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) {
     for(rt = host->templates; rt ; rt = rt->next) {
         if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
            && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
-            RRDCALC *rc = rrdcalc_create(host, rt, st->id);
+            RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id);
             if(unlikely(!rc))
                 info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname);
 

+ 6 - 0
database/rrdcalctemplate.h

@@ -48,6 +48,12 @@ struct rrdcalctemplate {
     int delay_max_duration;        // the absolute max delay to apply to this alarm
     float delay_multiplier;        // multiplier for all delays when alarms switch status
 
+    // ------------------------------------------------------------------------
+    // notification repeat settings
+
+    uint32_t warn_repeat_every;    // interval between repeating warning notifications
+    uint32_t crit_repeat_every; // interval between repeating critical notifications
+
     // ------------------------------------------------------------------------
     // expressions related to the alarm
 

+ 47 - 3
database/rrdhost.c

@@ -179,6 +179,10 @@ RRDHOST *rrdhost_create(const char *hostname,
     if(config_get_boolean(CONFIG_SECTION_GLOBAL, "delete orphan hosts files", 1) && !is_localhost)
         rrdhost_flag_set(host, RRDHOST_FLAG_DELETE_ORPHAN_HOST);
 
+    host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
+    host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
+    avl_init_lock(&(host->alarms_idx_health_log), alarm_compare_id);
+    avl_init_lock(&(host->alarms_idx_name), alarm_compare_name);
 
     // ------------------------------------------------------------------------
     // initialize health variables
@@ -274,12 +278,12 @@ RRDHOST *rrdhost_create(const char *hostname,
     // load health configuration
 
     if(host->health_enabled) {
-        health_alarm_log_load(host);
-        health_alarm_log_open(host);
-
         rrdhost_wrlock(host);
         health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
         rrdhost_unlock(host);
+
+        health_alarm_log_load(host);
+        health_alarm_log_open(host);
     }
 
 
@@ -876,3 +880,43 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch
 
     return res;
 }
+
+/**
+ * Alarm Compare ID
+ *
+ * Callback function used with the binary trees to compare the id of RRDCALC
+ *
+ * @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
+ * @param b the pointer to the binary tree.
+ *
+ * @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
+ */
+int alarm_compare_id(void *a, void *b) {
+    register uint32_t hash1 = ((RRDCALC *)a)->id;
+    register uint32_t hash2 = ((RRDCALC *)b)->id;
+
+    if(hash1 < hash2) return -1;
+    else if(hash1 > hash2) return 1;
+
+    return 0;
+}
+
+/**
+ * Alarm Compare NAME
+ *
+ * Callback function used with the binary trees to compare the name of RRDCALC
+ *
+ * @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
+ * @param b the pointer to the binary tree.
+ *
+ * @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
+ */
+int alarm_compare_name(void *a, void *b) {
+    RRDCALC *in1 = (RRDCALC *)a;
+    RRDCALC *in2 = (RRDCALC *)b;
+
+    if(in1->hash < in2->hash) return -1;
+    else if(in1->hash > in2->hash) return 1;
+
+    return strcmp(in1->name,in2->name);
+}

+ 21 - 1
health/README.md

@@ -11,7 +11,6 @@ packet dropped).
 
 Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.).  
 
-
 Each alarm can execute a single query to the database using statistical algorithms against past data,
 but alarms can be combined. So, if you need 2 queries in the database, you can combine
 2 alarms together (both will run a query to the database, and the results can be combined).
@@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X]
      their matching one) and a delay is in place.
   - All are reset to their defaults when the alarm switches state without a delay in place.
 
+---
+
+#### Alarm line `repeat`
+
+Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
+
+Format:
+
+```
+repeat: [off] [warning DURATION] [critical DURATION]
+```
+
+* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration.
+* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode.
+* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode.
+
+---
+
 #### Alarm line `option`
 
 The only possible value for the `option` line is 
@@ -567,12 +584,15 @@ template: disk_full_percent
    every: 1m
     warn: $this > 80
     crit: $this > 95
+  repeat: warning 120s critical 10s
 ```
 
 `$used` and `$avail`  are the `used` and `avail` chart dimensions as shown on the dashboard.
 
 So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage.
 
+This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode.
+
 ### Example 3
 
 Predict if any disk will run out of space in the near future.

Some files were not shown because too many files changed in this diff