123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212 |
- // SPDX-License-Identifier: GPL-3.0-or-later
- #include "health.h"
- unsigned int default_health_enabled = 1;
- char *silencers_filename;
- // the queue of executed alarm notifications that haven't been waited for yet
- static struct {
- ALARM_ENTRY *head; // oldest
- ALARM_ENTRY *tail; // latest
- } alarm_notifications_in_progress = {NULL, NULL};
- static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
- {
- ae->prev_in_progress = NULL;
- ae->next_in_progress = NULL;
- if (NULL != alarm_notifications_in_progress.tail) {
- ae->prev_in_progress = alarm_notifications_in_progress.tail;
- alarm_notifications_in_progress.tail->next_in_progress = ae;
- }
- if (NULL == alarm_notifications_in_progress.head) {
- alarm_notifications_in_progress.head = ae;
- }
- alarm_notifications_in_progress.tail = ae;
- }
- static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
- {
- struct alarm_entry *prev = ae->prev_in_progress;
- struct alarm_entry *next = ae->next_in_progress;
- if (NULL != prev) {
- prev->next_in_progress = next;
- }
- if (NULL != next) {
- next->prev_in_progress = prev;
- }
- if (ae == alarm_notifications_in_progress.head) {
- alarm_notifications_in_progress.head = next;
- }
- if (ae == alarm_notifications_in_progress.tail) {
- alarm_notifications_in_progress.tail = prev;
- }
- }
- // ----------------------------------------------------------------------------
- // health initialization
- /**
- * User Config directory
- *
- * Get the config directory for health and return it.
- *
- * @return a pointer to the user config directory
- */
- inline char *health_user_config_dir(void) {
- char buffer[FILENAME_MAX + 1];
- snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
- return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
- }
- /**
- * Stock Config Directory
- *
- * Get the Stock config directory and return it.
- *
- * @return a pointer to the stock config directory.
- */
- inline char *health_stock_config_dir(void) {
- char buffer[FILENAME_MAX + 1];
- snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
- return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
- }
- /**
- * Silencers init
- *
- * Function used to initialize the silencer structure.
- */
- static void health_silencers_init(void) {
- FILE *fd = fopen(silencers_filename, "r");
- if (fd) {
- fseek(fd, 0 , SEEK_END);
- off_t length = (off_t) ftell(fd);
- fseek(fd, 0 , SEEK_SET);
- if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
- char *str = mallocz((length+1)* sizeof(char));
- if(str) {
- size_t copied;
- copied = fread(str, sizeof(char), length, fd);
- if (copied == (length* sizeof(char))) {
- str[length] = 0x00;
- json_parse(str, NULL, health_silencers_json_read_callback);
- info("Parsed health silencers file %s", silencers_filename);
- } else {
- error("Cannot read the data from health silencers file %s", silencers_filename);
- }
- freez(str);
- }
- } else {
- error(
- "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
- silencers_filename,
- (int64_t)length,
- HEALTH_SILENCERS_MAX_FILE_LEN);
- }
- fclose(fd);
- } else {
- info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
- }
- }
- /**
- * Health Init
- *
- * Initialize the health thread.
- */
- void health_init(void) {
- debug(D_HEALTH, "Health configuration initializing");
- if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
- debug(D_HEALTH, "Health is disabled.");
- return;
- }
- health_silencers_init();
- }
- // ----------------------------------------------------------------------------
- // re-load health configuration
- /**
- * Reload host
- *
- * Reload configuration for a specific host.
- *
- * @param host the structure of the host that the function will reload the configuration.
- */
- static void health_reload_host(RRDHOST *host) {
- if(unlikely(!host->health_enabled))
- return;
- char *user_path = health_user_config_dir();
- char *stock_path = health_stock_config_dir();
- // free all running alarms
- rrdhost_wrlock(host);
- while(host->templates)
- rrdcalctemplate_unlink_and_free(host, host->templates);
- RRDCALCTEMPLATE *rt,*next;
- for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
- next = rt->next;
- rrdcalctemplate_free(rt);
- }
- host->alarms_template_with_foreach = NULL;
- while(host->alarms)
- rrdcalc_unlink_and_free(host, host->alarms);
- RRDCALC *rc,*nc;
- for(rc = host->alarms_with_foreach; rc ; rc = nc) {
- nc = rc->next;
- rrdcalc_free(rc);
- }
- host->alarms_with_foreach = NULL;
- rrdhost_unlock(host);
- // invalidate all previous entries in the alarm log
- ALARM_ENTRY *t;
- for(t = host->health_log.alarms ; t ; t = t->next) {
- if(t->new_status != RRDCALC_STATUS_REMOVED)
- t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
- }
- rrdhost_rdlock(host);
- // reset all thresholds to all charts
- RRDSET *st;
- rrdset_foreach_read(st, host) {
- st->green = NAN;
- st->red = NAN;
- }
- rrdhost_unlock(host);
- // load the new alarms
- rrdhost_wrlock(host);
- health_readdir(host, user_path, stock_path, NULL);
- //Discard alarms with labels that do not apply to host
- rrdcalc_labels_unlink_alarm_from_host(host);
- // link the loaded alarms to their charts
- RRDDIM *rd;
- rrdset_foreach_write(st, host) {
- if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
- continue;
- rrdsetcalc_link_matching(st);
- rrdcalctemplate_link_matching(st);
- //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
- rrdset_rdlock(st);
- rrddim_foreach_read(rd, st) {
- rrdcalc_link_to_rrddim(rd, st, host);
- }
- rrdset_unlock(st);
- }
- rrdhost_unlock(host);
- }
- /**
- * Reload
- *
- * Reload the host configuration for all hosts.
- */
- void health_reload(void) {
- #ifdef ENABLE_ACLK
- if (netdata_cloud_setting)
- aclk_single_update_disable();
- #endif
- sql_refresh_hashes();
- rrd_rdlock();
- RRDHOST *host;
- rrdhost_foreach_read(host)
- health_reload_host(host);
- rrd_unlock();
- #ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
- aclk_single_update_enable();
- aclk_alarm_reload();
- #ifdef ENABLE_NEW_CLOUD_PROTOCOL
- aclk_alert_reloaded = 1;
- #endif
- }
- #endif
- }
- // ----------------------------------------------------------------------------
- // health main thread and friends
- static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
- if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
- if(n) return RRDCALC_STATUS_RAISED;
- return RRDCALC_STATUS_CLEAR;
- }
- #define ALARM_EXEC_COMMAND_LENGTH 8192
- static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
- ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
- if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
- // do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- goto done;
- }
- if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- // do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- // mark it as run, so that we will send the same alarm if it happens again
- goto done;
- }
- // find the previous notification for the same alarm
- // which we have run the exec script
- // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- uint32_t id = ae->alarm_id;
- ALARM_ENTRY *t;
- for(t = ae->next; t ; t = t->next) {
- if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
- break;
- }
- if(likely(t)) {
- // we have executed this alarm notification in the past
- if(t && t->new_status == ae->new_status) {
- // don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
- , rrdcalc_status2string(ae->new_status));
- goto done;
- }
- }
- else {
- // we have not executed this alarm notification in the past
- // so, don't send CLEAR notifications
- if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
- if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
- debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- goto done;
- }
- }
- }
- }
- // Check if alarm notifications are silenced
- if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- goto done;
- }
- static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
- const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
- const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
- int n_warn=0, n_crit=0;
- RRDCALC *rc;
- EVAL_EXPRESSION *expr=NULL;
- BUFFER *warn_alarms, *crit_alarms;
- warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc ; rc = rc->next) {
- if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
- continue;
- if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_warn)
- buffer_strcat(warn_alarms, ",");
- buffer_strcat(warn_alarms, rc->name);
- buffer_strcat(warn_alarms, "=");
- buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
- n_warn++;
- } else if (ae->alarm_id == rc->id)
- expr = rc->warning;
- } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_crit)
- buffer_strcat(crit_alarms, ",");
- buffer_strcat(crit_alarms, rc->name);
- buffer_strcat(crit_alarms, "=");
- buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
- n_crit++;
- } else if (ae->alarm_id == rc->id)
- expr = rc->critical;
- } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if (ae->alarm_id == rc->id)
- expr = rc->warning;
- }
- }
- char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
- exec,
- recipient,
- host->registry_hostname,
- ae->unique_id,
- ae->alarm_id,
- ae->alarm_event_id,
- (unsigned long)ae->when,
- ae->name,
- ae->chart?ae->chart:"NOCHART",
- ae->family?ae->family:"NOFAMILY",
- rrdcalc_status2string(ae->new_status),
- rrdcalc_status2string(ae->old_status),
- ae->new_value,
- ae->old_value,
- ae->source?ae->source:"UNKNOWN",
- (uint32_t)ae->duration,
- (uint32_t)ae->non_clear_duration,
- ae->units?ae->units:"",
- ae->info?ae->info:"",
- ae->new_value_string,
- ae->old_value_string,
- (expr && expr->source)?expr->source:"NOSOURCE",
- (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
- n_warn,
- n_crit,
- buffer_tostring(warn_alarms),
- buffer_tostring(crit_alarms),
- ae->classification?ae->classification:"Unknown",
- edit_command,
- host != localhost ? host->machine_guid:""
- );
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
- ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
- debug(D_HEALTH, "executing command '%s'", command_to_run);
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
- enqueue_alarm_notify_in_progress(ae);
- freez(edit_command);
- buffer_free(warn_alarms);
- buffer_free(crit_alarms);
- return; //health_alarm_wait_for_execution
- done:
- health_alarm_log_save(host, ae);
- }
- static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
- if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
- return;
- spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
- debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
- ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- if(ae->exec_code != 0)
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
- unlink_alarm_notify_in_progress(ae);
- }
- static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
- ae->chart?ae->chart:"NOCHART", ae->name,
- ae->new_value,
- rrdcalc_status2string(ae->old_status),
- rrdcalc_status2string(ae->new_status)
- );
- health_alarm_execute(host, ae);
- }
- static inline void health_alarm_log_process(RRDHOST *host) {
- uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
- time_t now = now_realtime_sec();
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
- ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!alarm_entry_isrepeating(host, ae))) {
- if(unlikely(
- !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
- )) {
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
- }
- }
- }
- // remember this for the next iteration
- host->health_last_processed_id = first_waiting;
- bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- if (!cleanup_excess_log_entries)
- return;
- // cleanup excess entries in the log
- netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
- ALARM_ENTRY *last = NULL;
- unsigned int count = host->health_log.max * 2 / 3;
- for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
- if(ae && last && last->next == ae)
- last->next = NULL;
- else
- ae = NULL;
- while(ae) {
- debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
- ALARM_ENTRY *t = ae->next;
- if(likely(!alarm_entry_isrepeating(host, ae))) {
- health_alarm_wait_for_execution(ae);
- health_alarm_log_free_one_nochecks_nounlink(ae);
- host->health_log.count--;
- }
- ae = t;
- }
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- }
- static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
- if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
- if(unlikely(rc->next_update > now)) {
- if (unlikely(*next_run > rc->next_update)) {
- // update the next_run time of the main loop
- // to run this alarm precisely the time required
- *next_run = rc->next_update;
- }
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
- return 0;
- }
- if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
- if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
- if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
- if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
- int update_every = rc->rrdset->update_every;
- rrdset_rdlock(rc->rrdset);
- time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
- time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
- rrdset_unlock(rc->rrdset);
- if(unlikely(now + update_every < first /* || now - update_every > last */)) {
- debug(D_HEALTH
- , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
- , (unsigned long) last);
- return 0;
- }
- if(RRDCALC_HAS_DB_LOOKUP(rc)) {
- time_t needed = now + rc->before + rc->after;
- if(needed + update_every < first || needed - update_every > last) {
- debug(D_HEALTH
- , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
- , (unsigned long) last);
- return 0;
- }
- }
- return 1;
- }
- static inline int check_if_resumed_from_suspension(void) {
- static usec_t last_realtime = 0, last_monotonic = 0;
- usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
- int ret = 0;
- // detect if monotonic and realtime have twice the difference
- // in which case we assume the system was just waken from hibernation
- if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
- ret = 1;
- last_realtime = realtime;
- last_monotonic = monotonic;
- return ret;
- }
- static void health_main_cleanup(void *ptr) {
- worker_unregister();
- struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
- info("cleaning up...");
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- }
- static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
- SILENCER *s;
- debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
- rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
- for (s = silencers->silencers; s!=NULL; s=s->next){
- if (
- (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
- (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
- (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
- (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
- (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
- ) {
- debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
- if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
- } else {
- debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
- , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
- , rc->name
- , (rc->rrdset)?rc->rrdset->context:""
- , rc->chart
- , host
- , (rc->rrdset)?rc->rrdset->family:""
- );
- }
- return silencers->stype;
- }
- }
- return STYPE_NONE;
- }
- /**
- * Update Disabled Silenced
- *
- * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
- *
- * @param host structure that contains information about the host monitored.
- * @param rc structure with information about the alarm
- *
- * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
- */
- static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
- uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
- // Clear the flags
- rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
- if (unlikely(silencers->all_alarms)) {
- if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
- } else {
- SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
- if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
- }
- if (rrdcalc_flags_old != rc->rrdcalc_flags) {
- info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
- host->hostname,
- rc->name,
- (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
- );
- }
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
- return 1;
- else
- return 0;
- }
- // Create alarms for dimensions that have been added to charts
- // since the previous iteration.
- static void init_pending_foreach_alarms(RRDHOST *host) {
- RRDSET *st;
- RRDDIM *rd;
- if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
- return;
- rrdhost_wrlock(host);
- rrdset_foreach_write(st, host) {
- if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
- continue;
- rrdset_rdlock(st);
- rrddim_foreach_read(rd, st) {
- if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
- continue;
- rrdcalc_link_to_rrddim(rd, st, host);
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- rrdset_unlock(st);
- }
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
- rrdhost_unlock(host);
- }
- /**
- * Health Main
- *
- * The main thread of the health system. In this function all the alarms will be processed.
- *
- * @param ptr is a pointer to the netdata_static_thread structure.
- *
- * @return It always returns NULL
- */
- #define WORKER_HEALTH_JOB_RRD_LOCK 0
- #define WORKER_HEALTH_JOB_HOST_LOCK 1
- #define WORKER_HEALTH_JOB_DB_QUERY 2
- #define WORKER_HEALTH_JOB_CALC_EVAL 3
- #define WORKER_HEALTH_JOB_WARNING_EVAL 4
- #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
- #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
- #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
- #if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
- #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
- #endif
- void *health_main(void *ptr) {
- worker_register("HEALTH");
- worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
- worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
- worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
- worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
- worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
- worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
- worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
- worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
- netdata_thread_cleanup_push(health_main_cleanup, ptr);
- int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
- if(min_run_every < 1) min_run_every = 1;
- int cleanup_sql_every_loop = 7200 / min_run_every;
- time_t now = now_realtime_sec();
- time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
- rrdcalc_labels_unlink();
- unsigned int loop = 0;
- #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
- unsigned int marked_aclk_reload_loop = 0;
- #endif
- while(!netdata_exit) {
- loop++;
- debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
- int runnable = 0, apply_hibernation_delay = 0;
- time_t next_run = now + min_run_every;
- RRDCALC *rc;
- if (unlikely(check_if_resumed_from_suspension())) {
- apply_hibernation_delay = 1;
- info(
- "Postponing alarm checks for %"PRId64" seconds, "
- "because it seems that the system was just resumed from suspension.",
- (int64_t)hibernation_delay);
- }
- if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
- static int logged=0;
- if (!logged) {
- info("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
- logged = 1;
- }
- }
- #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
- if (aclk_alert_reloaded && !marked_aclk_reload_loop)
- marked_aclk_reload_loop = loop;
- #endif
- worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
- rrd_rdlock();
- RRDHOST *host;
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
- continue;
- if (unlikely(apply_hibernation_delay)) {
- info(
- "Postponing health checks for %"PRId64" seconds, on host '%s'.",
- (int64_t)hibernation_delay,
- host->hostname);
- host->health_delay_up_to = now + hibernation_delay;
- }
- if (unlikely(host->health_delay_up_to)) {
- if (unlikely(now < host->health_delay_up_to))
- continue;
- info("Resuming health checks on host '%s'.", host->hostname);
- host->health_delay_up_to = 0;
- }
- if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
- sql_health_alarm_log_cleanup(host);
- init_pending_foreach_alarms(host);
- worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- rrdhost_rdlock(host);
- // the first loop is to lookup values from the db
- for (rc = host->alarms; rc; rc = rc->next) {
- if (update_disabled_silenced(host, rc))
- continue;
- // create an alert removed event if the chart is obsolete and
- // has stopped being collected for 60 seconds
- if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
- rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
- now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
- if (!rrdcalc_isrepeating(rc)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- time_t now = now_realtime_sec();
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
- if (ae) {
- health_alarm_log(host, ae);
- rc->old_status = rc->status;
- rc->status = RRDCALC_STATUS_REMOVED;
- rc->last_status_change = now;
- rc->last_updated = now;
- rc->value = NAN;
- #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
- if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
- sql_queue_alarm_to_aclk(host, ae, 1);
- #endif
- }
- }
- }
- if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
- if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
- continue;
- }
- runnable++;
- rc->old_value = rc->value;
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
- // ------------------------------------------------------------
- // if there is database lookup, do it
- if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
- worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
- /* time_t old_db_timestamp = rc->db_before; */
- int value_is_null = 0;
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
- rc->before, rc->group, 0, rc->options, &rc->db_after,
- &rc->db_before, &value_is_null, 0
- );
- if (unlikely(ret != 200)) {
- // database lookup failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
- /* - RRDCALC_FLAG_DB_STALE not currently used
- if (unlikely(old_db_timestamp == rc->db_before)) {
- // database is stale
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
- error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
- }
- }
- else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
- */
- if (unlikely(value_is_null)) {
- // collected value is null
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
- CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->value
- );
- }
- // ------------------------------------------------------------
- // if there is calculation expression, run it
- if (unlikely(rc->calculation)) {
- worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
- if (unlikely(!expression_evaluate(rc->calculation))) {
- // calculation failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
- CALCULATED_NUMBER_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, rc->calculation->result,
- buffer_tostring(rc->calculation->error_msg), rc->source
- );
- rc->value = rc->calculation->result;
- if (rc->local) rc->local->last_updated = now;
- if (rc->family) rc->family->last_updated = now;
- if (rc->hostid) rc->hostid->last_updated = now;
- if (rc->hostname) rc->hostname->last_updated = now;
- }
- }
- }
- rrdhost_unlock(host);
- if (unlikely(runnable && !netdata_exit)) {
- rrdhost_rdlock(host);
- for (rc = host->alarms; rc; rc = rc->next) {
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
- continue;
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
- continue;
- }
- RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
- RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
- // --------------------------------------------------------
- // check the warning expression
- if (likely(rc->warning)) {
- worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
- if (unlikely(!expression_evaluate(rc->warning))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->warning->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
- CALCULATED_NUMBER_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
- );
- warning_status = rrdcalc_value2status(rc->warning->result);
- }
- }
- // --------------------------------------------------------
- // check the critical expression
- if (likely(rc->critical)) {
- worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
- if (unlikely(!expression_evaluate(rc->critical))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->critical->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
- CALCULATED_NUMBER_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
- rc->source
- );
- critical_status = rrdcalc_value2status(rc->critical->result);
- }
- }
- // --------------------------------------------------------
- // decide the final alarm status
- RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
- switch (warning_status) {
- case RRDCALC_STATUS_CLEAR:
- status = RRDCALC_STATUS_CLEAR;
- break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_WARNING;
- break;
- default:
- break;
- }
- switch (critical_status) {
- case RRDCALC_STATUS_CLEAR:
- if (status == RRDCALC_STATUS_UNDEFINED)
- status = RRDCALC_STATUS_CLEAR;
- break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_CRITICAL;
- break;
- default:
- break;
- }
- // --------------------------------------------------------
- // check if the new status and the old differ
- if (status != rc->status) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- int delay = 0;
- // apply trigger hysteresis
- if (now > rc->delay_up_to_timestamp) {
- rc->delay_up_current = rc->delay_up_duration;
- rc->delay_down_current = rc->delay_down_duration;
- rc->delay_last = 0;
- rc->delay_up_to_timestamp = 0;
- } else {
- rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
- if (rc->delay_up_current > rc->delay_max_duration)
- rc->delay_up_current = rc->delay_max_duration;
- rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
- if (rc->delay_down_current > rc->delay_max_duration)
- rc->delay_down_current = rc->delay_max_duration;
- }
- if (status > rc->status)
- delay = rc->delay_up_current;
- else
- delay = rc->delay_down_current;
- // COMMENTED: because we do need to send raising alarms
- // if(now + delay < rc->delay_up_to_timestamp)
- // delay = (int)(rc->delay_up_to_timestamp - now);
- rc->delay_last = delay;
- rc->delay_up_to_timestamp = now + delay;
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- health_alarm_log(host, ae);
- rc->last_status_change = now;
- rc->old_status = rc->status;
- rc->status = status;
- }
- rc->last_updated = now;
- rc->next_update = now + rc->update_every;
- if (next_run > rc->next_update)
- next_run = rc->next_update;
- }
- // process repeating alarms
- RRDCALC *rc;
- for(rc = host->alarms; rc ; rc = rc->next) {
- int repeat_every = 0;
- if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->warn_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->crit_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
- if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
- repeat_every = 1;
- } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
- repeat_every = 1;
- }
- }
- }
- } else {
- continue;
- }
- if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- rc->last_repeat = now;
- if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- ae->last_repeat = rc->last_repeat;
- if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
- ae->flags |= HEALTH_ENTRY_RUN_ONCE;
- }
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
- health_process_notifications(host, ae);
- debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
- health_alarm_wait_for_execution(ae);
- health_alarm_log_free_one_nochecks_nounlink(ae);
- }
- }
- rrdhost_unlock(host);
- }
- if (unlikely(netdata_exit))
- break;
- // execute notifications
- // and cleanup
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
- health_alarm_log_process(host);
- if (unlikely(netdata_exit)) {
- // wait for all notifications to finish before allowing health to be cleaned up
- ALARM_ENTRY *ae;
- while (NULL != (ae = alarm_notifications_in_progress.head)) {
- health_alarm_wait_for_execution(ae);
- }
- break;
- }
- } /* rrdhost_foreach */
- // wait for all notifications to finish before allowing health to be cleaned up
- ALARM_ENTRY *ae;
- while (NULL != (ae = alarm_notifications_in_progress.head)) {
- health_alarm_wait_for_execution(ae);
- }
- #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
- if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
- continue;
- sql_queue_removed_alerts_to_aclk(host);
- }
- aclk_alert_reloaded = 0;
- marked_aclk_reload_loop = 0;
- }
- #endif
- rrd_unlock();
- if(unlikely(netdata_exit))
- break;
- now = now_realtime_sec();
- if(now < next_run) {
- worker_is_idle();
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
- sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
- now = now_realtime_sec();
- }
- else
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
- } // forever
- netdata_thread_cleanup_pop(1);
- return NULL;
- }
|