health.c 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. unsigned int default_health_enabled = 1;
  4. char *silencers_filename;
  5. // the queue of executed alarm notifications that haven't been waited for yet
  6. static struct {
  7. ALARM_ENTRY *head; // oldest
  8. ALARM_ENTRY *tail; // latest
  9. } alarm_notifications_in_progress = {NULL, NULL};
  10. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  11. {
  12. ae->prev_in_progress = NULL;
  13. ae->next_in_progress = NULL;
  14. if (NULL != alarm_notifications_in_progress.tail) {
  15. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  16. alarm_notifications_in_progress.tail->next_in_progress = ae;
  17. }
  18. if (NULL == alarm_notifications_in_progress.head) {
  19. alarm_notifications_in_progress.head = ae;
  20. }
  21. alarm_notifications_in_progress.tail = ae;
  22. }
  23. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  24. {
  25. struct alarm_entry *prev = ae->prev_in_progress;
  26. struct alarm_entry *next = ae->next_in_progress;
  27. if (NULL != prev) {
  28. prev->next_in_progress = next;
  29. }
  30. if (NULL != next) {
  31. next->prev_in_progress = prev;
  32. }
  33. if (ae == alarm_notifications_in_progress.head) {
  34. alarm_notifications_in_progress.head = next;
  35. }
  36. if (ae == alarm_notifications_in_progress.tail) {
  37. alarm_notifications_in_progress.tail = prev;
  38. }
  39. }
  40. // ----------------------------------------------------------------------------
  41. // health initialization
  42. /**
  43. * User Config directory
  44. *
  45. * Get the config directory for health and return it.
  46. *
  47. * @return a pointer to the user config directory
  48. */
  49. inline char *health_user_config_dir(void) {
  50. char buffer[FILENAME_MAX + 1];
  51. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  52. return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
  53. }
  54. /**
  55. * Stock Config Directory
  56. *
  57. * Get the Stock config directory and return it.
  58. *
  59. * @return a pointer to the stock config directory.
  60. */
  61. inline char *health_stock_config_dir(void) {
  62. char buffer[FILENAME_MAX + 1];
  63. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  64. return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer);
  65. }
  66. /**
  67. * Silencers init
  68. *
  69. * Function used to initialize the silencer structure.
  70. */
  71. static void health_silencers_init(void) {
  72. FILE *fd = fopen(silencers_filename, "r");
  73. if (fd) {
  74. fseek(fd, 0 , SEEK_END);
  75. off_t length = (off_t) ftell(fd);
  76. fseek(fd, 0 , SEEK_SET);
  77. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  78. char *str = mallocz((length+1)* sizeof(char));
  79. if(str) {
  80. size_t copied;
  81. copied = fread(str, sizeof(char), length, fd);
  82. if (copied == (length* sizeof(char))) {
  83. str[length] = 0x00;
  84. json_parse(str, NULL, health_silencers_json_read_callback);
  85. info("Parsed health silencers file %s", silencers_filename);
  86. } else {
  87. error("Cannot read the data from health silencers file %s", silencers_filename);
  88. }
  89. freez(str);
  90. }
  91. } else {
  92. error(
  93. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  94. silencers_filename,
  95. (int64_t)length,
  96. HEALTH_SILENCERS_MAX_FILE_LEN);
  97. }
  98. fclose(fd);
  99. } else {
  100. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  101. }
  102. }
  103. /**
  104. * Health Init
  105. *
  106. * Initialize the health thread.
  107. */
  108. void health_init(void) {
  109. debug(D_HEALTH, "Health configuration initializing");
  110. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  111. debug(D_HEALTH, "Health is disabled.");
  112. return;
  113. }
  114. health_silencers_init();
  115. }
  116. // ----------------------------------------------------------------------------
  117. // re-load health configuration
  118. /**
  119. * Reload host
  120. *
  121. * Reload configuration for a specific host.
  122. *
  123. * @param host the structure of the host that the function will reload the configuration.
  124. */
  125. static void health_reload_host(RRDHOST *host) {
  126. if(unlikely(!host->health_enabled))
  127. return;
  128. char *user_path = health_user_config_dir();
  129. char *stock_path = health_stock_config_dir();
  130. // free all running alarms
  131. rrdhost_wrlock(host);
  132. while(host->templates)
  133. rrdcalctemplate_unlink_and_free(host, host->templates);
  134. RRDCALCTEMPLATE *rt,*next;
  135. for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
  136. next = rt->next;
  137. rrdcalctemplate_free(rt);
  138. }
  139. host->alarms_template_with_foreach = NULL;
  140. while(host->alarms)
  141. rrdcalc_unlink_and_free(host, host->alarms);
  142. RRDCALC *rc,*nc;
  143. for(rc = host->alarms_with_foreach; rc ; rc = nc) {
  144. nc = rc->next;
  145. rrdcalc_free(rc);
  146. }
  147. host->alarms_with_foreach = NULL;
  148. rrdhost_unlock(host);
  149. // invalidate all previous entries in the alarm log
  150. ALARM_ENTRY *t;
  151. for(t = host->health_log.alarms ; t ; t = t->next) {
  152. if(t->new_status != RRDCALC_STATUS_REMOVED)
  153. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  154. }
  155. rrdhost_rdlock(host);
  156. // reset all thresholds to all charts
  157. RRDSET *st;
  158. rrdset_foreach_read(st, host) {
  159. st->green = NAN;
  160. st->red = NAN;
  161. }
  162. rrdhost_unlock(host);
  163. // load the new alarms
  164. rrdhost_wrlock(host);
  165. health_readdir(host, user_path, stock_path, NULL);
  166. //Discard alarms with labels that do not apply to host
  167. rrdcalc_labels_unlink_alarm_from_host(host);
  168. // link the loaded alarms to their charts
  169. RRDDIM *rd;
  170. rrdset_foreach_write(st, host) {
  171. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  172. continue;
  173. rrdsetcalc_link_matching(st);
  174. rrdcalctemplate_link_matching(st);
  175. //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
  176. rrdset_rdlock(st);
  177. rrddim_foreach_read(rd, st) {
  178. rrdcalc_link_to_rrddim(rd, st, host);
  179. }
  180. rrdset_unlock(st);
  181. }
  182. rrdhost_unlock(host);
  183. }
  184. /**
  185. * Reload
  186. *
  187. * Reload the host configuration for all hosts.
  188. */
  189. void health_reload(void) {
  190. #ifdef ENABLE_ACLK
  191. if (netdata_cloud_setting)
  192. aclk_single_update_disable();
  193. #endif
  194. sql_refresh_hashes();
  195. rrd_rdlock();
  196. RRDHOST *host;
  197. rrdhost_foreach_read(host)
  198. health_reload_host(host);
  199. rrd_unlock();
  200. #ifdef ENABLE_ACLK
  201. if (netdata_cloud_setting) {
  202. aclk_single_update_enable();
  203. aclk_alarm_reload();
  204. #ifdef ENABLE_NEW_CLOUD_PROTOCOL
  205. aclk_alert_reloaded = 1;
  206. #endif
  207. }
  208. #endif
  209. }
  210. // ----------------------------------------------------------------------------
  211. // health main thread and friends
  212. static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
  213. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  214. if(n) return RRDCALC_STATUS_RAISED;
  215. return RRDCALC_STATUS_CLEAR;
  216. }
  217. #define ALARM_EXEC_COMMAND_LENGTH 8192
  218. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  219. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  220. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  221. // do not send notifications for internal statuses
  222. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  223. goto done;
  224. }
  225. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  226. // do not send notifications for disabled statuses
  227. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  228. // mark it as run, so that we will send the same alarm if it happens again
  229. goto done;
  230. }
  231. // find the previous notification for the same alarm
  232. // which we have run the exec script
  233. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  234. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  235. uint32_t id = ae->alarm_id;
  236. ALARM_ENTRY *t;
  237. for(t = ae->next; t ; t = t->next) {
  238. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  239. break;
  240. }
  241. if(likely(t)) {
  242. // we have executed this alarm notification in the past
  243. if(t && t->new_status == ae->new_status) {
  244. // don't send the notification for the same status again
  245. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
  246. , rrdcalc_status2string(ae->new_status));
  247. goto done;
  248. }
  249. }
  250. else {
  251. // we have not executed this alarm notification in the past
  252. // so, don't send CLEAR notifications
  253. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  254. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  255. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  256. , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  257. goto done;
  258. }
  259. }
  260. }
  261. }
  262. // Check if alarm notifications are silenced
  263. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  264. info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  265. goto done;
  266. }
  267. static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
  268. const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
  269. const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
  270. int n_warn=0, n_crit=0;
  271. RRDCALC *rc;
  272. EVAL_EXPRESSION *expr=NULL;
  273. BUFFER *warn_alarms, *crit_alarms;
  274. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  275. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  276. for(rc = host->alarms; rc ; rc = rc->next) {
  277. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  278. continue;
  279. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  280. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  281. if (n_warn)
  282. buffer_strcat(warn_alarms, ",");
  283. buffer_strcat(warn_alarms, rc->name);
  284. buffer_strcat(warn_alarms, "=");
  285. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  286. n_warn++;
  287. } else if (ae->alarm_id == rc->id)
  288. expr = rc->warning;
  289. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  290. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  291. if (n_crit)
  292. buffer_strcat(crit_alarms, ",");
  293. buffer_strcat(crit_alarms, rc->name);
  294. buffer_strcat(crit_alarms, "=");
  295. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  296. n_crit++;
  297. } else if (ae->alarm_id == rc->id)
  298. expr = rc->critical;
  299. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  300. if (ae->alarm_id == rc->id)
  301. expr = rc->warning;
  302. }
  303. }
  304. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  305. snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s'",
  306. exec,
  307. recipient,
  308. host->registry_hostname,
  309. ae->unique_id,
  310. ae->alarm_id,
  311. ae->alarm_event_id,
  312. (unsigned long)ae->when,
  313. ae->name,
  314. ae->chart?ae->chart:"NOCHART",
  315. ae->family?ae->family:"NOFAMILY",
  316. rrdcalc_status2string(ae->new_status),
  317. rrdcalc_status2string(ae->old_status),
  318. ae->new_value,
  319. ae->old_value,
  320. ae->source?ae->source:"UNKNOWN",
  321. (uint32_t)ae->duration,
  322. (uint32_t)ae->non_clear_duration,
  323. ae->units?ae->units:"",
  324. ae->info?ae->info:"",
  325. ae->new_value_string,
  326. ae->old_value_string,
  327. (expr && expr->source)?expr->source:"NOSOURCE",
  328. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  329. n_warn,
  330. n_crit,
  331. buffer_tostring(warn_alarms),
  332. buffer_tostring(crit_alarms),
  333. ae->classification?ae->classification:"Unknown",
  334. edit_command
  335. );
  336. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  337. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  338. debug(D_HEALTH, "executing command '%s'", command_to_run);
  339. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  340. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  341. enqueue_alarm_notify_in_progress(ae);
  342. freez(edit_command);
  343. buffer_free(warn_alarms);
  344. buffer_free(crit_alarms);
  345. return; //health_alarm_wait_for_execution
  346. done:
  347. health_alarm_log_save(host, ae);
  348. }
  349. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  350. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  351. return;
  352. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  353. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  354. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  355. if(ae->exec_code != 0)
  356. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  357. unlink_alarm_notify_in_progress(ae);
  358. }
  359. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  360. debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
  361. ae->chart?ae->chart:"NOCHART", ae->name,
  362. ae->new_value,
  363. rrdcalc_status2string(ae->old_status),
  364. rrdcalc_status2string(ae->new_status)
  365. );
  366. health_alarm_execute(host, ae);
  367. }
  368. static inline void health_alarm_log_process(RRDHOST *host) {
  369. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  370. time_t now = now_realtime_sec();
  371. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  372. ALARM_ENTRY *ae;
  373. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  374. if(likely(!alarm_entry_isrepeating(host, ae))) {
  375. if(unlikely(
  376. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  377. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  378. )) {
  379. if(unlikely(ae->unique_id < first_waiting))
  380. first_waiting = ae->unique_id;
  381. if(likely(now >= ae->delay_up_to_timestamp))
  382. health_process_notifications(host, ae);
  383. }
  384. }
  385. }
  386. // remember this for the next iteration
  387. host->health_last_processed_id = first_waiting;
  388. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  389. if(host->health_log.count <= host->health_log.max)
  390. return;
  391. // cleanup excess entries in the log
  392. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  393. ALARM_ENTRY *last = NULL;
  394. unsigned int count = host->health_log.max * 2 / 3;
  395. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  396. if(ae && last && last->next == ae)
  397. last->next = NULL;
  398. else
  399. ae = NULL;
  400. while(ae) {
  401. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  402. ALARM_ENTRY *t = ae->next;
  403. if(likely(!alarm_entry_isrepeating(host, ae))) {
  404. health_alarm_wait_for_execution(ae);
  405. health_alarm_log_free_one_nochecks_nounlink(ae);
  406. host->health_log.count--;
  407. }
  408. ae = t;
  409. }
  410. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  411. }
  412. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  413. if(unlikely(!rc->rrdset)) {
  414. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
  415. return 0;
  416. }
  417. if(unlikely(rc->next_update > now)) {
  418. if (unlikely(*next_run > rc->next_update)) {
  419. // update the next_run time of the main loop
  420. // to run this alarm precisely the time required
  421. *next_run = rc->next_update;
  422. }
  423. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
  424. return 0;
  425. }
  426. if(unlikely(!rc->update_every)) {
  427. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
  428. return 0;
  429. }
  430. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  431. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
  432. return 0;
  433. }
  434. if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
  435. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
  436. return 0;
  437. }
  438. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  439. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
  440. return 0;
  441. }
  442. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  443. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
  444. return 0;
  445. }
  446. int update_every = rc->rrdset->update_every;
  447. rrdset_rdlock(rc->rrdset);
  448. time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
  449. time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
  450. rrdset_unlock(rc->rrdset);
  451. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  452. debug(D_HEALTH
  453. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  454. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
  455. , (unsigned long) last);
  456. return 0;
  457. }
  458. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  459. time_t needed = now + rc->before + rc->after;
  460. if(needed + update_every < first || needed - update_every > last) {
  461. debug(D_HEALTH
  462. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  463. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
  464. , (unsigned long) last);
  465. return 0;
  466. }
  467. }
  468. return 1;
  469. }
  470. static inline int check_if_resumed_from_suspension(void) {
  471. static usec_t last_realtime = 0, last_monotonic = 0;
  472. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  473. int ret = 0;
  474. // detect if monotonic and realtime have twice the difference
  475. // in which case we assume the system was just waken from hibernation
  476. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  477. ret = 1;
  478. last_realtime = realtime;
  479. last_monotonic = monotonic;
  480. return ret;
  481. }
  482. static void health_main_cleanup(void *ptr) {
  483. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  484. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  485. info("cleaning up...");
  486. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  487. }
  488. static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
  489. SILENCER *s;
  490. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  491. rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
  492. for (s = silencers->silencers; s!=NULL; s=s->next){
  493. if (
  494. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
  495. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
  496. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  497. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
  498. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
  499. ) {
  500. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  501. if (unlikely(silencers->stype == STYPE_NONE)) {
  502. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
  503. } else {
  504. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  505. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  506. , rc->name
  507. , (rc->rrdset)?rc->rrdset->context:""
  508. , rc->chart
  509. , host
  510. , (rc->rrdset)?rc->rrdset->family:""
  511. );
  512. }
  513. return silencers->stype;
  514. }
  515. }
  516. return STYPE_NONE;
  517. }
  518. /**
  519. * Update Disabled Silenced
  520. *
  521. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  522. *
  523. * @param host structure that contains information about the host monitored.
  524. * @param rc structure with information about the alarm
  525. *
  526. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  527. */
  528. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  529. uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
  530. // Clear the flags
  531. rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  532. if (unlikely(silencers->all_alarms)) {
  533. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  534. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  535. } else {
  536. SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
  537. if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  538. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  539. }
  540. if (rrdcalc_flags_old != rc->rrdcalc_flags) {
  541. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  542. host->hostname,
  543. rc->name,
  544. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  545. (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  546. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  547. (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  548. );
  549. }
  550. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
  551. return 1;
  552. else
  553. return 0;
  554. }
  555. // Create alarms for dimensions that have been added to charts
  556. // since the previous iteration.
  557. static void init_pending_foreach_alarms(RRDHOST *host) {
  558. rrdhost_wrlock(host);
  559. if (host->alarms_with_foreach || host->alarms_template_with_foreach) {
  560. if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) {
  561. RRDSET *st;
  562. rrdset_foreach_read(st, host) {
  563. rrdset_wrlock(st);
  564. if (rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) {
  565. RRDDIM *rd;
  566. rrddim_foreach_write(rd, st) {
  567. if (rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) {
  568. rrdcalc_link_to_rrddim(rd, st, host);
  569. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
  570. }
  571. }
  572. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
  573. }
  574. rrdset_unlock(st);
  575. }
  576. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
  577. }
  578. }
  579. rrdhost_unlock(host);
  580. }
  581. /**
  582. * Health Main
  583. *
  584. * The main thread of the health system. In this function all the alarms will be processed.
  585. *
  586. * @param ptr is a pointer to the netdata_static_thread structure.
  587. *
  588. * @return It always returns NULL
  589. */
  590. void *health_main(void *ptr) {
  591. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  592. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  593. if(min_run_every < 1) min_run_every = 1;
  594. int cleanup_sql_every_loop = 7200 / min_run_every;
  595. time_t now = now_realtime_sec();
  596. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  597. rrdcalc_labels_unlink();
  598. unsigned int loop = 0;
  599. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  600. unsigned int marked_aclk_reload_loop = 0;
  601. #endif
  602. while(!netdata_exit) {
  603. loop++;
  604. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  605. int runnable = 0, apply_hibernation_delay = 0;
  606. time_t next_run = now + min_run_every;
  607. RRDCALC *rc;
  608. if (unlikely(check_if_resumed_from_suspension())) {
  609. apply_hibernation_delay = 1;
  610. info(
  611. "Postponing alarm checks for %"PRId64" seconds, "
  612. "because it seems that the system was just resumed from suspension.",
  613. (int64_t)hibernation_delay);
  614. }
  615. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  616. static int logged=0;
  617. if (!logged) {
  618. info("Skipping health checks, because all alarms are disabled via a %s command.",
  619. HEALTH_CMDAPI_CMD_DISABLEALL);
  620. logged = 1;
  621. }
  622. }
  623. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  624. if (aclk_alert_reloaded && !marked_aclk_reload_loop)
  625. marked_aclk_reload_loop = loop;
  626. #endif
  627. rrd_rdlock();
  628. RRDHOST *host;
  629. rrdhost_foreach_read(host) {
  630. if (unlikely(!host->health_enabled))
  631. continue;
  632. if (unlikely(apply_hibernation_delay)) {
  633. info(
  634. "Postponing health checks for %"PRId64" seconds, on host '%s'.",
  635. (int64_t)hibernation_delay,
  636. host->hostname);
  637. host->health_delay_up_to = now + hibernation_delay;
  638. }
  639. if (unlikely(host->health_delay_up_to)) {
  640. if (unlikely(now < host->health_delay_up_to))
  641. continue;
  642. info("Resuming health checks on host '%s'.", host->hostname);
  643. host->health_delay_up_to = 0;
  644. }
  645. if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
  646. sql_health_alarm_log_cleanup(host);
  647. init_pending_foreach_alarms(host);
  648. rrdhost_rdlock(host);
  649. // the first loop is to lookup values from the db
  650. for (rc = host->alarms; rc; rc = rc->next) {
  651. if (update_disabled_silenced(host, rc))
  652. continue;
  653. // create an alert removed event if the chart is obsolete and
  654. // has stopped being collected for 60 seconds
  655. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  656. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  657. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  658. if (!rrdcalc_isrepeating(rc)) {
  659. time_t now = now_realtime_sec();
  660. ALARM_ENTRY *ae = health_create_alarm_entry(
  661. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  662. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  663. rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
  664. if (ae) {
  665. health_alarm_log(host, ae);
  666. rc->old_status = rc->status;
  667. rc->status = RRDCALC_STATUS_REMOVED;
  668. rc->last_status_change = now;
  669. rc->last_updated = now;
  670. rc->value = NAN;
  671. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  672. if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
  673. sql_queue_removed_alerts_to_aclk(host);
  674. #endif
  675. }
  676. }
  677. continue;
  678. }
  679. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  680. if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
  681. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
  682. continue;
  683. }
  684. runnable++;
  685. rc->old_value = rc->value;
  686. rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
  687. // ------------------------------------------------------------
  688. // if there is database lookup, do it
  689. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  690. /* time_t old_db_timestamp = rc->db_before; */
  691. int value_is_null = 0;
  692. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
  693. rc->before, rc->group, 0, rc->options, &rc->db_after,
  694. &rc->db_before, &value_is_null, 0
  695. );
  696. if (unlikely(ret != 200)) {
  697. // database lookup failed
  698. rc->value = NAN;
  699. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
  700. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  701. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
  702. );
  703. } else
  704. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
  705. /* - RRDCALC_FLAG_DB_STALE not currently used
  706. if (unlikely(old_db_timestamp == rc->db_before)) {
  707. // database is stale
  708. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  709. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
  710. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
  711. error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  712. }
  713. }
  714. else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
  715. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
  716. */
  717. if (unlikely(value_is_null)) {
  718. // collected value is null
  719. rc->value = NAN;
  720. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
  721. debug(D_HEALTH,
  722. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  723. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
  724. );
  725. } else
  726. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
  727. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
  728. CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  729. rc->value
  730. );
  731. }
  732. // ------------------------------------------------------------
  733. // if there is calculation expression, run it
  734. if (unlikely(rc->calculation)) {
  735. if (unlikely(!expression_evaluate(rc->calculation))) {
  736. // calculation failed
  737. rc->value = NAN;
  738. rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
  739. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  740. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  741. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  742. );
  743. } else {
  744. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  745. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  746. CALCULATED_NUMBER_FORMAT
  747. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  748. rc->calculation->parsed_as, rc->calculation->result,
  749. buffer_tostring(rc->calculation->error_msg), rc->source
  750. );
  751. rc->value = rc->calculation->result;
  752. if (rc->local) rc->local->last_updated = now;
  753. if (rc->family) rc->family->last_updated = now;
  754. if (rc->hostid) rc->hostid->last_updated = now;
  755. if (rc->hostname) rc->hostname->last_updated = now;
  756. }
  757. }
  758. }
  759. rrdhost_unlock(host);
  760. if (unlikely(runnable && !netdata_exit)) {
  761. rrdhost_rdlock(host);
  762. for (rc = host->alarms; rc; rc = rc->next) {
  763. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
  764. continue;
  765. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
  766. continue;
  767. }
  768. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  769. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  770. // --------------------------------------------------------
  771. // check the warning expression
  772. if (likely(rc->warning)) {
  773. if (unlikely(!expression_evaluate(rc->warning))) {
  774. // calculation failed
  775. rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
  776. debug(D_HEALTH,
  777. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  778. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  779. buffer_tostring(rc->warning->error_msg)
  780. );
  781. } else {
  782. rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  783. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  784. CALCULATED_NUMBER_FORMAT
  785. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  786. rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
  787. );
  788. warning_status = rrdcalc_value2status(rc->warning->result);
  789. }
  790. }
  791. // --------------------------------------------------------
  792. // check the critical expression
  793. if (likely(rc->critical)) {
  794. if (unlikely(!expression_evaluate(rc->critical))) {
  795. // calculation failed
  796. rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
  797. debug(D_HEALTH,
  798. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  799. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  800. buffer_tostring(rc->critical->error_msg)
  801. );
  802. } else {
  803. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  804. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  805. CALCULATED_NUMBER_FORMAT
  806. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  807. rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
  808. rc->source
  809. );
  810. critical_status = rrdcalc_value2status(rc->critical->result);
  811. }
  812. }
  813. // --------------------------------------------------------
  814. // decide the final alarm status
  815. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  816. switch (warning_status) {
  817. case RRDCALC_STATUS_CLEAR:
  818. status = RRDCALC_STATUS_CLEAR;
  819. break;
  820. case RRDCALC_STATUS_RAISED:
  821. status = RRDCALC_STATUS_WARNING;
  822. break;
  823. default:
  824. break;
  825. }
  826. switch (critical_status) {
  827. case RRDCALC_STATUS_CLEAR:
  828. if (status == RRDCALC_STATUS_UNDEFINED)
  829. status = RRDCALC_STATUS_CLEAR;
  830. break;
  831. case RRDCALC_STATUS_RAISED:
  832. status = RRDCALC_STATUS_CRITICAL;
  833. break;
  834. default:
  835. break;
  836. }
  837. // --------------------------------------------------------
  838. // check if the new status and the old differ
  839. if (status != rc->status) {
  840. int delay = 0;
  841. // apply trigger hysteresis
  842. if (now > rc->delay_up_to_timestamp) {
  843. rc->delay_up_current = rc->delay_up_duration;
  844. rc->delay_down_current = rc->delay_down_duration;
  845. rc->delay_last = 0;
  846. rc->delay_up_to_timestamp = 0;
  847. } else {
  848. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  849. if (rc->delay_up_current > rc->delay_max_duration)
  850. rc->delay_up_current = rc->delay_max_duration;
  851. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  852. if (rc->delay_down_current > rc->delay_max_duration)
  853. rc->delay_down_current = rc->delay_max_duration;
  854. }
  855. if (status > rc->status)
  856. delay = rc->delay_up_current;
  857. else
  858. delay = rc->delay_down_current;
  859. // COMMENTED: because we do need to send raising alarms
  860. // if(now + delay < rc->delay_up_to_timestamp)
  861. // delay = (int)(rc->delay_up_to_timestamp - now);
  862. rc->delay_last = delay;
  863. rc->delay_up_to_timestamp = now + delay;
  864. if(likely(!rrdcalc_isrepeating(rc))) {
  865. ALARM_ENTRY *ae = health_create_alarm_entry(
  866. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  867. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  868. rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
  869. rc->delay_last,
  870. (
  871. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  872. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  873. )
  874. );
  875. health_alarm_log(host, ae);
  876. }
  877. rc->last_status_change = now;
  878. rc->old_status = rc->status;
  879. rc->status = status;
  880. }
  881. rc->last_updated = now;
  882. rc->next_update = now + rc->update_every;
  883. if (next_run > rc->next_update)
  884. next_run = rc->next_update;
  885. }
  886. // process repeating alarms
  887. RRDCALC *rc;
  888. for(rc = host->alarms; rc ; rc = rc->next) {
  889. int repeat_every = 0;
  890. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  891. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  892. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  893. repeat_every = rc->warn_repeat_every;
  894. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  895. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  896. repeat_every = rc->crit_repeat_every;
  897. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  898. if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
  899. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  900. repeat_every = 1;
  901. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  902. repeat_every = 1;
  903. }
  904. }
  905. }
  906. } else {
  907. continue;
  908. }
  909. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  910. rc->last_repeat = now;
  911. ALARM_ENTRY *ae = health_create_alarm_entry(
  912. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  913. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  914. rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
  915. rc->delay_last,
  916. (
  917. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  918. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  919. )
  920. );
  921. ae->last_repeat = rc->last_repeat;
  922. if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  923. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  924. }
  925. rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
  926. health_process_notifications(host, ae);
  927. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  928. health_alarm_wait_for_execution(ae);
  929. health_alarm_log_free_one_nochecks_nounlink(ae);
  930. }
  931. }
  932. rrdhost_unlock(host);
  933. }
  934. if (unlikely(netdata_exit))
  935. break;
  936. // execute notifications
  937. // and cleanup
  938. health_alarm_log_process(host);
  939. if (unlikely(netdata_exit)) {
  940. // wait for all notifications to finish before allowing health to be cleaned up
  941. ALARM_ENTRY *ae;
  942. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  943. health_alarm_wait_for_execution(ae);
  944. }
  945. break;
  946. }
  947. } /* rrdhost_foreach */
  948. // wait for all notifications to finish before allowing health to be cleaned up
  949. ALARM_ENTRY *ae;
  950. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  951. health_alarm_wait_for_execution(ae);
  952. }
  953. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  954. if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  955. rrdhost_foreach_read(host) {
  956. if (unlikely(!host->health_enabled))
  957. continue;
  958. sql_queue_removed_alerts_to_aclk(host);
  959. }
  960. aclk_alert_reloaded = 0;
  961. marked_aclk_reload_loop = 0;
  962. }
  963. #endif
  964. rrd_unlock();
  965. if(unlikely(netdata_exit))
  966. break;
  967. now = now_realtime_sec();
  968. if(now < next_run) {
  969. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  970. sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
  971. now = now_realtime_sec();
  972. }
  973. else
  974. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  975. } // forever
  976. netdata_thread_cleanup_pop(1);
  977. return NULL;
  978. }