health.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. unsigned int default_health_enabled = 1;
  4. char *silencers_filename;
  5. // the queue of executed alarm notifications that haven't been waited for yet
  6. static struct {
  7. ALARM_ENTRY *head; // oldest
  8. ALARM_ENTRY *tail; // latest
  9. } alarm_notifications_in_progress = {NULL, NULL};
  10. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  11. {
  12. ae->prev_in_progress = NULL;
  13. ae->next_in_progress = NULL;
  14. if (NULL != alarm_notifications_in_progress.tail) {
  15. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  16. alarm_notifications_in_progress.tail->next_in_progress = ae;
  17. }
  18. if (NULL == alarm_notifications_in_progress.head) {
  19. alarm_notifications_in_progress.head = ae;
  20. }
  21. alarm_notifications_in_progress.tail = ae;
  22. }
  23. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  24. {
  25. struct alarm_entry *prev = ae->prev_in_progress;
  26. struct alarm_entry *next = ae->next_in_progress;
  27. if (NULL != prev) {
  28. prev->next_in_progress = next;
  29. }
  30. if (NULL != next) {
  31. next->prev_in_progress = prev;
  32. }
  33. if (ae == alarm_notifications_in_progress.head) {
  34. alarm_notifications_in_progress.head = next;
  35. }
  36. if (ae == alarm_notifications_in_progress.tail) {
  37. alarm_notifications_in_progress.tail = prev;
  38. }
  39. }
  40. // ----------------------------------------------------------------------------
  41. // health initialization
  42. /**
  43. * User Config directory
  44. *
  45. * Get the config directory for health and return it.
  46. *
  47. * @return a pointer to the user config directory
  48. */
  49. inline char *health_user_config_dir(void) {
  50. char buffer[FILENAME_MAX + 1];
  51. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  52. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  53. }
  54. /**
  55. * Stock Config Directory
  56. *
  57. * Get the Stock config directory and return it.
  58. *
  59. * @return a pointer to the stock config directory.
  60. */
  61. inline char *health_stock_config_dir(void) {
  62. char buffer[FILENAME_MAX + 1];
  63. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  64. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  65. }
  66. /**
  67. * Silencers init
  68. *
  69. * Function used to initialize the silencer structure.
  70. */
  71. static void health_silencers_init(void) {
  72. FILE *fd = fopen(silencers_filename, "r");
  73. if (fd) {
  74. fseek(fd, 0 , SEEK_END);
  75. off_t length = (off_t) ftell(fd);
  76. fseek(fd, 0 , SEEK_SET);
  77. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  78. char *str = mallocz((length+1)* sizeof(char));
  79. if(str) {
  80. size_t copied;
  81. copied = fread(str, sizeof(char), length, fd);
  82. if (copied == (length* sizeof(char))) {
  83. str[length] = 0x00;
  84. json_parse(str, NULL, health_silencers_json_read_callback);
  85. info("Parsed health silencers file %s", silencers_filename);
  86. } else {
  87. error("Cannot read the data from health silencers file %s", silencers_filename);
  88. }
  89. freez(str);
  90. }
  91. } else {
  92. error(
  93. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  94. silencers_filename,
  95. (int64_t)length,
  96. HEALTH_SILENCERS_MAX_FILE_LEN);
  97. }
  98. fclose(fd);
  99. } else {
  100. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  101. }
  102. }
  103. /**
  104. * Health Init
  105. *
  106. * Initialize the health thread.
  107. */
  108. void health_init(void) {
  109. debug(D_HEALTH, "Health configuration initializing");
  110. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  111. debug(D_HEALTH, "Health is disabled.");
  112. return;
  113. }
  114. health_silencers_init();
  115. }
  116. // ----------------------------------------------------------------------------
  117. // re-load health configuration
  118. /**
  119. * Reload host
  120. *
  121. * Reload configuration for a specific host.
  122. *
  123. * @param host the structure of the host that the function will reload the configuration.
  124. */
  125. static void health_reload_host(RRDHOST *host) {
  126. if(unlikely(!host->health_enabled))
  127. return;
  128. char *user_path = health_user_config_dir();
  129. char *stock_path = health_stock_config_dir();
  130. // free all running alarms
  131. rrdhost_wrlock(host);
  132. while(host->templates)
  133. rrdcalctemplate_unlink_and_free(host, host->templates);
  134. RRDCALCTEMPLATE *rt,*next;
  135. for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
  136. next = rt->next;
  137. rrdcalctemplate_free(rt);
  138. }
  139. host->alarms_template_with_foreach = NULL;
  140. while(host->alarms)
  141. rrdcalc_unlink_and_free(host, host->alarms);
  142. RRDCALC *rc,*nc;
  143. for(rc = host->alarms_with_foreach; rc ; rc = nc) {
  144. nc = rc->next;
  145. rrdcalc_free(rc);
  146. }
  147. host->alarms_with_foreach = NULL;
  148. rrdhost_unlock(host);
  149. // invalidate all previous entries in the alarm log
  150. ALARM_ENTRY *t;
  151. for(t = host->health_log.alarms ; t ; t = t->next) {
  152. if(t->new_status != RRDCALC_STATUS_REMOVED)
  153. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  154. }
  155. rrdhost_rdlock(host);
  156. // reset all thresholds to all charts
  157. RRDSET *st;
  158. rrdset_foreach_read(st, host) {
  159. st->green = NAN;
  160. st->red = NAN;
  161. }
  162. rrdhost_unlock(host);
  163. // load the new alarms
  164. rrdhost_wrlock(host);
  165. health_readdir(host, user_path, stock_path, NULL);
  166. //Discard alarms with labels that do not apply to host
  167. rrdcalc_labels_unlink_alarm_from_host(host);
  168. // link the loaded alarms to their charts
  169. RRDDIM *rd;
  170. rrdset_foreach_write(st, host) {
  171. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  172. continue;
  173. rrdsetcalc_link_matching(st);
  174. rrdcalctemplate_link_matching(st);
  175. //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
  176. rrdset_rdlock(st);
  177. rrddim_foreach_read(rd, st) {
  178. rrdcalc_link_to_rrddim(rd, st, host);
  179. }
  180. rrdset_unlock(st);
  181. }
  182. rrdhost_unlock(host);
  183. }
  184. /**
  185. * Reload
  186. *
  187. * Reload the host configuration for all hosts.
  188. */
  189. void health_reload(void) {
  190. sql_refresh_hashes();
  191. rrd_rdlock();
  192. RRDHOST *host;
  193. rrdhost_foreach_read(host)
  194. health_reload_host(host);
  195. rrd_unlock();
  196. #ifdef ENABLE_ACLK
  197. if (netdata_cloud_setting) {
  198. aclk_alert_reloaded = 1;
  199. }
  200. #endif
  201. }
  202. // ----------------------------------------------------------------------------
  203. // health main thread and friends
  204. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  205. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  206. if(n) return RRDCALC_STATUS_RAISED;
  207. return RRDCALC_STATUS_CLEAR;
  208. }
  209. #define ALARM_EXEC_COMMAND_LENGTH 8192
  210. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  211. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  212. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  213. // do not send notifications for internal statuses
  214. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  215. goto done;
  216. }
  217. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  218. // do not send notifications for disabled statuses
  219. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  220. // mark it as run, so that we will send the same alarm if it happens again
  221. goto done;
  222. }
  223. // find the previous notification for the same alarm
  224. // which we have run the exec script
  225. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  226. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  227. uint32_t id = ae->alarm_id;
  228. ALARM_ENTRY *t;
  229. for(t = ae->next; t ; t = t->next) {
  230. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  231. break;
  232. }
  233. if(likely(t)) {
  234. // we have executed this alarm notification in the past
  235. if(t && t->new_status == ae->new_status) {
  236. // don't send the notification for the same status again
  237. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
  238. , rrdcalc_status2string(ae->new_status));
  239. goto done;
  240. }
  241. }
  242. else {
  243. // we have not executed this alarm notification in the past
  244. // so, don't send CLEAR notifications
  245. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  246. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  247. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  248. , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  249. goto done;
  250. }
  251. }
  252. }
  253. }
  254. // Check if alarm notifications are silenced
  255. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  256. info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  257. goto done;
  258. }
  259. static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
  260. const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
  261. const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
  262. int n_warn=0, n_crit=0;
  263. RRDCALC *rc;
  264. EVAL_EXPRESSION *expr=NULL;
  265. BUFFER *warn_alarms, *crit_alarms;
  266. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  267. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  268. for(rc = host->alarms; rc ; rc = rc->next) {
  269. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  270. continue;
  271. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  272. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  273. if (n_warn)
  274. buffer_strcat(warn_alarms, ",");
  275. buffer_strcat(warn_alarms, rc->name);
  276. buffer_strcat(warn_alarms, "=");
  277. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  278. n_warn++;
  279. } else if (ae->alarm_id == rc->id)
  280. expr = rc->warning;
  281. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  282. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  283. if (n_crit)
  284. buffer_strcat(crit_alarms, ",");
  285. buffer_strcat(crit_alarms, rc->name);
  286. buffer_strcat(crit_alarms, "=");
  287. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  288. n_crit++;
  289. } else if (ae->alarm_id == rc->id)
  290. expr = rc->critical;
  291. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  292. if (ae->alarm_id == rc->id)
  293. expr = rc->warning;
  294. }
  295. }
  296. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  297. snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
  298. "' '" NETDATA_DOUBLE_FORMAT_ZERO
  299. "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
  300. exec,
  301. recipient,
  302. host->registry_hostname,
  303. ae->unique_id,
  304. ae->alarm_id,
  305. ae->alarm_event_id,
  306. (unsigned long)ae->when,
  307. ae->name,
  308. ae->chart?ae->chart:"NOCHART",
  309. ae->family?ae->family:"NOFAMILY",
  310. rrdcalc_status2string(ae->new_status),
  311. rrdcalc_status2string(ae->old_status),
  312. ae->new_value,
  313. ae->old_value,
  314. ae->source?ae->source:"UNKNOWN",
  315. (uint32_t)ae->duration,
  316. (uint32_t)ae->non_clear_duration,
  317. ae->units?ae->units:"",
  318. ae->info?ae->info:"",
  319. ae->new_value_string,
  320. ae->old_value_string,
  321. (expr && expr->source)?expr->source:"NOSOURCE",
  322. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  323. n_warn,
  324. n_crit,
  325. buffer_tostring(warn_alarms),
  326. buffer_tostring(crit_alarms),
  327. ae->classification?ae->classification:"Unknown",
  328. edit_command,
  329. host != localhost ? host->machine_guid:""
  330. );
  331. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  332. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  333. debug(D_HEALTH, "executing command '%s'", command_to_run);
  334. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  335. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  336. enqueue_alarm_notify_in_progress(ae);
  337. freez(edit_command);
  338. buffer_free(warn_alarms);
  339. buffer_free(crit_alarms);
  340. return; //health_alarm_wait_for_execution
  341. done:
  342. health_alarm_log_save(host, ae);
  343. }
  344. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  345. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  346. return;
  347. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  348. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  349. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  350. if(ae->exec_code != 0)
  351. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  352. unlink_alarm_notify_in_progress(ae);
  353. }
  354. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  355. debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  356. ae->chart?ae->chart:"NOCHART", ae->name,
  357. ae->new_value,
  358. rrdcalc_status2string(ae->old_status),
  359. rrdcalc_status2string(ae->new_status)
  360. );
  361. health_alarm_execute(host, ae);
  362. }
  363. static inline void health_alarm_log_process(RRDHOST *host) {
  364. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  365. time_t now = now_realtime_sec();
  366. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  367. ALARM_ENTRY *ae;
  368. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  369. if(likely(!alarm_entry_isrepeating(host, ae))) {
  370. if(unlikely(
  371. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  372. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  373. )) {
  374. if(unlikely(ae->unique_id < first_waiting))
  375. first_waiting = ae->unique_id;
  376. if(likely(now >= ae->delay_up_to_timestamp))
  377. health_process_notifications(host, ae);
  378. }
  379. }
  380. }
  381. // remember this for the next iteration
  382. host->health_last_processed_id = first_waiting;
  383. bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
  384. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  385. if (!cleanup_excess_log_entries)
  386. return;
  387. // cleanup excess entries in the log
  388. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  389. ALARM_ENTRY *last = NULL;
  390. unsigned int count = host->health_log.max * 2 / 3;
  391. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  392. if(ae && last && last->next == ae)
  393. last->next = NULL;
  394. else
  395. ae = NULL;
  396. while(ae) {
  397. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  398. ALARM_ENTRY *t = ae->next;
  399. if(likely(!alarm_entry_isrepeating(host, ae))) {
  400. health_alarm_wait_for_execution(ae);
  401. health_alarm_log_free_one_nochecks_nounlink(ae);
  402. host->health_log.count--;
  403. }
  404. ae = t;
  405. }
  406. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  407. }
  408. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  409. if(unlikely(!rc->rrdset)) {
  410. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
  411. return 0;
  412. }
  413. if(unlikely(rc->next_update > now)) {
  414. if (unlikely(*next_run > rc->next_update)) {
  415. // update the next_run time of the main loop
  416. // to run this alarm precisely the time required
  417. *next_run = rc->next_update;
  418. }
  419. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
  420. return 0;
  421. }
  422. if(unlikely(!rc->update_every)) {
  423. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
  424. return 0;
  425. }
  426. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  427. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
  428. return 0;
  429. }
  430. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  431. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
  432. return 0;
  433. }
  434. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  435. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
  436. return 0;
  437. }
  438. int update_every = rc->rrdset->update_every;
  439. rrdset_rdlock(rc->rrdset);
  440. time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
  441. time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
  442. rrdset_unlock(rc->rrdset);
  443. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  444. debug(D_HEALTH
  445. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  446. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
  447. , (unsigned long) last);
  448. return 0;
  449. }
  450. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  451. time_t needed = now + rc->before + rc->after;
  452. if(needed + update_every < first || needed - update_every > last) {
  453. debug(D_HEALTH
  454. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  455. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
  456. , (unsigned long) last);
  457. return 0;
  458. }
  459. }
  460. return 1;
  461. }
  462. static inline int check_if_resumed_from_suspension(void) {
  463. static usec_t last_realtime = 0, last_monotonic = 0;
  464. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  465. int ret = 0;
  466. // detect if monotonic and realtime have twice the difference
  467. // in which case we assume the system was just waken from hibernation
  468. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  469. ret = 1;
  470. last_realtime = realtime;
  471. last_monotonic = monotonic;
  472. return ret;
  473. }
  474. static void health_main_cleanup(void *ptr) {
  475. worker_unregister();
  476. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  477. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  478. info("cleaning up...");
  479. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  480. }
  481. static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
  482. SILENCER *s;
  483. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  484. rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
  485. for (s = silencers->silencers; s!=NULL; s=s->next){
  486. if (
  487. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
  488. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
  489. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  490. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
  491. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
  492. ) {
  493. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  494. if (unlikely(silencers->stype == STYPE_NONE)) {
  495. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
  496. } else {
  497. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  498. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  499. , rc->name
  500. , (rc->rrdset)?rc->rrdset->context:""
  501. , rc->chart
  502. , host
  503. , (rc->rrdset)?rc->rrdset->family:""
  504. );
  505. }
  506. return silencers->stype;
  507. }
  508. }
  509. return STYPE_NONE;
  510. }
  511. /**
  512. * Update Disabled Silenced
  513. *
  514. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  515. *
  516. * @param host structure that contains information about the host monitored.
  517. * @param rc structure with information about the alarm
  518. *
  519. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  520. */
  521. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  522. uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
  523. // Clear the flags
  524. rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  525. if (unlikely(silencers->all_alarms)) {
  526. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  527. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  528. } else {
  529. SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
  530. if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  531. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  532. }
  533. if (rrdcalc_flags_old != rc->rrdcalc_flags) {
  534. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  535. host->hostname,
  536. rc->name,
  537. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  538. (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  539. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  540. (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  541. );
  542. }
  543. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
  544. return 1;
  545. else
  546. return 0;
  547. }
  548. // Create alarms for dimensions that have been added to charts
  549. // since the previous iteration.
  550. static void init_pending_foreach_alarms(RRDHOST *host) {
  551. RRDSET *st;
  552. RRDDIM *rd;
  553. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
  554. return;
  555. rrdhost_wrlock(host);
  556. rrdset_foreach_write(st, host) {
  557. if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
  558. continue;
  559. rrdset_rdlock(st);
  560. rrddim_foreach_read(rd, st) {
  561. if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
  562. continue;
  563. rrdcalc_link_to_rrddim(rd, st, host);
  564. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
  565. }
  566. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
  567. rrdset_unlock(st);
  568. }
  569. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
  570. rrdhost_unlock(host);
  571. }
  572. /**
  573. * Health Main
  574. *
  575. * The main thread of the health system. In this function all the alarms will be processed.
  576. *
  577. * @param ptr is a pointer to the netdata_static_thread structure.
  578. *
  579. * @return It always returns NULL
  580. */
  581. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  582. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  583. #define WORKER_HEALTH_JOB_DB_QUERY 2
  584. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  585. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  586. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  587. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  588. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  589. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
  590. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
  591. #endif
  592. void *health_main(void *ptr) {
  593. worker_register("HEALTH");
  594. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  595. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  596. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  597. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  598. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  599. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  600. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  601. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  602. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  603. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  604. if(min_run_every < 1) min_run_every = 1;
  605. int cleanup_sql_every_loop = 7200 / min_run_every;
  606. time_t now = now_realtime_sec();
  607. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  608. rrdcalc_labels_unlink();
  609. unsigned int loop = 0;
  610. #ifdef ENABLE_ACLK
  611. unsigned int marked_aclk_reload_loop = 0;
  612. #endif
  613. while(!netdata_exit) {
  614. loop++;
  615. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  616. int runnable = 0, apply_hibernation_delay = 0;
  617. time_t next_run = now + min_run_every;
  618. RRDCALC *rc;
  619. if (unlikely(check_if_resumed_from_suspension())) {
  620. apply_hibernation_delay = 1;
  621. info(
  622. "Postponing alarm checks for %"PRId64" seconds, "
  623. "because it seems that the system was just resumed from suspension.",
  624. (int64_t)hibernation_delay);
  625. }
  626. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  627. static int logged=0;
  628. if (!logged) {
  629. info("Skipping health checks, because all alarms are disabled via a %s command.",
  630. HEALTH_CMDAPI_CMD_DISABLEALL);
  631. logged = 1;
  632. }
  633. }
  634. #ifdef ENABLE_ACLK
  635. if (aclk_alert_reloaded && !marked_aclk_reload_loop)
  636. marked_aclk_reload_loop = loop;
  637. #endif
  638. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  639. rrd_rdlock();
  640. RRDHOST *host;
  641. rrdhost_foreach_read(host) {
  642. if (unlikely(!host->health_enabled))
  643. continue;
  644. if (unlikely(apply_hibernation_delay)) {
  645. info(
  646. "Postponing health checks for %"PRId64" seconds, on host '%s'.",
  647. (int64_t)hibernation_delay,
  648. host->hostname);
  649. host->health_delay_up_to = now + hibernation_delay;
  650. }
  651. if (unlikely(host->health_delay_up_to)) {
  652. if (unlikely(now < host->health_delay_up_to))
  653. continue;
  654. info("Resuming health checks on host '%s'.", host->hostname);
  655. host->health_delay_up_to = 0;
  656. }
  657. // wait until cleanup of obsolete charts on children is complete
  658. if (host != localhost)
  659. if (unlikely(host->trigger_chart_obsoletion_check == 1))
  660. continue;
  661. if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
  662. sql_health_alarm_log_cleanup(host);
  663. init_pending_foreach_alarms(host);
  664. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  665. rrdhost_rdlock(host);
  666. // the first loop is to lookup values from the db
  667. for (rc = host->alarms; rc; rc = rc->next) {
  668. if (update_disabled_silenced(host, rc))
  669. continue;
  670. // create an alert removed event if the chart is obsolete and
  671. // has stopped being collected for 60 seconds
  672. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  673. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  674. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  675. if (!rrdcalc_isrepeating(rc)) {
  676. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  677. time_t now = now_realtime_sec();
  678. ALARM_ENTRY *ae = health_create_alarm_entry(
  679. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  680. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  681. rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
  682. if (ae) {
  683. health_alarm_log(host, ae);
  684. rc->old_status = rc->status;
  685. rc->status = RRDCALC_STATUS_REMOVED;
  686. rc->last_status_change = now;
  687. rc->last_updated = now;
  688. rc->value = NAN;
  689. #ifdef ENABLE_ACLK
  690. if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
  691. sql_queue_alarm_to_aclk(host, ae, 1);
  692. #endif
  693. }
  694. }
  695. }
  696. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  697. if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
  698. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
  699. continue;
  700. }
  701. runnable++;
  702. rc->old_value = rc->value;
  703. rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
  704. // ------------------------------------------------------------
  705. // if there is database lookup, do it
  706. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  707. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  708. /* time_t old_db_timestamp = rc->db_before; */
  709. int value_is_null = 0;
  710. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
  711. rc->after, rc->before, rc->group, NULL,
  712. 0, rc->options,
  713. &rc->db_after,&rc->db_before,
  714. NULL, NULL,
  715. &value_is_null, NULL, 0, 0);
  716. if (unlikely(ret != 200)) {
  717. // database lookup failed
  718. rc->value = NAN;
  719. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
  720. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  721. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
  722. );
  723. } else
  724. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
  725. /* - RRDCALC_FLAG_DB_STALE not currently used
  726. if (unlikely(old_db_timestamp == rc->db_before)) {
  727. // database is stale
  728. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  729. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
  730. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
  731. error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  732. }
  733. }
  734. else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
  735. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
  736. */
  737. if (unlikely(value_is_null)) {
  738. // collected value is null
  739. rc->value = NAN;
  740. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
  741. debug(D_HEALTH,
  742. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  743. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
  744. );
  745. } else
  746. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
  747. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  748. rc->value
  749. );
  750. }
  751. // ------------------------------------------------------------
  752. // if there is calculation expression, run it
  753. if (unlikely(rc->calculation)) {
  754. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  755. if (unlikely(!expression_evaluate(rc->calculation))) {
  756. // calculation failed
  757. rc->value = NAN;
  758. rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
  759. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  760. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  761. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  762. );
  763. } else {
  764. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  765. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  766. NETDATA_DOUBLE_FORMAT
  767. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  768. rc->calculation->parsed_as, rc->calculation->result,
  769. buffer_tostring(rc->calculation->error_msg), rc->source
  770. );
  771. rc->value = rc->calculation->result;
  772. if (rc->local) rc->local->last_updated = now;
  773. if (rc->family) rc->family->last_updated = now;
  774. if (rc->hostid) rc->hostid->last_updated = now;
  775. if (rc->hostname) rc->hostname->last_updated = now;
  776. }
  777. }
  778. }
  779. rrdhost_unlock(host);
  780. if (unlikely(runnable && !netdata_exit)) {
  781. rrdhost_rdlock(host);
  782. for (rc = host->alarms; rc; rc = rc->next) {
  783. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
  784. continue;
  785. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
  786. continue;
  787. }
  788. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  789. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  790. // --------------------------------------------------------
  791. // check the warning expression
  792. if (likely(rc->warning)) {
  793. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  794. if (unlikely(!expression_evaluate(rc->warning))) {
  795. // calculation failed
  796. rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
  797. debug(D_HEALTH,
  798. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  799. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  800. buffer_tostring(rc->warning->error_msg)
  801. );
  802. } else {
  803. rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  804. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  805. NETDATA_DOUBLE_FORMAT
  806. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  807. rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
  808. );
  809. warning_status = rrdcalc_value2status(rc->warning->result);
  810. }
  811. }
  812. // --------------------------------------------------------
  813. // check the critical expression
  814. if (likely(rc->critical)) {
  815. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  816. if (unlikely(!expression_evaluate(rc->critical))) {
  817. // calculation failed
  818. rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
  819. debug(D_HEALTH,
  820. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  821. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  822. buffer_tostring(rc->critical->error_msg)
  823. );
  824. } else {
  825. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  826. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  827. NETDATA_DOUBLE_FORMAT
  828. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  829. rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
  830. rc->source
  831. );
  832. critical_status = rrdcalc_value2status(rc->critical->result);
  833. }
  834. }
  835. // --------------------------------------------------------
  836. // decide the final alarm status
  837. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  838. switch (warning_status) {
  839. case RRDCALC_STATUS_CLEAR:
  840. status = RRDCALC_STATUS_CLEAR;
  841. break;
  842. case RRDCALC_STATUS_RAISED:
  843. status = RRDCALC_STATUS_WARNING;
  844. break;
  845. default:
  846. break;
  847. }
  848. switch (critical_status) {
  849. case RRDCALC_STATUS_CLEAR:
  850. if (status == RRDCALC_STATUS_UNDEFINED)
  851. status = RRDCALC_STATUS_CLEAR;
  852. break;
  853. case RRDCALC_STATUS_RAISED:
  854. status = RRDCALC_STATUS_CRITICAL;
  855. break;
  856. default:
  857. break;
  858. }
  859. // --------------------------------------------------------
  860. // check if the new status and the old differ
  861. if (status != rc->status) {
  862. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  863. int delay = 0;
  864. // apply trigger hysteresis
  865. if (now > rc->delay_up_to_timestamp) {
  866. rc->delay_up_current = rc->delay_up_duration;
  867. rc->delay_down_current = rc->delay_down_duration;
  868. rc->delay_last = 0;
  869. rc->delay_up_to_timestamp = 0;
  870. } else {
  871. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  872. if (rc->delay_up_current > rc->delay_max_duration)
  873. rc->delay_up_current = rc->delay_max_duration;
  874. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  875. if (rc->delay_down_current > rc->delay_max_duration)
  876. rc->delay_down_current = rc->delay_max_duration;
  877. }
  878. if (status > rc->status)
  879. delay = rc->delay_up_current;
  880. else
  881. delay = rc->delay_down_current;
  882. // COMMENTED: because we do need to send raising alarms
  883. // if(now + delay < rc->delay_up_to_timestamp)
  884. // delay = (int)(rc->delay_up_to_timestamp - now);
  885. rc->delay_last = delay;
  886. rc->delay_up_to_timestamp = now + delay;
  887. ALARM_ENTRY *ae = health_create_alarm_entry(
  888. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  889. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  890. rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
  891. rc->delay_last,
  892. (
  893. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  894. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  895. )
  896. );
  897. health_alarm_log(host, ae);
  898. rc->last_status_change = now;
  899. rc->old_status = rc->status;
  900. rc->status = status;
  901. }
  902. rc->last_updated = now;
  903. rc->next_update = now + rc->update_every;
  904. if (next_run > rc->next_update)
  905. next_run = rc->next_update;
  906. }
  907. // process repeating alarms
  908. RRDCALC *rc;
  909. for(rc = host->alarms; rc ; rc = rc->next) {
  910. int repeat_every = 0;
  911. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  912. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  913. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  914. repeat_every = rc->warn_repeat_every;
  915. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  916. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  917. repeat_every = rc->crit_repeat_every;
  918. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  919. if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
  920. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  921. repeat_every = 1;
  922. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  923. repeat_every = 1;
  924. }
  925. }
  926. }
  927. } else {
  928. continue;
  929. }
  930. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  931. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  932. rc->last_repeat = now;
  933. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  934. ALARM_ENTRY *ae = health_create_alarm_entry(
  935. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  936. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  937. rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
  938. rc->delay_last,
  939. (
  940. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  941. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  942. )
  943. );
  944. ae->last_repeat = rc->last_repeat;
  945. if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  946. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  947. }
  948. rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
  949. health_process_notifications(host, ae);
  950. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  951. health_alarm_wait_for_execution(ae);
  952. health_alarm_log_free_one_nochecks_nounlink(ae);
  953. }
  954. }
  955. rrdhost_unlock(host);
  956. }
  957. if (unlikely(netdata_exit))
  958. break;
  959. // execute notifications
  960. // and cleanup
  961. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  962. health_alarm_log_process(host);
  963. if (unlikely(netdata_exit)) {
  964. // wait for all notifications to finish before allowing health to be cleaned up
  965. ALARM_ENTRY *ae;
  966. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  967. health_alarm_wait_for_execution(ae);
  968. }
  969. break;
  970. }
  971. } /* rrdhost_foreach */
  972. // wait for all notifications to finish before allowing health to be cleaned up
  973. ALARM_ENTRY *ae;
  974. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  975. health_alarm_wait_for_execution(ae);
  976. }
  977. #ifdef ENABLE_ACLK
  978. if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  979. rrdhost_foreach_read(host) {
  980. if (unlikely(!host->health_enabled))
  981. continue;
  982. sql_queue_removed_alerts_to_aclk(host);
  983. }
  984. aclk_alert_reloaded = 0;
  985. marked_aclk_reload_loop = 0;
  986. }
  987. #endif
  988. rrd_unlock();
  989. if(unlikely(netdata_exit))
  990. break;
  991. now = now_realtime_sec();
  992. if(now < next_run) {
  993. worker_is_idle();
  994. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  995. sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
  996. now = now_realtime_sec();
  997. }
  998. else
  999. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  1000. } // forever
  1001. netdata_thread_cleanup_pop(1);
  1002. return NULL;
  1003. }