health.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. unsigned int default_health_enabled = 1;
  4. char *silencers_filename;
  5. // the queue of executed alarm notifications that haven't been waited for yet
  6. static struct {
  7. ALARM_ENTRY *head; // oldest
  8. ALARM_ENTRY *tail; // latest
  9. } alarm_notifications_in_progress = {NULL, NULL};
  10. typedef struct active_alerts {
  11. char *name;
  12. time_t last_status_change;
  13. RRDCALC_STATUS status;
  14. } active_alerts_t;
  15. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  16. {
  17. ae->prev_in_progress = NULL;
  18. ae->next_in_progress = NULL;
  19. if (NULL != alarm_notifications_in_progress.tail) {
  20. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  21. alarm_notifications_in_progress.tail->next_in_progress = ae;
  22. }
  23. if (NULL == alarm_notifications_in_progress.head) {
  24. alarm_notifications_in_progress.head = ae;
  25. }
  26. alarm_notifications_in_progress.tail = ae;
  27. }
  28. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  29. {
  30. struct alarm_entry *prev = ae->prev_in_progress;
  31. struct alarm_entry *next = ae->next_in_progress;
  32. if (NULL != prev) {
  33. prev->next_in_progress = next;
  34. }
  35. if (NULL != next) {
  36. next->prev_in_progress = prev;
  37. }
  38. if (ae == alarm_notifications_in_progress.head) {
  39. alarm_notifications_in_progress.head = next;
  40. }
  41. if (ae == alarm_notifications_in_progress.tail) {
  42. alarm_notifications_in_progress.tail = prev;
  43. }
  44. }
  45. // ----------------------------------------------------------------------------
  46. // health initialization
  47. /**
  48. * User Config directory
  49. *
  50. * Get the config directory for health and return it.
  51. *
  52. * @return a pointer to the user config directory
  53. */
  54. inline char *health_user_config_dir(void) {
  55. char buffer[FILENAME_MAX + 1];
  56. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  57. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  58. }
  59. /**
  60. * Stock Config Directory
  61. *
  62. * Get the Stock config directory and return it.
  63. *
  64. * @return a pointer to the stock config directory.
  65. */
  66. inline char *health_stock_config_dir(void) {
  67. char buffer[FILENAME_MAX + 1];
  68. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  69. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  70. }
  71. /**
  72. * Silencers init
  73. *
  74. * Function used to initialize the silencer structure.
  75. */
  76. static void health_silencers_init(void) {
  77. FILE *fd = fopen(silencers_filename, "r");
  78. if (fd) {
  79. fseek(fd, 0 , SEEK_END);
  80. off_t length = (off_t) ftell(fd);
  81. fseek(fd, 0 , SEEK_SET);
  82. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  83. char *str = mallocz((length+1)* sizeof(char));
  84. if(str) {
  85. size_t copied;
  86. copied = fread(str, sizeof(char), length, fd);
  87. if (copied == (length* sizeof(char))) {
  88. str[length] = 0x00;
  89. json_parse(str, NULL, health_silencers_json_read_callback);
  90. info("Parsed health silencers file %s", silencers_filename);
  91. } else {
  92. error("Cannot read the data from health silencers file %s", silencers_filename);
  93. }
  94. freez(str);
  95. }
  96. } else {
  97. error(
  98. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  99. silencers_filename,
  100. (int64_t)length,
  101. HEALTH_SILENCERS_MAX_FILE_LEN);
  102. }
  103. fclose(fd);
  104. } else {
  105. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  106. }
  107. }
  108. /**
  109. * Health Init
  110. *
  111. * Initialize the health thread.
  112. */
  113. void health_init(void) {
  114. debug(D_HEALTH, "Health configuration initializing");
  115. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  116. debug(D_HEALTH, "Health is disabled.");
  117. return;
  118. }
  119. health_silencers_init();
  120. }
  121. // ----------------------------------------------------------------------------
  122. // re-load health configuration
  123. /**
  124. * Reload host
  125. *
  126. * Reload configuration for a specific host.
  127. *
  128. * @param host the structure of the host that the function will reload the configuration.
  129. */
  130. static void health_reload_host(RRDHOST *host) {
  131. if(unlikely(!host->health_enabled))
  132. return;
  133. char *user_path = health_user_config_dir();
  134. char *stock_path = health_stock_config_dir();
  135. // free all running alarms
  136. rrdhost_wrlock(host);
  137. while(host->templates)
  138. rrdcalctemplate_unlink_and_free(host, host->templates);
  139. RRDCALCTEMPLATE *rt,*next;
  140. for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
  141. next = rt->next;
  142. rrdcalctemplate_free(rt);
  143. }
  144. host->alarms_template_with_foreach = NULL;
  145. while(host->alarms)
  146. rrdcalc_unlink_and_free(host, host->alarms);
  147. RRDCALC *rc,*nc;
  148. for(rc = host->alarms_with_foreach; rc ; rc = nc) {
  149. nc = rc->next;
  150. rrdcalc_free(rc);
  151. }
  152. host->alarms_with_foreach = NULL;
  153. rrdhost_unlock(host);
  154. // invalidate all previous entries in the alarm log
  155. ALARM_ENTRY *t;
  156. for(t = host->health_log.alarms ; t ; t = t->next) {
  157. if(t->new_status != RRDCALC_STATUS_REMOVED)
  158. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  159. }
  160. rrdhost_rdlock(host);
  161. // reset all thresholds to all charts
  162. RRDSET *st;
  163. rrdset_foreach_read(st, host) {
  164. st->green = NAN;
  165. st->red = NAN;
  166. }
  167. rrdhost_unlock(host);
  168. // load the new alarms
  169. rrdhost_wrlock(host);
  170. health_readdir(host, user_path, stock_path, NULL);
  171. //Discard alarms with labels that do not apply to host
  172. rrdcalc_labels_unlink_alarm_from_host(host);
  173. // link the loaded alarms to their charts
  174. RRDDIM *rd;
  175. rrdset_foreach_write(st, host) {
  176. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  177. continue;
  178. rrdsetcalc_link_matching(st);
  179. rrdcalctemplate_link_matching(st);
  180. //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
  181. rrdset_rdlock(st);
  182. rrddim_foreach_read(rd, st) {
  183. rrdcalc_link_to_rrddim(rd, st, host);
  184. }
  185. rrdset_unlock(st);
  186. }
  187. rrdhost_unlock(host);
  188. }
  189. /**
  190. * Reload
  191. *
  192. * Reload the host configuration for all hosts.
  193. */
  194. void health_reload(void) {
  195. sql_refresh_hashes();
  196. rrd_rdlock();
  197. RRDHOST *host;
  198. rrdhost_foreach_read(host)
  199. health_reload_host(host);
  200. rrd_unlock();
  201. #ifdef ENABLE_ACLK
  202. if (netdata_cloud_setting) {
  203. aclk_alert_reloaded = 1;
  204. }
  205. #endif
  206. }
  207. // ----------------------------------------------------------------------------
  208. // health main thread and friends
  209. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  210. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  211. if(n) return RRDCALC_STATUS_RAISED;
  212. return RRDCALC_STATUS_CLEAR;
  213. }
  214. #define ALARM_EXEC_COMMAND_LENGTH 8192
  215. #define ACTIVE_ALARMS_LIST_EXAMINE 500
  216. #define ACTIVE_ALARMS_LIST 15
  217. static inline int compare_active_alerts(const void * a, const void * b) {
  218. active_alerts_t *active_alerts_a = (active_alerts_t *)a;
  219. active_alerts_t *active_alerts_b = (active_alerts_t *)b;
  220. return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
  221. }
  222. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  223. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  224. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  225. // do not send notifications for internal statuses
  226. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  227. goto done;
  228. }
  229. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  230. // do not send notifications for disabled statuses
  231. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  232. // mark it as run, so that we will send the same alarm if it happens again
  233. goto done;
  234. }
  235. // find the previous notification for the same alarm
  236. // which we have run the exec script
  237. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  238. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  239. uint32_t id = ae->alarm_id;
  240. ALARM_ENTRY *t;
  241. for(t = ae->next; t ; t = t->next) {
  242. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  243. break;
  244. }
  245. if(likely(t)) {
  246. // we have executed this alarm notification in the past
  247. if(t && t->new_status == ae->new_status) {
  248. // don't send the notification for the same status again
  249. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
  250. , rrdcalc_status2string(ae->new_status));
  251. goto done;
  252. }
  253. }
  254. else {
  255. // we have not executed this alarm notification in the past
  256. // so, don't send CLEAR notifications
  257. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  258. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  259. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  260. , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  261. goto done;
  262. }
  263. }
  264. }
  265. }
  266. // Check if alarm notifications are silenced
  267. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  268. info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  269. goto done;
  270. }
  271. static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
  272. const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
  273. const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
  274. int n_warn=0, n_crit=0;
  275. RRDCALC *rc;
  276. EVAL_EXPRESSION *expr=NULL;
  277. BUFFER *warn_alarms, *crit_alarms;
  278. active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
  279. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  280. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  281. for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
  282. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  283. continue;
  284. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  285. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  286. active_alerts[n_warn+n_crit].name = rc->name;
  287. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  288. active_alerts[n_warn+n_crit].status = rc->status;
  289. n_warn++;
  290. } else if (ae->alarm_id == rc->id)
  291. expr = rc->warning;
  292. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  293. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  294. active_alerts[n_warn+n_crit].name = rc->name;
  295. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  296. active_alerts[n_warn+n_crit].status = rc->status;
  297. n_crit++;
  298. } else if (ae->alarm_id == rc->id)
  299. expr = rc->critical;
  300. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  301. if (ae->alarm_id == rc->id)
  302. expr = rc->warning;
  303. }
  304. }
  305. if (n_warn+n_crit>1)
  306. qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
  307. int count_w = 0, count_c = 0;
  308. while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
  309. if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
  310. if (count_w)
  311. buffer_strcat(warn_alarms, ",");
  312. buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
  313. buffer_strcat(warn_alarms, "=");
  314. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  315. count_w++;
  316. }
  317. else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
  318. if (count_c)
  319. buffer_strcat(crit_alarms, ",");
  320. buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
  321. buffer_strcat(crit_alarms, "=");
  322. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  323. count_c++;
  324. }
  325. }
  326. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  327. snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
  328. "' '" NETDATA_DOUBLE_FORMAT_ZERO
  329. "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
  330. exec,
  331. recipient,
  332. host->registry_hostname,
  333. ae->unique_id,
  334. ae->alarm_id,
  335. ae->alarm_event_id,
  336. (unsigned long)ae->when,
  337. ae->name,
  338. ae->chart?ae->chart:"NOCHART",
  339. ae->family?ae->family:"NOFAMILY",
  340. rrdcalc_status2string(ae->new_status),
  341. rrdcalc_status2string(ae->old_status),
  342. ae->new_value,
  343. ae->old_value,
  344. ae->source?ae->source:"UNKNOWN",
  345. (uint32_t)ae->duration,
  346. (uint32_t)ae->non_clear_duration,
  347. ae->units?ae->units:"",
  348. ae->info?ae->info:"",
  349. ae->new_value_string,
  350. ae->old_value_string,
  351. (expr && expr->source)?expr->source:"NOSOURCE",
  352. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  353. n_warn,
  354. n_crit,
  355. buffer_tostring(warn_alarms),
  356. buffer_tostring(crit_alarms),
  357. ae->classification?ae->classification:"Unknown",
  358. edit_command,
  359. host != localhost ? host->machine_guid:""
  360. );
  361. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  362. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  363. debug(D_HEALTH, "executing command '%s'", command_to_run);
  364. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  365. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  366. enqueue_alarm_notify_in_progress(ae);
  367. freez(edit_command);
  368. buffer_free(warn_alarms);
  369. buffer_free(crit_alarms);
  370. freez(active_alerts);
  371. return; //health_alarm_wait_for_execution
  372. done:
  373. health_alarm_log_save(host, ae);
  374. }
  375. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  376. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  377. return;
  378. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  379. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  380. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  381. if(ae->exec_code != 0)
  382. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  383. unlink_alarm_notify_in_progress(ae);
  384. }
  385. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  386. debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  387. ae->chart?ae->chart:"NOCHART", ae->name,
  388. ae->new_value,
  389. rrdcalc_status2string(ae->old_status),
  390. rrdcalc_status2string(ae->new_status)
  391. );
  392. health_alarm_execute(host, ae);
  393. }
  394. static inline void health_alarm_log_process(RRDHOST *host) {
  395. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  396. time_t now = now_realtime_sec();
  397. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  398. ALARM_ENTRY *ae;
  399. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  400. if(likely(!alarm_entry_isrepeating(host, ae))) {
  401. if(unlikely(
  402. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  403. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  404. )) {
  405. if(unlikely(ae->unique_id < first_waiting))
  406. first_waiting = ae->unique_id;
  407. if(likely(now >= ae->delay_up_to_timestamp))
  408. health_process_notifications(host, ae);
  409. }
  410. }
  411. }
  412. // remember this for the next iteration
  413. host->health_last_processed_id = first_waiting;
  414. bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
  415. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  416. if (!cleanup_excess_log_entries)
  417. return;
  418. // cleanup excess entries in the log
  419. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  420. ALARM_ENTRY *last = NULL;
  421. unsigned int count = host->health_log.max * 2 / 3;
  422. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  423. if(ae && last && last->next == ae)
  424. last->next = NULL;
  425. else
  426. ae = NULL;
  427. while(ae) {
  428. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  429. ALARM_ENTRY *t = ae->next;
  430. if(likely(!alarm_entry_isrepeating(host, ae))) {
  431. health_alarm_wait_for_execution(ae);
  432. health_alarm_log_free_one_nochecks_nounlink(ae);
  433. host->health_log.count--;
  434. }
  435. ae = t;
  436. }
  437. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  438. }
  439. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  440. if(unlikely(!rc->rrdset)) {
  441. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
  442. return 0;
  443. }
  444. if(unlikely(rc->next_update > now)) {
  445. if (unlikely(*next_run > rc->next_update)) {
  446. // update the next_run time of the main loop
  447. // to run this alarm precisely the time required
  448. *next_run = rc->next_update;
  449. }
  450. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
  451. return 0;
  452. }
  453. if(unlikely(!rc->update_every)) {
  454. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
  455. return 0;
  456. }
  457. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  458. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
  459. return 0;
  460. }
  461. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  462. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
  463. return 0;
  464. }
  465. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  466. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
  467. return 0;
  468. }
  469. int update_every = rc->rrdset->update_every;
  470. rrdset_rdlock(rc->rrdset);
  471. time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
  472. time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
  473. rrdset_unlock(rc->rrdset);
  474. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  475. debug(D_HEALTH
  476. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  477. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
  478. , (unsigned long) last);
  479. return 0;
  480. }
  481. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  482. time_t needed = now + rc->before + rc->after;
  483. if(needed + update_every < first || needed - update_every > last) {
  484. debug(D_HEALTH
  485. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  486. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
  487. , (unsigned long) last);
  488. return 0;
  489. }
  490. }
  491. return 1;
  492. }
  493. static inline int check_if_resumed_from_suspension(void) {
  494. static usec_t last_realtime = 0, last_monotonic = 0;
  495. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  496. int ret = 0;
  497. // detect if monotonic and realtime have twice the difference
  498. // in which case we assume the system was just waken from hibernation
  499. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  500. ret = 1;
  501. last_realtime = realtime;
  502. last_monotonic = monotonic;
  503. return ret;
  504. }
  505. static void health_main_cleanup(void *ptr) {
  506. worker_unregister();
  507. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  508. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  509. info("cleaning up...");
  510. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  511. }
  512. static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
  513. SILENCER *s;
  514. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  515. rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
  516. for (s = silencers->silencers; s!=NULL; s=s->next){
  517. if (
  518. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
  519. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
  520. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  521. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
  522. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
  523. ) {
  524. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  525. if (unlikely(silencers->stype == STYPE_NONE)) {
  526. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
  527. } else {
  528. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  529. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  530. , rc->name
  531. , (rc->rrdset)?rc->rrdset->context:""
  532. , rc->chart
  533. , host
  534. , (rc->rrdset)?rc->rrdset->family:""
  535. );
  536. }
  537. return silencers->stype;
  538. }
  539. }
  540. return STYPE_NONE;
  541. }
  542. /**
  543. * Update Disabled Silenced
  544. *
  545. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  546. *
  547. * @param host structure that contains information about the host monitored.
  548. * @param rc structure with information about the alarm
  549. *
  550. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  551. */
  552. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  553. uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
  554. // Clear the flags
  555. rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  556. if (unlikely(silencers->all_alarms)) {
  557. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  558. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  559. } else {
  560. SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
  561. if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  562. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  563. }
  564. if (rrdcalc_flags_old != rc->rrdcalc_flags) {
  565. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  566. host->hostname,
  567. rc->name,
  568. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  569. (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  570. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  571. (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  572. );
  573. }
  574. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
  575. return 1;
  576. else
  577. return 0;
  578. }
  579. // Create alarms for dimensions that have been added to charts
  580. // since the previous iteration.
  581. static void init_pending_foreach_alarms(RRDHOST *host) {
  582. RRDSET *st;
  583. RRDDIM *rd;
  584. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
  585. return;
  586. rrdhost_wrlock(host);
  587. rrdset_foreach_write(st, host) {
  588. if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
  589. continue;
  590. rrdset_rdlock(st);
  591. rrddim_foreach_read(rd, st) {
  592. if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
  593. continue;
  594. rrdcalc_link_to_rrddim(rd, st, host);
  595. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
  596. }
  597. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
  598. rrdset_unlock(st);
  599. }
  600. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
  601. rrdhost_unlock(host);
  602. }
  603. /**
  604. * Health Main
  605. *
  606. * The main thread of the health system. In this function all the alarms will be processed.
  607. *
  608. * @param ptr is a pointer to the netdata_static_thread structure.
  609. *
  610. * @return It always returns NULL
  611. */
  612. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  613. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  614. #define WORKER_HEALTH_JOB_DB_QUERY 2
  615. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  616. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  617. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  618. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  619. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  620. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
  621. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
  622. #endif
  623. void *health_main(void *ptr) {
  624. worker_register("HEALTH");
  625. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  626. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  627. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  628. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  629. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  630. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  631. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  632. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  633. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  634. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  635. if(min_run_every < 1) min_run_every = 1;
  636. int cleanup_sql_every_loop = 7200 / min_run_every;
  637. time_t now = now_realtime_sec();
  638. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  639. rrdcalc_labels_unlink();
  640. unsigned int loop = 0;
  641. #ifdef ENABLE_ACLK
  642. unsigned int marked_aclk_reload_loop = 0;
  643. #endif
  644. while(!netdata_exit) {
  645. loop++;
  646. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  647. int runnable = 0, apply_hibernation_delay = 0;
  648. time_t next_run = now + min_run_every;
  649. RRDCALC *rc;
  650. if (unlikely(check_if_resumed_from_suspension())) {
  651. apply_hibernation_delay = 1;
  652. info(
  653. "Postponing alarm checks for %"PRId64" seconds, "
  654. "because it seems that the system was just resumed from suspension.",
  655. (int64_t)hibernation_delay);
  656. }
  657. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  658. static int logged=0;
  659. if (!logged) {
  660. info("Skipping health checks, because all alarms are disabled via a %s command.",
  661. HEALTH_CMDAPI_CMD_DISABLEALL);
  662. logged = 1;
  663. }
  664. }
  665. #ifdef ENABLE_ACLK
  666. if (aclk_alert_reloaded && !marked_aclk_reload_loop)
  667. marked_aclk_reload_loop = loop;
  668. #endif
  669. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  670. rrd_rdlock();
  671. RRDHOST *host;
  672. rrdhost_foreach_read(host) {
  673. if (unlikely(!host->health_enabled))
  674. continue;
  675. if (unlikely(apply_hibernation_delay)) {
  676. info(
  677. "Postponing health checks for %"PRId64" seconds, on host '%s'.",
  678. (int64_t)hibernation_delay,
  679. host->hostname);
  680. host->health_delay_up_to = now + hibernation_delay;
  681. }
  682. if (unlikely(host->health_delay_up_to)) {
  683. if (unlikely(now < host->health_delay_up_to))
  684. continue;
  685. info("Resuming health checks on host '%s'.", host->hostname);
  686. host->health_delay_up_to = 0;
  687. }
  688. // wait until cleanup of obsolete charts on children is complete
  689. if (host != localhost)
  690. if (unlikely(host->trigger_chart_obsoletion_check == 1))
  691. continue;
  692. if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
  693. sql_health_alarm_log_cleanup(host);
  694. init_pending_foreach_alarms(host);
  695. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  696. rrdhost_rdlock(host);
  697. // the first loop is to lookup values from the db
  698. for (rc = host->alarms; rc; rc = rc->next) {
  699. if (update_disabled_silenced(host, rc))
  700. continue;
  701. // create an alert removed event if the chart is obsolete and
  702. // has stopped being collected for 60 seconds
  703. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  704. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  705. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  706. if (!rrdcalc_isrepeating(rc)) {
  707. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  708. time_t now = now_realtime_sec();
  709. ALARM_ENTRY *ae = health_create_alarm_entry(
  710. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
  711. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  712. rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
  713. if (ae) {
  714. health_alarm_log(host, ae);
  715. rc->old_status = rc->status;
  716. rc->status = RRDCALC_STATUS_REMOVED;
  717. rc->last_status_change = now;
  718. rc->last_updated = now;
  719. rc->value = NAN;
  720. #ifdef ENABLE_ACLK
  721. if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
  722. sql_queue_alarm_to_aclk(host, ae, 1);
  723. #endif
  724. }
  725. }
  726. }
  727. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  728. if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
  729. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
  730. continue;
  731. }
  732. runnable++;
  733. rc->old_value = rc->value;
  734. rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
  735. // ------------------------------------------------------------
  736. // if there is database lookup, do it
  737. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  738. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  739. /* time_t old_db_timestamp = rc->db_before; */
  740. int value_is_null = 0;
  741. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
  742. rc->after, rc->before, rc->group, NULL,
  743. 0, rc->options,
  744. &rc->db_after,&rc->db_before,
  745. NULL, NULL, NULL,
  746. &value_is_null, NULL, 0, 0);
  747. if (unlikely(ret != 200)) {
  748. // database lookup failed
  749. rc->value = NAN;
  750. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
  751. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  752. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
  753. );
  754. } else
  755. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
  756. /* - RRDCALC_FLAG_DB_STALE not currently used
  757. if (unlikely(old_db_timestamp == rc->db_before)) {
  758. // database is stale
  759. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  760. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
  761. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
  762. error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  763. }
  764. }
  765. else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
  766. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
  767. */
  768. if (unlikely(value_is_null)) {
  769. // collected value is null
  770. rc->value = NAN;
  771. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
  772. debug(D_HEALTH,
  773. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  774. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
  775. );
  776. } else
  777. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
  778. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  779. rc->value
  780. );
  781. }
  782. // ------------------------------------------------------------
  783. // if there is calculation expression, run it
  784. if (unlikely(rc->calculation)) {
  785. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  786. if (unlikely(!expression_evaluate(rc->calculation))) {
  787. // calculation failed
  788. rc->value = NAN;
  789. rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
  790. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  791. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  792. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  793. );
  794. } else {
  795. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  796. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  797. NETDATA_DOUBLE_FORMAT
  798. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  799. rc->calculation->parsed_as, rc->calculation->result,
  800. buffer_tostring(rc->calculation->error_msg), rc->source
  801. );
  802. rc->value = rc->calculation->result;
  803. if (rc->local) rc->local->last_updated = now;
  804. if (rc->family) rc->family->last_updated = now;
  805. if (rc->hostid) rc->hostid->last_updated = now;
  806. if (rc->hostname) rc->hostname->last_updated = now;
  807. }
  808. }
  809. }
  810. rrdhost_unlock(host);
  811. if (unlikely(runnable && !netdata_exit)) {
  812. rrdhost_rdlock(host);
  813. for (rc = host->alarms; rc; rc = rc->next) {
  814. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
  815. continue;
  816. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
  817. continue;
  818. }
  819. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  820. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  821. // --------------------------------------------------------
  822. // check the warning expression
  823. if (likely(rc->warning)) {
  824. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  825. if (unlikely(!expression_evaluate(rc->warning))) {
  826. // calculation failed
  827. rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
  828. debug(D_HEALTH,
  829. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  830. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  831. buffer_tostring(rc->warning->error_msg)
  832. );
  833. } else {
  834. rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  835. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  836. NETDATA_DOUBLE_FORMAT
  837. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  838. rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
  839. );
  840. warning_status = rrdcalc_value2status(rc->warning->result);
  841. }
  842. }
  843. // --------------------------------------------------------
  844. // check the critical expression
  845. if (likely(rc->critical)) {
  846. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  847. if (unlikely(!expression_evaluate(rc->critical))) {
  848. // calculation failed
  849. rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
  850. debug(D_HEALTH,
  851. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  852. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  853. buffer_tostring(rc->critical->error_msg)
  854. );
  855. } else {
  856. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  857. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  858. NETDATA_DOUBLE_FORMAT
  859. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  860. rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
  861. rc->source
  862. );
  863. critical_status = rrdcalc_value2status(rc->critical->result);
  864. }
  865. }
  866. // --------------------------------------------------------
  867. // decide the final alarm status
  868. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  869. switch (warning_status) {
  870. case RRDCALC_STATUS_CLEAR:
  871. status = RRDCALC_STATUS_CLEAR;
  872. break;
  873. case RRDCALC_STATUS_RAISED:
  874. status = RRDCALC_STATUS_WARNING;
  875. break;
  876. default:
  877. break;
  878. }
  879. switch (critical_status) {
  880. case RRDCALC_STATUS_CLEAR:
  881. if (status == RRDCALC_STATUS_UNDEFINED)
  882. status = RRDCALC_STATUS_CLEAR;
  883. break;
  884. case RRDCALC_STATUS_RAISED:
  885. status = RRDCALC_STATUS_CRITICAL;
  886. break;
  887. default:
  888. break;
  889. }
  890. // --------------------------------------------------------
  891. // check if the new status and the old differ
  892. if (status != rc->status) {
  893. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  894. int delay = 0;
  895. // apply trigger hysteresis
  896. if (now > rc->delay_up_to_timestamp) {
  897. rc->delay_up_current = rc->delay_up_duration;
  898. rc->delay_down_current = rc->delay_down_duration;
  899. rc->delay_last = 0;
  900. rc->delay_up_to_timestamp = 0;
  901. } else {
  902. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  903. if (rc->delay_up_current > rc->delay_max_duration)
  904. rc->delay_up_current = rc->delay_max_duration;
  905. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  906. if (rc->delay_down_current > rc->delay_max_duration)
  907. rc->delay_down_current = rc->delay_max_duration;
  908. }
  909. if (status > rc->status)
  910. delay = rc->delay_up_current;
  911. else
  912. delay = rc->delay_down_current;
  913. // COMMENTED: because we do need to send raising alarms
  914. // if(now + delay < rc->delay_up_to_timestamp)
  915. // delay = (int)(rc->delay_up_to_timestamp - now);
  916. rc->delay_last = delay;
  917. rc->delay_up_to_timestamp = now + delay;
  918. ALARM_ENTRY *ae = health_create_alarm_entry(
  919. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
  920. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  921. rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
  922. rc->delay_last,
  923. (
  924. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  925. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  926. )
  927. );
  928. health_alarm_log(host, ae);
  929. rc->last_status_change = now;
  930. rc->old_status = rc->status;
  931. rc->status = status;
  932. }
  933. rc->last_updated = now;
  934. rc->next_update = now + rc->update_every;
  935. if (next_run > rc->next_update)
  936. next_run = rc->next_update;
  937. }
  938. // process repeating alarms
  939. RRDCALC *rc;
  940. for(rc = host->alarms; rc ; rc = rc->next) {
  941. int repeat_every = 0;
  942. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  943. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  944. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  945. repeat_every = rc->warn_repeat_every;
  946. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  947. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  948. repeat_every = rc->crit_repeat_every;
  949. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  950. if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
  951. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  952. repeat_every = 1;
  953. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  954. repeat_every = 1;
  955. }
  956. }
  957. }
  958. } else {
  959. continue;
  960. }
  961. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  962. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  963. rc->last_repeat = now;
  964. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  965. ALARM_ENTRY *ae = health_create_alarm_entry(
  966. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
  967. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  968. rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
  969. rc->delay_last,
  970. (
  971. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  972. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  973. )
  974. );
  975. ae->last_repeat = rc->last_repeat;
  976. if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  977. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  978. }
  979. rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
  980. health_process_notifications(host, ae);
  981. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  982. health_alarm_wait_for_execution(ae);
  983. health_alarm_log_free_one_nochecks_nounlink(ae);
  984. }
  985. }
  986. rrdhost_unlock(host);
  987. }
  988. if (unlikely(netdata_exit))
  989. break;
  990. // execute notifications
  991. // and cleanup
  992. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  993. health_alarm_log_process(host);
  994. if (unlikely(netdata_exit)) {
  995. // wait for all notifications to finish before allowing health to be cleaned up
  996. ALARM_ENTRY *ae;
  997. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  998. health_alarm_wait_for_execution(ae);
  999. }
  1000. break;
  1001. }
  1002. } /* rrdhost_foreach */
  1003. // wait for all notifications to finish before allowing health to be cleaned up
  1004. ALARM_ENTRY *ae;
  1005. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1006. health_alarm_wait_for_execution(ae);
  1007. }
  1008. #ifdef ENABLE_ACLK
  1009. if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  1010. rrdhost_foreach_read(host) {
  1011. if (unlikely(!host->health_enabled))
  1012. continue;
  1013. sql_queue_removed_alerts_to_aclk(host);
  1014. }
  1015. aclk_alert_reloaded = 0;
  1016. marked_aclk_reload_loop = 0;
  1017. }
  1018. #endif
  1019. rrd_unlock();
  1020. if(unlikely(netdata_exit))
  1021. break;
  1022. now = now_realtime_sec();
  1023. if(now < next_run) {
  1024. worker_is_idle();
  1025. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  1026. sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
  1027. now = now_realtime_sec();
  1028. }
  1029. else
  1030. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  1031. } // forever
  1032. netdata_thread_cleanup_pop(1);
  1033. return NULL;
  1034. }