health.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. unsigned int default_health_enabled = 1;
  4. char *silencers_filename;
  5. // the queue of executed alarm notifications that haven't been waited for yet
  6. static struct {
  7. ALARM_ENTRY *head; // oldest
  8. ALARM_ENTRY *tail; // latest
  9. } alarm_notifications_in_progress = {NULL, NULL};
  10. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  11. {
  12. ae->prev_in_progress = NULL;
  13. ae->next_in_progress = NULL;
  14. if (NULL != alarm_notifications_in_progress.tail) {
  15. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  16. alarm_notifications_in_progress.tail->next_in_progress = ae;
  17. }
  18. if (NULL == alarm_notifications_in_progress.head) {
  19. alarm_notifications_in_progress.head = ae;
  20. }
  21. alarm_notifications_in_progress.tail = ae;
  22. }
  23. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  24. {
  25. struct alarm_entry *prev = ae->prev_in_progress;
  26. struct alarm_entry *next = ae->next_in_progress;
  27. if (NULL != prev) {
  28. prev->next_in_progress = next;
  29. }
  30. if (NULL != next) {
  31. next->prev_in_progress = prev;
  32. }
  33. if (ae == alarm_notifications_in_progress.head) {
  34. alarm_notifications_in_progress.head = next;
  35. }
  36. if (ae == alarm_notifications_in_progress.tail) {
  37. alarm_notifications_in_progress.tail = prev;
  38. }
  39. }
  40. // ----------------------------------------------------------------------------
  41. // health initialization
  42. /**
  43. * User Config directory
  44. *
  45. * Get the config directory for health and return it.
  46. *
  47. * @return a pointer to the user config directory
  48. */
  49. inline char *health_user_config_dir(void) {
  50. char buffer[FILENAME_MAX + 1];
  51. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  52. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  53. }
  54. /**
  55. * Stock Config Directory
  56. *
  57. * Get the Stock config directory and return it.
  58. *
  59. * @return a pointer to the stock config directory.
  60. */
  61. inline char *health_stock_config_dir(void) {
  62. char buffer[FILENAME_MAX + 1];
  63. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  64. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  65. }
  66. /**
  67. * Silencers init
  68. *
  69. * Function used to initialize the silencer structure.
  70. */
  71. static void health_silencers_init(void) {
  72. FILE *fd = fopen(silencers_filename, "r");
  73. if (fd) {
  74. fseek(fd, 0 , SEEK_END);
  75. off_t length = (off_t) ftell(fd);
  76. fseek(fd, 0 , SEEK_SET);
  77. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  78. char *str = mallocz((length+1)* sizeof(char));
  79. if(str) {
  80. size_t copied;
  81. copied = fread(str, sizeof(char), length, fd);
  82. if (copied == (length* sizeof(char))) {
  83. str[length] = 0x00;
  84. json_parse(str, NULL, health_silencers_json_read_callback);
  85. info("Parsed health silencers file %s", silencers_filename);
  86. } else {
  87. error("Cannot read the data from health silencers file %s", silencers_filename);
  88. }
  89. freez(str);
  90. }
  91. } else {
  92. error(
  93. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  94. silencers_filename,
  95. (int64_t)length,
  96. HEALTH_SILENCERS_MAX_FILE_LEN);
  97. }
  98. fclose(fd);
  99. } else {
  100. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  101. }
  102. }
  103. /**
  104. * Health Init
  105. *
  106. * Initialize the health thread.
  107. */
  108. void health_init(void) {
  109. debug(D_HEALTH, "Health configuration initializing");
  110. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  111. debug(D_HEALTH, "Health is disabled.");
  112. return;
  113. }
  114. health_silencers_init();
  115. }
  116. // ----------------------------------------------------------------------------
  117. // re-load health configuration
  118. /**
  119. * Reload host
  120. *
  121. * Reload configuration for a specific host.
  122. *
  123. * @param host the structure of the host that the function will reload the configuration.
  124. */
  125. static void health_reload_host(RRDHOST *host) {
  126. if(unlikely(!host->health_enabled))
  127. return;
  128. char *user_path = health_user_config_dir();
  129. char *stock_path = health_stock_config_dir();
  130. // free all running alarms
  131. rrdhost_wrlock(host);
  132. while(host->templates)
  133. rrdcalctemplate_unlink_and_free(host, host->templates);
  134. RRDCALCTEMPLATE *rt,*next;
  135. for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
  136. next = rt->next;
  137. rrdcalctemplate_free(rt);
  138. }
  139. host->alarms_template_with_foreach = NULL;
  140. while(host->alarms)
  141. rrdcalc_unlink_and_free(host, host->alarms);
  142. RRDCALC *rc,*nc;
  143. for(rc = host->alarms_with_foreach; rc ; rc = nc) {
  144. nc = rc->next;
  145. rrdcalc_free(rc);
  146. }
  147. host->alarms_with_foreach = NULL;
  148. rrdhost_unlock(host);
  149. // invalidate all previous entries in the alarm log
  150. ALARM_ENTRY *t;
  151. for(t = host->health_log.alarms ; t ; t = t->next) {
  152. if(t->new_status != RRDCALC_STATUS_REMOVED)
  153. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  154. }
  155. rrdhost_rdlock(host);
  156. // reset all thresholds to all charts
  157. RRDSET *st;
  158. rrdset_foreach_read(st, host) {
  159. st->green = NAN;
  160. st->red = NAN;
  161. }
  162. rrdhost_unlock(host);
  163. // load the new alarms
  164. rrdhost_wrlock(host);
  165. health_readdir(host, user_path, stock_path, NULL);
  166. //Discard alarms with labels that do not apply to host
  167. rrdcalc_labels_unlink_alarm_from_host(host);
  168. // link the loaded alarms to their charts
  169. RRDDIM *rd;
  170. rrdset_foreach_write(st, host) {
  171. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  172. continue;
  173. rrdsetcalc_link_matching(st);
  174. rrdcalctemplate_link_matching(st);
  175. //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
  176. rrdset_rdlock(st);
  177. rrddim_foreach_read(rd, st) {
  178. rrdcalc_link_to_rrddim(rd, st, host);
  179. }
  180. rrdset_unlock(st);
  181. }
  182. rrdhost_unlock(host);
  183. }
  184. /**
  185. * Reload
  186. *
  187. * Reload the host configuration for all hosts.
  188. */
  189. void health_reload(void) {
  190. #ifdef ENABLE_ACLK
  191. if (netdata_cloud_setting)
  192. aclk_single_update_disable();
  193. #endif
  194. sql_refresh_hashes();
  195. rrd_rdlock();
  196. RRDHOST *host;
  197. rrdhost_foreach_read(host)
  198. health_reload_host(host);
  199. rrd_unlock();
  200. #ifdef ENABLE_ACLK
  201. if (netdata_cloud_setting) {
  202. aclk_single_update_enable();
  203. aclk_alarm_reload();
  204. #ifdef ENABLE_NEW_CLOUD_PROTOCOL
  205. aclk_alert_reloaded = 1;
  206. #endif
  207. }
  208. #endif
  209. }
  210. // ----------------------------------------------------------------------------
  211. // health main thread and friends
  212. static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
  213. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  214. if(n) return RRDCALC_STATUS_RAISED;
  215. return RRDCALC_STATUS_CLEAR;
  216. }
  217. #define ALARM_EXEC_COMMAND_LENGTH 8192
  218. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  219. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  220. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  221. // do not send notifications for internal statuses
  222. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  223. goto done;
  224. }
  225. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  226. // do not send notifications for disabled statuses
  227. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  228. // mark it as run, so that we will send the same alarm if it happens again
  229. goto done;
  230. }
  231. // find the previous notification for the same alarm
  232. // which we have run the exec script
  233. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  234. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  235. uint32_t id = ae->alarm_id;
  236. ALARM_ENTRY *t;
  237. for(t = ae->next; t ; t = t->next) {
  238. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  239. break;
  240. }
  241. if(likely(t)) {
  242. // we have executed this alarm notification in the past
  243. if(t && t->new_status == ae->new_status) {
  244. // don't send the notification for the same status again
  245. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
  246. , rrdcalc_status2string(ae->new_status));
  247. goto done;
  248. }
  249. }
  250. else {
  251. // we have not executed this alarm notification in the past
  252. // so, don't send CLEAR notifications
  253. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  254. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  255. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  256. , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  257. goto done;
  258. }
  259. }
  260. }
  261. }
  262. // Check if alarm notifications are silenced
  263. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  264. info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
  265. goto done;
  266. }
  267. static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
  268. const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
  269. const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
  270. int n_warn=0, n_crit=0;
  271. RRDCALC *rc;
  272. EVAL_EXPRESSION *expr=NULL;
  273. BUFFER *warn_alarms, *crit_alarms;
  274. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  275. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  276. for(rc = host->alarms; rc ; rc = rc->next) {
  277. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  278. continue;
  279. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  280. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  281. if (n_warn)
  282. buffer_strcat(warn_alarms, ",");
  283. buffer_strcat(warn_alarms, rc->name);
  284. buffer_strcat(warn_alarms, "=");
  285. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  286. n_warn++;
  287. } else if (ae->alarm_id == rc->id)
  288. expr = rc->warning;
  289. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  290. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  291. if (n_crit)
  292. buffer_strcat(crit_alarms, ",");
  293. buffer_strcat(crit_alarms, rc->name);
  294. buffer_strcat(crit_alarms, "=");
  295. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
  296. n_crit++;
  297. } else if (ae->alarm_id == rc->id)
  298. expr = rc->critical;
  299. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  300. if (ae->alarm_id == rc->id)
  301. expr = rc->warning;
  302. }
  303. }
  304. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  305. snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
  306. exec,
  307. recipient,
  308. host->registry_hostname,
  309. ae->unique_id,
  310. ae->alarm_id,
  311. ae->alarm_event_id,
  312. (unsigned long)ae->when,
  313. ae->name,
  314. ae->chart?ae->chart:"NOCHART",
  315. ae->family?ae->family:"NOFAMILY",
  316. rrdcalc_status2string(ae->new_status),
  317. rrdcalc_status2string(ae->old_status),
  318. ae->new_value,
  319. ae->old_value,
  320. ae->source?ae->source:"UNKNOWN",
  321. (uint32_t)ae->duration,
  322. (uint32_t)ae->non_clear_duration,
  323. ae->units?ae->units:"",
  324. ae->info?ae->info:"",
  325. ae->new_value_string,
  326. ae->old_value_string,
  327. (expr && expr->source)?expr->source:"NOSOURCE",
  328. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  329. n_warn,
  330. n_crit,
  331. buffer_tostring(warn_alarms),
  332. buffer_tostring(crit_alarms),
  333. ae->classification?ae->classification:"Unknown",
  334. edit_command,
  335. host != localhost ? host->machine_guid:""
  336. );
  337. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  338. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  339. debug(D_HEALTH, "executing command '%s'", command_to_run);
  340. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  341. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  342. enqueue_alarm_notify_in_progress(ae);
  343. freez(edit_command);
  344. buffer_free(warn_alarms);
  345. buffer_free(crit_alarms);
  346. return; //health_alarm_wait_for_execution
  347. done:
  348. health_alarm_log_save(host, ae);
  349. }
  350. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  351. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  352. return;
  353. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  354. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  355. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  356. if(ae->exec_code != 0)
  357. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  358. unlink_alarm_notify_in_progress(ae);
  359. }
  360. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  361. debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
  362. ae->chart?ae->chart:"NOCHART", ae->name,
  363. ae->new_value,
  364. rrdcalc_status2string(ae->old_status),
  365. rrdcalc_status2string(ae->new_status)
  366. );
  367. health_alarm_execute(host, ae);
  368. }
  369. static inline void health_alarm_log_process(RRDHOST *host) {
  370. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  371. time_t now = now_realtime_sec();
  372. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  373. ALARM_ENTRY *ae;
  374. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  375. if(likely(!alarm_entry_isrepeating(host, ae))) {
  376. if(unlikely(
  377. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  378. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  379. )) {
  380. if(unlikely(ae->unique_id < first_waiting))
  381. first_waiting = ae->unique_id;
  382. if(likely(now >= ae->delay_up_to_timestamp))
  383. health_process_notifications(host, ae);
  384. }
  385. }
  386. }
  387. // remember this for the next iteration
  388. host->health_last_processed_id = first_waiting;
  389. bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
  390. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  391. if (!cleanup_excess_log_entries)
  392. return;
  393. // cleanup excess entries in the log
  394. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  395. ALARM_ENTRY *last = NULL;
  396. unsigned int count = host->health_log.max * 2 / 3;
  397. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  398. if(ae && last && last->next == ae)
  399. last->next = NULL;
  400. else
  401. ae = NULL;
  402. while(ae) {
  403. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  404. ALARM_ENTRY *t = ae->next;
  405. if(likely(!alarm_entry_isrepeating(host, ae))) {
  406. health_alarm_wait_for_execution(ae);
  407. health_alarm_log_free_one_nochecks_nounlink(ae);
  408. host->health_log.count--;
  409. }
  410. ae = t;
  411. }
  412. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  413. }
  414. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  415. if(unlikely(!rc->rrdset)) {
  416. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
  417. return 0;
  418. }
  419. if(unlikely(rc->next_update > now)) {
  420. if (unlikely(*next_run > rc->next_update)) {
  421. // update the next_run time of the main loop
  422. // to run this alarm precisely the time required
  423. *next_run = rc->next_update;
  424. }
  425. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
  426. return 0;
  427. }
  428. if(unlikely(!rc->update_every)) {
  429. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
  430. return 0;
  431. }
  432. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  433. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
  434. return 0;
  435. }
  436. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  437. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
  438. return 0;
  439. }
  440. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  441. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
  442. return 0;
  443. }
  444. int update_every = rc->rrdset->update_every;
  445. rrdset_rdlock(rc->rrdset);
  446. time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
  447. time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
  448. rrdset_unlock(rc->rrdset);
  449. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  450. debug(D_HEALTH
  451. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  452. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
  453. , (unsigned long) last);
  454. return 0;
  455. }
  456. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  457. time_t needed = now + rc->before + rc->after;
  458. if(needed + update_every < first || needed - update_every > last) {
  459. debug(D_HEALTH
  460. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  461. , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
  462. , (unsigned long) last);
  463. return 0;
  464. }
  465. }
  466. return 1;
  467. }
  468. static inline int check_if_resumed_from_suspension(void) {
  469. static usec_t last_realtime = 0, last_monotonic = 0;
  470. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  471. int ret = 0;
  472. // detect if monotonic and realtime have twice the difference
  473. // in which case we assume the system was just waken from hibernation
  474. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  475. ret = 1;
  476. last_realtime = realtime;
  477. last_monotonic = monotonic;
  478. return ret;
  479. }
  480. static void health_main_cleanup(void *ptr) {
  481. worker_unregister();
  482. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  483. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  484. info("cleaning up...");
  485. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  486. }
  487. static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
  488. SILENCER *s;
  489. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  490. rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
  491. for (s = silencers->silencers; s!=NULL; s=s->next){
  492. if (
  493. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
  494. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
  495. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  496. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
  497. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
  498. ) {
  499. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  500. if (unlikely(silencers->stype == STYPE_NONE)) {
  501. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
  502. } else {
  503. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  504. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  505. , rc->name
  506. , (rc->rrdset)?rc->rrdset->context:""
  507. , rc->chart
  508. , host
  509. , (rc->rrdset)?rc->rrdset->family:""
  510. );
  511. }
  512. return silencers->stype;
  513. }
  514. }
  515. return STYPE_NONE;
  516. }
  517. /**
  518. * Update Disabled Silenced
  519. *
  520. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  521. *
  522. * @param host structure that contains information about the host monitored.
  523. * @param rc structure with information about the alarm
  524. *
  525. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  526. */
  527. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  528. uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
  529. // Clear the flags
  530. rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  531. if (unlikely(silencers->all_alarms)) {
  532. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  533. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  534. } else {
  535. SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
  536. if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
  537. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
  538. }
  539. if (rrdcalc_flags_old != rc->rrdcalc_flags) {
  540. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  541. host->hostname,
  542. rc->name,
  543. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  544. (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  545. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  546. (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  547. );
  548. }
  549. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
  550. return 1;
  551. else
  552. return 0;
  553. }
  554. // Create alarms for dimensions that have been added to charts
  555. // since the previous iteration.
  556. static void init_pending_foreach_alarms(RRDHOST *host) {
  557. RRDSET *st;
  558. RRDDIM *rd;
  559. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
  560. return;
  561. rrdhost_wrlock(host);
  562. rrdset_foreach_write(st, host) {
  563. if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
  564. continue;
  565. rrdset_rdlock(st);
  566. rrddim_foreach_read(rd, st) {
  567. if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
  568. continue;
  569. rrdcalc_link_to_rrddim(rd, st, host);
  570. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
  571. }
  572. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
  573. rrdset_unlock(st);
  574. }
  575. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
  576. rrdhost_unlock(host);
  577. }
  578. /**
  579. * Health Main
  580. *
  581. * The main thread of the health system. In this function all the alarms will be processed.
  582. *
  583. * @param ptr is a pointer to the netdata_static_thread structure.
  584. *
  585. * @return It always returns NULL
  586. */
  587. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  588. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  589. #define WORKER_HEALTH_JOB_DB_QUERY 2
  590. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  591. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  592. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  593. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  594. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  595. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
  596. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
  597. #endif
  598. void *health_main(void *ptr) {
  599. worker_register("HEALTH");
  600. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  601. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  602. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  603. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  604. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  605. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  606. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  607. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  608. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  609. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  610. if(min_run_every < 1) min_run_every = 1;
  611. int cleanup_sql_every_loop = 7200 / min_run_every;
  612. time_t now = now_realtime_sec();
  613. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  614. rrdcalc_labels_unlink();
  615. unsigned int loop = 0;
  616. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  617. unsigned int marked_aclk_reload_loop = 0;
  618. #endif
  619. while(!netdata_exit) {
  620. loop++;
  621. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  622. int runnable = 0, apply_hibernation_delay = 0;
  623. time_t next_run = now + min_run_every;
  624. RRDCALC *rc;
  625. if (unlikely(check_if_resumed_from_suspension())) {
  626. apply_hibernation_delay = 1;
  627. info(
  628. "Postponing alarm checks for %"PRId64" seconds, "
  629. "because it seems that the system was just resumed from suspension.",
  630. (int64_t)hibernation_delay);
  631. }
  632. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  633. static int logged=0;
  634. if (!logged) {
  635. info("Skipping health checks, because all alarms are disabled via a %s command.",
  636. HEALTH_CMDAPI_CMD_DISABLEALL);
  637. logged = 1;
  638. }
  639. }
  640. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  641. if (aclk_alert_reloaded && !marked_aclk_reload_loop)
  642. marked_aclk_reload_loop = loop;
  643. #endif
  644. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  645. rrd_rdlock();
  646. RRDHOST *host;
  647. rrdhost_foreach_read(host) {
  648. if (unlikely(!host->health_enabled))
  649. continue;
  650. if (unlikely(apply_hibernation_delay)) {
  651. info(
  652. "Postponing health checks for %"PRId64" seconds, on host '%s'.",
  653. (int64_t)hibernation_delay,
  654. host->hostname);
  655. host->health_delay_up_to = now + hibernation_delay;
  656. }
  657. if (unlikely(host->health_delay_up_to)) {
  658. if (unlikely(now < host->health_delay_up_to))
  659. continue;
  660. info("Resuming health checks on host '%s'.", host->hostname);
  661. host->health_delay_up_to = 0;
  662. }
  663. if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
  664. sql_health_alarm_log_cleanup(host);
  665. init_pending_foreach_alarms(host);
  666. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  667. rrdhost_rdlock(host);
  668. // the first loop is to lookup values from the db
  669. for (rc = host->alarms; rc; rc = rc->next) {
  670. if (update_disabled_silenced(host, rc))
  671. continue;
  672. // create an alert removed event if the chart is obsolete and
  673. // has stopped being collected for 60 seconds
  674. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  675. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  676. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  677. if (!rrdcalc_isrepeating(rc)) {
  678. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  679. time_t now = now_realtime_sec();
  680. ALARM_ENTRY *ae = health_create_alarm_entry(
  681. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  682. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  683. rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
  684. if (ae) {
  685. health_alarm_log(host, ae);
  686. rc->old_status = rc->status;
  687. rc->status = RRDCALC_STATUS_REMOVED;
  688. rc->last_status_change = now;
  689. rc->last_updated = now;
  690. rc->value = NAN;
  691. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  692. if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
  693. sql_queue_alarm_to_aclk(host, ae, 1);
  694. #endif
  695. }
  696. }
  697. }
  698. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  699. if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
  700. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
  701. continue;
  702. }
  703. runnable++;
  704. rc->old_value = rc->value;
  705. rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
  706. // ------------------------------------------------------------
  707. // if there is database lookup, do it
  708. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  709. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  710. /* time_t old_db_timestamp = rc->db_before; */
  711. int value_is_null = 0;
  712. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
  713. rc->before, rc->group, 0, rc->options, &rc->db_after,
  714. &rc->db_before, &value_is_null, 0
  715. );
  716. if (unlikely(ret != 200)) {
  717. // database lookup failed
  718. rc->value = NAN;
  719. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
  720. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  721. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
  722. );
  723. } else
  724. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
  725. /* - RRDCALC_FLAG_DB_STALE not currently used
  726. if (unlikely(old_db_timestamp == rc->db_before)) {
  727. // database is stale
  728. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  729. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
  730. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
  731. error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  732. }
  733. }
  734. else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
  735. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
  736. */
  737. if (unlikely(value_is_null)) {
  738. // collected value is null
  739. rc->value = NAN;
  740. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
  741. debug(D_HEALTH,
  742. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  743. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
  744. );
  745. } else
  746. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
  747. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
  748. CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  749. rc->value
  750. );
  751. }
  752. // ------------------------------------------------------------
  753. // if there is calculation expression, run it
  754. if (unlikely(rc->calculation)) {
  755. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  756. if (unlikely(!expression_evaluate(rc->calculation))) {
  757. // calculation failed
  758. rc->value = NAN;
  759. rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
  760. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  761. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  762. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  763. );
  764. } else {
  765. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  766. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  767. CALCULATED_NUMBER_FORMAT
  768. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  769. rc->calculation->parsed_as, rc->calculation->result,
  770. buffer_tostring(rc->calculation->error_msg), rc->source
  771. );
  772. rc->value = rc->calculation->result;
  773. if (rc->local) rc->local->last_updated = now;
  774. if (rc->family) rc->family->last_updated = now;
  775. if (rc->hostid) rc->hostid->last_updated = now;
  776. if (rc->hostname) rc->hostname->last_updated = now;
  777. }
  778. }
  779. }
  780. rrdhost_unlock(host);
  781. if (unlikely(runnable && !netdata_exit)) {
  782. rrdhost_rdlock(host);
  783. for (rc = host->alarms; rc; rc = rc->next) {
  784. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
  785. continue;
  786. if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
  787. continue;
  788. }
  789. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  790. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  791. // --------------------------------------------------------
  792. // check the warning expression
  793. if (likely(rc->warning)) {
  794. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  795. if (unlikely(!expression_evaluate(rc->warning))) {
  796. // calculation failed
  797. rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
  798. debug(D_HEALTH,
  799. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  800. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  801. buffer_tostring(rc->warning->error_msg)
  802. );
  803. } else {
  804. rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  805. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  806. CALCULATED_NUMBER_FORMAT
  807. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  808. rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
  809. );
  810. warning_status = rrdcalc_value2status(rc->warning->result);
  811. }
  812. }
  813. // --------------------------------------------------------
  814. // check the critical expression
  815. if (likely(rc->critical)) {
  816. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  817. if (unlikely(!expression_evaluate(rc->critical))) {
  818. // calculation failed
  819. rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
  820. debug(D_HEALTH,
  821. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  822. host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
  823. buffer_tostring(rc->critical->error_msg)
  824. );
  825. } else {
  826. rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  827. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  828. CALCULATED_NUMBER_FORMAT
  829. ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
  830. rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
  831. rc->source
  832. );
  833. critical_status = rrdcalc_value2status(rc->critical->result);
  834. }
  835. }
  836. // --------------------------------------------------------
  837. // decide the final alarm status
  838. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  839. switch (warning_status) {
  840. case RRDCALC_STATUS_CLEAR:
  841. status = RRDCALC_STATUS_CLEAR;
  842. break;
  843. case RRDCALC_STATUS_RAISED:
  844. status = RRDCALC_STATUS_WARNING;
  845. break;
  846. default:
  847. break;
  848. }
  849. switch (critical_status) {
  850. case RRDCALC_STATUS_CLEAR:
  851. if (status == RRDCALC_STATUS_UNDEFINED)
  852. status = RRDCALC_STATUS_CLEAR;
  853. break;
  854. case RRDCALC_STATUS_RAISED:
  855. status = RRDCALC_STATUS_CRITICAL;
  856. break;
  857. default:
  858. break;
  859. }
  860. // --------------------------------------------------------
  861. // check if the new status and the old differ
  862. if (status != rc->status) {
  863. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  864. int delay = 0;
  865. // apply trigger hysteresis
  866. if (now > rc->delay_up_to_timestamp) {
  867. rc->delay_up_current = rc->delay_up_duration;
  868. rc->delay_down_current = rc->delay_down_duration;
  869. rc->delay_last = 0;
  870. rc->delay_up_to_timestamp = 0;
  871. } else {
  872. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  873. if (rc->delay_up_current > rc->delay_max_duration)
  874. rc->delay_up_current = rc->delay_max_duration;
  875. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  876. if (rc->delay_down_current > rc->delay_max_duration)
  877. rc->delay_down_current = rc->delay_max_duration;
  878. }
  879. if (status > rc->status)
  880. delay = rc->delay_up_current;
  881. else
  882. delay = rc->delay_down_current;
  883. // COMMENTED: because we do need to send raising alarms
  884. // if(now + delay < rc->delay_up_to_timestamp)
  885. // delay = (int)(rc->delay_up_to_timestamp - now);
  886. rc->delay_last = delay;
  887. rc->delay_up_to_timestamp = now + delay;
  888. ALARM_ENTRY *ae = health_create_alarm_entry(
  889. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  890. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  891. rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
  892. rc->delay_last,
  893. (
  894. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  895. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  896. )
  897. );
  898. health_alarm_log(host, ae);
  899. rc->last_status_change = now;
  900. rc->old_status = rc->status;
  901. rc->status = status;
  902. }
  903. rc->last_updated = now;
  904. rc->next_update = now + rc->update_every;
  905. if (next_run > rc->next_update)
  906. next_run = rc->next_update;
  907. }
  908. // process repeating alarms
  909. RRDCALC *rc;
  910. for(rc = host->alarms; rc ; rc = rc->next) {
  911. int repeat_every = 0;
  912. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  913. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  914. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  915. repeat_every = rc->warn_repeat_every;
  916. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  917. rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  918. repeat_every = rc->crit_repeat_every;
  919. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  920. if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
  921. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  922. repeat_every = 1;
  923. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  924. repeat_every = 1;
  925. }
  926. }
  927. }
  928. } else {
  929. continue;
  930. }
  931. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  932. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  933. rc->last_repeat = now;
  934. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  935. ALARM_ENTRY *ae = health_create_alarm_entry(
  936. host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
  937. rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
  938. rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
  939. rc->delay_last,
  940. (
  941. ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  942. ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
  943. )
  944. );
  945. ae->last_repeat = rc->last_repeat;
  946. if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  947. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  948. }
  949. rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
  950. health_process_notifications(host, ae);
  951. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  952. health_alarm_wait_for_execution(ae);
  953. health_alarm_log_free_one_nochecks_nounlink(ae);
  954. }
  955. }
  956. rrdhost_unlock(host);
  957. }
  958. if (unlikely(netdata_exit))
  959. break;
  960. // execute notifications
  961. // and cleanup
  962. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  963. health_alarm_log_process(host);
  964. if (unlikely(netdata_exit)) {
  965. // wait for all notifications to finish before allowing health to be cleaned up
  966. ALARM_ENTRY *ae;
  967. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  968. health_alarm_wait_for_execution(ae);
  969. }
  970. break;
  971. }
  972. } /* rrdhost_foreach */
  973. // wait for all notifications to finish before allowing health to be cleaned up
  974. ALARM_ENTRY *ae;
  975. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  976. health_alarm_wait_for_execution(ae);
  977. }
  978. #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
  979. if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  980. rrdhost_foreach_read(host) {
  981. if (unlikely(!host->health_enabled))
  982. continue;
  983. sql_queue_removed_alerts_to_aclk(host);
  984. }
  985. aclk_alert_reloaded = 0;
  986. marked_aclk_reload_loop = 0;
  987. }
  988. #endif
  989. rrd_unlock();
  990. if(unlikely(netdata_exit))
  991. break;
  992. now = now_realtime_sec();
  993. if(now < next_run) {
  994. worker_is_idle();
  995. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  996. sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
  997. now = now_realtime_sec();
  998. }
  999. else
  1000. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  1001. } // forever
  1002. netdata_thread_cleanup_pop(1);
  1003. return NULL;
  1004. }