health.c 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  4. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  5. #define WORKER_HEALTH_JOB_DB_QUERY 2
  6. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  7. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  8. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  9. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  10. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  11. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
  12. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
  13. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
  14. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
  15. #endif
  16. unsigned int default_health_enabled = 1;
  17. char *silencers_filename;
  18. // the queue of executed alarm notifications that haven't been waited for yet
  19. static __thread struct {
  20. ALARM_ENTRY *head; // oldest
  21. ALARM_ENTRY *tail; // latest
  22. } alarm_notifications_in_progress = {NULL, NULL};
  23. typedef struct active_alerts {
  24. char *name;
  25. time_t last_status_change;
  26. RRDCALC_STATUS status;
  27. } active_alerts_t;
  28. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  29. {
  30. ae->prev_in_progress = NULL;
  31. ae->next_in_progress = NULL;
  32. if (NULL != alarm_notifications_in_progress.tail) {
  33. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  34. alarm_notifications_in_progress.tail->next_in_progress = ae;
  35. }
  36. if (NULL == alarm_notifications_in_progress.head) {
  37. alarm_notifications_in_progress.head = ae;
  38. }
  39. alarm_notifications_in_progress.tail = ae;
  40. }
  41. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  42. {
  43. struct alarm_entry *prev = ae->prev_in_progress;
  44. struct alarm_entry *next = ae->next_in_progress;
  45. if (NULL != prev) {
  46. prev->next_in_progress = next;
  47. }
  48. if (NULL != next) {
  49. next->prev_in_progress = prev;
  50. }
  51. if (ae == alarm_notifications_in_progress.head) {
  52. alarm_notifications_in_progress.head = next;
  53. }
  54. if (ae == alarm_notifications_in_progress.tail) {
  55. alarm_notifications_in_progress.tail = prev;
  56. }
  57. }
  58. // ----------------------------------------------------------------------------
  59. // health initialization
  60. /**
  61. * User Config directory
  62. *
  63. * Get the config directory for health and return it.
  64. *
  65. * @return a pointer to the user config directory
  66. */
  67. inline char *health_user_config_dir(void) {
  68. char buffer[FILENAME_MAX + 1];
  69. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  70. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  71. }
  72. /**
  73. * Stock Config Directory
  74. *
  75. * Get the Stock config directory and return it.
  76. *
  77. * @return a pointer to the stock config directory.
  78. */
  79. inline char *health_stock_config_dir(void) {
  80. char buffer[FILENAME_MAX + 1];
  81. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  82. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  83. }
  84. /**
  85. * Silencers init
  86. *
  87. * Function used to initialize the silencer structure.
  88. */
  89. static void health_silencers_init(void) {
  90. FILE *fd = fopen(silencers_filename, "r");
  91. if (fd) {
  92. fseek(fd, 0 , SEEK_END);
  93. off_t length = (off_t) ftell(fd);
  94. fseek(fd, 0 , SEEK_SET);
  95. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  96. char *str = mallocz((length+1)* sizeof(char));
  97. if(str) {
  98. size_t copied;
  99. copied = fread(str, sizeof(char), length, fd);
  100. if (copied == (length* sizeof(char))) {
  101. str[length] = 0x00;
  102. json_parse(str, NULL, health_silencers_json_read_callback);
  103. info("Parsed health silencers file %s", silencers_filename);
  104. } else {
  105. error("Cannot read the data from health silencers file %s", silencers_filename);
  106. }
  107. freez(str);
  108. }
  109. } else {
  110. error(
  111. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  112. silencers_filename,
  113. (int64_t)length,
  114. HEALTH_SILENCERS_MAX_FILE_LEN);
  115. }
  116. fclose(fd);
  117. } else {
  118. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  119. }
  120. }
  121. /**
  122. * Health Init
  123. *
  124. * Initialize the health thread.
  125. */
  126. void health_init(void) {
  127. debug(D_HEALTH, "Health configuration initializing");
  128. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  129. debug(D_HEALTH, "Health is disabled.");
  130. return;
  131. }
  132. health_silencers_init();
  133. }
  134. // ----------------------------------------------------------------------------
  135. // re-load health configuration
  136. /**
  137. * Reload host
  138. *
  139. * Reload configuration for a specific host.
  140. *
  141. * @param host the structure of the host that the function will reload the configuration.
  142. */
  143. static void health_reload_host(RRDHOST *host) {
  144. if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
  145. return;
  146. log_health("[%s]: Reloading health.", rrdhost_hostname(host));
  147. char *user_path = health_user_config_dir();
  148. char *stock_path = health_stock_config_dir();
  149. // free all running alarms
  150. rrdcalc_delete_all(host);
  151. rrdcalctemplate_delete_all(host);
  152. // invalidate all previous entries in the alarm log
  153. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  154. ALARM_ENTRY *t;
  155. for(t = host->health_log.alarms ; t ; t = t->next) {
  156. if(t->new_status != RRDCALC_STATUS_REMOVED)
  157. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  158. }
  159. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  160. // reset all thresholds to all charts
  161. RRDSET *st;
  162. rrdset_foreach_read(st, host) {
  163. st->green = NAN;
  164. st->red = NAN;
  165. }
  166. rrdset_foreach_done(st);
  167. // load the new alarms
  168. health_readdir(host, user_path, stock_path, NULL);
  169. //Discard alarms with labels that do not apply to host
  170. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  171. // link the loaded alarms to their charts
  172. rrdset_foreach_write(st, host) {
  173. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  174. continue;
  175. rrdcalc_link_matching_alerts_to_rrdset(st);
  176. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  177. }
  178. rrdset_foreach_done(st);
  179. host->aclk_alert_reloaded = 1;
  180. }
  181. /**
  182. * Reload
  183. *
  184. * Reload the host configuration for all hosts.
  185. */
  186. void health_reload(void) {
  187. sql_refresh_hashes();
  188. rrd_rdlock();
  189. RRDHOST *host;
  190. rrdhost_foreach_read(host)
  191. health_reload_host(host);
  192. rrd_unlock();
  193. }
  194. // ----------------------------------------------------------------------------
  195. // health main thread and friends
  196. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  197. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  198. if(n) return RRDCALC_STATUS_RAISED;
  199. return RRDCALC_STATUS_CLEAR;
  200. }
  201. #define ALARM_EXEC_COMMAND_LENGTH 8192
  202. #define ACTIVE_ALARMS_LIST_EXAMINE 500
  203. #define ACTIVE_ALARMS_LIST 15
  204. static inline int compare_active_alerts(const void * a, const void * b) {
  205. active_alerts_t *active_alerts_a = (active_alerts_t *)a;
  206. active_alerts_t *active_alerts_b = (active_alerts_t *)b;
  207. return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
  208. }
  209. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  210. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  211. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  212. // do not send notifications for internal statuses
  213. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  214. goto done;
  215. }
  216. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  217. // do not send notifications for disabled statuses
  218. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  219. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  220. // mark it as run, so that we will send the same alarm if it happens again
  221. goto done;
  222. }
  223. // find the previous notification for the same alarm
  224. // which we have run the exec script
  225. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  226. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  227. uint32_t id = ae->alarm_id;
  228. ALARM_ENTRY *t;
  229. for(t = ae->next; t ; t = t->next) {
  230. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  231. break;
  232. }
  233. if(likely(t)) {
  234. // we have executed this alarm notification in the past
  235. if(t && t->new_status == ae->new_status) {
  236. // don't send the notification for the same status again
  237. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
  238. , rrdcalc_status2string(ae->new_status));
  239. log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
  240. , rrdcalc_status2string(ae->new_status));
  241. goto done;
  242. }
  243. }
  244. else {
  245. // we have not executed this alarm notification in the past
  246. // so, don't send CLEAR notifications
  247. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  248. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  249. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  250. , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  251. goto done;
  252. }
  253. }
  254. }
  255. }
  256. // Check if alarm notifications are silenced
  257. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  258. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  259. goto done;
  260. }
  261. log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  262. static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
  263. const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
  264. const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
  265. int n_warn=0, n_crit=0;
  266. RRDCALC *rc;
  267. EVAL_EXPRESSION *expr=NULL;
  268. BUFFER *warn_alarms, *crit_alarms;
  269. active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
  270. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  271. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
  272. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  273. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  274. continue;
  275. if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
  276. break;
  277. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  278. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  279. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  280. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  281. active_alerts[n_warn+n_crit].status = rc->status;
  282. n_warn++;
  283. } else if (ae->alarm_id == rc->id)
  284. expr = rc->warning;
  285. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  286. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  287. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  288. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  289. active_alerts[n_warn+n_crit].status = rc->status;
  290. n_crit++;
  291. } else if (ae->alarm_id == rc->id)
  292. expr = rc->critical;
  293. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  294. if (ae->alarm_id == rc->id)
  295. expr = rc->warning;
  296. }
  297. }
  298. foreach_rrdcalc_in_rrdhost_done(rc);
  299. if (n_warn+n_crit>1)
  300. qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
  301. int count_w = 0, count_c = 0;
  302. while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
  303. if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
  304. if (count_w)
  305. buffer_strcat(warn_alarms, ",");
  306. buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
  307. buffer_strcat(warn_alarms, "=");
  308. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  309. count_w++;
  310. }
  311. else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
  312. if (count_c)
  313. buffer_strcat(crit_alarms, ",");
  314. buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
  315. buffer_strcat(crit_alarms, "=");
  316. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  317. count_c++;
  318. }
  319. }
  320. char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
  321. snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
  322. "' '" NETDATA_DOUBLE_FORMAT_ZERO
  323. "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
  324. exec,
  325. recipient,
  326. rrdhost_registry_hostname(host),
  327. ae->unique_id,
  328. ae->alarm_id,
  329. ae->alarm_event_id,
  330. (unsigned long)ae->when,
  331. ae_name(ae),
  332. ae->chart?ae_chart_name(ae):"NOCHART",
  333. ae->family?ae_family(ae):"NOFAMILY",
  334. rrdcalc_status2string(ae->new_status),
  335. rrdcalc_status2string(ae->old_status),
  336. ae->new_value,
  337. ae->old_value,
  338. ae->source?ae_source(ae):"UNKNOWN",
  339. (uint32_t)ae->duration,
  340. (uint32_t)ae->non_clear_duration,
  341. ae_units(ae),
  342. ae_info(ae),
  343. ae_new_value_string(ae),
  344. ae_old_value_string(ae),
  345. (expr && expr->source)?expr->source:"NOSOURCE",
  346. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  347. n_warn,
  348. n_crit,
  349. buffer_tostring(warn_alarms),
  350. buffer_tostring(crit_alarms),
  351. ae->classification?ae_classification(ae):"Unknown",
  352. edit_command,
  353. host != localhost ? host->machine_guid:""
  354. );
  355. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  356. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  357. debug(D_HEALTH, "executing command '%s'", command_to_run);
  358. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  359. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  360. enqueue_alarm_notify_in_progress(ae);
  361. freez(edit_command);
  362. buffer_free(warn_alarms);
  363. buffer_free(crit_alarms);
  364. freez(active_alerts);
  365. return; //health_alarm_wait_for_execution
  366. done:
  367. health_alarm_log_save(host, ae);
  368. }
  369. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  370. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  371. return;
  372. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  373. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  374. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  375. if(ae->exec_code != 0)
  376. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  377. unlink_alarm_notify_in_progress(ae);
  378. }
  379. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  380. debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  381. ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
  382. ae->new_value,
  383. rrdcalc_status2string(ae->old_status),
  384. rrdcalc_status2string(ae->new_status)
  385. );
  386. health_alarm_execute(host, ae);
  387. }
  388. static inline void health_alarm_log_process(RRDHOST *host) {
  389. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  390. time_t now = now_realtime_sec();
  391. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  392. ALARM_ENTRY *ae;
  393. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  394. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
  395. if(unlikely(
  396. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  397. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  398. )) {
  399. if(unlikely(ae->unique_id < first_waiting))
  400. first_waiting = ae->unique_id;
  401. if(likely(now >= ae->delay_up_to_timestamp))
  402. health_process_notifications(host, ae);
  403. }
  404. }
  405. }
  406. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  407. // remember this for the next iteration
  408. host->health_last_processed_id = first_waiting;
  409. bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
  410. if (!cleanup_excess_log_entries)
  411. return;
  412. // cleanup excess entries in the log
  413. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  414. ALARM_ENTRY *last = NULL;
  415. unsigned int count = host->health_log.max * 2 / 3;
  416. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  417. if(ae && last && last->next == ae)
  418. last->next = NULL;
  419. else
  420. ae = NULL;
  421. while(ae) {
  422. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  423. ALARM_ENTRY *t = ae->next;
  424. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
  425. health_alarm_wait_for_execution(ae);
  426. health_alarm_log_free_one_nochecks_nounlink(ae);
  427. host->health_log.count--;
  428. }
  429. ae = t;
  430. }
  431. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  432. }
  433. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  434. if(unlikely(!rc->rrdset)) {
  435. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  436. return 0;
  437. }
  438. if(unlikely(rc->next_update > now)) {
  439. if (unlikely(*next_run > rc->next_update)) {
  440. // update the next_run time of the main loop
  441. // to run this alarm precisely the time required
  442. *next_run = rc->next_update;
  443. }
  444. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
  445. return 0;
  446. }
  447. if(unlikely(!rc->update_every)) {
  448. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  449. return 0;
  450. }
  451. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  452. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  453. return 0;
  454. }
  455. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  456. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  457. return 0;
  458. }
  459. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  460. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  461. return 0;
  462. }
  463. int update_every = rc->rrdset->update_every;
  464. time_t first = rrdset_first_entry_t(rc->rrdset);
  465. time_t last = rrdset_last_entry_t(rc->rrdset);
  466. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  467. debug(D_HEALTH
  468. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  469. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
  470. , (unsigned long) last);
  471. return 0;
  472. }
  473. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  474. time_t needed = now + rc->before + rc->after;
  475. if(needed + update_every < first || needed - update_every > last) {
  476. debug(D_HEALTH
  477. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  478. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
  479. , (unsigned long) last);
  480. return 0;
  481. }
  482. }
  483. return 1;
  484. }
  485. static inline int check_if_resumed_from_suspension(void) {
  486. static __thread usec_t last_realtime = 0, last_monotonic = 0;
  487. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  488. int ret = 0;
  489. // detect if monotonic and realtime have twice the difference
  490. // in which case we assume the system was just waken from hibernation
  491. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  492. ret = 1;
  493. last_realtime = realtime;
  494. last_monotonic = monotonic;
  495. return ret;
  496. }
  497. static void health_thread_cleanup(void *ptr) {
  498. worker_unregister();
  499. struct health_state *h = ptr;
  500. h->host->health_spawn = 0;
  501. netdata_thread_cancel(netdata_thread_self());
  502. log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host));
  503. debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host));
  504. }
  505. static void initialize_health(RRDHOST *host, int is_localhost) {
  506. if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return;
  507. rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
  508. log_health("[%s]: Initializing health.", rrdhost_hostname(host));
  509. host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
  510. host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
  511. host->health_log.next_log_id = 1;
  512. host->health_log.next_alarm_id = 1;
  513. host->health_log.max = 1000;
  514. host->health_log.next_log_id = (uint32_t)now_realtime_sec();
  515. host->health_log.next_alarm_id = 0;
  516. long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
  517. if(n < 10) {
  518. error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
  519. config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
  520. }
  521. else
  522. host->health_log.max = (unsigned int)n;
  523. netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
  524. char filename[FILENAME_MAX + 1];
  525. if(!is_localhost) {
  526. int r = mkdir(host->varlib_dir, 0775);
  527. if (r != 0 && errno != EEXIST)
  528. error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir);
  529. }
  530. {
  531. snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
  532. int r = mkdir(filename, 0775);
  533. if(r != 0 && errno != EEXIST)
  534. error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
  535. }
  536. snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
  537. host->health_log_filename = strdupz(filename);
  538. snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
  539. host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
  540. host->health_default_recipient = string_strdupz("root");
  541. if (!file_is_migrated(host->health_log_filename)) {
  542. int rc = sql_create_health_log_table(host);
  543. if (unlikely(rc)) {
  544. log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host));
  545. health_alarm_log_load(host);
  546. health_alarm_log_open(host);
  547. }
  548. else {
  549. health_alarm_log_load(host);
  550. add_migrated_file(host->health_log_filename, 0);
  551. }
  552. } else {
  553. // TODO: This needs to go to the metadata thread
  554. // Health should wait before accessing the table (needs to be created by the metadata thread)
  555. sql_create_health_log_table(host);
  556. sql_health_alarm_log_load(host);
  557. }
  558. // ------------------------------------------------------------------------
  559. // load health configuration
  560. health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
  561. // link the loaded alarms to their charts
  562. RRDSET *st;
  563. rrdset_foreach_write(st, host) {
  564. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  565. continue;
  566. rrdcalc_link_matching_alerts_to_rrdset(st);
  567. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  568. }
  569. rrdset_foreach_done(st);
  570. //Discard alarms with labels that do not apply to host
  571. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  572. health_silencers_init();
  573. }
  574. static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) {
  575. time_t now = now_realtime_sec();
  576. if(now < next_run) {
  577. worker_is_idle();
  578. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  579. while (now < next_run && host->health_enabled && !netdata_exit) {
  580. sleep_usec(USEC_PER_SEC);
  581. now = now_realtime_sec();
  582. }
  583. }
  584. else {
  585. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  586. }
  587. }
  588. static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
  589. SILENCER *s;
  590. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  591. rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
  592. for (s = silencers->silencers; s!=NULL; s=s->next){
  593. if (
  594. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
  595. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
  596. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  597. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
  598. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
  599. ) {
  600. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  601. if (unlikely(silencers->stype == STYPE_NONE)) {
  602. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
  603. } else {
  604. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  605. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  606. , rrdcalc_name(rc)
  607. , (rc->rrdset)?rrdset_context(rc->rrdset):""
  608. , rrdcalc_chart_name(rc)
  609. , host
  610. , (rc->rrdset)?rrdset_family(rc->rrdset):""
  611. );
  612. }
  613. return silencers->stype;
  614. }
  615. }
  616. return STYPE_NONE;
  617. }
  618. /**
  619. * Update Disabled Silenced
  620. *
  621. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  622. *
  623. * @param host structure that contains information about the host monitored.
  624. * @param rc structure with information about the alarm
  625. *
  626. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  627. */
  628. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  629. uint32_t rrdcalc_flags_old = rc->run_flags;
  630. // Clear the flags
  631. rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  632. if (unlikely(silencers->all_alarms)) {
  633. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  634. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  635. } else {
  636. SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
  637. if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  638. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  639. }
  640. if (rrdcalc_flags_old != rc->run_flags) {
  641. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  642. rrdhost_hostname(host),
  643. rrdcalc_name(rc),
  644. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  645. (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  646. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  647. (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  648. );
  649. }
  650. if (rc->run_flags & RRDCALC_FLAG_DISABLED)
  651. return 1;
  652. else
  653. return 0;
  654. }
  655. static void health_execute_delayed_initializations(RRDHOST *host) {
  656. RRDSET *st;
  657. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
  658. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
  659. rrdset_foreach_reentrant(st, host) {
  660. if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  661. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
  662. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
  663. if(!st->rrdfamily)
  664. st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
  665. if(!st->rrdvars)
  666. st->rrdvars = rrdvariables_create();
  667. rrddimvar_index_init(st);
  668. rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
  669. rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
  670. rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
  671. rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
  672. rrdcalc_link_matching_alerts_to_rrdset(st);
  673. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  674. RRDDIM *rd;
  675. rrddim_foreach_read(rd, st) {
  676. if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  677. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
  678. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
  679. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
  680. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
  681. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
  682. RRDCALCTEMPLATE *rt;
  683. foreach_rrdcalctemplate_read(host, rt) {
  684. if(!rt->foreach_dimension_pattern)
  685. continue;
  686. if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
  687. rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
  688. }
  689. foreach_rrdcalctemplate_done(rt);
  690. }
  691. rrddim_foreach_done(rd);
  692. }
  693. rrdset_foreach_done(st);
  694. }
  695. /**
  696. * Health Main
  697. *
  698. * The main thread of the health system. In this function all the alarms will be processed.
  699. *
  700. * @param ptr is a pointer to the netdata_static_thread structure.
  701. *
  702. * @return It always returns NULL
  703. */
  704. void *health_main(void *ptr) {
  705. worker_register("HEALTH");
  706. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  707. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  708. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  709. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  710. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  711. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  712. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  713. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  714. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
  715. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
  716. struct health_state *h = ptr;
  717. netdata_thread_cleanup_push(health_thread_cleanup, ptr);
  718. RRDHOST *host = h->host;
  719. initialize_health(host, host == localhost);
  720. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  721. if(min_run_every < 1) min_run_every = 1;
  722. int cleanup_sql_every_loop = 7200 / min_run_every;
  723. time_t now = now_realtime_sec();
  724. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  725. bool health_running_logged = false;
  726. rrdhost_rdlock(host); //CHECK
  727. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  728. rrdhost_unlock(host);
  729. unsigned int loop = 0;
  730. #ifdef ENABLE_ACLK
  731. unsigned int marked_aclk_reload_loop = 0;
  732. #endif
  733. while(!netdata_exit && host->health_enabled) {
  734. loop++;
  735. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  736. now = now_realtime_sec();
  737. int runnable = 0, apply_hibernation_delay = 0;
  738. time_t next_run = now + min_run_every;
  739. RRDCALC *rc;
  740. if (unlikely(check_if_resumed_from_suspension())) {
  741. apply_hibernation_delay = 1;
  742. log_health(
  743. "[%s]: Postponing alarm checks for %"PRId64" seconds, "
  744. "because it seems that the system was just resumed from suspension.",
  745. rrdhost_hostname(host),
  746. (int64_t)hibernation_delay);
  747. }
  748. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  749. static __thread int logged=0;
  750. if (!logged) {
  751. log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.",
  752. rrdhost_hostname(host),
  753. HEALTH_CMDAPI_CMD_DISABLEALL);
  754. logged = 1;
  755. }
  756. }
  757. #ifdef ENABLE_ACLK
  758. if (host->aclk_alert_reloaded && !marked_aclk_reload_loop)
  759. marked_aclk_reload_loop = loop;
  760. #endif
  761. if (unlikely(apply_hibernation_delay)) {
  762. log_health(
  763. "[%s]: Postponing health checks for %"PRId64" seconds.",
  764. rrdhost_hostname(host),
  765. (int64_t)hibernation_delay);
  766. host->health_delay_up_to = now + hibernation_delay;
  767. next_run = now + hibernation_delay;
  768. health_sleep(next_run, loop, host);
  769. }
  770. if (unlikely(host->health_delay_up_to)) {
  771. if (unlikely(now < host->health_delay_up_to)) {
  772. next_run = host->health_delay_up_to;
  773. health_sleep(next_run, loop, host);
  774. continue;
  775. }
  776. log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
  777. host->health_delay_up_to = 0;
  778. }
  779. // wait until cleanup of obsolete charts on children is complete
  780. if (host != localhost) {
  781. if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
  782. log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
  783. health_sleep(next_run, loop, host);
  784. continue;
  785. }
  786. }
  787. if (!health_running_logged) {
  788. log_health("[%s]: Health is running.", rrdhost_hostname(host));
  789. health_running_logged = true;
  790. }
  791. if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
  792. sql_health_alarm_log_cleanup(host);
  793. health_execute_delayed_initializations(host);
  794. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  795. // the first loop is to lookup values from the db
  796. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  797. rrdcalc_update_info_using_rrdset_labels(rc);
  798. if (update_disabled_silenced(host, rc))
  799. continue;
  800. // create an alert removed event if the chart is obsolete and
  801. // has stopped being collected for 60 seconds
  802. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  803. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  804. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  805. if (!rrdcalc_isrepeating(rc)) {
  806. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  807. time_t now = now_realtime_sec();
  808. ALARM_ENTRY *ae = health_create_alarm_entry(
  809. host,
  810. rc->id,
  811. rc->next_event_id++,
  812. rc->config_hash_id,
  813. now,
  814. rc->name,
  815. rc->rrdset->id,
  816. rc->rrdset->context,
  817. rc->rrdset->family,
  818. rc->classification,
  819. rc->component,
  820. rc->type,
  821. rc->exec,
  822. rc->recipient,
  823. now - rc->last_status_change,
  824. rc->value,
  825. NAN,
  826. rc->status,
  827. RRDCALC_STATUS_REMOVED,
  828. rc->source,
  829. rc->units,
  830. rc->info,
  831. 0,
  832. rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
  833. if (ae) {
  834. health_alarm_log_add_entry(host, ae);
  835. rc->old_status = rc->status;
  836. rc->status = RRDCALC_STATUS_REMOVED;
  837. rc->last_status_change = now;
  838. rc->last_updated = now;
  839. rc->value = NAN;
  840. #ifdef ENABLE_ACLK
  841. if (netdata_cloud_setting && likely(!host->aclk_alert_reloaded))
  842. sql_queue_alarm_to_aclk(host, ae, 1);
  843. #endif
  844. }
  845. }
  846. }
  847. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  848. if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
  849. rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
  850. continue;
  851. }
  852. runnable++;
  853. rc->old_value = rc->value;
  854. rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
  855. // ------------------------------------------------------------
  856. // if there is database lookup, do it
  857. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  858. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  859. /* time_t old_db_timestamp = rc->db_before; */
  860. int value_is_null = 0;
  861. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
  862. rc->after, rc->before, rc->group, NULL,
  863. 0, rc->options,
  864. &rc->db_after,&rc->db_before,
  865. NULL, NULL, NULL,
  866. &value_is_null, NULL, 0, 0,
  867. QUERY_SOURCE_HEALTH);
  868. if (unlikely(ret != 200)) {
  869. // database lookup failed
  870. rc->value = NAN;
  871. rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
  872. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  873. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
  874. );
  875. } else
  876. rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
  877. /* - RRDCALC_FLAG_DB_STALE not currently used
  878. if (unlikely(old_db_timestamp == rc->db_before)) {
  879. // database is stale
  880. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  881. if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
  882. rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
  883. error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
  884. }
  885. }
  886. else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
  887. rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
  888. */
  889. if (unlikely(value_is_null)) {
  890. // collected value is null
  891. rc->value = NAN;
  892. rc->run_flags |= RRDCALC_FLAG_DB_NAN;
  893. debug(D_HEALTH,
  894. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  895. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
  896. );
  897. } else
  898. rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
  899. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
  900. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
  901. );
  902. }
  903. // ------------------------------------------------------------
  904. // if there is calculation expression, run it
  905. if (unlikely(rc->calculation)) {
  906. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  907. if (unlikely(!expression_evaluate(rc->calculation))) {
  908. // calculation failed
  909. rc->value = NAN;
  910. rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
  911. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  912. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  913. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  914. );
  915. } else {
  916. rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  917. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  918. NETDATA_DOUBLE_FORMAT
  919. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  920. rc->calculation->parsed_as, rc->calculation->result,
  921. buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
  922. );
  923. rc->value = rc->calculation->result;
  924. }
  925. }
  926. }
  927. foreach_rrdcalc_in_rrdhost_done(rc);
  928. if (unlikely(runnable && !netdata_exit)) {
  929. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  930. if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
  931. continue;
  932. if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
  933. continue;
  934. }
  935. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  936. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  937. // --------------------------------------------------------
  938. // check the warning expression
  939. if (likely(rc->warning)) {
  940. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  941. if (unlikely(!expression_evaluate(rc->warning))) {
  942. // calculation failed
  943. rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
  944. debug(D_HEALTH,
  945. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  946. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  947. buffer_tostring(rc->warning->error_msg)
  948. );
  949. } else {
  950. rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  951. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  952. NETDATA_DOUBLE_FORMAT
  953. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  954. rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
  955. );
  956. warning_status = rrdcalc_value2status(rc->warning->result);
  957. }
  958. }
  959. // --------------------------------------------------------
  960. // check the critical expression
  961. if (likely(rc->critical)) {
  962. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  963. if (unlikely(!expression_evaluate(rc->critical))) {
  964. // calculation failed
  965. rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
  966. debug(D_HEALTH,
  967. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  968. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  969. buffer_tostring(rc->critical->error_msg)
  970. );
  971. } else {
  972. rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  973. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  974. NETDATA_DOUBLE_FORMAT
  975. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  976. rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
  977. rrdcalc_source(rc)
  978. );
  979. critical_status = rrdcalc_value2status(rc->critical->result);
  980. }
  981. }
  982. // --------------------------------------------------------
  983. // decide the final alarm status
  984. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  985. switch (warning_status) {
  986. case RRDCALC_STATUS_CLEAR:
  987. status = RRDCALC_STATUS_CLEAR;
  988. break;
  989. case RRDCALC_STATUS_RAISED:
  990. status = RRDCALC_STATUS_WARNING;
  991. break;
  992. default:
  993. break;
  994. }
  995. switch (critical_status) {
  996. case RRDCALC_STATUS_CLEAR:
  997. if (status == RRDCALC_STATUS_UNDEFINED)
  998. status = RRDCALC_STATUS_CLEAR;
  999. break;
  1000. case RRDCALC_STATUS_RAISED:
  1001. status = RRDCALC_STATUS_CRITICAL;
  1002. break;
  1003. default:
  1004. break;
  1005. }
  1006. // --------------------------------------------------------
  1007. // check if the new status and the old differ
  1008. if (status != rc->status) {
  1009. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1010. int delay = 0;
  1011. // apply trigger hysteresis
  1012. if (now > rc->delay_up_to_timestamp) {
  1013. rc->delay_up_current = rc->delay_up_duration;
  1014. rc->delay_down_current = rc->delay_down_duration;
  1015. rc->delay_last = 0;
  1016. rc->delay_up_to_timestamp = 0;
  1017. } else {
  1018. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  1019. if (rc->delay_up_current > rc->delay_max_duration)
  1020. rc->delay_up_current = rc->delay_max_duration;
  1021. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  1022. if (rc->delay_down_current > rc->delay_max_duration)
  1023. rc->delay_down_current = rc->delay_max_duration;
  1024. }
  1025. if (status > rc->status)
  1026. delay = rc->delay_up_current;
  1027. else
  1028. delay = rc->delay_down_current;
  1029. // COMMENTED: because we do need to send raising alarms
  1030. // if(now + delay < rc->delay_up_to_timestamp)
  1031. // delay = (int)(rc->delay_up_to_timestamp - now);
  1032. rc->delay_last = delay;
  1033. rc->delay_up_to_timestamp = now + delay;
  1034. ALARM_ENTRY *ae = health_create_alarm_entry(
  1035. host,
  1036. rc->id,
  1037. rc->next_event_id++,
  1038. rc->config_hash_id,
  1039. now,
  1040. rc->name,
  1041. rc->rrdset->id,
  1042. rc->rrdset->context,
  1043. rc->rrdset->family,
  1044. rc->classification,
  1045. rc->component,
  1046. rc->type,
  1047. rc->exec,
  1048. rc->recipient,
  1049. now - rc->last_status_change,
  1050. rc->old_value,
  1051. rc->value,
  1052. rc->status,
  1053. status,
  1054. rc->source,
  1055. rc->units,
  1056. rc->info,
  1057. rc->delay_last,
  1058. (
  1059. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1060. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1061. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1062. )
  1063. );
  1064. health_alarm_log_add_entry(host, ae);
  1065. log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
  1066. rc->last_status_change = now;
  1067. rc->old_status = rc->status;
  1068. rc->status = status;
  1069. }
  1070. rc->last_updated = now;
  1071. rc->next_update = now + rc->update_every;
  1072. if (next_run > rc->next_update)
  1073. next_run = rc->next_update;
  1074. }
  1075. foreach_rrdcalc_in_rrdhost_done(rc);
  1076. // process repeating alarms
  1077. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1078. int repeat_every = 0;
  1079. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  1080. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  1081. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1082. repeat_every = rc->warn_repeat_every;
  1083. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  1084. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1085. repeat_every = rc->crit_repeat_every;
  1086. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  1087. if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
  1088. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  1089. repeat_every = 1;
  1090. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  1091. repeat_every = 1;
  1092. }
  1093. }
  1094. }
  1095. } else {
  1096. continue;
  1097. }
  1098. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  1099. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1100. rc->last_repeat = now;
  1101. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  1102. ALARM_ENTRY *ae = health_create_alarm_entry(
  1103. host,
  1104. rc->id,
  1105. rc->next_event_id++,
  1106. rc->config_hash_id,
  1107. now,
  1108. rc->name,
  1109. rc->rrdset->id,
  1110. rc->rrdset->context,
  1111. rc->rrdset->family,
  1112. rc->classification,
  1113. rc->component,
  1114. rc->type,
  1115. rc->exec,
  1116. rc->recipient,
  1117. now - rc->last_status_change,
  1118. rc->old_value,
  1119. rc->value,
  1120. rc->old_status,
  1121. rc->status,
  1122. rc->source,
  1123. rc->units,
  1124. rc->info,
  1125. rc->delay_last,
  1126. (
  1127. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1128. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1129. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1130. )
  1131. );
  1132. ae->last_repeat = rc->last_repeat;
  1133. if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  1134. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  1135. }
  1136. rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
  1137. health_process_notifications(host, ae);
  1138. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  1139. health_alarm_wait_for_execution(ae);
  1140. health_alarm_log_free_one_nochecks_nounlink(ae);
  1141. }
  1142. }
  1143. foreach_rrdcalc_in_rrdhost_done(rc);
  1144. }
  1145. if (unlikely(netdata_exit))
  1146. break;
  1147. // execute notifications
  1148. // and cleanup
  1149. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  1150. health_alarm_log_process(host);
  1151. if (unlikely(netdata_exit)) {
  1152. // wait for all notifications to finish before allowing health to be cleaned up
  1153. ALARM_ENTRY *ae;
  1154. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1155. health_alarm_wait_for_execution(ae);
  1156. }
  1157. break;
  1158. }
  1159. // wait for all notifications to finish before allowing health to be cleaned up
  1160. ALARM_ENTRY *ae;
  1161. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1162. health_alarm_wait_for_execution(ae);
  1163. }
  1164. #ifdef ENABLE_ACLK
  1165. if (netdata_cloud_setting && unlikely(host->aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  1166. sql_queue_removed_alerts_to_aclk(host);
  1167. host->aclk_alert_reloaded = 0;
  1168. marked_aclk_reload_loop = 0;
  1169. }
  1170. #endif
  1171. if(unlikely(netdata_exit))
  1172. break;
  1173. health_sleep(next_run, loop, host);
  1174. } // forever
  1175. netdata_thread_cleanup_pop(1);
  1176. return NULL;
  1177. }
  1178. void health_add_host_labels(void) {
  1179. DICTIONARY *labels = localhost->rrdlabels;
  1180. int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
  1181. rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_CONFIG);
  1182. int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
  1183. rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_CONFIG);
  1184. }
  1185. void health_thread_spawn(RRDHOST * host) {
  1186. if(!host->health_spawn) {
  1187. char tag[NETDATA_THREAD_TAG_MAX + 1];
  1188. snprintfz(tag, NETDATA_THREAD_TAG_MAX, "HEALTH[%s]", rrdhost_hostname(host));
  1189. struct health_state *health = callocz(1, sizeof(*health));
  1190. health->host = host;
  1191. if(netdata_thread_create(&host->health_thread, tag, NETDATA_THREAD_OPTION_JOINABLE, health_main, (void *) health)) {
  1192. log_health("[%s]: Failed to create new thread for client.", rrdhost_hostname(host));
  1193. error("HEALTH [%s]: Failed to create new thread for client.", rrdhost_hostname(host));
  1194. }
  1195. else {
  1196. log_health("[%s]: Created new thread for client.", rrdhost_hostname(host));
  1197. host->health_spawn = 1;
  1198. host->aclk_alert_reloaded = 1;
  1199. }
  1200. }
  1201. }