health.c 67 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  4. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  5. #define WORKER_HEALTH_JOB_DB_QUERY 2
  6. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  7. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  8. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  9. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  10. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  11. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
  12. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
  13. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
  14. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
  15. #endif
  16. static bool prepare_command(BUFFER *wb,
  17. const char *exec,
  18. const char *recipient,
  19. const char *registry_hostname,
  20. uint32_t unique_id,
  21. uint32_t alarm_id,
  22. uint32_t alarm_event_id,
  23. uint32_t when,
  24. const char *alert_name,
  25. const char *alert_chart_name,
  26. const char *alert_family,
  27. const char *new_status,
  28. const char *old_status,
  29. NETDATA_DOUBLE new_value,
  30. NETDATA_DOUBLE old_value,
  31. const char *alert_source,
  32. uint32_t duration,
  33. uint32_t non_clear_duration,
  34. const char *alert_units,
  35. const char *alert_info,
  36. const char *new_value_string,
  37. const char *old_value_string,
  38. const char *source,
  39. const char *error_msg,
  40. int n_warn,
  41. int n_crit,
  42. const char *warn_alarms,
  43. const char *crit_alarms,
  44. const char *classification,
  45. const char *edit_command,
  46. const char *machine_guid)
  47. {
  48. char buf[8192];
  49. size_t n = 8192 - 1;
  50. buffer_strcat(wb, "exec");
  51. if (!sanitize_command_argument_string(buf, exec, n))
  52. return false;
  53. buffer_sprintf(wb, " '%s'", buf);
  54. if (!sanitize_command_argument_string(buf, recipient, n))
  55. return false;
  56. buffer_sprintf(wb, " '%s'", buf);
  57. if (!sanitize_command_argument_string(buf, registry_hostname, n))
  58. return false;
  59. buffer_sprintf(wb, " '%s'", buf);
  60. buffer_sprintf(wb, " '%u'", unique_id);
  61. buffer_sprintf(wb, " '%u'", alarm_id);
  62. buffer_sprintf(wb, " '%u'", alarm_event_id);
  63. buffer_sprintf(wb, " '%u'", when);
  64. if (!sanitize_command_argument_string(buf, alert_name, n))
  65. return false;
  66. buffer_sprintf(wb, " '%s'", buf);
  67. if (!sanitize_command_argument_string(buf, alert_chart_name, n))
  68. return false;
  69. buffer_sprintf(wb, " '%s'", buf);
  70. if (!sanitize_command_argument_string(buf, alert_family, n))
  71. return false;
  72. buffer_sprintf(wb, " '%s'", buf);
  73. if (!sanitize_command_argument_string(buf, new_status, n))
  74. return false;
  75. buffer_sprintf(wb, " '%s'", buf);
  76. if (!sanitize_command_argument_string(buf, old_status, n))
  77. return false;
  78. buffer_sprintf(wb, " '%s'", buf);
  79. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
  80. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
  81. if (!sanitize_command_argument_string(buf, alert_source, n))
  82. return false;
  83. buffer_sprintf(wb, " '%s'", buf);
  84. buffer_sprintf(wb, " '%u'", duration);
  85. buffer_sprintf(wb, " '%u'", non_clear_duration);
  86. if (!sanitize_command_argument_string(buf, alert_units, n))
  87. return false;
  88. buffer_sprintf(wb, " '%s'", buf);
  89. if (!sanitize_command_argument_string(buf, alert_info, n))
  90. return false;
  91. buffer_sprintf(wb, " '%s'", buf);
  92. if (!sanitize_command_argument_string(buf, new_value_string, n))
  93. return false;
  94. buffer_sprintf(wb, " '%s'", buf);
  95. if (!sanitize_command_argument_string(buf, old_value_string, n))
  96. return false;
  97. buffer_sprintf(wb, " '%s'", buf);
  98. if (!sanitize_command_argument_string(buf, source, n))
  99. return false;
  100. buffer_sprintf(wb, " '%s'", buf);
  101. if (!sanitize_command_argument_string(buf, error_msg, n))
  102. return false;
  103. buffer_sprintf(wb, " '%s'", buf);
  104. buffer_sprintf(wb, " '%d'", n_warn);
  105. buffer_sprintf(wb, " '%d'", n_crit);
  106. if (!sanitize_command_argument_string(buf, warn_alarms, n))
  107. return false;
  108. buffer_sprintf(wb, " '%s'", buf);
  109. if (!sanitize_command_argument_string(buf, crit_alarms, n))
  110. return false;
  111. buffer_sprintf(wb, " '%s'", buf);
  112. if (!sanitize_command_argument_string(buf, classification, n))
  113. return false;
  114. buffer_sprintf(wb, " '%s'", buf);
  115. if (!sanitize_command_argument_string(buf, edit_command, n))
  116. return false;
  117. buffer_sprintf(wb, " '%s'", buf);
  118. if (!sanitize_command_argument_string(buf, machine_guid, n))
  119. return false;
  120. buffer_sprintf(wb, " '%s'", buf);
  121. return true;
  122. }
  123. unsigned int default_health_enabled = 1;
  124. char *silencers_filename;
  125. SIMPLE_PATTERN *conf_enabled_alarms = NULL;
  126. // the queue of executed alarm notifications that haven't been waited for yet
  127. static struct {
  128. ALARM_ENTRY *head; // oldest
  129. ALARM_ENTRY *tail; // latest
  130. } alarm_notifications_in_progress = {NULL, NULL};
  131. typedef struct active_alerts {
  132. char *name;
  133. time_t last_status_change;
  134. RRDCALC_STATUS status;
  135. } active_alerts_t;
  136. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  137. {
  138. ae->prev_in_progress = NULL;
  139. ae->next_in_progress = NULL;
  140. if (NULL != alarm_notifications_in_progress.tail) {
  141. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  142. alarm_notifications_in_progress.tail->next_in_progress = ae;
  143. }
  144. if (NULL == alarm_notifications_in_progress.head) {
  145. alarm_notifications_in_progress.head = ae;
  146. }
  147. alarm_notifications_in_progress.tail = ae;
  148. }
  149. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  150. {
  151. struct alarm_entry *prev = ae->prev_in_progress;
  152. struct alarm_entry *next = ae->next_in_progress;
  153. if (NULL != prev) {
  154. prev->next_in_progress = next;
  155. }
  156. if (NULL != next) {
  157. next->prev_in_progress = prev;
  158. }
  159. if (ae == alarm_notifications_in_progress.head) {
  160. alarm_notifications_in_progress.head = next;
  161. }
  162. if (ae == alarm_notifications_in_progress.tail) {
  163. alarm_notifications_in_progress.tail = prev;
  164. }
  165. }
  166. // ----------------------------------------------------------------------------
  167. // health initialization
  168. /**
  169. * User Config directory
  170. *
  171. * Get the config directory for health and return it.
  172. *
  173. * @return a pointer to the user config directory
  174. */
  175. inline char *health_user_config_dir(void) {
  176. char buffer[FILENAME_MAX + 1];
  177. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  178. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  179. }
  180. /**
  181. * Stock Config Directory
  182. *
  183. * Get the Stock config directory and return it.
  184. *
  185. * @return a pointer to the stock config directory.
  186. */
  187. inline char *health_stock_config_dir(void) {
  188. char buffer[FILENAME_MAX + 1];
  189. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  190. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  191. }
  192. /**
  193. * Silencers init
  194. *
  195. * Function used to initialize the silencer structure.
  196. */
  197. static void health_silencers_init(void) {
  198. FILE *fd = fopen(silencers_filename, "r");
  199. if (fd) {
  200. fseek(fd, 0 , SEEK_END);
  201. off_t length = (off_t) ftell(fd);
  202. fseek(fd, 0 , SEEK_SET);
  203. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  204. char *str = mallocz((length+1)* sizeof(char));
  205. if(str) {
  206. size_t copied;
  207. copied = fread(str, sizeof(char), length, fd);
  208. if (copied == (length* sizeof(char))) {
  209. str[length] = 0x00;
  210. json_parse(str, NULL, health_silencers_json_read_callback);
  211. info("Parsed health silencers file %s", silencers_filename);
  212. } else {
  213. error("Cannot read the data from health silencers file %s", silencers_filename);
  214. }
  215. freez(str);
  216. }
  217. } else {
  218. error(
  219. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  220. silencers_filename,
  221. (int64_t)length,
  222. HEALTH_SILENCERS_MAX_FILE_LEN);
  223. }
  224. fclose(fd);
  225. } else {
  226. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  227. }
  228. }
  229. /**
  230. * Health Init
  231. *
  232. * Initialize the health thread.
  233. */
  234. void health_init(void) {
  235. debug(D_HEALTH, "Health configuration initializing");
  236. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  237. debug(D_HEALTH, "Health is disabled.");
  238. return;
  239. }
  240. health_silencers_init();
  241. }
  242. // ----------------------------------------------------------------------------
  243. // re-load health configuration
  244. /**
  245. * Reload host
  246. *
  247. * Reload configuration for a specific host.
  248. *
  249. * @param host the structure of the host that the function will reload the configuration.
  250. */
  251. static void health_reload_host(RRDHOST *host) {
  252. if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
  253. return;
  254. log_health("[%s]: Reloading health.", rrdhost_hostname(host));
  255. char *user_path = health_user_config_dir();
  256. char *stock_path = health_stock_config_dir();
  257. // free all running alarms
  258. rrdcalc_delete_all(host);
  259. rrdcalctemplate_delete_all(host);
  260. // invalidate all previous entries in the alarm log
  261. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  262. ALARM_ENTRY *t;
  263. for(t = host->health_log.alarms ; t ; t = t->next) {
  264. if(t->new_status != RRDCALC_STATUS_REMOVED)
  265. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  266. }
  267. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  268. // reset all thresholds to all charts
  269. RRDSET *st;
  270. rrdset_foreach_read(st, host) {
  271. st->green = NAN;
  272. st->red = NAN;
  273. }
  274. rrdset_foreach_done(st);
  275. // load the new alarms
  276. health_readdir(host, user_path, stock_path, NULL);
  277. //Discard alarms with labels that do not apply to host
  278. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  279. // link the loaded alarms to their charts
  280. rrdset_foreach_write(st, host) {
  281. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  282. continue;
  283. rrdcalc_link_matching_alerts_to_rrdset(st);
  284. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  285. }
  286. rrdset_foreach_done(st);
  287. }
  288. /**
  289. * Reload
  290. *
  291. * Reload the host configuration for all hosts.
  292. */
  293. void health_reload(void) {
  294. sql_refresh_hashes();
  295. rrd_rdlock();
  296. RRDHOST *host;
  297. rrdhost_foreach_read(host)
  298. health_reload_host(host);
  299. rrd_unlock();
  300. #ifdef ENABLE_ACLK
  301. if (netdata_cloud_setting) {
  302. aclk_alert_reloaded = 1;
  303. }
  304. #endif
  305. }
  306. // ----------------------------------------------------------------------------
  307. // health main thread and friends
  308. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  309. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  310. if(n) return RRDCALC_STATUS_RAISED;
  311. return RRDCALC_STATUS_CLEAR;
  312. }
  313. #define ACTIVE_ALARMS_LIST_EXAMINE 500
  314. #define ACTIVE_ALARMS_LIST 15
  315. static inline int compare_active_alerts(const void * a, const void * b) {
  316. active_alerts_t *active_alerts_a = (active_alerts_t *)a;
  317. active_alerts_t *active_alerts_b = (active_alerts_t *)b;
  318. return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
  319. }
  320. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  321. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  322. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  323. // do not send notifications for internal statuses
  324. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  325. goto done;
  326. }
  327. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  328. // do not send notifications for disabled statuses
  329. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  330. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  331. // mark it as run, so that we will send the same alarm if it happens again
  332. goto done;
  333. }
  334. // find the previous notification for the same alarm
  335. // which we have run the exec script
  336. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  337. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  338. uint32_t id = ae->alarm_id;
  339. ALARM_ENTRY *t;
  340. for(t = ae->next; t ; t = t->next) {
  341. if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  342. break;
  343. }
  344. if(likely(t)) {
  345. // we have executed this alarm notification in the past
  346. if(t && t->new_status == ae->new_status) {
  347. // don't send the notification for the same status again
  348. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
  349. , rrdcalc_status2string(ae->new_status));
  350. log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
  351. , rrdcalc_status2string(ae->new_status));
  352. goto done;
  353. }
  354. }
  355. else {
  356. // we have not executed this alarm notification in the past
  357. // so, don't send CLEAR notifications
  358. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  359. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  360. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  361. , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  362. goto done;
  363. }
  364. }
  365. }
  366. }
  367. // Check if alarm notifications are silenced
  368. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  369. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  370. goto done;
  371. }
  372. log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  373. const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
  374. const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
  375. int n_warn=0, n_crit=0;
  376. RRDCALC *rc;
  377. EVAL_EXPRESSION *expr=NULL;
  378. BUFFER *warn_alarms, *crit_alarms;
  379. active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
  380. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  381. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  382. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  383. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  384. continue;
  385. if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
  386. break;
  387. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  388. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  389. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  390. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  391. active_alerts[n_warn+n_crit].status = rc->status;
  392. n_warn++;
  393. } else if (ae->alarm_id == rc->id)
  394. expr = rc->warning;
  395. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  396. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  397. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  398. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  399. active_alerts[n_warn+n_crit].status = rc->status;
  400. n_crit++;
  401. } else if (ae->alarm_id == rc->id)
  402. expr = rc->critical;
  403. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  404. if (ae->alarm_id == rc->id)
  405. expr = rc->warning;
  406. }
  407. }
  408. foreach_rrdcalc_in_rrdhost_done(rc);
  409. if (n_warn+n_crit>1)
  410. qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
  411. int count_w = 0, count_c = 0;
  412. while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
  413. if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
  414. if (count_w)
  415. buffer_strcat(warn_alarms, ",");
  416. buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
  417. buffer_strcat(warn_alarms, "=");
  418. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  419. count_w++;
  420. }
  421. else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
  422. if (count_c)
  423. buffer_strcat(crit_alarms, ",");
  424. buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
  425. buffer_strcat(crit_alarms, "=");
  426. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  427. count_c++;
  428. }
  429. }
  430. char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
  431. BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health);
  432. bool ok = prepare_command(wb,
  433. exec,
  434. recipient,
  435. rrdhost_registry_hostname(host),
  436. ae->unique_id,
  437. ae->alarm_id,
  438. ae->alarm_event_id,
  439. (unsigned long)ae->when,
  440. ae_name(ae),
  441. ae->chart?ae_chart_name(ae):"NOCHART",
  442. ae->family?ae_family(ae):"NOFAMILY",
  443. rrdcalc_status2string(ae->new_status),
  444. rrdcalc_status2string(ae->old_status),
  445. ae->new_value,
  446. ae->old_value,
  447. ae->source?ae_source(ae):"UNKNOWN",
  448. (uint32_t)ae->duration,
  449. (uint32_t)ae->non_clear_duration,
  450. ae_units(ae),
  451. ae_info(ae),
  452. ae_new_value_string(ae),
  453. ae_old_value_string(ae),
  454. (expr && expr->source)?expr->source:"NOSOURCE",
  455. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  456. n_warn,
  457. n_crit,
  458. buffer_tostring(warn_alarms),
  459. buffer_tostring(crit_alarms),
  460. ae->classification?ae_classification(ae):"Unknown",
  461. edit_command,
  462. host != localhost ? host->machine_guid:"");
  463. const char *command_to_run = buffer_tostring(wb);
  464. if (ok) {
  465. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  466. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  467. debug(D_HEALTH, "executing command '%s'", command_to_run);
  468. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  469. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  470. enqueue_alarm_notify_in_progress(ae);
  471. } else {
  472. error("Failed to format command arguments");
  473. }
  474. buffer_free(wb);
  475. freez(edit_command);
  476. buffer_free(warn_alarms);
  477. buffer_free(crit_alarms);
  478. freez(active_alerts);
  479. return; //health_alarm_wait_for_execution
  480. done:
  481. health_alarm_log_save(host, ae);
  482. }
  483. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  484. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  485. return;
  486. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  487. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  488. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  489. if(ae->exec_code != 0)
  490. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  491. unlink_alarm_notify_in_progress(ae);
  492. }
  493. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  494. debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  495. ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
  496. ae->new_value,
  497. rrdcalc_status2string(ae->old_status),
  498. rrdcalc_status2string(ae->new_status)
  499. );
  500. health_alarm_execute(host, ae);
  501. }
  502. static inline void health_alarm_log_process(RRDHOST *host) {
  503. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  504. time_t now = now_realtime_sec();
  505. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  506. ALARM_ENTRY *ae;
  507. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  508. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
  509. if(unlikely(
  510. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  511. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  512. )) {
  513. if(unlikely(ae->unique_id < first_waiting))
  514. first_waiting = ae->unique_id;
  515. if(likely(now >= ae->delay_up_to_timestamp))
  516. health_process_notifications(host, ae);
  517. }
  518. }
  519. }
  520. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  521. // remember this for the next iteration
  522. host->health_last_processed_id = first_waiting;
  523. bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
  524. if (!cleanup_excess_log_entries)
  525. return;
  526. // cleanup excess entries in the log
  527. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  528. ALARM_ENTRY *last = NULL;
  529. unsigned int count = host->health_log.max * 2 / 3;
  530. for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
  531. if(ae && last && last->next == ae)
  532. last->next = NULL;
  533. else
  534. ae = NULL;
  535. while(ae) {
  536. debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
  537. ALARM_ENTRY *t = ae->next;
  538. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
  539. health_alarm_wait_for_execution(ae);
  540. health_alarm_log_free_one_nochecks_nounlink(ae);
  541. host->health_log.count--;
  542. }
  543. ae = t;
  544. }
  545. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  546. }
  547. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  548. if(unlikely(!rc->rrdset)) {
  549. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  550. return 0;
  551. }
  552. if(unlikely(rc->next_update > now)) {
  553. if (unlikely(*next_run > rc->next_update)) {
  554. // update the next_run time of the main loop
  555. // to run this alarm precisely the time required
  556. *next_run = rc->next_update;
  557. }
  558. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
  559. return 0;
  560. }
  561. if(unlikely(!rc->update_every)) {
  562. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  563. return 0;
  564. }
  565. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  566. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  567. return 0;
  568. }
  569. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  570. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  571. return 0;
  572. }
  573. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  574. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  575. return 0;
  576. }
  577. int update_every = rc->rrdset->update_every;
  578. time_t first = rrdset_first_entry_s(rc->rrdset);
  579. time_t last = rrdset_last_entry_s(rc->rrdset);
  580. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  581. debug(D_HEALTH
  582. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  583. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
  584. , (unsigned long) last);
  585. return 0;
  586. }
  587. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  588. time_t needed = now + rc->before + rc->after;
  589. if(needed + update_every < first || needed - update_every > last) {
  590. debug(D_HEALTH
  591. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  592. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
  593. , (unsigned long) last);
  594. return 0;
  595. }
  596. }
  597. return 1;
  598. }
  599. static inline int check_if_resumed_from_suspension(void) {
  600. static usec_t last_realtime = 0, last_monotonic = 0;
  601. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  602. int ret = 0;
  603. // detect if monotonic and realtime have twice the difference
  604. // in which case we assume the system was just waken from hibernation
  605. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  606. ret = 1;
  607. last_realtime = realtime;
  608. last_monotonic = monotonic;
  609. return ret;
  610. }
  611. static void health_main_cleanup(void *ptr) {
  612. worker_unregister();
  613. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  614. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  615. info("cleaning up...");
  616. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  617. log_health("Health thread ended.");
  618. }
  619. static void initialize_health(RRDHOST *host, int is_localhost) {
  620. if(!host->health.health_enabled ||
  621. rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) ||
  622. !service_running(SERVICE_HEALTH))
  623. return;
  624. rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
  625. log_health("[%s]: Initializing health.", rrdhost_hostname(host));
  626. host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
  627. host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
  628. host->health_log.next_log_id = 1;
  629. host->health_log.next_alarm_id = 1;
  630. host->health_log.max = 1000;
  631. host->health_log.next_log_id = (uint32_t)now_realtime_sec();
  632. host->health_log.next_alarm_id = 0;
  633. long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
  634. if(n < 10) {
  635. error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
  636. config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
  637. }
  638. else
  639. host->health_log.max = (unsigned int)n;
  640. conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT);
  641. netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
  642. char filename[FILENAME_MAX + 1];
  643. if(!is_localhost) {
  644. int r = mkdir(host->varlib_dir, 0775);
  645. if (r != 0 && errno != EEXIST)
  646. error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir);
  647. }
  648. {
  649. snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
  650. int r = mkdir(filename, 0775);
  651. if(r != 0 && errno != EEXIST)
  652. error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
  653. }
  654. snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
  655. host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
  656. host->health.health_default_recipient = string_strdupz("root");
  657. // TODO: This needs to go to the metadata thread
  658. // Health should wait before accessing the table (needs to be created by the metadata thread)
  659. sql_create_health_log_table(host);
  660. sql_health_alarm_log_load(host);
  661. // ------------------------------------------------------------------------
  662. // load health configuration
  663. health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
  664. // link the loaded alarms to their charts
  665. RRDSET *st;
  666. rrdset_foreach_write(st, host) {
  667. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  668. continue;
  669. rrdcalc_link_matching_alerts_to_rrdset(st);
  670. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  671. }
  672. rrdset_foreach_done(st);
  673. //Discard alarms with labels that do not apply to host
  674. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  675. }
  676. static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
  677. time_t now = now_realtime_sec();
  678. if(now < next_run) {
  679. worker_is_idle();
  680. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  681. while (now < next_run && service_running(SERVICE_HEALTH)) {
  682. sleep_usec(USEC_PER_SEC);
  683. now = now_realtime_sec();
  684. }
  685. }
  686. else {
  687. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  688. }
  689. }
  690. static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
  691. SILENCER *s;
  692. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  693. rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
  694. for (s = silencers->silencers; s!=NULL; s=s->next){
  695. if (
  696. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
  697. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
  698. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
  699. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
  700. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
  701. ) {
  702. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  703. if (unlikely(silencers->stype == STYPE_NONE)) {
  704. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
  705. } else {
  706. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  707. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  708. , rrdcalc_name(rc)
  709. , (rc->rrdset)?rrdset_context(rc->rrdset):""
  710. , rrdcalc_chart_name(rc)
  711. , host
  712. , (rc->rrdset)?rrdset_family(rc->rrdset):""
  713. );
  714. }
  715. return silencers->stype;
  716. }
  717. }
  718. return STYPE_NONE;
  719. }
  720. /**
  721. * Update Disabled Silenced
  722. *
  723. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  724. *
  725. * @param host structure that contains information about the host monitored.
  726. * @param rc structure with information about the alarm
  727. *
  728. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  729. */
  730. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  731. uint32_t rrdcalc_flags_old = rc->run_flags;
  732. // Clear the flags
  733. rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  734. if (unlikely(silencers->all_alarms)) {
  735. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  736. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  737. } else {
  738. SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
  739. if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  740. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  741. }
  742. if (rrdcalc_flags_old != rc->run_flags) {
  743. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  744. rrdhost_hostname(host),
  745. rrdcalc_name(rc),
  746. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  747. (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  748. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  749. (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  750. );
  751. }
  752. if (rc->run_flags & RRDCALC_FLAG_DISABLED)
  753. return 1;
  754. else
  755. return 0;
  756. }
  757. static void health_execute_delayed_initializations(RRDHOST *host) {
  758. RRDSET *st;
  759. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
  760. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
  761. rrdset_foreach_reentrant(st, host) {
  762. if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  763. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
  764. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
  765. if(!st->rrdfamily)
  766. st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
  767. if(!st->rrdvars)
  768. st->rrdvars = rrdvariables_create();
  769. rrddimvar_index_init(st);
  770. rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
  771. rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
  772. rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
  773. rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
  774. rrdcalc_link_matching_alerts_to_rrdset(st);
  775. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  776. RRDDIM *rd;
  777. rrddim_foreach_read(rd, st) {
  778. if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  779. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
  780. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
  781. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
  782. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
  783. rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
  784. RRDCALCTEMPLATE *rt;
  785. foreach_rrdcalctemplate_read(host, rt) {
  786. if(!rt->foreach_dimension_pattern)
  787. continue;
  788. if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
  789. rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
  790. }
  791. foreach_rrdcalctemplate_done(rt);
  792. }
  793. rrddim_foreach_done(rd);
  794. }
  795. rrdset_foreach_done(st);
  796. }
  797. /**
  798. * Health Main
  799. *
  800. * The main thread of the health system. In this function all the alarms will be processed.
  801. *
  802. * @param ptr is a pointer to the netdata_static_thread structure.
  803. *
  804. * @return It always returns NULL
  805. */
  806. void *health_main(void *ptr) {
  807. worker_register("HEALTH");
  808. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  809. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  810. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  811. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  812. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  813. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  814. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  815. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  816. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
  817. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
  818. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  819. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  820. if(min_run_every < 1) min_run_every = 1;
  821. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  822. bool health_running_logged = false;
  823. rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
  824. unsigned int loop = 0;
  825. #ifdef ENABLE_ACLK
  826. unsigned int marked_aclk_reload_loop = 0;
  827. #endif
  828. while(service_running(SERVICE_HEALTH)) {
  829. loop++;
  830. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  831. time_t now = now_realtime_sec();
  832. int runnable = 0, apply_hibernation_delay = 0;
  833. time_t next_run = now + min_run_every;
  834. RRDCALC *rc;
  835. RRDHOST *host;
  836. if (unlikely(check_if_resumed_from_suspension())) {
  837. apply_hibernation_delay = 1;
  838. log_health(
  839. "Postponing alarm checks for %"PRId64" seconds, "
  840. "because it seems that the system was just resumed from suspension.",
  841. (int64_t)hibernation_delay);
  842. }
  843. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  844. static int logged=0;
  845. if (!logged) {
  846. log_health("Skipping health checks, because all alarms are disabled via a %s command.",
  847. HEALTH_CMDAPI_CMD_DISABLEALL);
  848. logged = 1;
  849. }
  850. }
  851. #ifdef ENABLE_ACLK
  852. if (aclk_alert_reloaded && !marked_aclk_reload_loop)
  853. marked_aclk_reload_loop = loop;
  854. #endif
  855. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  856. rrd_rdlock();
  857. rrdhost_foreach_read(host) {
  858. if(unlikely(!service_running(SERVICE_HEALTH)))
  859. break;
  860. if (unlikely(!host->health.health_enabled))
  861. continue;
  862. if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) {
  863. rrd_unlock();
  864. initialize_health(host, host == localhost);
  865. rrd_rdlock();
  866. }
  867. health_execute_delayed_initializations(host);
  868. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  869. if (unlikely(apply_hibernation_delay)) {
  870. log_health(
  871. "[%s]: Postponing health checks for %"PRId64" seconds.",
  872. rrdhost_hostname(host),
  873. (int64_t)hibernation_delay);
  874. host->health.health_delay_up_to = now + hibernation_delay;
  875. }
  876. if (unlikely(host->health.health_delay_up_to)) {
  877. if (unlikely(now < host->health.health_delay_up_to)) {
  878. continue;
  879. }
  880. log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
  881. host->health.health_delay_up_to = 0;
  882. }
  883. // wait until cleanup of obsolete charts on children is complete
  884. if (host != localhost) {
  885. if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
  886. log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
  887. continue;
  888. }
  889. }
  890. if (!health_running_logged) {
  891. log_health("[%s]: Health is running.", rrdhost_hostname(host));
  892. health_running_logged = true;
  893. }
  894. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  895. // the first loop is to lookup values from the db
  896. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  897. if(unlikely(!service_running(SERVICE_HEALTH)))
  898. break;
  899. rrdcalc_update_info_using_rrdset_labels(rc);
  900. if (update_disabled_silenced(host, rc))
  901. continue;
  902. // create an alert removed event if the chart is obsolete and
  903. // has stopped being collected for 60 seconds
  904. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  905. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  906. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  907. if (!rrdcalc_isrepeating(rc)) {
  908. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  909. time_t now = now_realtime_sec();
  910. ALARM_ENTRY *ae = health_create_alarm_entry(
  911. host,
  912. rc->id,
  913. rc->next_event_id++,
  914. rc->config_hash_id,
  915. now,
  916. rc->name,
  917. rc->rrdset->id,
  918. rc->rrdset->context,
  919. rc->rrdset->family,
  920. rc->classification,
  921. rc->component,
  922. rc->type,
  923. rc->exec,
  924. rc->recipient,
  925. now - rc->last_status_change,
  926. rc->value,
  927. NAN,
  928. rc->status,
  929. RRDCALC_STATUS_REMOVED,
  930. rc->source,
  931. rc->units,
  932. rc->info,
  933. 0,
  934. rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
  935. if (ae) {
  936. health_alarm_log_add_entry(host, ae);
  937. rc->old_status = rc->status;
  938. rc->status = RRDCALC_STATUS_REMOVED;
  939. rc->last_status_change = now;
  940. rc->last_updated = now;
  941. rc->value = NAN;
  942. #ifdef ENABLE_ACLK
  943. if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
  944. sql_queue_alarm_to_aclk(host, ae, 1);
  945. #endif
  946. }
  947. }
  948. }
  949. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  950. if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
  951. rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
  952. continue;
  953. }
  954. runnable++;
  955. rc->old_value = rc->value;
  956. rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
  957. // ------------------------------------------------------------
  958. // if there is database lookup, do it
  959. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  960. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  961. /* time_t old_db_timestamp = rc->db_before; */
  962. int value_is_null = 0;
  963. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
  964. rc->after, rc->before, rc->group, NULL,
  965. 0, rc->options,
  966. &rc->db_after,&rc->db_before,
  967. NULL, NULL, NULL,
  968. &value_is_null, NULL, 0, 0,
  969. QUERY_SOURCE_HEALTH, STORAGE_PRIORITY_LOW);
  970. if (unlikely(ret != 200)) {
  971. // database lookup failed
  972. rc->value = NAN;
  973. rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
  974. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  975. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
  976. );
  977. } else
  978. rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
  979. if (unlikely(value_is_null)) {
  980. // collected value is null
  981. rc->value = NAN;
  982. rc->run_flags |= RRDCALC_FLAG_DB_NAN;
  983. debug(D_HEALTH,
  984. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  985. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
  986. );
  987. } else
  988. rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
  989. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
  990. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
  991. );
  992. }
  993. // ------------------------------------------------------------
  994. // if there is calculation expression, run it
  995. if (unlikely(rc->calculation)) {
  996. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  997. if (unlikely(!expression_evaluate(rc->calculation))) {
  998. // calculation failed
  999. rc->value = NAN;
  1000. rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
  1001. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  1002. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1003. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  1004. );
  1005. } else {
  1006. rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  1007. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  1008. NETDATA_DOUBLE_FORMAT
  1009. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1010. rc->calculation->parsed_as, rc->calculation->result,
  1011. buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
  1012. );
  1013. rc->value = rc->calculation->result;
  1014. }
  1015. }
  1016. }
  1017. foreach_rrdcalc_in_rrdhost_done(rc);
  1018. if (unlikely(runnable && service_running(SERVICE_HEALTH))) {
  1019. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1020. if(unlikely(!service_running(SERVICE_HEALTH)))
  1021. break;
  1022. if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
  1023. continue;
  1024. if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
  1025. continue;
  1026. }
  1027. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  1028. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  1029. // --------------------------------------------------------
  1030. // check the warning expression
  1031. if (likely(rc->warning)) {
  1032. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  1033. if (unlikely(!expression_evaluate(rc->warning))) {
  1034. // calculation failed
  1035. rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
  1036. debug(D_HEALTH,
  1037. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  1038. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1039. buffer_tostring(rc->warning->error_msg)
  1040. );
  1041. } else {
  1042. rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  1043. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  1044. NETDATA_DOUBLE_FORMAT
  1045. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1046. rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
  1047. );
  1048. warning_status = rrdcalc_value2status(rc->warning->result);
  1049. }
  1050. }
  1051. // --------------------------------------------------------
  1052. // check the critical expression
  1053. if (likely(rc->critical)) {
  1054. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  1055. if (unlikely(!expression_evaluate(rc->critical))) {
  1056. // calculation failed
  1057. rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
  1058. debug(D_HEALTH,
  1059. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  1060. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1061. buffer_tostring(rc->critical->error_msg)
  1062. );
  1063. } else {
  1064. rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  1065. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  1066. NETDATA_DOUBLE_FORMAT
  1067. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1068. rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
  1069. rrdcalc_source(rc)
  1070. );
  1071. critical_status = rrdcalc_value2status(rc->critical->result);
  1072. }
  1073. }
  1074. // --------------------------------------------------------
  1075. // decide the final alarm status
  1076. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  1077. switch (warning_status) {
  1078. case RRDCALC_STATUS_CLEAR:
  1079. status = RRDCALC_STATUS_CLEAR;
  1080. break;
  1081. case RRDCALC_STATUS_RAISED:
  1082. status = RRDCALC_STATUS_WARNING;
  1083. break;
  1084. default:
  1085. break;
  1086. }
  1087. switch (critical_status) {
  1088. case RRDCALC_STATUS_CLEAR:
  1089. if (status == RRDCALC_STATUS_UNDEFINED)
  1090. status = RRDCALC_STATUS_CLEAR;
  1091. break;
  1092. case RRDCALC_STATUS_RAISED:
  1093. status = RRDCALC_STATUS_CRITICAL;
  1094. break;
  1095. default:
  1096. break;
  1097. }
  1098. // --------------------------------------------------------
  1099. // check if the new status and the old differ
  1100. if (status != rc->status) {
  1101. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1102. int delay = 0;
  1103. // apply trigger hysteresis
  1104. if (now > rc->delay_up_to_timestamp) {
  1105. rc->delay_up_current = rc->delay_up_duration;
  1106. rc->delay_down_current = rc->delay_down_duration;
  1107. rc->delay_last = 0;
  1108. rc->delay_up_to_timestamp = 0;
  1109. } else {
  1110. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  1111. if (rc->delay_up_current > rc->delay_max_duration)
  1112. rc->delay_up_current = rc->delay_max_duration;
  1113. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  1114. if (rc->delay_down_current > rc->delay_max_duration)
  1115. rc->delay_down_current = rc->delay_max_duration;
  1116. }
  1117. if (status > rc->status)
  1118. delay = rc->delay_up_current;
  1119. else
  1120. delay = rc->delay_down_current;
  1121. // COMMENTED: because we do need to send raising alarms
  1122. // if(now + delay < rc->delay_up_to_timestamp)
  1123. // delay = (int)(rc->delay_up_to_timestamp - now);
  1124. rc->delay_last = delay;
  1125. rc->delay_up_to_timestamp = now + delay;
  1126. ALARM_ENTRY *ae = health_create_alarm_entry(
  1127. host,
  1128. rc->id,
  1129. rc->next_event_id++,
  1130. rc->config_hash_id,
  1131. now,
  1132. rc->name,
  1133. rc->rrdset->id,
  1134. rc->rrdset->context,
  1135. rc->rrdset->family,
  1136. rc->classification,
  1137. rc->component,
  1138. rc->type,
  1139. rc->exec,
  1140. rc->recipient,
  1141. now - rc->last_status_change,
  1142. rc->old_value,
  1143. rc->value,
  1144. rc->status,
  1145. status,
  1146. rc->source,
  1147. rc->units,
  1148. rc->info,
  1149. rc->delay_last,
  1150. (
  1151. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1152. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1153. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1154. )
  1155. );
  1156. health_alarm_log_add_entry(host, ae);
  1157. log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
  1158. rc->last_status_change = now;
  1159. rc->old_status = rc->status;
  1160. rc->status = status;
  1161. }
  1162. rc->last_updated = now;
  1163. rc->next_update = now + rc->update_every;
  1164. if (next_run > rc->next_update)
  1165. next_run = rc->next_update;
  1166. }
  1167. foreach_rrdcalc_in_rrdhost_done(rc);
  1168. // process repeating alarms
  1169. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1170. if(unlikely(!service_running(SERVICE_HEALTH)))
  1171. break;
  1172. int repeat_every = 0;
  1173. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  1174. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  1175. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1176. repeat_every = rc->warn_repeat_every;
  1177. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  1178. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1179. repeat_every = rc->crit_repeat_every;
  1180. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  1181. if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
  1182. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  1183. repeat_every = 1;
  1184. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  1185. repeat_every = 1;
  1186. }
  1187. }
  1188. }
  1189. } else {
  1190. continue;
  1191. }
  1192. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  1193. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1194. rc->last_repeat = now;
  1195. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  1196. ALARM_ENTRY *ae = health_create_alarm_entry(
  1197. host,
  1198. rc->id,
  1199. rc->next_event_id++,
  1200. rc->config_hash_id,
  1201. now,
  1202. rc->name,
  1203. rc->rrdset->id,
  1204. rc->rrdset->context,
  1205. rc->rrdset->family,
  1206. rc->classification,
  1207. rc->component,
  1208. rc->type,
  1209. rc->exec,
  1210. rc->recipient,
  1211. now - rc->last_status_change,
  1212. rc->old_value,
  1213. rc->value,
  1214. rc->old_status,
  1215. rc->status,
  1216. rc->source,
  1217. rc->units,
  1218. rc->info,
  1219. rc->delay_last,
  1220. (
  1221. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1222. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1223. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1224. )
  1225. );
  1226. ae->last_repeat = rc->last_repeat;
  1227. if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  1228. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  1229. }
  1230. rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
  1231. health_process_notifications(host, ae);
  1232. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  1233. health_alarm_wait_for_execution(ae);
  1234. health_alarm_log_free_one_nochecks_nounlink(ae);
  1235. }
  1236. }
  1237. foreach_rrdcalc_in_rrdhost_done(rc);
  1238. }
  1239. if (unlikely(!service_running(SERVICE_HEALTH)))
  1240. break;
  1241. // execute notifications
  1242. // and cleanup
  1243. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  1244. health_alarm_log_process(host);
  1245. if (unlikely(!service_running(SERVICE_HEALTH))) {
  1246. // wait for all notifications to finish before allowing health to be cleaned up
  1247. ALARM_ENTRY *ae;
  1248. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1249. if(unlikely(!service_running(SERVICE_HEALTH)))
  1250. break;
  1251. health_alarm_wait_for_execution(ae);
  1252. }
  1253. break;
  1254. }
  1255. } //for each host
  1256. rrd_unlock();
  1257. // wait for all notifications to finish before allowing health to be cleaned up
  1258. ALARM_ENTRY *ae;
  1259. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1260. if(unlikely(!service_running(SERVICE_HEALTH)))
  1261. break;
  1262. health_alarm_wait_for_execution(ae);
  1263. }
  1264. #ifdef ENABLE_ACLK
  1265. if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
  1266. rrdhost_foreach_read(host) {
  1267. if(unlikely(!service_running(SERVICE_HEALTH)))
  1268. break;
  1269. if (unlikely(!host->health.health_enabled))
  1270. continue;
  1271. sql_queue_removed_alerts_to_aclk(host);
  1272. }
  1273. aclk_alert_reloaded = 0;
  1274. marked_aclk_reload_loop = 0;
  1275. }
  1276. #endif
  1277. if(unlikely(!service_running(SERVICE_HEALTH)))
  1278. break;
  1279. health_sleep(next_run, loop);
  1280. } // forever
  1281. netdata_thread_cleanup_pop(1);
  1282. return NULL;
  1283. }
  1284. void health_add_host_labels(void) {
  1285. DICTIONARY *labels = localhost->rrdlabels;
  1286. // The source should be CONF, but when it is set, these labels are exported by default ('send configured labels' in exporting.conf).
  1287. // Their export seems to break exporting to Graphite, see https://github.com/netdata/netdata/issues/14084.
  1288. int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
  1289. rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_AUTO);
  1290. int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
  1291. rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_AUTO);
  1292. }