health.c 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  4. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  5. #define WORKER_HEALTH_JOB_DB_QUERY 2
  6. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  7. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  8. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  9. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  10. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  11. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
  12. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
  13. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
  14. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
  15. #endif
  16. unsigned int default_health_enabled = 1;
  17. char *silencers_filename;
  18. SIMPLE_PATTERN *conf_enabled_alarms = NULL;
  19. DICTIONARY *health_rrdvars;
  20. bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, uuid_t *transitions_id) {
  21. if(!rc->rrdset)
  22. return false;
  23. RRDHOST *host = rc->rrdset->rrdhost;
  24. rw_spinlock_read_lock(&host->health_log.spinlock);
  25. ALARM_ENTRY *ae;
  26. for(ae = host->health_log.alarms; ae ; ae = ae->next) {
  27. if(unlikely(ae->alarm_id == rc->id))
  28. break;
  29. }
  30. if(ae) {
  31. *global_id = ae->global_id;
  32. uuid_copy(*transitions_id, ae->transition_id);
  33. }
  34. else {
  35. *global_id = 0;
  36. uuid_clear(*transitions_id);
  37. }
  38. rw_spinlock_read_unlock(&host->health_log.spinlock);
  39. return ae != NULL;
  40. }
  41. void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
  42. buffer_json_member_add_array(wb, key);
  43. if(flags & HEALTH_ENTRY_FLAG_PROCESSED)
  44. buffer_json_add_array_item_string(wb, "PROCESSED");
  45. if(flags & HEALTH_ENTRY_FLAG_UPDATED)
  46. buffer_json_add_array_item_string(wb, "UPDATED");
  47. if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
  48. buffer_json_add_array_item_string(wb, "EXEC_RUN");
  49. if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)
  50. buffer_json_add_array_item_string(wb, "EXEC_FAILED");
  51. if(flags & HEALTH_ENTRY_FLAG_SILENCED)
  52. buffer_json_add_array_item_string(wb, "SILENCED");
  53. if(flags & HEALTH_ENTRY_RUN_ONCE)
  54. buffer_json_add_array_item_string(wb, "RUN_ONCE");
  55. if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)
  56. buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS");
  57. if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING)
  58. buffer_json_add_array_item_string(wb, "RECURRING");
  59. if(flags & HEALTH_ENTRY_FLAG_SAVED)
  60. buffer_json_add_array_item_string(wb, "SAVED");
  61. if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED)
  62. buffer_json_add_array_item_string(wb, "ACLK_QUEUED");
  63. if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)
  64. buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION");
  65. buffer_json_array_close(wb);
  66. }
  67. static bool prepare_command(BUFFER *wb,
  68. const char *exec,
  69. const char *recipient,
  70. const char *registry_hostname,
  71. uint32_t unique_id,
  72. uint32_t alarm_id,
  73. uint32_t alarm_event_id,
  74. uint32_t when,
  75. const char *alert_name,
  76. const char *alert_chart_name,
  77. const char *new_status,
  78. const char *old_status,
  79. NETDATA_DOUBLE new_value,
  80. NETDATA_DOUBLE old_value,
  81. const char *alert_source,
  82. uint32_t duration,
  83. uint32_t non_clear_duration,
  84. const char *alert_units,
  85. const char *alert_info,
  86. const char *new_value_string,
  87. const char *old_value_string,
  88. const char *source,
  89. const char *error_msg,
  90. int n_warn,
  91. int n_crit,
  92. const char *warn_alarms,
  93. const char *crit_alarms,
  94. const char *classification,
  95. const char *edit_command,
  96. const char *machine_guid,
  97. uuid_t *transition_id,
  98. const char *summary,
  99. const char *context,
  100. const char *component,
  101. const char *type
  102. ) {
  103. char buf[8192];
  104. size_t n = sizeof(buf) - 1;
  105. buffer_strcat(wb, "exec");
  106. if (!sanitize_command_argument_string(buf, exec, n))
  107. return false;
  108. buffer_sprintf(wb, " '%s'", buf);
  109. if (!sanitize_command_argument_string(buf, recipient, n))
  110. return false;
  111. buffer_sprintf(wb, " '%s'", buf);
  112. if (!sanitize_command_argument_string(buf, registry_hostname, n))
  113. return false;
  114. buffer_sprintf(wb, " '%s'", buf);
  115. buffer_sprintf(wb, " '%u'", unique_id);
  116. buffer_sprintf(wb, " '%u'", alarm_id);
  117. buffer_sprintf(wb, " '%u'", alarm_event_id);
  118. buffer_sprintf(wb, " '%u'", when);
  119. if (!sanitize_command_argument_string(buf, alert_name, n))
  120. return false;
  121. buffer_sprintf(wb, " '%s'", buf);
  122. if (!sanitize_command_argument_string(buf, alert_chart_name, n))
  123. return false;
  124. buffer_sprintf(wb, " '%s'", buf);
  125. if (!sanitize_command_argument_string(buf, new_status, n))
  126. return false;
  127. buffer_sprintf(wb, " '%s'", buf);
  128. if (!sanitize_command_argument_string(buf, old_status, n))
  129. return false;
  130. buffer_sprintf(wb, " '%s'", buf);
  131. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
  132. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
  133. if (!sanitize_command_argument_string(buf, alert_source, n))
  134. return false;
  135. buffer_sprintf(wb, " '%s'", buf);
  136. buffer_sprintf(wb, " '%u'", duration);
  137. buffer_sprintf(wb, " '%u'", non_clear_duration);
  138. if (!sanitize_command_argument_string(buf, alert_units, n))
  139. return false;
  140. buffer_sprintf(wb, " '%s'", buf);
  141. if (!sanitize_command_argument_string(buf, alert_info, n))
  142. return false;
  143. buffer_sprintf(wb, " '%s'", buf);
  144. if (!sanitize_command_argument_string(buf, new_value_string, n))
  145. return false;
  146. buffer_sprintf(wb, " '%s'", buf);
  147. if (!sanitize_command_argument_string(buf, old_value_string, n))
  148. return false;
  149. buffer_sprintf(wb, " '%s'", buf);
  150. if (!sanitize_command_argument_string(buf, source, n))
  151. return false;
  152. buffer_sprintf(wb, " '%s'", buf);
  153. if (!sanitize_command_argument_string(buf, error_msg, n))
  154. return false;
  155. buffer_sprintf(wb, " '%s'", buf);
  156. buffer_sprintf(wb, " '%d'", n_warn);
  157. buffer_sprintf(wb, " '%d'", n_crit);
  158. if (!sanitize_command_argument_string(buf, warn_alarms, n))
  159. return false;
  160. buffer_sprintf(wb, " '%s'", buf);
  161. if (!sanitize_command_argument_string(buf, crit_alarms, n))
  162. return false;
  163. buffer_sprintf(wb, " '%s'", buf);
  164. if (!sanitize_command_argument_string(buf, classification, n))
  165. return false;
  166. buffer_sprintf(wb, " '%s'", buf);
  167. if (!sanitize_command_argument_string(buf, edit_command, n))
  168. return false;
  169. buffer_sprintf(wb, " '%s'", buf);
  170. if (!sanitize_command_argument_string(buf, machine_guid, n))
  171. return false;
  172. buffer_sprintf(wb, " '%s'", buf);
  173. char tr_id[UUID_STR_LEN];
  174. uuid_unparse_lower(*transition_id, tr_id);
  175. if (!sanitize_command_argument_string(buf, tr_id, n))
  176. return false;
  177. buffer_sprintf(wb, " '%s'", buf);
  178. if (!sanitize_command_argument_string(buf, summary, n))
  179. return false;
  180. buffer_sprintf(wb, " '%s'", buf);
  181. if (!sanitize_command_argument_string(buf, context, n))
  182. return false;
  183. buffer_sprintf(wb, " '%s'", buf);
  184. if (!sanitize_command_argument_string(buf, component, n))
  185. return false;
  186. buffer_sprintf(wb, " '%s'", buf);
  187. if (!sanitize_command_argument_string(buf, type, n))
  188. return false;
  189. buffer_sprintf(wb, " '%s'", buf);
  190. return true;
  191. }
  192. // the queue of executed alarm notifications that haven't been waited for yet
  193. static struct {
  194. ALARM_ENTRY *head; // oldest
  195. ALARM_ENTRY *tail; // latest
  196. } alarm_notifications_in_progress = {NULL, NULL};
  197. typedef struct active_alerts {
  198. char *name;
  199. time_t last_status_change;
  200. RRDCALC_STATUS status;
  201. } active_alerts_t;
  202. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  203. {
  204. ae->prev_in_progress = NULL;
  205. ae->next_in_progress = NULL;
  206. if (NULL != alarm_notifications_in_progress.tail) {
  207. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  208. alarm_notifications_in_progress.tail->next_in_progress = ae;
  209. }
  210. if (NULL == alarm_notifications_in_progress.head) {
  211. alarm_notifications_in_progress.head = ae;
  212. }
  213. alarm_notifications_in_progress.tail = ae;
  214. }
  215. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  216. {
  217. struct alarm_entry *prev = ae->prev_in_progress;
  218. struct alarm_entry *next = ae->next_in_progress;
  219. if (NULL != prev) {
  220. prev->next_in_progress = next;
  221. }
  222. if (NULL != next) {
  223. next->prev_in_progress = prev;
  224. }
  225. if (ae == alarm_notifications_in_progress.head) {
  226. alarm_notifications_in_progress.head = next;
  227. }
  228. if (ae == alarm_notifications_in_progress.tail) {
  229. alarm_notifications_in_progress.tail = prev;
  230. }
  231. }
  232. // ----------------------------------------------------------------------------
  233. // health initialization
  234. /**
  235. * User Config directory
  236. *
  237. * Get the config directory for health and return it.
  238. *
  239. * @return a pointer to the user config directory
  240. */
  241. inline char *health_user_config_dir(void) {
  242. char buffer[FILENAME_MAX + 1];
  243. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  244. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  245. }
  246. /**
  247. * Stock Config Directory
  248. *
  249. * Get the Stock config directory and return it.
  250. *
  251. * @return a pointer to the stock config directory.
  252. */
  253. inline char *health_stock_config_dir(void) {
  254. char buffer[FILENAME_MAX + 1];
  255. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  256. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  257. }
  258. /**
  259. * Silencers init
  260. *
  261. * Function used to initialize the silencer structure.
  262. */
  263. static void health_silencers_init(void) {
  264. FILE *fd = fopen(silencers_filename, "r");
  265. if (fd) {
  266. fseek(fd, 0 , SEEK_END);
  267. off_t length = (off_t) ftell(fd);
  268. fseek(fd, 0 , SEEK_SET);
  269. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  270. char *str = mallocz((length+1)* sizeof(char));
  271. if(str) {
  272. size_t copied;
  273. copied = fread(str, sizeof(char), length, fd);
  274. if (copied == (length* sizeof(char))) {
  275. str[length] = 0x00;
  276. json_parse(str, NULL, health_silencers_json_read_callback);
  277. netdata_log_info("Parsed health silencers file %s", silencers_filename);
  278. } else {
  279. netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename);
  280. }
  281. freez(str);
  282. }
  283. } else {
  284. netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  285. silencers_filename,
  286. (int64_t)length,
  287. HEALTH_SILENCERS_MAX_FILE_LEN);
  288. }
  289. fclose(fd);
  290. } else {
  291. netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.",
  292. silencers_filename);
  293. }
  294. }
  295. /**
  296. * Health Init
  297. *
  298. * Initialize the health thread.
  299. */
  300. void health_init(void) {
  301. netdata_log_debug(D_HEALTH, "Health configuration initializing");
  302. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  303. netdata_log_debug(D_HEALTH, "Health is disabled.");
  304. return;
  305. }
  306. health_silencers_init();
  307. }
  308. // ----------------------------------------------------------------------------
  309. // re-load health configuration
  310. /**
  311. * Reload host
  312. *
  313. * Reload configuration for a specific host.
  314. *
  315. * @param host the structure of the host that the function will reload the configuration.
  316. */
  317. static void health_reload_host(RRDHOST *host) {
  318. if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
  319. return;
  320. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  321. "[%s]: Reloading health.",
  322. rrdhost_hostname(host));
  323. char *user_path = health_user_config_dir();
  324. char *stock_path = health_stock_config_dir();
  325. // free all running alarms
  326. rrdcalc_delete_all(host);
  327. rrdcalctemplate_delete_all(host);
  328. // invalidate all previous entries in the alarm log
  329. rw_spinlock_read_lock(&host->health_log.spinlock);
  330. ALARM_ENTRY *t;
  331. for(t = host->health_log.alarms ; t ; t = t->next) {
  332. if(t->new_status != RRDCALC_STATUS_REMOVED)
  333. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  334. }
  335. rw_spinlock_read_unlock(&host->health_log.spinlock);
  336. // reset all thresholds to all charts
  337. RRDSET *st;
  338. rrdset_foreach_read(st, host) {
  339. st->green = NAN;
  340. st->red = NAN;
  341. }
  342. rrdset_foreach_done(st);
  343. // load the new alarms
  344. health_readdir(host, user_path, stock_path, NULL);
  345. //Discard alarms with labels that do not apply to host
  346. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  347. // link the loaded alarms to their charts
  348. rrdset_foreach_write(st, host) {
  349. rrdcalc_link_matching_alerts_to_rrdset(st);
  350. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  351. }
  352. rrdset_foreach_done(st);
  353. #ifdef ENABLE_ACLK
  354. if (netdata_cloud_enabled) {
  355. struct aclk_sync_cfg_t *wc = host->aclk_config;
  356. if (likely(wc)) {
  357. wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
  358. }
  359. }
  360. #endif
  361. }
  362. /**
  363. * Reload
  364. *
  365. * Reload the host configuration for all hosts.
  366. */
  367. void health_reload(void) {
  368. sql_refresh_hashes();
  369. RRDHOST *host;
  370. dfe_start_reentrant(rrdhost_root_index, host){
  371. health_reload_host(host);
  372. }
  373. dfe_done(host);
  374. }
  375. // ----------------------------------------------------------------------------
  376. // health main thread and friends
  377. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  378. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  379. if(n) return RRDCALC_STATUS_RAISED;
  380. return RRDCALC_STATUS_CLEAR;
  381. }
  382. #define ACTIVE_ALARMS_LIST_EXAMINE 500
  383. #define ACTIVE_ALARMS_LIST 15
  384. static inline int compare_active_alerts(const void * a, const void * b) {
  385. active_alerts_t *active_alerts_a = (active_alerts_t *)a;
  386. active_alerts_t *active_alerts_b = (active_alerts_t *)b;
  387. return (int) ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
  388. }
  389. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  390. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  391. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  392. // do not send notifications for internal statuses
  393. netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  394. goto done;
  395. }
  396. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  397. // do not send notifications for disabled statuses
  398. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  399. "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
  400. rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  401. // mark it as run, so that we will send the same alarm if it happens again
  402. goto done;
  403. }
  404. // find the previous notification for the same alarm
  405. // which we have run the exec script
  406. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  407. RRDCALC_STATUS last_executed_status = -3;
  408. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  409. int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
  410. if (likely(ret == 1)) {
  411. // we have executed this alarm notification in the past
  412. if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
  413. // don't send the notification for the same status again
  414. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  415. "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
  416. rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
  417. rrdcalc_status2string(ae->new_status));
  418. goto done;
  419. }
  420. }
  421. else {
  422. // we have not executed this alarm notification in the past
  423. // so, don't send CLEAR notifications
  424. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  425. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  426. netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  427. , ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  428. goto done;
  429. }
  430. }
  431. }
  432. }
  433. // Check if alarm notifications are silenced
  434. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  435. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  436. "[%s]: Health not sending notification for alarm '%s.%s' status %s "
  437. "(command API has disabled notifications)",
  438. rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  439. goto done;
  440. }
  441. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  442. "[%s]: Sending notification for alarm '%s.%s' status %s.",
  443. rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  444. const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
  445. const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
  446. int n_warn=0, n_crit=0;
  447. RRDCALC *rc;
  448. EVAL_EXPRESSION *expr=NULL;
  449. BUFFER *warn_alarms, *crit_alarms;
  450. active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
  451. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  452. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  453. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  454. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  455. continue;
  456. if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
  457. break;
  458. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  459. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  460. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  461. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  462. active_alerts[n_warn+n_crit].status = rc->status;
  463. n_warn++;
  464. } else if (ae->alarm_id == rc->id)
  465. expr = rc->warning;
  466. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  467. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  468. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  469. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  470. active_alerts[n_warn+n_crit].status = rc->status;
  471. n_crit++;
  472. } else if (ae->alarm_id == rc->id)
  473. expr = rc->critical;
  474. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  475. if (ae->alarm_id == rc->id)
  476. expr = rc->warning;
  477. }
  478. }
  479. foreach_rrdcalc_in_rrdhost_done(rc);
  480. if (n_warn+n_crit>1)
  481. qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
  482. int count_w = 0, count_c = 0;
  483. while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
  484. if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
  485. if (count_w)
  486. buffer_strcat(warn_alarms, ",");
  487. buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
  488. buffer_strcat(warn_alarms, "=");
  489. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  490. count_w++;
  491. }
  492. else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
  493. if (count_c)
  494. buffer_strcat(crit_alarms, ",");
  495. buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
  496. buffer_strcat(crit_alarms, "=");
  497. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  498. count_c++;
  499. }
  500. }
  501. char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
  502. BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health);
  503. bool ok = prepare_command(wb,
  504. exec,
  505. recipient,
  506. rrdhost_registry_hostname(host),
  507. ae->unique_id,
  508. ae->alarm_id,
  509. ae->alarm_event_id,
  510. (unsigned long)ae->when,
  511. ae_name(ae),
  512. ae->chart?ae_chart_id(ae):"NOCHART",
  513. rrdcalc_status2string(ae->new_status),
  514. rrdcalc_status2string(ae->old_status),
  515. ae->new_value,
  516. ae->old_value,
  517. ae->source?ae_source(ae):"UNKNOWN",
  518. (uint32_t)ae->duration,
  519. (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
  520. ae_units(ae),
  521. ae_info(ae),
  522. ae_new_value_string(ae),
  523. ae_old_value_string(ae),
  524. (expr && expr->source)?expr->source:"NOSOURCE",
  525. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  526. n_warn,
  527. n_crit,
  528. buffer_tostring(warn_alarms),
  529. buffer_tostring(crit_alarms),
  530. ae->classification?ae_classification(ae):"Unknown",
  531. edit_command,
  532. host->machine_guid,
  533. &ae->transition_id,
  534. host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
  535. string2str(ae->chart_context),
  536. string2str(ae->component),
  537. string2str(ae->type)
  538. );
  539. const char *command_to_run = buffer_tostring(wb);
  540. if (ok) {
  541. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  542. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  543. netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
  544. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  545. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  546. enqueue_alarm_notify_in_progress(ae);
  547. health_alarm_log_save(host, ae);
  548. } else {
  549. netdata_log_error("Failed to format command arguments");
  550. }
  551. buffer_free(wb);
  552. freez(edit_command);
  553. buffer_free(warn_alarms);
  554. buffer_free(crit_alarms);
  555. freez(active_alerts);
  556. return; //health_alarm_wait_for_execution
  557. done:
  558. health_alarm_log_save(host, ae);
  559. }
  560. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  561. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  562. return;
  563. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  564. netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  565. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  566. if(ae->exec_code != 0)
  567. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  568. unlink_alarm_notify_in_progress(ae);
  569. }
  570. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  571. netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  572. ae->chart?ae_chart_id(ae):"NOCHART", ae_name(ae),
  573. ae->new_value,
  574. rrdcalc_status2string(ae->old_status),
  575. rrdcalc_status2string(ae->new_status)
  576. );
  577. health_alarm_execute(host, ae);
  578. }
  579. static inline void health_alarm_log_process(RRDHOST *host) {
  580. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  581. time_t now = now_realtime_sec();
  582. rw_spinlock_read_lock(&host->health_log.spinlock);
  583. ALARM_ENTRY *ae;
  584. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  585. if(unlikely(
  586. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  587. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  588. )) {
  589. if(unlikely(ae->unique_id < first_waiting))
  590. first_waiting = ae->unique_id;
  591. if(likely(now >= ae->delay_up_to_timestamp))
  592. health_process_notifications(host, ae);
  593. }
  594. }
  595. rw_spinlock_read_unlock(&host->health_log.spinlock);
  596. // remember this for the next iteration
  597. host->health_last_processed_id = first_waiting;
  598. //delete those that are updated, no in progress execution, and is not repeating
  599. rw_spinlock_write_lock(&host->health_log.spinlock);
  600. ALARM_ENTRY *prev = NULL, *next = NULL;
  601. for(ae = host->health_log.alarms; ae ; ae = next) {
  602. next = ae->next; // set it here, for the next iteration
  603. if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
  604. (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
  605. (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
  606. !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  607. ||
  608. ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
  609. (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
  610. (ae->when + 86400 < now_realtime_sec())))
  611. {
  612. if(host->health_log.alarms == ae) {
  613. host->health_log.alarms = next;
  614. // prev is also NULL here
  615. }
  616. else {
  617. prev->next = next;
  618. // prev should not be touched here - we need it for the next iteration
  619. // because we may have to also remove the next item
  620. }
  621. health_alarm_log_free_one_nochecks_nounlink(ae);
  622. }
  623. else
  624. prev = ae;
  625. }
  626. rw_spinlock_write_unlock(&host->health_log.spinlock);
  627. }
  628. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  629. if(unlikely(!rc->rrdset)) {
  630. netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  631. return 0;
  632. }
  633. if(unlikely(rc->next_update > now)) {
  634. if (unlikely(*next_run > rc->next_update)) {
  635. // update the next_run time of the main loop
  636. // to run this alarm precisely the time required
  637. *next_run = rc->next_update;
  638. }
  639. netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
  640. return 0;
  641. }
  642. if(unlikely(!rc->update_every)) {
  643. netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  644. return 0;
  645. }
  646. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  647. netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  648. return 0;
  649. }
  650. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  651. netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  652. return 0;
  653. }
  654. int update_every = rc->rrdset->update_every;
  655. time_t first = rrdset_first_entry_s(rc->rrdset);
  656. time_t last = rrdset_last_entry_s(rc->rrdset);
  657. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  658. netdata_log_debug(D_HEALTH
  659. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  660. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
  661. , (unsigned long) last);
  662. return 0;
  663. }
  664. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  665. time_t needed = now + rc->before + rc->after;
  666. if(needed + update_every < first || needed - update_every > last) {
  667. netdata_log_debug(D_HEALTH
  668. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  669. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
  670. , (unsigned long) last);
  671. return 0;
  672. }
  673. }
  674. return 1;
  675. }
  676. static inline int check_if_resumed_from_suspension(void) {
  677. static usec_t last_realtime = 0, last_monotonic = 0;
  678. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  679. int ret = 0;
  680. // detect if monotonic and realtime have twice the difference
  681. // in which case we assume the system was just waken from hibernation
  682. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  683. ret = 1;
  684. last_realtime = realtime;
  685. last_monotonic = monotonic;
  686. return ret;
  687. }
  688. static void health_main_cleanup(void *ptr) {
  689. worker_unregister();
  690. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  691. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  692. netdata_log_info("cleaning up...");
  693. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  694. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  695. "Health thread ended.");
  696. }
  697. static void initialize_health(RRDHOST *host)
  698. {
  699. if(!host->health.health_enabled ||
  700. rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) ||
  701. !service_running(SERVICE_HEALTH))
  702. return;
  703. rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
  704. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  705. "[%s]: Initializing health.",
  706. rrdhost_hostname(host));
  707. host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
  708. host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
  709. host->health_log.next_log_id = 1;
  710. host->health_log.next_alarm_id = 1;
  711. host->health_log.max = 1000;
  712. host->health_log.next_log_id = (uint32_t)now_realtime_sec();
  713. host->health_log.next_alarm_id = 0;
  714. long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
  715. if(n < 10) {
  716. nd_log(NDLS_DAEMON, NDLP_WARNING,
  717. "Host '%s': health configuration has invalid max log entries %ld. "
  718. "Using default %u",
  719. rrdhost_hostname(host), n, host->health_log.max);
  720. config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
  721. }
  722. else
  723. host->health_log.max = (unsigned int)n;
  724. uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
  725. if (m < HEALTH_LOG_MINIMUM_HISTORY) {
  726. nd_log(NDLS_DAEMON, NDLP_WARNING,
  727. "Host '%s': health configuration has invalid health log history %u. "
  728. "Using minimum %d",
  729. rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
  730. config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
  731. m = HEALTH_LOG_MINIMUM_HISTORY;
  732. }
  733. //default health log history is 5 days and not less than a day
  734. if (host->health_log.health_log_history) {
  735. if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY)
  736. host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY;
  737. } else
  738. host->health_log.health_log_history = m;
  739. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  740. "[%s]: Health log history is set to %u seconds (%u days)",
  741. rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
  742. conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
  743. SIMPLE_PATTERN_EXACT, true);
  744. rw_spinlock_init(&host->health_log.spinlock);
  745. char filename[FILENAME_MAX + 1];
  746. snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
  747. host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
  748. host->health.health_default_recipient = string_strdupz("root");
  749. host->health.use_summary_for_notifications = config_get_boolean(CONFIG_SECTION_HEALTH, "use summary for notifications", CONFIG_BOOLEAN_YES);
  750. sql_health_alarm_log_load(host);
  751. // ------------------------------------------------------------------------
  752. // load health configuration
  753. health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
  754. // link the loaded alarms to their charts
  755. RRDSET *st;
  756. rrdset_foreach_reentrant(st, host) {
  757. rrdcalc_link_matching_alerts_to_rrdset(st);
  758. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  759. }
  760. rrdset_foreach_done(st);
  761. //Discard alarms with labels that do not apply to host
  762. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  763. }
  764. static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
  765. time_t now = now_realtime_sec();
  766. if(now < next_run) {
  767. worker_is_idle();
  768. netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  769. while (now < next_run && service_running(SERVICE_HEALTH)) {
  770. sleep_usec(USEC_PER_SEC);
  771. now = now_realtime_sec();
  772. }
  773. }
  774. else {
  775. netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  776. }
  777. }
  778. static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host)
  779. {
  780. SILENCER *s;
  781. for (s = silencers->silencers; s!=NULL; s=s->next){
  782. if (
  783. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches_string(s->alarms_pattern, rc->name))) &&
  784. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches_string(s->contexts_pattern, rc->rrdset->context))) &&
  785. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern, host))) &&
  786. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart)))
  787. ) {
  788. netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts);
  789. if (unlikely(silencers->stype == STYPE_NONE)) {
  790. netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
  791. } else {
  792. netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s"
  793. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  794. , rrdcalc_name(rc)
  795. , (rc->rrdset)?rrdset_context(rc->rrdset):""
  796. , rrdcalc_chart_name(rc)
  797. , host
  798. );
  799. }
  800. return silencers->stype;
  801. }
  802. }
  803. return STYPE_NONE;
  804. }
  805. /**
  806. * Update Disabled Silenced
  807. *
  808. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  809. *
  810. * @param host structure that contains information about the host monitored.
  811. * @param rc structure with information about the alarm
  812. *
  813. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  814. */
  815. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  816. uint32_t rrdcalc_flags_old = rc->run_flags;
  817. // Clear the flags
  818. rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  819. if (unlikely(silencers->all_alarms)) {
  820. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  821. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  822. } else {
  823. SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host));
  824. if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  825. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  826. }
  827. if (rrdcalc_flags_old != rc->run_flags) {
  828. netdata_log_info(
  829. "Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  830. rrdhost_hostname(host),
  831. rrdcalc_name(rc),
  832. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED) ? "true" : "false",
  833. (rc->run_flags & RRDCALC_FLAG_DISABLED) ? "true" : "false",
  834. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED) ? "true" : "false",
  835. (rc->run_flags & RRDCALC_FLAG_SILENCED) ? "true" : "false");
  836. }
  837. if (rc->run_flags & RRDCALC_FLAG_DISABLED)
  838. return 1;
  839. else
  840. return 0;
  841. }
  842. static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
  843. #ifdef ENABLE_ACLK
  844. if (netdata_cloud_enabled) {
  845. struct aclk_sync_cfg_t *wc = host->aclk_config;
  846. if (unlikely(!wc)) {
  847. return;
  848. }
  849. if (wc->alert_queue_removed >= 1) {
  850. wc->alert_queue_removed+=6;
  851. }
  852. }
  853. #endif
  854. }
  855. static void health_execute_delayed_initializations(RRDHOST *host) {
  856. RRDSET *st;
  857. bool must_postpone = false;
  858. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
  859. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
  860. rrdset_foreach_reentrant(st, host) {
  861. if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  862. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
  863. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
  864. rrdcalc_link_matching_alerts_to_rrdset(st);
  865. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  866. RRDDIM *rd;
  867. rrddim_foreach_read(rd, st) {
  868. if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  869. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
  870. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
  871. RRDCALCTEMPLATE *rt;
  872. foreach_rrdcalctemplate_read(host, rt) {
  873. if(!rt->foreach_dimension_pattern)
  874. continue;
  875. if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) {
  876. rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
  877. }
  878. }
  879. foreach_rrdcalctemplate_done(rt);
  880. if (health_variable_check(health_rrdvars, st, rd) || rrdset_flag_check(st, RRDSET_FLAG_HAS_RRDCALC_LINKED))
  881. rrdvar_store_for_chart(host, st);
  882. }
  883. rrddim_foreach_done(rd);
  884. must_postpone = true;
  885. }
  886. rrdset_foreach_done(st);
  887. if (must_postpone)
  888. sql_health_postpone_queue_removed(host);
  889. }
  890. /**
  891. * Health Main
  892. *
  893. * The main thread of the health system. In this function all the alarms will be processed.
  894. *
  895. * @param ptr is a pointer to the netdata_static_thread structure.
  896. *
  897. * @return It always returns NULL
  898. */
  899. void *health_main(void *ptr) {
  900. worker_register("HEALTH");
  901. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  902. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  903. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  904. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  905. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  906. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  907. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  908. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  909. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
  910. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
  911. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  912. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  913. if(min_run_every < 1) min_run_every = 1;
  914. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  915. bool health_running_logged = false;
  916. rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
  917. unsigned int loop = 0;
  918. while(service_running(SERVICE_HEALTH)) {
  919. loop++;
  920. netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  921. time_t now = now_realtime_sec();
  922. int runnable = 0, apply_hibernation_delay = 0;
  923. time_t next_run = now + min_run_every;
  924. RRDCALC *rc;
  925. RRDHOST *host;
  926. if (unlikely(check_if_resumed_from_suspension())) {
  927. apply_hibernation_delay = 1;
  928. nd_log(NDLS_DAEMON, NDLP_NOTICE,
  929. "Postponing alarm checks for %"PRId64" seconds, "
  930. "because it seems that the system was just resumed from suspension.",
  931. (int64_t)hibernation_delay);
  932. }
  933. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  934. static int logged=0;
  935. if (!logged) {
  936. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  937. "Skipping health checks, because all alarms are disabled via a %s command.",
  938. HEALTH_CMDAPI_CMD_DISABLEALL);
  939. logged = 1;
  940. }
  941. }
  942. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  943. dfe_start_reentrant(rrdhost_root_index, host) {
  944. if(unlikely(!service_running(SERVICE_HEALTH)))
  945. break;
  946. if (unlikely(!host->health.health_enabled))
  947. continue;
  948. if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)))
  949. initialize_health(host);
  950. health_execute_delayed_initializations(host);
  951. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  952. if (unlikely(apply_hibernation_delay)) {
  953. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  954. "[%s]: Postponing health checks for %"PRId64" seconds.",
  955. rrdhost_hostname(host),
  956. (int64_t)hibernation_delay);
  957. host->health.health_delay_up_to = now + hibernation_delay;
  958. }
  959. if (unlikely(host->health.health_delay_up_to)) {
  960. if (unlikely(now < host->health.health_delay_up_to)) {
  961. continue;
  962. }
  963. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  964. "[%s]: Resuming health checks after delay.",
  965. rrdhost_hostname(host));
  966. host->health.health_delay_up_to = 0;
  967. }
  968. // wait until cleanup of obsolete charts on children is complete
  969. if (host != localhost) {
  970. if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
  971. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  972. "[%s]: Waiting for chart obsoletion check.",
  973. rrdhost_hostname(host));
  974. continue;
  975. }
  976. }
  977. if (!health_running_logged) {
  978. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  979. "[%s]: Health is running.",
  980. rrdhost_hostname(host));
  981. health_running_logged = true;
  982. }
  983. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  984. // the first loop is to lookup values from the db
  985. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  986. if(unlikely(!service_running(SERVICE_HEALTH)))
  987. break;
  988. rrdcalc_update_info_using_rrdset_labels(rc);
  989. if (update_disabled_silenced(host, rc))
  990. continue;
  991. // create an alert removed event if the chart is obsolete and
  992. // has stopped being collected for 60 seconds
  993. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  994. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  995. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  996. if (!rrdcalc_isrepeating(rc)) {
  997. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  998. time_t now = now_realtime_sec();
  999. ALARM_ENTRY *ae = health_create_alarm_entry(
  1000. host,
  1001. rc->id,
  1002. rc->next_event_id++,
  1003. rc->config_hash_id,
  1004. now,
  1005. rc->name,
  1006. rc->rrdset->id,
  1007. rc->rrdset->context,
  1008. rc->rrdset->name,
  1009. rc->classification,
  1010. rc->component,
  1011. rc->type,
  1012. rc->exec,
  1013. rc->recipient,
  1014. now - rc->last_status_change,
  1015. rc->value,
  1016. NAN,
  1017. rc->status,
  1018. RRDCALC_STATUS_REMOVED,
  1019. rc->source,
  1020. rc->units,
  1021. rc->summary,
  1022. rc->info,
  1023. 0,
  1024. rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
  1025. if (ae) {
  1026. health_log_alert(host, ae);
  1027. health_alarm_log_add_entry(host, ae);
  1028. rc->old_status = rc->status;
  1029. rc->status = RRDCALC_STATUS_REMOVED;
  1030. rc->last_status_change = now;
  1031. rc->last_status_change_value = rc->value;
  1032. rc->last_updated = now;
  1033. rc->value = NAN;
  1034. #ifdef ENABLE_ACLK
  1035. if (netdata_cloud_enabled)
  1036. sql_queue_alarm_to_aclk(host, ae, true);
  1037. #endif
  1038. }
  1039. }
  1040. }
  1041. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  1042. if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
  1043. rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
  1044. continue;
  1045. }
  1046. runnable++;
  1047. rc->old_value = rc->value;
  1048. rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
  1049. // ------------------------------------------------------------
  1050. // if there is database lookup, do it
  1051. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  1052. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  1053. /* time_t old_db_timestamp = rc->db_before; */
  1054. int value_is_null = 0;
  1055. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
  1056. rc->after, rc->before, rc->group, NULL,
  1057. 0, rc->options | RRDR_OPTION_SELECTED_TIER,
  1058. &rc->db_after,&rc->db_before,
  1059. NULL, NULL, NULL,
  1060. &value_is_null, NULL, 0, 0,
  1061. QUERY_SOURCE_HEALTH, STORAGE_PRIORITY_LOW);
  1062. if (unlikely(ret != 200)) {
  1063. // database lookup failed
  1064. rc->value = NAN;
  1065. rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
  1066. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  1067. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
  1068. );
  1069. } else
  1070. rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
  1071. if (unlikely(value_is_null)) {
  1072. // collected value is null
  1073. rc->value = NAN;
  1074. rc->run_flags |= RRDCALC_FLAG_DB_NAN;
  1075. netdata_log_debug(D_HEALTH,
  1076. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  1077. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
  1078. );
  1079. } else
  1080. rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
  1081. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
  1082. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
  1083. );
  1084. }
  1085. // ------------------------------------------------------------
  1086. // if there is calculation expression, run it
  1087. if (unlikely(rc->calculation)) {
  1088. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  1089. if (unlikely(!expression_evaluate(rc->calculation))) {
  1090. // calculation failed
  1091. rc->value = NAN;
  1092. rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
  1093. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  1094. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1095. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  1096. );
  1097. } else {
  1098. rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  1099. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  1100. NETDATA_DOUBLE_FORMAT
  1101. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1102. rc->calculation->parsed_as, rc->calculation->result,
  1103. buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
  1104. );
  1105. rc->value = rc->calculation->result;
  1106. }
  1107. }
  1108. }
  1109. foreach_rrdcalc_in_rrdhost_done(rc);
  1110. if (unlikely(runnable && service_running(SERVICE_HEALTH))) {
  1111. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1112. if(unlikely(!service_running(SERVICE_HEALTH)))
  1113. break;
  1114. if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
  1115. continue;
  1116. if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
  1117. continue;
  1118. }
  1119. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  1120. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  1121. // --------------------------------------------------------
  1122. // check the warning expression
  1123. if (likely(rc->warning)) {
  1124. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  1125. if (unlikely(!expression_evaluate(rc->warning))) {
  1126. // calculation failed
  1127. rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
  1128. netdata_log_debug(D_HEALTH,
  1129. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  1130. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1131. buffer_tostring(rc->warning->error_msg)
  1132. );
  1133. } else {
  1134. rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  1135. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  1136. NETDATA_DOUBLE_FORMAT
  1137. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1138. rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
  1139. );
  1140. warning_status = rrdcalc_value2status(rc->warning->result);
  1141. }
  1142. }
  1143. // --------------------------------------------------------
  1144. // check the critical expression
  1145. if (likely(rc->critical)) {
  1146. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  1147. if (unlikely(!expression_evaluate(rc->critical))) {
  1148. // calculation failed
  1149. rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
  1150. netdata_log_debug(D_HEALTH,
  1151. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  1152. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1153. buffer_tostring(rc->critical->error_msg)
  1154. );
  1155. } else {
  1156. rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  1157. netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  1158. NETDATA_DOUBLE_FORMAT
  1159. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1160. rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
  1161. rrdcalc_source(rc)
  1162. );
  1163. critical_status = rrdcalc_value2status(rc->critical->result);
  1164. }
  1165. }
  1166. // --------------------------------------------------------
  1167. // decide the final alarm status
  1168. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  1169. switch (warning_status) {
  1170. case RRDCALC_STATUS_CLEAR:
  1171. status = RRDCALC_STATUS_CLEAR;
  1172. break;
  1173. case RRDCALC_STATUS_RAISED:
  1174. status = RRDCALC_STATUS_WARNING;
  1175. break;
  1176. default:
  1177. break;
  1178. }
  1179. switch (critical_status) {
  1180. case RRDCALC_STATUS_CLEAR:
  1181. if (status == RRDCALC_STATUS_UNDEFINED)
  1182. status = RRDCALC_STATUS_CLEAR;
  1183. break;
  1184. case RRDCALC_STATUS_RAISED:
  1185. status = RRDCALC_STATUS_CRITICAL;
  1186. break;
  1187. default:
  1188. break;
  1189. }
  1190. // --------------------------------------------------------
  1191. // check if the new status and the old differ
  1192. if (status != rc->status) {
  1193. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1194. int delay = 0;
  1195. // apply trigger hysteresis
  1196. if (now > rc->delay_up_to_timestamp) {
  1197. rc->delay_up_current = rc->delay_up_duration;
  1198. rc->delay_down_current = rc->delay_down_duration;
  1199. rc->delay_last = 0;
  1200. rc->delay_up_to_timestamp = 0;
  1201. } else {
  1202. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  1203. if (rc->delay_up_current > rc->delay_max_duration)
  1204. rc->delay_up_current = rc->delay_max_duration;
  1205. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  1206. if (rc->delay_down_current > rc->delay_max_duration)
  1207. rc->delay_down_current = rc->delay_max_duration;
  1208. }
  1209. if (status > rc->status)
  1210. delay = rc->delay_up_current;
  1211. else
  1212. delay = rc->delay_down_current;
  1213. // COMMENTED: because we do need to send raising alarms
  1214. // if(now + delay < rc->delay_up_to_timestamp)
  1215. // delay = (int)(rc->delay_up_to_timestamp - now);
  1216. rc->delay_last = delay;
  1217. rc->delay_up_to_timestamp = now + delay;
  1218. ALARM_ENTRY *ae = health_create_alarm_entry(
  1219. host,
  1220. rc->id,
  1221. rc->next_event_id++,
  1222. rc->config_hash_id,
  1223. now,
  1224. rc->name,
  1225. rc->rrdset->id,
  1226. rc->rrdset->context,
  1227. rc->rrdset->name,
  1228. rc->classification,
  1229. rc->component,
  1230. rc->type,
  1231. rc->exec,
  1232. rc->recipient,
  1233. now - rc->last_status_change,
  1234. rc->old_value,
  1235. rc->value,
  1236. rc->status,
  1237. status,
  1238. rc->source,
  1239. rc->units,
  1240. rc->summary,
  1241. rc->info,
  1242. rc->delay_last,
  1243. (
  1244. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1245. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1246. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1247. )
  1248. );
  1249. health_log_alert(host, ae);
  1250. health_alarm_log_add_entry(host, ae);
  1251. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  1252. "[%s]: Alert event for [%s.%s], value [%s], status [%s].",
  1253. rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae),
  1254. rrdcalc_status2string(ae->new_status));
  1255. rc->last_status_change_value = rc->value;
  1256. rc->last_status_change = now;
  1257. rc->old_status = rc->status;
  1258. rc->status = status;
  1259. if(unlikely(rrdcalc_isrepeating(rc))) {
  1260. rc->last_repeat = now;
  1261. if (rc->status == RRDCALC_STATUS_CLEAR)
  1262. rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
  1263. }
  1264. }
  1265. rc->last_updated = now;
  1266. rc->next_update = now + rc->update_every;
  1267. if (next_run > rc->next_update)
  1268. next_run = rc->next_update;
  1269. }
  1270. foreach_rrdcalc_in_rrdhost_done(rc);
  1271. // process repeating alarms
  1272. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1273. if(unlikely(!service_running(SERVICE_HEALTH)))
  1274. break;
  1275. int repeat_every = 0;
  1276. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  1277. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  1278. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1279. repeat_every = rc->warn_repeat_every;
  1280. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  1281. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1282. repeat_every = rc->crit_repeat_every;
  1283. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  1284. if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
  1285. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  1286. repeat_every = 1;
  1287. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  1288. repeat_every = 1;
  1289. }
  1290. }
  1291. }
  1292. } else {
  1293. continue;
  1294. }
  1295. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  1296. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1297. rc->last_repeat = now;
  1298. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  1299. ALARM_ENTRY *ae = health_create_alarm_entry(
  1300. host,
  1301. rc->id,
  1302. rc->next_event_id++,
  1303. rc->config_hash_id,
  1304. now,
  1305. rc->name,
  1306. rc->rrdset->id,
  1307. rc->rrdset->context,
  1308. rc->rrdset->name,
  1309. rc->classification,
  1310. rc->component,
  1311. rc->type,
  1312. rc->exec,
  1313. rc->recipient,
  1314. now - rc->last_status_change,
  1315. rc->old_value,
  1316. rc->value,
  1317. rc->old_status,
  1318. rc->status,
  1319. rc->source,
  1320. rc->units,
  1321. rc->summary,
  1322. rc->info,
  1323. rc->delay_last,
  1324. (
  1325. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1326. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1327. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1328. )
  1329. );
  1330. health_log_alert(host, ae);
  1331. ae->last_repeat = rc->last_repeat;
  1332. if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  1333. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  1334. }
  1335. rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
  1336. health_process_notifications(host, ae);
  1337. netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  1338. health_alarm_wait_for_execution(ae);
  1339. health_alarm_log_free_one_nochecks_nounlink(ae);
  1340. }
  1341. }
  1342. foreach_rrdcalc_in_rrdhost_done(rc);
  1343. }
  1344. if (unlikely(!service_running(SERVICE_HEALTH)))
  1345. break;
  1346. // execute notifications
  1347. // and cleanup
  1348. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  1349. health_alarm_log_process(host);
  1350. if (unlikely(!service_running(SERVICE_HEALTH))) {
  1351. // wait for all notifications to finish before allowing health to be cleaned up
  1352. ALARM_ENTRY *ae;
  1353. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1354. if(unlikely(!service_running(SERVICE_HEALTH)))
  1355. break;
  1356. health_alarm_wait_for_execution(ae);
  1357. }
  1358. break;
  1359. }
  1360. #ifdef ENABLE_ACLK
  1361. if (netdata_cloud_enabled) {
  1362. struct aclk_sync_cfg_t *wc = host->aclk_config;
  1363. if (unlikely(!wc))
  1364. continue;
  1365. if (wc->alert_queue_removed == 1) {
  1366. sql_queue_removed_alerts_to_aclk(host);
  1367. } else if (wc->alert_queue_removed > 1) {
  1368. wc->alert_queue_removed--;
  1369. }
  1370. if (wc->alert_checkpoint_req == 1) {
  1371. aclk_push_alarm_checkpoint(host);
  1372. } else if (wc->alert_checkpoint_req > 1) {
  1373. wc->alert_checkpoint_req--;
  1374. }
  1375. }
  1376. #endif
  1377. }
  1378. dfe_done(host);
  1379. // wait for all notifications to finish before allowing health to be cleaned up
  1380. ALARM_ENTRY *ae;
  1381. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1382. if(unlikely(!service_running(SERVICE_HEALTH)))
  1383. break;
  1384. health_alarm_wait_for_execution(ae);
  1385. }
  1386. if(unlikely(!service_running(SERVICE_HEALTH)))
  1387. break;
  1388. health_sleep(next_run, loop);
  1389. } // forever
  1390. netdata_thread_cleanup_pop(1);
  1391. return NULL;
  1392. }