health.c 67 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. #define WORKER_HEALTH_JOB_RRD_LOCK 0
  4. #define WORKER_HEALTH_JOB_HOST_LOCK 1
  5. #define WORKER_HEALTH_JOB_DB_QUERY 2
  6. #define WORKER_HEALTH_JOB_CALC_EVAL 3
  7. #define WORKER_HEALTH_JOB_WARNING_EVAL 4
  8. #define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
  9. #define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
  10. #define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
  11. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
  12. #define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
  13. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
  14. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
  15. #endif
  16. unsigned int default_health_enabled = 1;
  17. char *silencers_filename;
  18. SIMPLE_PATTERN *conf_enabled_alarms = NULL;
  19. DICTIONARY *health_rrdvars;
  20. static bool prepare_command(BUFFER *wb,
  21. const char *exec,
  22. const char *recipient,
  23. const char *registry_hostname,
  24. uint32_t unique_id,
  25. uint32_t alarm_id,
  26. uint32_t alarm_event_id,
  27. uint32_t when,
  28. const char *alert_name,
  29. const char *alert_chart_name,
  30. const char *alert_family,
  31. const char *new_status,
  32. const char *old_status,
  33. NETDATA_DOUBLE new_value,
  34. NETDATA_DOUBLE old_value,
  35. const char *alert_source,
  36. uint32_t duration,
  37. uint32_t non_clear_duration,
  38. const char *alert_units,
  39. const char *alert_info,
  40. const char *new_value_string,
  41. const char *old_value_string,
  42. const char *source,
  43. const char *error_msg,
  44. int n_warn,
  45. int n_crit,
  46. const char *warn_alarms,
  47. const char *crit_alarms,
  48. const char *classification,
  49. const char *edit_command,
  50. const char *machine_guid)
  51. {
  52. char buf[8192];
  53. size_t n = 8192 - 1;
  54. buffer_strcat(wb, "exec");
  55. if (!sanitize_command_argument_string(buf, exec, n))
  56. return false;
  57. buffer_sprintf(wb, " '%s'", buf);
  58. if (!sanitize_command_argument_string(buf, recipient, n))
  59. return false;
  60. buffer_sprintf(wb, " '%s'", buf);
  61. if (!sanitize_command_argument_string(buf, registry_hostname, n))
  62. return false;
  63. buffer_sprintf(wb, " '%s'", buf);
  64. buffer_sprintf(wb, " '%u'", unique_id);
  65. buffer_sprintf(wb, " '%u'", alarm_id);
  66. buffer_sprintf(wb, " '%u'", alarm_event_id);
  67. buffer_sprintf(wb, " '%u'", when);
  68. if (!sanitize_command_argument_string(buf, alert_name, n))
  69. return false;
  70. buffer_sprintf(wb, " '%s'", buf);
  71. if (!sanitize_command_argument_string(buf, alert_chart_name, n))
  72. return false;
  73. buffer_sprintf(wb, " '%s'", buf);
  74. if (!sanitize_command_argument_string(buf, alert_family, n))
  75. return false;
  76. buffer_sprintf(wb, " '%s'", buf);
  77. if (!sanitize_command_argument_string(buf, new_status, n))
  78. return false;
  79. buffer_sprintf(wb, " '%s'", buf);
  80. if (!sanitize_command_argument_string(buf, old_status, n))
  81. return false;
  82. buffer_sprintf(wb, " '%s'", buf);
  83. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
  84. buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
  85. if (!sanitize_command_argument_string(buf, alert_source, n))
  86. return false;
  87. buffer_sprintf(wb, " '%s'", buf);
  88. buffer_sprintf(wb, " '%u'", duration);
  89. buffer_sprintf(wb, " '%u'", non_clear_duration);
  90. if (!sanitize_command_argument_string(buf, alert_units, n))
  91. return false;
  92. buffer_sprintf(wb, " '%s'", buf);
  93. if (!sanitize_command_argument_string(buf, alert_info, n))
  94. return false;
  95. buffer_sprintf(wb, " '%s'", buf);
  96. if (!sanitize_command_argument_string(buf, new_value_string, n))
  97. return false;
  98. buffer_sprintf(wb, " '%s'", buf);
  99. if (!sanitize_command_argument_string(buf, old_value_string, n))
  100. return false;
  101. buffer_sprintf(wb, " '%s'", buf);
  102. if (!sanitize_command_argument_string(buf, source, n))
  103. return false;
  104. buffer_sprintf(wb, " '%s'", buf);
  105. if (!sanitize_command_argument_string(buf, error_msg, n))
  106. return false;
  107. buffer_sprintf(wb, " '%s'", buf);
  108. buffer_sprintf(wb, " '%d'", n_warn);
  109. buffer_sprintf(wb, " '%d'", n_crit);
  110. if (!sanitize_command_argument_string(buf, warn_alarms, n))
  111. return false;
  112. buffer_sprintf(wb, " '%s'", buf);
  113. if (!sanitize_command_argument_string(buf, crit_alarms, n))
  114. return false;
  115. buffer_sprintf(wb, " '%s'", buf);
  116. if (!sanitize_command_argument_string(buf, classification, n))
  117. return false;
  118. buffer_sprintf(wb, " '%s'", buf);
  119. if (!sanitize_command_argument_string(buf, edit_command, n))
  120. return false;
  121. buffer_sprintf(wb, " '%s'", buf);
  122. if (!sanitize_command_argument_string(buf, machine_guid, n))
  123. return false;
  124. buffer_sprintf(wb, " '%s'", buf);
  125. return true;
  126. }
  127. // the queue of executed alarm notifications that haven't been waited for yet
  128. static struct {
  129. ALARM_ENTRY *head; // oldest
  130. ALARM_ENTRY *tail; // latest
  131. } alarm_notifications_in_progress = {NULL, NULL};
  132. typedef struct active_alerts {
  133. char *name;
  134. time_t last_status_change;
  135. RRDCALC_STATUS status;
  136. } active_alerts_t;
  137. static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
  138. {
  139. ae->prev_in_progress = NULL;
  140. ae->next_in_progress = NULL;
  141. if (NULL != alarm_notifications_in_progress.tail) {
  142. ae->prev_in_progress = alarm_notifications_in_progress.tail;
  143. alarm_notifications_in_progress.tail->next_in_progress = ae;
  144. }
  145. if (NULL == alarm_notifications_in_progress.head) {
  146. alarm_notifications_in_progress.head = ae;
  147. }
  148. alarm_notifications_in_progress.tail = ae;
  149. }
  150. static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
  151. {
  152. struct alarm_entry *prev = ae->prev_in_progress;
  153. struct alarm_entry *next = ae->next_in_progress;
  154. if (NULL != prev) {
  155. prev->next_in_progress = next;
  156. }
  157. if (NULL != next) {
  158. next->prev_in_progress = prev;
  159. }
  160. if (ae == alarm_notifications_in_progress.head) {
  161. alarm_notifications_in_progress.head = next;
  162. }
  163. if (ae == alarm_notifications_in_progress.tail) {
  164. alarm_notifications_in_progress.tail = prev;
  165. }
  166. }
  167. // ----------------------------------------------------------------------------
  168. // health initialization
  169. /**
  170. * User Config directory
  171. *
  172. * Get the config directory for health and return it.
  173. *
  174. * @return a pointer to the user config directory
  175. */
  176. inline char *health_user_config_dir(void) {
  177. char buffer[FILENAME_MAX + 1];
  178. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
  179. return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
  180. }
  181. /**
  182. * Stock Config Directory
  183. *
  184. * Get the Stock config directory and return it.
  185. *
  186. * @return a pointer to the stock config directory.
  187. */
  188. inline char *health_stock_config_dir(void) {
  189. char buffer[FILENAME_MAX + 1];
  190. snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
  191. return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
  192. }
  193. /**
  194. * Silencers init
  195. *
  196. * Function used to initialize the silencer structure.
  197. */
  198. static void health_silencers_init(void) {
  199. FILE *fd = fopen(silencers_filename, "r");
  200. if (fd) {
  201. fseek(fd, 0 , SEEK_END);
  202. off_t length = (off_t) ftell(fd);
  203. fseek(fd, 0 , SEEK_SET);
  204. if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
  205. char *str = mallocz((length+1)* sizeof(char));
  206. if(str) {
  207. size_t copied;
  208. copied = fread(str, sizeof(char), length, fd);
  209. if (copied == (length* sizeof(char))) {
  210. str[length] = 0x00;
  211. json_parse(str, NULL, health_silencers_json_read_callback);
  212. info("Parsed health silencers file %s", silencers_filename);
  213. } else {
  214. error("Cannot read the data from health silencers file %s", silencers_filename);
  215. }
  216. freez(str);
  217. }
  218. } else {
  219. error(
  220. "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
  221. silencers_filename,
  222. (int64_t)length,
  223. HEALTH_SILENCERS_MAX_FILE_LEN);
  224. }
  225. fclose(fd);
  226. } else {
  227. info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
  228. }
  229. }
  230. /**
  231. * Health Init
  232. *
  233. * Initialize the health thread.
  234. */
  235. void health_init(void) {
  236. debug(D_HEALTH, "Health configuration initializing");
  237. if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
  238. debug(D_HEALTH, "Health is disabled.");
  239. return;
  240. }
  241. health_silencers_init();
  242. }
  243. // ----------------------------------------------------------------------------
  244. // re-load health configuration
  245. /**
  246. * Reload host
  247. *
  248. * Reload configuration for a specific host.
  249. *
  250. * @param host the structure of the host that the function will reload the configuration.
  251. */
  252. static void health_reload_host(RRDHOST *host) {
  253. if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
  254. return;
  255. log_health("[%s]: Reloading health.", rrdhost_hostname(host));
  256. char *user_path = health_user_config_dir();
  257. char *stock_path = health_stock_config_dir();
  258. // free all running alarms
  259. rrdcalc_delete_all(host);
  260. rrdcalctemplate_delete_all(host);
  261. // invalidate all previous entries in the alarm log
  262. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  263. ALARM_ENTRY *t;
  264. for(t = host->health_log.alarms ; t ; t = t->next) {
  265. if(t->new_status != RRDCALC_STATUS_REMOVED)
  266. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  267. }
  268. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  269. // reset all thresholds to all charts
  270. RRDSET *st;
  271. rrdset_foreach_read(st, host) {
  272. st->green = NAN;
  273. st->red = NAN;
  274. }
  275. rrdset_foreach_done(st);
  276. // load the new alarms
  277. health_readdir(host, user_path, stock_path, NULL);
  278. //Discard alarms with labels that do not apply to host
  279. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  280. // link the loaded alarms to their charts
  281. rrdset_foreach_write(st, host) {
  282. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  283. continue;
  284. rrdcalc_link_matching_alerts_to_rrdset(st);
  285. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  286. }
  287. rrdset_foreach_done(st);
  288. #ifdef ENABLE_ACLK
  289. if (netdata_cloud_setting) {
  290. struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
  291. if (likely(wc)) {
  292. wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
  293. }
  294. }
  295. #endif
  296. }
  297. /**
  298. * Reload
  299. *
  300. * Reload the host configuration for all hosts.
  301. */
  302. void health_reload(void) {
  303. sql_refresh_hashes();
  304. RRDHOST *host;
  305. dfe_start_reentrant(rrdhost_root_index, host){
  306. health_reload_host(host);
  307. }
  308. dfe_done(host);
  309. }
  310. // ----------------------------------------------------------------------------
  311. // health main thread and friends
  312. static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
  313. if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
  314. if(n) return RRDCALC_STATUS_RAISED;
  315. return RRDCALC_STATUS_CLEAR;
  316. }
  317. #define ACTIVE_ALARMS_LIST_EXAMINE 500
  318. #define ACTIVE_ALARMS_LIST 15
  319. static inline int compare_active_alerts(const void * a, const void * b) {
  320. active_alerts_t *active_alerts_a = (active_alerts_t *)a;
  321. active_alerts_t *active_alerts_b = (active_alerts_t *)b;
  322. return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
  323. }
  324. static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
  325. ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
  326. if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
  327. // do not send notifications for internal statuses
  328. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  329. goto done;
  330. }
  331. if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  332. // do not send notifications for disabled statuses
  333. debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  334. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  335. // mark it as run, so that we will send the same alarm if it happens again
  336. goto done;
  337. }
  338. // find the previous notification for the same alarm
  339. // which we have run the exec script
  340. // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
  341. RRDCALC_STATUS last_executed_status = -3;
  342. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
  343. int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
  344. if (likely(ret == 1)) {
  345. // we have executed this alarm notification in the past
  346. if(last_executed_status == ae->new_status) {
  347. // don't send the notification for the same status again
  348. debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
  349. , rrdcalc_status2string(ae->new_status));
  350. log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
  351. , rrdcalc_status2string(ae->new_status));
  352. goto done;
  353. }
  354. }
  355. else {
  356. // we have not executed this alarm notification in the past
  357. // so, don't send CLEAR notifications
  358. if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
  359. if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
  360. debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
  361. , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  362. goto done;
  363. }
  364. }
  365. }
  366. }
  367. // Check if alarm notifications are silenced
  368. if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
  369. log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  370. goto done;
  371. }
  372. log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
  373. const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
  374. const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
  375. int n_warn=0, n_crit=0;
  376. RRDCALC *rc;
  377. EVAL_EXPRESSION *expr=NULL;
  378. BUFFER *warn_alarms, *crit_alarms;
  379. active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
  380. warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  381. crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
  382. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  383. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  384. continue;
  385. if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
  386. break;
  387. if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  388. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  389. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  390. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  391. active_alerts[n_warn+n_crit].status = rc->status;
  392. n_warn++;
  393. } else if (ae->alarm_id == rc->id)
  394. expr = rc->warning;
  395. } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  396. if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
  397. active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
  398. active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
  399. active_alerts[n_warn+n_crit].status = rc->status;
  400. n_crit++;
  401. } else if (ae->alarm_id == rc->id)
  402. expr = rc->critical;
  403. } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  404. if (ae->alarm_id == rc->id)
  405. expr = rc->warning;
  406. }
  407. }
  408. foreach_rrdcalc_in_rrdhost_done(rc);
  409. if (n_warn+n_crit>1)
  410. qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
  411. int count_w = 0, count_c = 0;
  412. while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
  413. if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
  414. if (count_w)
  415. buffer_strcat(warn_alarms, ",");
  416. buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
  417. buffer_strcat(warn_alarms, "=");
  418. buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  419. count_w++;
  420. }
  421. else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
  422. if (count_c)
  423. buffer_strcat(crit_alarms, ",");
  424. buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
  425. buffer_strcat(crit_alarms, "=");
  426. buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
  427. count_c++;
  428. }
  429. }
  430. char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
  431. BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health);
  432. bool ok = prepare_command(wb,
  433. exec,
  434. recipient,
  435. rrdhost_registry_hostname(host),
  436. ae->unique_id,
  437. ae->alarm_id,
  438. ae->alarm_event_id,
  439. (unsigned long)ae->when,
  440. ae_name(ae),
  441. ae->chart?ae_chart_name(ae):"NOCHART",
  442. ae->family?ae_family(ae):"NOFAMILY",
  443. rrdcalc_status2string(ae->new_status),
  444. rrdcalc_status2string(ae->old_status),
  445. ae->new_value,
  446. ae->old_value,
  447. ae->source?ae_source(ae):"UNKNOWN",
  448. (uint32_t)ae->duration,
  449. (uint32_t)ae->non_clear_duration,
  450. ae_units(ae),
  451. ae_info(ae),
  452. ae_new_value_string(ae),
  453. ae_old_value_string(ae),
  454. (expr && expr->source)?expr->source:"NOSOURCE",
  455. (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
  456. n_warn,
  457. n_crit,
  458. buffer_tostring(warn_alarms),
  459. buffer_tostring(crit_alarms),
  460. ae->classification?ae_classification(ae):"Unknown",
  461. edit_command,
  462. host != localhost ? host->machine_guid:"");
  463. const char *command_to_run = buffer_tostring(wb);
  464. if (ok) {
  465. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
  466. ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
  467. debug(D_HEALTH, "executing command '%s'", command_to_run);
  468. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  469. ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
  470. enqueue_alarm_notify_in_progress(ae);
  471. health_alarm_log_save(host, ae);
  472. } else {
  473. error("Failed to format command arguments");
  474. }
  475. buffer_free(wb);
  476. freez(edit_command);
  477. buffer_free(warn_alarms);
  478. buffer_free(crit_alarms);
  479. freez(active_alerts);
  480. return; //health_alarm_wait_for_execution
  481. done:
  482. health_alarm_log_save(host, ae);
  483. }
  484. static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
  485. if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  486. return;
  487. spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
  488. debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
  489. ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
  490. if(ae->exec_code != 0)
  491. ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
  492. unlink_alarm_notify_in_progress(ae);
  493. }
  494. static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
  495. debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
  496. ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
  497. ae->new_value,
  498. rrdcalc_status2string(ae->old_status),
  499. rrdcalc_status2string(ae->new_status)
  500. );
  501. health_alarm_execute(host, ae);
  502. }
  503. static inline void health_alarm_log_process(RRDHOST *host) {
  504. uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
  505. time_t now = now_realtime_sec();
  506. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  507. ALARM_ENTRY *ae;
  508. for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
  509. if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
  510. if(unlikely(
  511. !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
  512. !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
  513. )) {
  514. if(unlikely(ae->unique_id < first_waiting))
  515. first_waiting = ae->unique_id;
  516. if(likely(now >= ae->delay_up_to_timestamp))
  517. health_process_notifications(host, ae);
  518. }
  519. }
  520. }
  521. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  522. // remember this for the next iteration
  523. host->health_last_processed_id = first_waiting;
  524. //delete those that are updated, no in progress execution, and is not repeating
  525. netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
  526. ALARM_ENTRY *prev = NULL, *next = NULL;
  527. for(ae = host->health_log.alarms; ae ; ae = next) {
  528. next = ae->next; // set it here, for the next iteration
  529. if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
  530. (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
  531. (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
  532. !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
  533. ||
  534. ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
  535. (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
  536. (ae->when + 3600 < now_realtime_sec())))
  537. {
  538. if(host->health_log.alarms == ae) {
  539. host->health_log.alarms = next;
  540. // prev is also NULL here
  541. }
  542. else {
  543. prev->next = next;
  544. // prev should not be touched here - we need it for the next iteration
  545. // because we may have to also remove the next item
  546. }
  547. health_alarm_log_free_one_nochecks_nounlink(ae);
  548. }
  549. else
  550. prev = ae;
  551. }
  552. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  553. }
  554. static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
  555. if(unlikely(!rc->rrdset)) {
  556. debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  557. return 0;
  558. }
  559. if(unlikely(rc->next_update > now)) {
  560. if (unlikely(*next_run > rc->next_update)) {
  561. // update the next_run time of the main loop
  562. // to run this alarm precisely the time required
  563. *next_run = rc->next_update;
  564. }
  565. debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
  566. return 0;
  567. }
  568. if(unlikely(!rc->update_every)) {
  569. debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  570. return 0;
  571. }
  572. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
  573. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  574. return 0;
  575. }
  576. if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
  577. debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  578. return 0;
  579. }
  580. if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
  581. debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
  582. return 0;
  583. }
  584. int update_every = rc->rrdset->update_every;
  585. time_t first = rrdset_first_entry_s(rc->rrdset);
  586. time_t last = rrdset_last_entry_s(rc->rrdset);
  587. if(unlikely(now + update_every < first /* || now - update_every > last */)) {
  588. debug(D_HEALTH
  589. , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
  590. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
  591. , (unsigned long) last);
  592. return 0;
  593. }
  594. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  595. time_t needed = now + rc->before + rc->after;
  596. if(needed + update_every < first || needed - update_every > last) {
  597. debug(D_HEALTH
  598. , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
  599. , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
  600. , (unsigned long) last);
  601. return 0;
  602. }
  603. }
  604. return 1;
  605. }
  606. static inline int check_if_resumed_from_suspension(void) {
  607. static usec_t last_realtime = 0, last_monotonic = 0;
  608. usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
  609. int ret = 0;
  610. // detect if monotonic and realtime have twice the difference
  611. // in which case we assume the system was just waken from hibernation
  612. if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
  613. ret = 1;
  614. last_realtime = realtime;
  615. last_monotonic = monotonic;
  616. return ret;
  617. }
  618. static void health_main_cleanup(void *ptr) {
  619. worker_unregister();
  620. struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
  621. static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
  622. info("cleaning up...");
  623. static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
  624. log_health("Health thread ended.");
  625. }
  626. static void initialize_health(RRDHOST *host)
  627. {
  628. if(!host->health.health_enabled ||
  629. rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) ||
  630. !service_running(SERVICE_HEALTH))
  631. return;
  632. rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
  633. log_health("[%s]: Initializing health.", rrdhost_hostname(host));
  634. host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
  635. host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
  636. host->health_log.next_log_id = 1;
  637. host->health_log.next_alarm_id = 1;
  638. host->health_log.max = 1000;
  639. host->health_log.next_log_id = (uint32_t)now_realtime_sec();
  640. host->health_log.next_alarm_id = 0;
  641. long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
  642. if(n < 10) {
  643. error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
  644. config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
  645. }
  646. else
  647. host->health_log.max = (unsigned int)n;
  648. conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
  649. SIMPLE_PATTERN_EXACT, true);
  650. netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
  651. char filename[FILENAME_MAX + 1];
  652. snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
  653. host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
  654. host->health.health_default_recipient = string_strdupz("root");
  655. // TODO: This needs to go to the metadata thread
  656. // Health should wait before accessing the table (needs to be created by the metadata thread)
  657. sql_create_health_log_table(host);
  658. sql_health_alarm_log_load(host);
  659. // ------------------------------------------------------------------------
  660. // load health configuration
  661. health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
  662. // link the loaded alarms to their charts
  663. RRDSET *st;
  664. rrdset_foreach_reentrant(st, host) {
  665. if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
  666. continue;
  667. rrdcalc_link_matching_alerts_to_rrdset(st);
  668. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  669. }
  670. rrdset_foreach_done(st);
  671. //Discard alarms with labels that do not apply to host
  672. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  673. }
  674. static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
  675. time_t now = now_realtime_sec();
  676. if(now < next_run) {
  677. worker_is_idle();
  678. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
  679. while (now < next_run && service_running(SERVICE_HEALTH)) {
  680. sleep_usec(USEC_PER_SEC);
  681. now = now_realtime_sec();
  682. }
  683. }
  684. else {
  685. debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
  686. }
  687. }
  688. static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
  689. SILENCER *s;
  690. debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
  691. rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
  692. for (s = silencers->silencers; s!=NULL; s=s->next){
  693. if (
  694. (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches_string(s->alarms_pattern, rc->name))) &&
  695. (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches_string(s->contexts_pattern, rc->rrdset->context))) &&
  696. (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern, host))) &&
  697. (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) &&
  698. (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family)))
  699. ) {
  700. debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
  701. if (unlikely(silencers->stype == STYPE_NONE)) {
  702. debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
  703. } else {
  704. debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
  705. , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
  706. , rrdcalc_name(rc)
  707. , (rc->rrdset)?rrdset_context(rc->rrdset):""
  708. , rrdcalc_chart_name(rc)
  709. , host
  710. , (rc->rrdset)?rrdset_family(rc->rrdset):""
  711. );
  712. }
  713. return silencers->stype;
  714. }
  715. }
  716. return STYPE_NONE;
  717. }
  718. /**
  719. * Update Disabled Silenced
  720. *
  721. * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
  722. *
  723. * @param host structure that contains information about the host monitored.
  724. * @param rc structure with information about the alarm
  725. *
  726. * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
  727. */
  728. static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
  729. uint32_t rrdcalc_flags_old = rc->run_flags;
  730. // Clear the flags
  731. rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
  732. if (unlikely(silencers->all_alarms)) {
  733. if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  734. else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  735. } else {
  736. SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
  737. if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
  738. else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
  739. }
  740. if (rrdcalc_flags_old != rc->run_flags) {
  741. info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
  742. rrdhost_hostname(host),
  743. rrdcalc_name(rc),
  744. (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
  745. (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
  746. (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
  747. (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  748. );
  749. }
  750. if (rc->run_flags & RRDCALC_FLAG_DISABLED)
  751. return 1;
  752. else
  753. return 0;
  754. }
  755. static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
  756. #ifdef ENABLE_ACLK
  757. if (netdata_cloud_setting) {
  758. struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
  759. if (unlikely(!wc)) {
  760. return;
  761. }
  762. if (wc->alert_queue_removed >= 1) {
  763. wc->alert_queue_removed+=6;
  764. }
  765. }
  766. #endif
  767. }
  768. static void health_execute_delayed_initializations(RRDHOST *host) {
  769. RRDSET *st;
  770. bool must_postpone = false;
  771. if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
  772. rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
  773. rrdset_foreach_reentrant(st, host) {
  774. if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  775. rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
  776. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
  777. rrdcalc_link_matching_alerts_to_rrdset(st);
  778. rrdcalctemplate_link_matching_templates_to_rrdset(st);
  779. RRDDIM *rd;
  780. rrddim_foreach_read(rd, st) {
  781. if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
  782. rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
  783. worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
  784. RRDCALCTEMPLATE *rt;
  785. foreach_rrdcalctemplate_read(host, rt) {
  786. if(!rt->foreach_dimension_pattern)
  787. continue;
  788. if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) {
  789. rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
  790. }
  791. }
  792. foreach_rrdcalctemplate_done(rt);
  793. if (health_variable_check(health_rrdvars, st, rd))
  794. rrdvar_store_for_chart(host, st);
  795. }
  796. rrddim_foreach_done(rd);
  797. must_postpone = true;
  798. }
  799. rrdset_foreach_done(st);
  800. if (must_postpone)
  801. sql_health_postpone_queue_removed(host);
  802. }
  803. /**
  804. * Health Main
  805. *
  806. * The main thread of the health system. In this function all the alarms will be processed.
  807. *
  808. * @param ptr is a pointer to the netdata_static_thread structure.
  809. *
  810. * @return It always returns NULL
  811. */
  812. void *health_main(void *ptr) {
  813. worker_register("HEALTH");
  814. worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
  815. worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
  816. worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
  817. worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
  818. worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
  819. worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
  820. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
  821. worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
  822. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
  823. worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
  824. netdata_thread_cleanup_push(health_main_cleanup, ptr);
  825. int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
  826. if(min_run_every < 1) min_run_every = 1;
  827. time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
  828. bool health_running_logged = false;
  829. rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
  830. unsigned int loop = 0;
  831. while(service_running(SERVICE_HEALTH)) {
  832. loop++;
  833. debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
  834. time_t now = now_realtime_sec();
  835. int runnable = 0, apply_hibernation_delay = 0;
  836. time_t next_run = now + min_run_every;
  837. RRDCALC *rc;
  838. RRDHOST *host;
  839. if (unlikely(check_if_resumed_from_suspension())) {
  840. apply_hibernation_delay = 1;
  841. log_health(
  842. "Postponing alarm checks for %"PRId64" seconds, "
  843. "because it seems that the system was just resumed from suspension.",
  844. (int64_t)hibernation_delay);
  845. }
  846. if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
  847. static int logged=0;
  848. if (!logged) {
  849. log_health("Skipping health checks, because all alarms are disabled via a %s command.",
  850. HEALTH_CMDAPI_CMD_DISABLEALL);
  851. logged = 1;
  852. }
  853. }
  854. worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
  855. dfe_start_reentrant(rrdhost_root_index, host) {
  856. if(unlikely(!service_running(SERVICE_HEALTH)))
  857. break;
  858. if (unlikely(!host->health.health_enabled))
  859. continue;
  860. if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)))
  861. initialize_health(host);
  862. health_execute_delayed_initializations(host);
  863. rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
  864. if (unlikely(apply_hibernation_delay)) {
  865. log_health(
  866. "[%s]: Postponing health checks for %"PRId64" seconds.",
  867. rrdhost_hostname(host),
  868. (int64_t)hibernation_delay);
  869. host->health.health_delay_up_to = now + hibernation_delay;
  870. }
  871. if (unlikely(host->health.health_delay_up_to)) {
  872. if (unlikely(now < host->health.health_delay_up_to)) {
  873. continue;
  874. }
  875. log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
  876. host->health.health_delay_up_to = 0;
  877. }
  878. // wait until cleanup of obsolete charts on children is complete
  879. if (host != localhost) {
  880. if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
  881. log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
  882. continue;
  883. }
  884. }
  885. if (!health_running_logged) {
  886. log_health("[%s]: Health is running.", rrdhost_hostname(host));
  887. health_running_logged = true;
  888. }
  889. worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
  890. // the first loop is to lookup values from the db
  891. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  892. if(unlikely(!service_running(SERVICE_HEALTH)))
  893. break;
  894. rrdcalc_update_info_using_rrdset_labels(rc);
  895. if (update_disabled_silenced(host, rc))
  896. continue;
  897. // create an alert removed event if the chart is obsolete and
  898. // has stopped being collected for 60 seconds
  899. if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
  900. rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
  901. now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
  902. if (!rrdcalc_isrepeating(rc)) {
  903. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  904. time_t now = now_realtime_sec();
  905. ALARM_ENTRY *ae = health_create_alarm_entry(
  906. host,
  907. rc->id,
  908. rc->next_event_id++,
  909. rc->config_hash_id,
  910. now,
  911. rc->name,
  912. rc->rrdset->id,
  913. rc->rrdset->context,
  914. rc->rrdset->family,
  915. rc->classification,
  916. rc->component,
  917. rc->type,
  918. rc->exec,
  919. rc->recipient,
  920. now - rc->last_status_change,
  921. rc->value,
  922. NAN,
  923. rc->status,
  924. RRDCALC_STATUS_REMOVED,
  925. rc->source,
  926. rc->units,
  927. rc->info,
  928. 0,
  929. rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
  930. if (ae) {
  931. health_alarm_log_add_entry(host, ae);
  932. rc->old_status = rc->status;
  933. rc->status = RRDCALC_STATUS_REMOVED;
  934. rc->last_status_change = now;
  935. rc->last_updated = now;
  936. rc->value = NAN;
  937. #ifdef ENABLE_ACLK
  938. if (netdata_cloud_setting)
  939. sql_queue_alarm_to_aclk(host, ae, 1);
  940. #endif
  941. }
  942. }
  943. }
  944. if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
  945. if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
  946. rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
  947. continue;
  948. }
  949. runnable++;
  950. rc->old_value = rc->value;
  951. rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
  952. // ------------------------------------------------------------
  953. // if there is database lookup, do it
  954. if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
  955. worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
  956. /* time_t old_db_timestamp = rc->db_before; */
  957. int value_is_null = 0;
  958. int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
  959. rc->after, rc->before, rc->group, NULL,
  960. 0, rc->options,
  961. &rc->db_after,&rc->db_before,
  962. NULL, NULL, NULL,
  963. &value_is_null, NULL, 0, 0,
  964. QUERY_SOURCE_HEALTH, STORAGE_PRIORITY_LOW);
  965. if (unlikely(ret != 200)) {
  966. // database lookup failed
  967. rc->value = NAN;
  968. rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
  969. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
  970. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
  971. );
  972. } else
  973. rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
  974. if (unlikely(value_is_null)) {
  975. // collected value is null
  976. rc->value = NAN;
  977. rc->run_flags |= RRDCALC_FLAG_DB_NAN;
  978. debug(D_HEALTH,
  979. "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
  980. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
  981. );
  982. } else
  983. rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
  984. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
  985. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
  986. );
  987. }
  988. // ------------------------------------------------------------
  989. // if there is calculation expression, run it
  990. if (unlikely(rc->calculation)) {
  991. worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
  992. if (unlikely(!expression_evaluate(rc->calculation))) {
  993. // calculation failed
  994. rc->value = NAN;
  995. rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
  996. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
  997. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  998. rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
  999. );
  1000. } else {
  1001. rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
  1002. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
  1003. NETDATA_DOUBLE_FORMAT
  1004. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1005. rc->calculation->parsed_as, rc->calculation->result,
  1006. buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
  1007. );
  1008. rc->value = rc->calculation->result;
  1009. }
  1010. }
  1011. }
  1012. foreach_rrdcalc_in_rrdhost_done(rc);
  1013. if (unlikely(runnable && service_running(SERVICE_HEALTH))) {
  1014. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1015. if(unlikely(!service_running(SERVICE_HEALTH)))
  1016. break;
  1017. if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
  1018. continue;
  1019. if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
  1020. continue;
  1021. }
  1022. RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
  1023. RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
  1024. // --------------------------------------------------------
  1025. // check the warning expression
  1026. if (likely(rc->warning)) {
  1027. worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
  1028. if (unlikely(!expression_evaluate(rc->warning))) {
  1029. // calculation failed
  1030. rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
  1031. debug(D_HEALTH,
  1032. "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
  1033. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1034. buffer_tostring(rc->warning->error_msg)
  1035. );
  1036. } else {
  1037. rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
  1038. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
  1039. NETDATA_DOUBLE_FORMAT
  1040. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1041. rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
  1042. );
  1043. warning_status = rrdcalc_value2status(rc->warning->result);
  1044. }
  1045. }
  1046. // --------------------------------------------------------
  1047. // check the critical expression
  1048. if (likely(rc->critical)) {
  1049. worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
  1050. if (unlikely(!expression_evaluate(rc->critical))) {
  1051. // calculation failed
  1052. rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
  1053. debug(D_HEALTH,
  1054. "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
  1055. rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
  1056. buffer_tostring(rc->critical->error_msg)
  1057. );
  1058. } else {
  1059. rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
  1060. debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
  1061. NETDATA_DOUBLE_FORMAT
  1062. ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
  1063. rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
  1064. rrdcalc_source(rc)
  1065. );
  1066. critical_status = rrdcalc_value2status(rc->critical->result);
  1067. }
  1068. }
  1069. // --------------------------------------------------------
  1070. // decide the final alarm status
  1071. RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
  1072. switch (warning_status) {
  1073. case RRDCALC_STATUS_CLEAR:
  1074. status = RRDCALC_STATUS_CLEAR;
  1075. break;
  1076. case RRDCALC_STATUS_RAISED:
  1077. status = RRDCALC_STATUS_WARNING;
  1078. break;
  1079. default:
  1080. break;
  1081. }
  1082. switch (critical_status) {
  1083. case RRDCALC_STATUS_CLEAR:
  1084. if (status == RRDCALC_STATUS_UNDEFINED)
  1085. status = RRDCALC_STATUS_CLEAR;
  1086. break;
  1087. case RRDCALC_STATUS_RAISED:
  1088. status = RRDCALC_STATUS_CRITICAL;
  1089. break;
  1090. default:
  1091. break;
  1092. }
  1093. // --------------------------------------------------------
  1094. // check if the new status and the old differ
  1095. if (status != rc->status) {
  1096. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1097. int delay = 0;
  1098. // apply trigger hysteresis
  1099. if (now > rc->delay_up_to_timestamp) {
  1100. rc->delay_up_current = rc->delay_up_duration;
  1101. rc->delay_down_current = rc->delay_down_duration;
  1102. rc->delay_last = 0;
  1103. rc->delay_up_to_timestamp = 0;
  1104. } else {
  1105. rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
  1106. if (rc->delay_up_current > rc->delay_max_duration)
  1107. rc->delay_up_current = rc->delay_max_duration;
  1108. rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
  1109. if (rc->delay_down_current > rc->delay_max_duration)
  1110. rc->delay_down_current = rc->delay_max_duration;
  1111. }
  1112. if (status > rc->status)
  1113. delay = rc->delay_up_current;
  1114. else
  1115. delay = rc->delay_down_current;
  1116. // COMMENTED: because we do need to send raising alarms
  1117. // if(now + delay < rc->delay_up_to_timestamp)
  1118. // delay = (int)(rc->delay_up_to_timestamp - now);
  1119. rc->delay_last = delay;
  1120. rc->delay_up_to_timestamp = now + delay;
  1121. ALARM_ENTRY *ae = health_create_alarm_entry(
  1122. host,
  1123. rc->id,
  1124. rc->next_event_id++,
  1125. rc->config_hash_id,
  1126. now,
  1127. rc->name,
  1128. rc->rrdset->id,
  1129. rc->rrdset->context,
  1130. rc->rrdset->family,
  1131. rc->classification,
  1132. rc->component,
  1133. rc->type,
  1134. rc->exec,
  1135. rc->recipient,
  1136. now - rc->last_status_change,
  1137. rc->old_value,
  1138. rc->value,
  1139. rc->status,
  1140. status,
  1141. rc->source,
  1142. rc->units,
  1143. rc->info,
  1144. rc->delay_last,
  1145. (
  1146. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1147. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1148. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1149. )
  1150. );
  1151. health_alarm_log_add_entry(host, ae);
  1152. log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
  1153. rc->last_status_change = now;
  1154. rc->old_status = rc->status;
  1155. rc->status = status;
  1156. }
  1157. rc->last_updated = now;
  1158. rc->next_update = now + rc->update_every;
  1159. if (next_run > rc->next_update)
  1160. next_run = rc->next_update;
  1161. }
  1162. foreach_rrdcalc_in_rrdhost_done(rc);
  1163. // process repeating alarms
  1164. foreach_rrdcalc_in_rrdhost_read(host, rc) {
  1165. if(unlikely(!service_running(SERVICE_HEALTH)))
  1166. break;
  1167. int repeat_every = 0;
  1168. if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
  1169. if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
  1170. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1171. repeat_every = rc->warn_repeat_every;
  1172. } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
  1173. rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
  1174. repeat_every = rc->crit_repeat_every;
  1175. } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
  1176. if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
  1177. if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
  1178. repeat_every = 1;
  1179. } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
  1180. repeat_every = 1;
  1181. }
  1182. }
  1183. }
  1184. } else {
  1185. continue;
  1186. }
  1187. if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
  1188. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
  1189. rc->last_repeat = now;
  1190. if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
  1191. ALARM_ENTRY *ae = health_create_alarm_entry(
  1192. host,
  1193. rc->id,
  1194. rc->next_event_id++,
  1195. rc->config_hash_id,
  1196. now,
  1197. rc->name,
  1198. rc->rrdset->id,
  1199. rc->rrdset->context,
  1200. rc->rrdset->family,
  1201. rc->classification,
  1202. rc->component,
  1203. rc->type,
  1204. rc->exec,
  1205. rc->recipient,
  1206. now - rc->last_status_change,
  1207. rc->old_value,
  1208. rc->value,
  1209. rc->old_status,
  1210. rc->status,
  1211. rc->source,
  1212. rc->units,
  1213. rc->info,
  1214. rc->delay_last,
  1215. (
  1216. ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
  1217. ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
  1218. (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
  1219. )
  1220. );
  1221. ae->last_repeat = rc->last_repeat;
  1222. if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
  1223. ae->flags |= HEALTH_ENTRY_RUN_ONCE;
  1224. }
  1225. rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
  1226. health_process_notifications(host, ae);
  1227. debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
  1228. health_alarm_wait_for_execution(ae);
  1229. health_alarm_log_free_one_nochecks_nounlink(ae);
  1230. }
  1231. }
  1232. foreach_rrdcalc_in_rrdhost_done(rc);
  1233. }
  1234. if (unlikely(!service_running(SERVICE_HEALTH)))
  1235. break;
  1236. // execute notifications
  1237. // and cleanup
  1238. worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
  1239. health_alarm_log_process(host);
  1240. if (unlikely(!service_running(SERVICE_HEALTH))) {
  1241. // wait for all notifications to finish before allowing health to be cleaned up
  1242. ALARM_ENTRY *ae;
  1243. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1244. if(unlikely(!service_running(SERVICE_HEALTH)))
  1245. break;
  1246. health_alarm_wait_for_execution(ae);
  1247. }
  1248. break;
  1249. }
  1250. #ifdef ENABLE_ACLK
  1251. if (netdata_cloud_setting) {
  1252. struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
  1253. if (unlikely(!wc)) {
  1254. continue;
  1255. }
  1256. if (wc->alert_queue_removed == 1) {
  1257. sql_queue_removed_alerts_to_aclk(host);
  1258. } else if (wc->alert_queue_removed > 1) {
  1259. wc->alert_queue_removed--;
  1260. }
  1261. if (wc->alert_checkpoint_req == 1) {
  1262. aclk_push_alarm_checkpoint(host);
  1263. } else if (wc->alert_checkpoint_req > 1) {
  1264. wc->alert_checkpoint_req--;
  1265. }
  1266. }
  1267. #endif
  1268. }
  1269. dfe_done(host);
  1270. // wait for all notifications to finish before allowing health to be cleaned up
  1271. ALARM_ENTRY *ae;
  1272. while (NULL != (ae = alarm_notifications_in_progress.head)) {
  1273. if(unlikely(!service_running(SERVICE_HEALTH)))
  1274. break;
  1275. health_alarm_wait_for_execution(ae);
  1276. }
  1277. if(unlikely(!service_running(SERVICE_HEALTH)))
  1278. break;
  1279. health_sleep(next_run, loop);
  1280. } // forever
  1281. netdata_thread_cleanup_pop(1);
  1282. return NULL;
  1283. }
  1284. void health_add_host_labels(void) {
  1285. DICTIONARY *labels = localhost->rrdlabels;
  1286. // The source should be CONF, but when it is set, these labels are exported by default ('send configured labels' in exporting.conf).
  1287. // Their export seems to break exporting to Graphite, see https://github.com/netdata/netdata/issues/14084.
  1288. int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
  1289. rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_AUTO);
  1290. int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
  1291. rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_AUTO);
  1292. }