health_json.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
  4. if(value && *value) {
  5. buffer_sprintf(wb, "%s\"%s\":\"", prefix, label);
  6. buffer_strcat_htmlescape(wb, value);
  7. buffer_strcat(wb, "\"");
  8. buffer_strcat(wb, suffix);
  9. }
  10. else
  11. buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
  12. }
  13. void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
  14. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  15. char config_hash_id[GUID_LEN + 1];
  16. uuid_unparse_lower(ae->config_hash_id, config_hash_id);
  17. buffer_sprintf(wb,
  18. "\n\t{\n"
  19. "\t\t\"hostname\": \"%s\",\n"
  20. "\t\t\"utc_offset\": %d,\n"
  21. "\t\t\"timezone\": \"%s\",\n"
  22. "\t\t\"unique_id\": %u,\n"
  23. "\t\t\"alarm_id\": %u,\n"
  24. "\t\t\"alarm_event_id\": %u,\n"
  25. "\t\t\"config_hash_id\": \"%s\",\n"
  26. "\t\t\"name\": \"%s\",\n"
  27. "\t\t\"chart\": \"%s\",\n"
  28. "\t\t\"context\": \"%s\",\n"
  29. "\t\t\"family\": \"%s\",\n"
  30. "\t\t\"class\": \"%s\",\n"
  31. "\t\t\"component\": \"%s\",\n"
  32. "\t\t\"type\": \"%s\",\n"
  33. "\t\t\"processed\": %s,\n"
  34. "\t\t\"updated\": %s,\n"
  35. "\t\t\"exec_run\": %lu,\n"
  36. "\t\t\"exec_failed\": %s,\n"
  37. "\t\t\"exec\": \"%s\",\n"
  38. "\t\t\"recipient\": \"%s\",\n"
  39. "\t\t\"exec_code\": %d,\n"
  40. "\t\t\"source\": \"%s\",\n"
  41. "\t\t\"command\": \"%s\",\n"
  42. "\t\t\"units\": \"%s\",\n"
  43. "\t\t\"when\": %lu,\n"
  44. "\t\t\"duration\": %lu,\n"
  45. "\t\t\"non_clear_duration\": %lu,\n"
  46. "\t\t\"status\": \"%s\",\n"
  47. "\t\t\"old_status\": \"%s\",\n"
  48. "\t\t\"delay\": %d,\n"
  49. "\t\t\"delay_up_to_timestamp\": %lu,\n"
  50. "\t\t\"updated_by_id\": %u,\n"
  51. "\t\t\"updates_id\": %u,\n"
  52. "\t\t\"value_string\": \"%s\",\n"
  53. "\t\t\"old_value_string\": \"%s\",\n"
  54. "\t\t\"last_repeat\": \"%lu\",\n"
  55. "\t\t\"silenced\": \"%s\",\n"
  56. , host->hostname
  57. , host->utc_offset
  58. , host->abbrev_timezone
  59. , ae->unique_id
  60. , ae->alarm_id
  61. , ae->alarm_event_id
  62. , config_hash_id
  63. , ae->name
  64. , ae->chart
  65. , ae->chart_context
  66. , ae->family
  67. , ae->classification?ae->classification:"Unknown"
  68. , ae->component?ae->component:"Unknown"
  69. , ae->type?ae->type:"Unknown"
  70. , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
  71. , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
  72. , (unsigned long)ae->exec_run_timestamp
  73. , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
  74. , ae->exec?ae->exec:host->health_default_exec
  75. , ae->recipient?ae->recipient:host->health_default_recipient
  76. , ae->exec_code
  77. , ae->source
  78. , edit_command
  79. , ae->units?ae->units:""
  80. , (unsigned long)ae->when
  81. , (unsigned long)ae->duration
  82. , (unsigned long)ae->non_clear_duration
  83. , rrdcalc_status2string(ae->new_status)
  84. , rrdcalc_status2string(ae->old_status)
  85. , ae->delay
  86. , (unsigned long)ae->delay_up_to_timestamp
  87. , ae->updated_by_id
  88. , ae->updates_id
  89. , ae->new_value_string
  90. , ae->old_value_string
  91. , (unsigned long)ae->last_repeat
  92. , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
  93. );
  94. char *replaced_info = NULL;
  95. if (likely(ae->info)) {
  96. char *m = NULL;
  97. replaced_info = strdupz(ae->info);
  98. size_t pos = 0;
  99. while ((m = strstr(replaced_info + pos, "$family"))) {
  100. char *buf = NULL;
  101. pos = m - replaced_info;
  102. buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m);
  103. freez(replaced_info);
  104. replaced_info = strdupz(buf);
  105. freez(buf);
  106. }
  107. }
  108. health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n");
  109. if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
  110. buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
  111. }
  112. buffer_strcat(wb, "\t\t\"value\":");
  113. buffer_rrd_value(wb, ae->new_value);
  114. buffer_strcat(wb, ",\n");
  115. buffer_strcat(wb, "\t\t\"old_value\":");
  116. buffer_rrd_value(wb, ae->old_value);
  117. buffer_strcat(wb, "\n");
  118. buffer_strcat(wb, "\t}");
  119. freez(replaced_info);
  120. freez(edit_command);
  121. }
  122. void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
  123. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  124. buffer_strcat(wb, "[");
  125. unsigned int max = host->health_log.max;
  126. unsigned int count = 0;
  127. uint32_t hash_chart = 0;
  128. if (chart) hash_chart = simple_hash(chart);
  129. ALARM_ENTRY *ae;
  130. for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
  131. if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) {
  132. if (likely(count))
  133. buffer_strcat(wb, ",");
  134. health_alarm_entry2json_nolock(wb, ae, host);
  135. count++;
  136. }
  137. }
  138. buffer_strcat(wb, "\n]\n");
  139. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  140. }
  141. static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
  142. (void)host;
  143. buffer_sprintf(wb,
  144. "\t\t\"%s.%s\": {\n"
  145. "\t\t\t\"id\": %lu,\n"
  146. , rc->chart, rc->name
  147. , (unsigned long)rc->id);
  148. buffer_strcat(wb, "\t\t\t\"value\":");
  149. buffer_rrd_value(wb, rc->value);
  150. buffer_strcat(wb, ",\n");
  151. buffer_strcat(wb, "\t\t\t\"last_updated\":");
  152. buffer_sprintf(wb, "%lu", (unsigned long)rc->last_updated);
  153. buffer_strcat(wb, ",\n");
  154. buffer_sprintf(wb,
  155. "\t\t\t\"status\": \"%s\"\n"
  156. , rrdcalc_status2string(rc->status));
  157. buffer_strcat(wb, "\t\t}");
  158. }
  159. static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
  160. char value_string[100 + 1];
  161. format_value_and_unit(value_string, 100, rc->value, rc->units, -1);
  162. char *replaced_info = NULL;
  163. if (likely(rc->info)) {
  164. char *m;
  165. replaced_info = strdupz(rc->info);
  166. size_t pos = 0;
  167. while ((m = strstr(replaced_info + pos, "$family"))) {
  168. char *buf = NULL;
  169. pos = m - replaced_info;
  170. buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m);
  171. freez(replaced_info);
  172. replaced_info = strdupz(buf);
  173. freez(buf);
  174. }
  175. }
  176. char hash_id[GUID_LEN + 1];
  177. uuid_unparse_lower(rc->config_hash_id, hash_id);
  178. buffer_sprintf(wb,
  179. "\t\t\"%s.%s\": {\n"
  180. "\t\t\t\"id\": %lu,\n"
  181. "\t\t\t\"config_hash_id\": \"%s\",\n"
  182. "\t\t\t\"name\": \"%s\",\n"
  183. "\t\t\t\"chart\": \"%s\",\n"
  184. "\t\t\t\"family\": \"%s\",\n"
  185. "\t\t\t\"class\": \"%s\",\n"
  186. "\t\t\t\"component\": \"%s\",\n"
  187. "\t\t\t\"type\": \"%s\",\n"
  188. "\t\t\t\"active\": %s,\n"
  189. "\t\t\t\"disabled\": %s,\n"
  190. "\t\t\t\"silenced\": %s,\n"
  191. "\t\t\t\"exec\": \"%s\",\n"
  192. "\t\t\t\"recipient\": \"%s\",\n"
  193. "\t\t\t\"source\": \"%s\",\n"
  194. "\t\t\t\"units\": \"%s\",\n"
  195. "\t\t\t\"info\": \"%s\",\n"
  196. "\t\t\t\"status\": \"%s\",\n"
  197. "\t\t\t\"last_status_change\": %lu,\n"
  198. "\t\t\t\"last_updated\": %lu,\n"
  199. "\t\t\t\"next_update\": %lu,\n"
  200. "\t\t\t\"update_every\": %d,\n"
  201. "\t\t\t\"delay_up_duration\": %d,\n"
  202. "\t\t\t\"delay_down_duration\": %d,\n"
  203. "\t\t\t\"delay_max_duration\": %d,\n"
  204. "\t\t\t\"delay_multiplier\": %f,\n"
  205. "\t\t\t\"delay\": %d,\n"
  206. "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
  207. "\t\t\t\"warn_repeat_every\": \"%u\",\n"
  208. "\t\t\t\"crit_repeat_every\": \"%u\",\n"
  209. "\t\t\t\"value_string\": \"%s\",\n"
  210. "\t\t\t\"last_repeat\": \"%lu\",\n"
  211. "\t\t\t\"times_repeat\": %lu,\n"
  212. , rc->chart, rc->name
  213. , (unsigned long)rc->id
  214. , hash_id
  215. , rc->name
  216. , rc->chart
  217. , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
  218. , rc->classification?rc->classification:"Unknown"
  219. , rc->component?rc->component:"Unknown"
  220. , rc->type?rc->type:"Unknown"
  221. , (rc->rrdset)?"true":"false"
  222. , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
  223. , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  224. , rc->exec?rc->exec:host->health_default_exec
  225. , rc->recipient?rc->recipient:host->health_default_recipient
  226. , rc->source
  227. , rc->units?rc->units:""
  228. , replaced_info?replaced_info:""
  229. , rrdcalc_status2string(rc->status)
  230. , (unsigned long)rc->last_status_change
  231. , (unsigned long)rc->last_updated
  232. , (unsigned long)rc->next_update
  233. , rc->update_every
  234. , rc->delay_up_duration
  235. , rc->delay_down_duration
  236. , rc->delay_max_duration
  237. , rc->delay_multiplier
  238. , rc->delay_last
  239. , (unsigned long)rc->delay_up_to_timestamp
  240. , rc->warn_repeat_every
  241. , rc->crit_repeat_every
  242. , value_string
  243. , (unsigned long)rc->last_repeat
  244. , (unsigned long)rc->times_repeat
  245. );
  246. if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
  247. buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
  248. }
  249. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  250. if(rc->dimensions && *rc->dimensions)
  251. health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
  252. buffer_sprintf(wb,
  253. "\t\t\t\"db_after\": %lu,\n"
  254. "\t\t\t\"db_before\": %lu,\n"
  255. "\t\t\t\"lookup_method\": \"%s\",\n"
  256. "\t\t\t\"lookup_after\": %d,\n"
  257. "\t\t\t\"lookup_before\": %d,\n"
  258. "\t\t\t\"lookup_options\": \"",
  259. (unsigned long) rc->db_after,
  260. (unsigned long) rc->db_before,
  261. group_method2string(rc->group),
  262. rc->after,
  263. rc->before
  264. );
  265. buffer_data_options2string(wb, rc->options);
  266. buffer_strcat(wb, "\",\n");
  267. }
  268. if(rc->calculation) {
  269. health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
  270. health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
  271. }
  272. if(rc->warning) {
  273. health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
  274. health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
  275. }
  276. if(rc->critical) {
  277. health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
  278. health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
  279. }
  280. buffer_strcat(wb, "\t\t\t\"green\":");
  281. buffer_rrd_value(wb, rc->green);
  282. buffer_strcat(wb, ",\n");
  283. buffer_strcat(wb, "\t\t\t\"red\":");
  284. buffer_rrd_value(wb, rc->red);
  285. buffer_strcat(wb, ",\n");
  286. buffer_strcat(wb, "\t\t\t\"value\":");
  287. buffer_rrd_value(wb, rc->value);
  288. buffer_strcat(wb, "\n");
  289. buffer_strcat(wb, "\t\t}");
  290. freez(replaced_info);
  291. }
  292. //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
  293. //
  294. //}
  295. void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) {
  296. RRDCALC *rc;
  297. int numberOfAlarms = 0;
  298. char *tok = NULL;
  299. char *p = NULL;
  300. rrdhost_rdlock(host);
  301. if (contexts) {
  302. p = (char*)buffer_tostring(contexts);
  303. while(p && *p && (tok = mystrsep(&p, ", |"))) {
  304. if(!*tok) continue;
  305. for(rc = host->alarms; rc ; rc = rc->next) {
  306. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  307. continue;
  308. if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
  309. continue;
  310. if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok)
  311. && !strcmp(rc->rrdset->context, tok)
  312. && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
  313. numberOfAlarms++;
  314. }
  315. }
  316. }
  317. else {
  318. for(rc = host->alarms; rc ; rc = rc->next) {
  319. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  320. continue;
  321. if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
  322. continue;
  323. if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))
  324. numberOfAlarms++;
  325. }
  326. }
  327. buffer_sprintf(wb, "%d", numberOfAlarms);
  328. rrdhost_unlock(host);
  329. }
  330. static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) {
  331. RRDCALC *rc;
  332. int i;
  333. for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
  334. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  335. continue;
  336. if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
  337. continue;
  338. if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
  339. continue;
  340. if(likely(i)) buffer_strcat(wb, ",\n");
  341. fp(host, wb, rc);
  342. i++;
  343. }
  344. }
  345. void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
  346. rrdhost_rdlock(host);
  347. buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
  348. "\n\t\"latest_alarm_log_unique_id\": %u,"
  349. "\n\t\"status\": %s,"
  350. "\n\t\"now\": %lu,"
  351. "\n\t\"alarms\": {\n",
  352. host->hostname,
  353. (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
  354. host->health_enabled?"true":"false",
  355. (unsigned long)now_realtime_sec());
  356. health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock);
  357. // buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
  358. // RRDCALCTEMPLATE *rt;
  359. // for(rt = host->templates; rt ; rt = rt->next)
  360. // health_rrdcalctemplate2json_nolock(wb, rt);
  361. buffer_strcat(wb, "\n\t}\n}\n");
  362. rrdhost_unlock(host);
  363. }
  364. void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) {
  365. rrdhost_rdlock(host);
  366. buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
  367. "\n\t\"alarms\": {\n",
  368. host->hostname);
  369. health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock);
  370. buffer_strcat(wb, "\n\t}\n}\n");
  371. rrdhost_unlock(host);
  372. }
  373. static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, time_t mark)
  374. {
  375. ALARM_ENTRY *ae = host->health_log.alarms;
  376. while(ae) {
  377. if (ae->alarm_id == alarm_id && ae->unique_id > mark &&
  378. (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL))
  379. return 1;
  380. ae = ae->next;
  381. }
  382. return 0;
  383. }
  384. void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) {
  385. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  386. buffer_sprintf(wb, "[\n");
  387. unsigned int max = host->health_log.max;
  388. unsigned int count = 0;
  389. ALARM_ENTRY *ae;
  390. for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) {
  391. if (!ae->updated_by_id &&
  392. ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) ||
  393. ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) &&
  394. ae->new_status == RRDCALC_STATUS_REMOVED))) {
  395. if (have_recent_alarm(host, ae->alarm_id, ae->unique_id))
  396. continue;
  397. if (likely(count))
  398. buffer_strcat(wb, ",");
  399. health_alarm_entry2json_nolock(wb, ae, host);
  400. count++;
  401. }
  402. }
  403. buffer_strcat(wb, "]");
  404. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  405. }