health_json.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
  4. if(value && *value) {
  5. buffer_sprintf(wb, "%s\"%s\":\"", prefix, label);
  6. buffer_strcat_htmlescape(wb, value);
  7. buffer_strcat(wb, "\"");
  8. buffer_strcat(wb, suffix);
  9. }
  10. else
  11. buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
  12. }
  13. void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
  14. char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
  15. char config_hash_id[GUID_LEN + 1];
  16. uuid_unparse_lower(ae->config_hash_id, config_hash_id);
  17. buffer_sprintf(wb,
  18. "\n\t{\n"
  19. "\t\t\"hostname\": \"%s\",\n"
  20. "\t\t\"utc_offset\": %d,\n"
  21. "\t\t\"timezone\": \"%s\",\n"
  22. "\t\t\"unique_id\": %u,\n"
  23. "\t\t\"alarm_id\": %u,\n"
  24. "\t\t\"alarm_event_id\": %u,\n"
  25. "\t\t\"config_hash_id\": \"%s\",\n"
  26. "\t\t\"name\": \"%s\",\n"
  27. "\t\t\"chart\": \"%s\",\n"
  28. "\t\t\"family\": \"%s\",\n"
  29. "\t\t\"class\": \"%s\",\n"
  30. "\t\t\"component\": \"%s\",\n"
  31. "\t\t\"type\": \"%s\",\n"
  32. "\t\t\"processed\": %s,\n"
  33. "\t\t\"updated\": %s,\n"
  34. "\t\t\"exec_run\": %lu,\n"
  35. "\t\t\"exec_failed\": %s,\n"
  36. "\t\t\"exec\": \"%s\",\n"
  37. "\t\t\"recipient\": \"%s\",\n"
  38. "\t\t\"exec_code\": %d,\n"
  39. "\t\t\"source\": \"%s\",\n"
  40. "\t\t\"command\": \"%s\",\n"
  41. "\t\t\"units\": \"%s\",\n"
  42. "\t\t\"when\": %lu,\n"
  43. "\t\t\"duration\": %lu,\n"
  44. "\t\t\"non_clear_duration\": %lu,\n"
  45. "\t\t\"status\": \"%s\",\n"
  46. "\t\t\"old_status\": \"%s\",\n"
  47. "\t\t\"delay\": %d,\n"
  48. "\t\t\"delay_up_to_timestamp\": %lu,\n"
  49. "\t\t\"updated_by_id\": %u,\n"
  50. "\t\t\"updates_id\": %u,\n"
  51. "\t\t\"value_string\": \"%s\",\n"
  52. "\t\t\"old_value_string\": \"%s\",\n"
  53. "\t\t\"last_repeat\": \"%lu\",\n"
  54. "\t\t\"silenced\": \"%s\",\n"
  55. , host->hostname
  56. , host->utc_offset
  57. , host->abbrev_timezone
  58. , ae->unique_id
  59. , ae->alarm_id
  60. , ae->alarm_event_id
  61. , config_hash_id
  62. , ae->name
  63. , ae->chart
  64. , ae->family
  65. , ae->classification?ae->classification:"Unknown"
  66. , ae->component?ae->component:"Unknown"
  67. , ae->type?ae->type:"Unknown"
  68. , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
  69. , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
  70. , (unsigned long)ae->exec_run_timestamp
  71. , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
  72. , ae->exec?ae->exec:host->health_default_exec
  73. , ae->recipient?ae->recipient:host->health_default_recipient
  74. , ae->exec_code
  75. , ae->source
  76. , edit_command
  77. , ae->units?ae->units:""
  78. , (unsigned long)ae->when
  79. , (unsigned long)ae->duration
  80. , (unsigned long)ae->non_clear_duration
  81. , rrdcalc_status2string(ae->new_status)
  82. , rrdcalc_status2string(ae->old_status)
  83. , ae->delay
  84. , (unsigned long)ae->delay_up_to_timestamp
  85. , ae->updated_by_id
  86. , ae->updates_id
  87. , ae->new_value_string
  88. , ae->old_value_string
  89. , (unsigned long)ae->last_repeat
  90. , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
  91. );
  92. char *replaced_info = NULL;
  93. if (likely(ae->info)) {
  94. char *m = NULL;
  95. replaced_info = strdupz(ae->info);
  96. size_t pos = 0;
  97. while ((m = strstr(replaced_info + pos, "$family"))) {
  98. char *buf = NULL;
  99. pos = m - replaced_info;
  100. buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m);
  101. freez(replaced_info);
  102. replaced_info = strdupz(buf);
  103. freez(buf);
  104. }
  105. }
  106. health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n");
  107. if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
  108. buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
  109. }
  110. buffer_strcat(wb, "\t\t\"value\":");
  111. buffer_rrd_value(wb, ae->new_value);
  112. buffer_strcat(wb, ",\n");
  113. buffer_strcat(wb, "\t\t\"old_value\":");
  114. buffer_rrd_value(wb, ae->old_value);
  115. buffer_strcat(wb, "\n");
  116. buffer_strcat(wb, "\t}");
  117. freez(replaced_info);
  118. freez(edit_command);
  119. }
  120. void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
  121. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  122. buffer_strcat(wb, "[");
  123. unsigned int max = host->health_log.max;
  124. unsigned int count = 0;
  125. uint32_t hash_chart = 0;
  126. if (chart) hash_chart = simple_hash(chart);
  127. ALARM_ENTRY *ae;
  128. for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
  129. if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) {
  130. if (likely(count))
  131. buffer_strcat(wb, ",");
  132. health_alarm_entry2json_nolock(wb, ae, host);
  133. count++;
  134. }
  135. }
  136. buffer_strcat(wb, "\n]\n");
  137. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  138. }
  139. static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
  140. (void)host;
  141. buffer_sprintf(wb,
  142. "\t\t\"%s.%s\": {\n"
  143. "\t\t\t\"id\": %lu,\n"
  144. , rc->chart, rc->name
  145. , (unsigned long)rc->id);
  146. buffer_strcat(wb, "\t\t\t\"value\":");
  147. buffer_rrd_value(wb, rc->value);
  148. buffer_strcat(wb, ",\n");
  149. buffer_sprintf(wb,
  150. "\t\t\t\"status\": \"%s\"\n"
  151. , rrdcalc_status2string(rc->status));
  152. buffer_strcat(wb, "\t\t}");
  153. }
  154. static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
  155. char value_string[100 + 1];
  156. format_value_and_unit(value_string, 100, rc->value, rc->units, -1);
  157. char *replaced_info = NULL;
  158. if (likely(rc->info)) {
  159. char *m;
  160. replaced_info = strdupz(rc->info);
  161. size_t pos = 0;
  162. while ((m = strstr(replaced_info + pos, "$family"))) {
  163. char *buf = NULL;
  164. pos = m - replaced_info;
  165. buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m);
  166. freez(replaced_info);
  167. replaced_info = strdupz(buf);
  168. freez(buf);
  169. }
  170. }
  171. char hash_id[GUID_LEN + 1];
  172. uuid_unparse_lower(rc->config_hash_id, hash_id);
  173. buffer_sprintf(wb,
  174. "\t\t\"%s.%s\": {\n"
  175. "\t\t\t\"id\": %lu,\n"
  176. "\t\t\t\"config_hash_id\": \"%s\",\n"
  177. "\t\t\t\"name\": \"%s\",\n"
  178. "\t\t\t\"chart\": \"%s\",\n"
  179. "\t\t\t\"family\": \"%s\",\n"
  180. "\t\t\t\"class\": \"%s\",\n"
  181. "\t\t\t\"component\": \"%s\",\n"
  182. "\t\t\t\"type\": \"%s\",\n"
  183. "\t\t\t\"active\": %s,\n"
  184. "\t\t\t\"disabled\": %s,\n"
  185. "\t\t\t\"silenced\": %s,\n"
  186. "\t\t\t\"exec\": \"%s\",\n"
  187. "\t\t\t\"recipient\": \"%s\",\n"
  188. "\t\t\t\"source\": \"%s\",\n"
  189. "\t\t\t\"units\": \"%s\",\n"
  190. "\t\t\t\"info\": \"%s\",\n"
  191. "\t\t\t\"status\": \"%s\",\n"
  192. "\t\t\t\"last_status_change\": %lu,\n"
  193. "\t\t\t\"last_updated\": %lu,\n"
  194. "\t\t\t\"next_update\": %lu,\n"
  195. "\t\t\t\"update_every\": %d,\n"
  196. "\t\t\t\"delay_up_duration\": %d,\n"
  197. "\t\t\t\"delay_down_duration\": %d,\n"
  198. "\t\t\t\"delay_max_duration\": %d,\n"
  199. "\t\t\t\"delay_multiplier\": %f,\n"
  200. "\t\t\t\"delay\": %d,\n"
  201. "\t\t\t\"delay_up_to_timestamp\": %lu,\n"
  202. "\t\t\t\"warn_repeat_every\": \"%u\",\n"
  203. "\t\t\t\"crit_repeat_every\": \"%u\",\n"
  204. "\t\t\t\"value_string\": \"%s\",\n"
  205. "\t\t\t\"last_repeat\": \"%lu\",\n"
  206. , rc->chart, rc->name
  207. , (unsigned long)rc->id
  208. , hash_id
  209. , rc->name
  210. , rc->chart
  211. , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
  212. , rc->classification?rc->classification:"Unknown"
  213. , rc->component?rc->component:"Unknown"
  214. , rc->type?rc->type:"Unknown"
  215. , (rc->rrdset)?"true":"false"
  216. , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
  217. , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
  218. , rc->exec?rc->exec:host->health_default_exec
  219. , rc->recipient?rc->recipient:host->health_default_recipient
  220. , rc->source
  221. , rc->units?rc->units:""
  222. , replaced_info?replaced_info:""
  223. , rrdcalc_status2string(rc->status)
  224. , (unsigned long)rc->last_status_change
  225. , (unsigned long)rc->last_updated
  226. , (unsigned long)rc->next_update
  227. , rc->update_every
  228. , rc->delay_up_duration
  229. , rc->delay_down_duration
  230. , rc->delay_max_duration
  231. , rc->delay_multiplier
  232. , rc->delay_last
  233. , (unsigned long)rc->delay_up_to_timestamp
  234. , rc->warn_repeat_every
  235. , rc->crit_repeat_every
  236. , value_string
  237. , (unsigned long)rc->last_repeat
  238. );
  239. if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
  240. buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
  241. }
  242. if(RRDCALC_HAS_DB_LOOKUP(rc)) {
  243. if(rc->dimensions && *rc->dimensions)
  244. health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
  245. buffer_sprintf(wb,
  246. "\t\t\t\"db_after\": %lu,\n"
  247. "\t\t\t\"db_before\": %lu,\n"
  248. "\t\t\t\"lookup_method\": \"%s\",\n"
  249. "\t\t\t\"lookup_after\": %d,\n"
  250. "\t\t\t\"lookup_before\": %d,\n"
  251. "\t\t\t\"lookup_options\": \"",
  252. (unsigned long) rc->db_after,
  253. (unsigned long) rc->db_before,
  254. group_method2string(rc->group),
  255. rc->after,
  256. rc->before
  257. );
  258. buffer_data_options2string(wb, rc->options);
  259. buffer_strcat(wb, "\",\n");
  260. }
  261. if(rc->calculation) {
  262. health_string2json(wb, "\t\t\t", "calc", rc->calculation->source, ",\n");
  263. health_string2json(wb, "\t\t\t", "calc_parsed", rc->calculation->parsed_as, ",\n");
  264. }
  265. if(rc->warning) {
  266. health_string2json(wb, "\t\t\t", "warn", rc->warning->source, ",\n");
  267. health_string2json(wb, "\t\t\t", "warn_parsed", rc->warning->parsed_as, ",\n");
  268. }
  269. if(rc->critical) {
  270. health_string2json(wb, "\t\t\t", "crit", rc->critical->source, ",\n");
  271. health_string2json(wb, "\t\t\t", "crit_parsed", rc->critical->parsed_as, ",\n");
  272. }
  273. buffer_strcat(wb, "\t\t\t\"green\":");
  274. buffer_rrd_value(wb, rc->green);
  275. buffer_strcat(wb, ",\n");
  276. buffer_strcat(wb, "\t\t\t\"red\":");
  277. buffer_rrd_value(wb, rc->red);
  278. buffer_strcat(wb, ",\n");
  279. buffer_strcat(wb, "\t\t\t\"value\":");
  280. buffer_rrd_value(wb, rc->value);
  281. buffer_strcat(wb, "\n");
  282. buffer_strcat(wb, "\t\t}");
  283. freez(replaced_info);
  284. }
  285. //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
  286. //
  287. //}
  288. void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) {
  289. RRDCALC *rc;
  290. int numberOfAlarms = 0;
  291. char *tok = NULL;
  292. char *p = NULL;
  293. rrdhost_rdlock(host);
  294. if (contexts) {
  295. p = (char*)buffer_tostring(contexts);
  296. while(p && *p && (tok = mystrsep(&p, ", |"))) {
  297. if(!*tok) continue;
  298. for(rc = host->alarms; rc ; rc = rc->next) {
  299. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  300. continue;
  301. if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok)
  302. && !strcmp(rc->rrdset->context, tok)
  303. && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
  304. numberOfAlarms++;
  305. }
  306. }
  307. }
  308. else {
  309. for(rc = host->alarms; rc ; rc = rc->next) {
  310. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  311. continue;
  312. if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))
  313. numberOfAlarms++;
  314. }
  315. }
  316. buffer_sprintf(wb, "%d", numberOfAlarms);
  317. rrdhost_unlock(host);
  318. }
  319. static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) {
  320. RRDCALC *rc;
  321. int i;
  322. for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
  323. if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
  324. continue;
  325. if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
  326. continue;
  327. if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
  328. continue;
  329. if(likely(i)) buffer_strcat(wb, ",\n");
  330. fp(host, wb, rc);
  331. i++;
  332. }
  333. }
  334. void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
  335. rrdhost_rdlock(host);
  336. buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
  337. "\n\t\"latest_alarm_log_unique_id\": %u,"
  338. "\n\t\"status\": %s,"
  339. "\n\t\"now\": %lu,"
  340. "\n\t\"alarms\": {\n",
  341. host->hostname,
  342. (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
  343. host->health_enabled?"true":"false",
  344. (unsigned long)now_realtime_sec());
  345. health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock);
  346. // buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
  347. // RRDCALCTEMPLATE *rt;
  348. // for(rt = host->templates; rt ; rt = rt->next)
  349. // health_rrdcalctemplate2json_nolock(wb, rt);
  350. buffer_strcat(wb, "\n\t}\n}\n");
  351. rrdhost_unlock(host);
  352. }
  353. void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) {
  354. rrdhost_rdlock(host);
  355. buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
  356. "\n\t\"alarms\": {\n",
  357. host->hostname);
  358. health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock);
  359. buffer_strcat(wb, "\n\t}\n}\n");
  360. rrdhost_unlock(host);
  361. }
  362. static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, time_t mark)
  363. {
  364. ALARM_ENTRY *ae = host->health_log.alarms;
  365. while(ae) {
  366. if (ae->alarm_id == alarm_id && ae->unique_id > mark &&
  367. (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL))
  368. return 1;
  369. ae = ae->next;
  370. }
  371. return 0;
  372. }
  373. void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) {
  374. netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
  375. buffer_sprintf(wb, "[\n");
  376. unsigned int max = host->health_log.max;
  377. unsigned int count = 0;
  378. ALARM_ENTRY *ae;
  379. for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) {
  380. if (!ae->updated_by_id &&
  381. ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) ||
  382. ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) &&
  383. ae->new_status == RRDCALC_STATUS_REMOVED))) {
  384. if (have_recent_alarm(host, ae->alarm_id, ae->unique_id))
  385. continue;
  386. if (likely(count))
  387. buffer_strcat(wb, ",");
  388. health_alarm_entry2json_nolock(wb, ae, host);
  389. count++;
  390. }
  391. }
  392. buffer_strcat(wb, "]");
  393. netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
  394. }