health_log.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "health.h"
  3. // ----------------------------------------------------------------------------
  4. inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
  5. sql_health_alarm_log_save(host, ae);
  6. }
  7. void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) {
  8. ND_LOG_STACK lgs[] = {
  9. ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid),
  10. ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname),
  11. ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name),
  12. ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context),
  13. ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id),
  14. ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id),
  15. ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id),
  16. ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id),
  17. ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id),
  18. ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name),
  19. ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification),
  20. ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component),
  21. ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type),
  22. ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec),
  23. ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient),
  24. ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec),
  25. ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units),
  26. ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary),
  27. ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info),
  28. ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value),
  29. ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value),
  30. ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)),
  31. ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)),
  32. ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration),
  33. ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code),
  34. ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC),
  35. ND_LOG_FIELD_END(),
  36. };
  37. ND_LOG_STACK_PUSH(lgs);
  38. errno = 0;
  39. ND_LOG_FIELD_PRIORITY priority = NDLP_INFO;
  40. switch(ae->new_status) {
  41. case RRDCALC_STATUS_UNDEFINED:
  42. if(ae->old_status >= RRDCALC_STATUS_CLEAR)
  43. priority = NDLP_NOTICE;
  44. else
  45. priority = NDLP_DEBUG;
  46. break;
  47. default:
  48. case RRDCALC_STATUS_UNINITIALIZED:
  49. case RRDCALC_STATUS_REMOVED:
  50. priority = NDLP_DEBUG;
  51. break;
  52. case RRDCALC_STATUS_CLEAR:
  53. priority = NDLP_INFO;
  54. break;
  55. case RRDCALC_STATUS_WARNING:
  56. if(ae->old_status < RRDCALC_STATUS_WARNING)
  57. priority = NDLP_WARNING;
  58. break;
  59. case RRDCALC_STATUS_CRITICAL:
  60. if(ae->old_status < RRDCALC_STATUS_CRITICAL)
  61. priority = NDLP_CRIT;
  62. break;
  63. }
  64. netdata_logger(NDLS_HEALTH, priority, file, function, line,
  65. "ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s",
  66. string2str(ae->name), string2str(ae->chart), string2str(host->hostname),
  67. rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status)
  68. );
  69. }
  70. // ----------------------------------------------------------------------------
  71. // health alarm log management
  72. inline ALARM_ENTRY* health_create_alarm_entry(
  73. RRDHOST *host,
  74. uint32_t alarm_id,
  75. uint32_t alarm_event_id,
  76. const uuid_t config_hash_id,
  77. time_t when,
  78. STRING *name,
  79. STRING *chart,
  80. STRING *chart_context,
  81. STRING *chart_name,
  82. STRING *class,
  83. STRING *component,
  84. STRING *type,
  85. STRING *exec,
  86. STRING *recipient,
  87. time_t duration,
  88. NETDATA_DOUBLE old_value,
  89. NETDATA_DOUBLE new_value,
  90. RRDCALC_STATUS old_status,
  91. RRDCALC_STATUS new_status,
  92. STRING *source,
  93. STRING *units,
  94. STRING *summary,
  95. STRING *info,
  96. int delay,
  97. HEALTH_ENTRY_FLAGS flags
  98. ) {
  99. if (duration < 0)
  100. duration = 0;
  101. netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
  102. ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
  103. ae->name = string_dup(name);
  104. ae->chart = string_dup(chart);
  105. ae->chart_context = string_dup(chart_context);
  106. ae->chart_name = string_dup(chart_name);
  107. uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
  108. uuid_generate_random(ae->transition_id);
  109. ae->global_id = now_realtime_usec();
  110. ae->classification = string_dup(class);
  111. ae->component = string_dup(component);
  112. ae->type = string_dup(type);
  113. ae->exec = string_dup(exec);
  114. ae->recipient = string_dup(recipient);
  115. ae->source = string_dup(source);
  116. ae->units = string_dup(units);
  117. ae->unique_id = host->health_log.next_log_id++;
  118. ae->alarm_id = alarm_id;
  119. ae->alarm_event_id = alarm_event_id;
  120. ae->when = when;
  121. ae->old_value = old_value;
  122. ae->new_value = new_value;
  123. char value_string[100 + 1];
  124. ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1));
  125. ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1));
  126. ae->summary = string_dup(summary);
  127. ae->info = string_dup(info);
  128. ae->old_status = old_status;
  129. ae->new_status = new_status;
  130. ae->duration = duration;
  131. ae->delay = delay;
  132. ae->delay_up_to_timestamp = when + delay;
  133. ae->flags |= flags;
  134. ae->last_repeat = 0;
  135. if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
  136. ae->non_clear_duration += ae->duration;
  137. return ae;
  138. }
  139. inline void health_alarm_log_add_entry(
  140. RRDHOST *host,
  141. ALARM_ENTRY *ae
  142. ) {
  143. netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
  144. __atomic_add_fetch(&host->health_transitions, 1, __ATOMIC_RELAXED);
  145. // link it
  146. rw_spinlock_write_lock(&host->health_log.spinlock);
  147. ae->next = host->health_log.alarms;
  148. host->health_log.alarms = ae;
  149. host->health_log.count++;
  150. rw_spinlock_write_unlock(&host->health_log.spinlock);
  151. // match previous alarms
  152. rw_spinlock_read_lock(&host->health_log.spinlock);
  153. ALARM_ENTRY *t;
  154. for(t = host->health_log.alarms ; t ; t = t->next) {
  155. if(t != ae && t->alarm_id == ae->alarm_id) {
  156. if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
  157. t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
  158. t->updated_by_id = ae->unique_id;
  159. ae->updates_id = t->unique_id;
  160. if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
  161. (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
  162. ae->non_clear_duration += t->non_clear_duration;
  163. health_alarm_log_save(host, t);
  164. }
  165. // no need to continue
  166. break;
  167. }
  168. }
  169. rw_spinlock_read_unlock(&host->health_log.spinlock);
  170. health_alarm_log_save(host, ae);
  171. }
  172. inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
  173. string_freez(ae->name);
  174. string_freez(ae->chart);
  175. string_freez(ae->chart_context);
  176. string_freez(ae->classification);
  177. string_freez(ae->component);
  178. string_freez(ae->type);
  179. string_freez(ae->exec);
  180. string_freez(ae->recipient);
  181. string_freez(ae->source);
  182. string_freez(ae->units);
  183. string_freez(ae->info);
  184. string_freez(ae->old_value_string);
  185. string_freez(ae->new_value_string);
  186. freez(ae);
  187. }
  188. inline void health_alarm_log_free(RRDHOST *host) {
  189. rw_spinlock_write_lock(&host->health_log.spinlock);
  190. ALARM_ENTRY *ae;
  191. while((ae = host->health_log.alarms)) {
  192. host->health_log.alarms = ae->next;
  193. health_alarm_log_free_one_nochecks_nounlink(ae);
  194. }
  195. rw_spinlock_write_unlock(&host->health_log.spinlock);
  196. }