|
@@ -2,35 +2,51 @@
|
|
|
|
|
|
#include "plugin_proc.h"
|
|
|
|
|
|
+struct edac_count {
|
|
|
+ bool updated;
|
|
|
+ char *filename;
|
|
|
+ procfile *ff;
|
|
|
+ kernel_uint_t count;
|
|
|
+ RRDDIM *rd;
|
|
|
+};
|
|
|
+
|
|
|
+struct edac_dimm {
|
|
|
+ char *name;
|
|
|
+
|
|
|
+ struct edac_count ce;
|
|
|
+ struct edac_count ue;
|
|
|
+
|
|
|
+ RRDSET *st;
|
|
|
+
|
|
|
+ struct edac_dimm *prev, *next;
|
|
|
+};
|
|
|
+
|
|
|
struct mc {
|
|
|
char *name;
|
|
|
- char ce_updated;
|
|
|
- char ue_updated;
|
|
|
|
|
|
- char *ce_count_filename;
|
|
|
- char *ue_count_filename;
|
|
|
+ struct edac_count ce;
|
|
|
+ struct edac_count ue;
|
|
|
+ struct edac_count ce_noinfo;
|
|
|
+ struct edac_count ue_noinfo;
|
|
|
|
|
|
- procfile *ce_ff;
|
|
|
- procfile *ue_ff;
|
|
|
+ RRDSET *st;
|
|
|
|
|
|
- collected_number ce_count;
|
|
|
- collected_number ue_count;
|
|
|
+ struct edac_dimm *dimms;
|
|
|
|
|
|
- RRDDIM *ce_rd;
|
|
|
- RRDDIM *ue_rd;
|
|
|
-
|
|
|
- struct mc *next;
|
|
|
+ struct mc *prev, *next;
|
|
|
};
|
|
|
+
|
|
|
static struct mc *mc_root = NULL;
|
|
|
+static char *mc_dirname = NULL;
|
|
|
|
|
|
static void find_all_mc() {
|
|
|
char name[FILENAME_MAX + 1];
|
|
|
snprintfz(name, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/edac/mc");
|
|
|
- char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
|
|
|
+ mc_dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
|
|
|
|
|
|
- DIR *dir = opendir(dirname);
|
|
|
+ DIR *dir = opendir(mc_dirname);
|
|
|
if(unlikely(!dir)) {
|
|
|
- collector_error("Cannot read ECC memory errors directory '%s'", dirname);
|
|
|
+ collector_error("Cannot read EDAC memory errors directory '%s'", mc_dirname);
|
|
|
return;
|
|
|
}
|
|
|
|
|
@@ -42,162 +58,239 @@ static void find_all_mc() {
|
|
|
|
|
|
struct stat st;
|
|
|
|
|
|
- snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name);
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", mc_dirname, de->d_name);
|
|
|
if(stat(name, &st) != -1)
|
|
|
- m->ce_count_filename = strdupz(name);
|
|
|
+ m->ce.filename = strdupz(name);
|
|
|
|
|
|
- snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name);
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", mc_dirname, de->d_name);
|
|
|
if(stat(name, &st) != -1)
|
|
|
- m->ue_count_filename = strdupz(name);
|
|
|
+ m->ue.filename = strdupz(name);
|
|
|
|
|
|
- if(!m->ce_count_filename && !m->ue_count_filename) {
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/ce_noinfo_count", mc_dirname, de->d_name);
|
|
|
+ if(stat(name, &st) != -1)
|
|
|
+ m->ce_noinfo.filename = strdupz(name);
|
|
|
+
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/ue_noinfo_count", mc_dirname, de->d_name);
|
|
|
+ if(stat(name, &st) != -1)
|
|
|
+ m->ue_noinfo.filename = strdupz(name);
|
|
|
+
|
|
|
+ if(!m->ce.filename && !m->ue.filename && !m->ce_noinfo.filename && !m->ue_noinfo.filename) {
|
|
|
freez(m->name);
|
|
|
freez(m);
|
|
|
}
|
|
|
- else {
|
|
|
- m->next = mc_root;
|
|
|
- mc_root = m;
|
|
|
- }
|
|
|
+ else
|
|
|
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(mc_root, m, prev, next);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
closedir(dir);
|
|
|
-}
|
|
|
|
|
|
-int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) {
|
|
|
- (void)dt;
|
|
|
+ for(struct mc *m = mc_root; m ;m = m->next) {
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s", mc_dirname, m->name);
|
|
|
+ dir = opendir(name);
|
|
|
+ if(!dir) {
|
|
|
+ collector_error("Cannot read EDAC memory errors directory '%s'", name);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- if(unlikely(mc_root == NULL)) {
|
|
|
- find_all_mc();
|
|
|
- if(unlikely(mc_root == NULL))
|
|
|
- return 1;
|
|
|
- }
|
|
|
+ while((de = readdir(dir))) {
|
|
|
+ // it can be dimmX or rankX directory
|
|
|
+ // https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5
|
|
|
|
|
|
- static int do_ce = -1, do_ue = -1;
|
|
|
- NETDATA_DOUBLE ce_sum = 0, ue_sum = 0;
|
|
|
- struct mc *m;
|
|
|
+ if (de->d_type == DT_DIR &&
|
|
|
+ ((strncmp(de->d_name, "rank", 4) == 0 || strncmp(de->d_name, "dimm", 4) == 0)) &&
|
|
|
+ isdigit(de->d_name[4])) {
|
|
|
|
|
|
- if(unlikely(do_ce == -1)) {
|
|
|
- do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_BOOLEAN_YES);
|
|
|
- do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_BOOLEAN_YES);
|
|
|
- }
|
|
|
+ struct edac_dimm *d = callocz(1, sizeof(struct edac_dimm));
|
|
|
+ d->name = strdupz(de->d_name);
|
|
|
|
|
|
- if(do_ce != CONFIG_BOOLEAN_NO) {
|
|
|
- for(m = mc_root; m; m = m->next) {
|
|
|
- if(m->ce_count_filename) {
|
|
|
- m->ce_updated = 0;
|
|
|
+ struct stat st;
|
|
|
|
|
|
- if(unlikely(!m->ce_ff)) {
|
|
|
- m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
|
|
|
- if(unlikely(!m->ce_ff))
|
|
|
- continue;
|
|
|
- }
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ce_count", mc_dirname, m->name, de->d_name);
|
|
|
+ if(stat(name, &st) != -1)
|
|
|
+ d->ce.filename = strdupz(name);
|
|
|
|
|
|
- m->ce_ff = procfile_readall(m->ce_ff);
|
|
|
- if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1))
|
|
|
- continue;
|
|
|
+ snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ue_count", mc_dirname, m->name, de->d_name);
|
|
|
+ if(stat(name, &st) != -1)
|
|
|
+ d->ue.filename = strdupz(name);
|
|
|
|
|
|
- m->ce_count = str2ull(procfile_lineword(m->ce_ff, 0, 0), NULL);
|
|
|
- ce_sum += m->ce_count;
|
|
|
- m->ce_updated = 1;
|
|
|
+ if(!d->ce.filename && !d->ue.filename) {
|
|
|
+ freez(d->name);
|
|
|
+ freez(d);
|
|
|
+ }
|
|
|
+ else
|
|
|
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(m->dimms, d, prev, next);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
- if(do_ue != CONFIG_BOOLEAN_NO) {
|
|
|
- for(m = mc_root; m; m = m->next) {
|
|
|
- if(m->ue_count_filename) {
|
|
|
- m->ue_updated = 0;
|
|
|
+static kernel_uint_t read_edac_count(struct edac_count *t) {
|
|
|
+ t->updated = false;
|
|
|
+ t->count = 0;
|
|
|
|
|
|
- if(unlikely(!m->ue_ff)) {
|
|
|
- m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
|
|
|
- if(unlikely(!m->ue_ff))
|
|
|
- continue;
|
|
|
- }
|
|
|
+ if(t->filename) {
|
|
|
+ if(unlikely(!t->ff)) {
|
|
|
+ t->ff = procfile_open(t->filename, " \t", PROCFILE_FLAG_DEFAULT);
|
|
|
+ if(unlikely(!t->ff))
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
|
|
|
- m->ue_ff = procfile_readall(m->ue_ff);
|
|
|
- if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1))
|
|
|
- continue;
|
|
|
+ t->ff = procfile_readall(t->ff);
|
|
|
+ if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1))
|
|
|
+ return 0;
|
|
|
|
|
|
- m->ue_count = str2ull(procfile_lineword(m->ue_ff, 0, 0), NULL);
|
|
|
- ue_sum += m->ue_count;
|
|
|
- m->ue_updated = 1;
|
|
|
- }
|
|
|
+ t->count = str2ull(procfile_lineword(t->ff, 0, 0), NULL);
|
|
|
+ t->updated = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ return t->count;
|
|
|
+}
|
|
|
+
|
|
|
+static bool read_edac_mc_file(const char *mc, const char *filename, char *out, size_t out_size) {
|
|
|
+ char f[FILENAME_MAX + 1];
|
|
|
+ snprintfz(f, FILENAME_MAX, "%s/%s/%s", mc_dirname, mc, filename);
|
|
|
+ if(read_file(f, out, out_size) != 0) {
|
|
|
+ collector_error("EDAC: cannot read file '%s'", f);
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static bool read_edac_mc_rank_file(const char *mc, const char *rank, const char *filename, char *out, size_t out_size) {
|
|
|
+ char f[FILENAME_MAX + 1];
|
|
|
+ snprintfz(f, FILENAME_MAX, "%s/%s/%s/%s", mc_dirname, mc, rank, filename);
|
|
|
+ if(read_file(f, out, out_size) != 0) {
|
|
|
+ collector_error("EDAC: cannot read file '%s'", f);
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unused) {
|
|
|
+ if(unlikely(!mc_root)) {
|
|
|
+ find_all_mc();
|
|
|
+
|
|
|
+ if(!mc_root)
|
|
|
+ // don't call this again
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ for(struct mc *m = mc_root; m; m = m->next) {
|
|
|
+ read_edac_count(&m->ce);
|
|
|
+ read_edac_count(&m->ce_noinfo);
|
|
|
+ read_edac_count(&m->ue);
|
|
|
+ read_edac_count(&m->ue_noinfo);
|
|
|
+
|
|
|
+ for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
|
|
|
+ read_edac_count(&d->ce);
|
|
|
+ read_edac_count(&d->ue);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// --------------------------------------------------------------------
|
|
|
|
|
|
- if(do_ce == CONFIG_BOOLEAN_YES || (do_ce == CONFIG_BOOLEAN_AUTO &&
|
|
|
- (ce_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
|
|
|
- do_ce = CONFIG_BOOLEAN_YES;
|
|
|
+ for(struct mc *m = mc_root; m ; m = m->next) {
|
|
|
+ if(unlikely(!m->ce.updated && !m->ue.updated && !m->ce_noinfo.updated && !m->ue_noinfo.updated))
|
|
|
+ continue;
|
|
|
|
|
|
- static RRDSET *ce_st = NULL;
|
|
|
-
|
|
|
- if(unlikely(!ce_st)) {
|
|
|
- ce_st = rrdset_create_localhost(
|
|
|
+ if(unlikely(!m->st)) {
|
|
|
+ char id[RRD_ID_LENGTH_MAX + 1];
|
|
|
+ snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s", m->name);
|
|
|
+ m->st = rrdset_create_localhost(
|
|
|
"mem"
|
|
|
- , "ecc_ce"
|
|
|
- , NULL
|
|
|
- , "ecc"
|
|
|
+ , id
|
|
|
, NULL
|
|
|
- , "ECC Memory Correctable Errors"
|
|
|
- , "errors"
|
|
|
+ , "edac"
|
|
|
+ , "mem.edac_mc"
|
|
|
+ , "Memory Controller (MC) Error Detection And Correction (EDAC) Errors"
|
|
|
+ , "errors/s"
|
|
|
, PLUGIN_PROC_NAME
|
|
|
, "/sys/devices/system/edac/mc"
|
|
|
, NETDATA_CHART_PRIO_MEM_HW_ECC_CE
|
|
|
, update_every
|
|
|
, RRDSET_TYPE_LINE
|
|
|
);
|
|
|
- }
|
|
|
|
|
|
- for(m = mc_root; m; m = m->next) {
|
|
|
- if (m->ce_count_filename && m->ce_updated) {
|
|
|
- if(unlikely(!m->ce_rd))
|
|
|
- m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ rrdlabels_add(m->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
|
|
|
|
|
|
- rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count);
|
|
|
- }
|
|
|
+ char buffer[1024 + 1];
|
|
|
+
|
|
|
+ if(read_edac_mc_file(m->name, "mc_name", buffer, 1024))
|
|
|
+ rrdlabels_add(m->st->rrdlabels, "mc_name", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_file(m->name, "size_mb", buffer, 1024))
|
|
|
+ rrdlabels_add(m->st->rrdlabels, "size_mb", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_file(m->name, "max_location", buffer, 1024))
|
|
|
+ rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
}
|
|
|
|
|
|
- rrdset_done(ce_st);
|
|
|
- }
|
|
|
+ rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count);
|
|
|
+ rrddim_set_by_pointer(m->st, m->ue.rd, (collected_number)m->ue.count);
|
|
|
+ rrddim_set_by_pointer(m->st, m->ce_noinfo.rd, (collected_number)m->ce_noinfo.count);
|
|
|
+ rrddim_set_by_pointer(m->st, m->ue_noinfo.rd, (collected_number)m->ue_noinfo.count);
|
|
|
|
|
|
- // --------------------------------------------------------------------
|
|
|
+ rrdset_done(m->st);
|
|
|
|
|
|
- if(do_ue == CONFIG_BOOLEAN_YES || (do_ue == CONFIG_BOOLEAN_AUTO &&
|
|
|
- (ue_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
|
|
|
- do_ue = CONFIG_BOOLEAN_YES;
|
|
|
+ for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
|
|
|
+ if(unlikely(!d->ce.updated && !d->ue.updated))
|
|
|
+ continue;
|
|
|
|
|
|
- static RRDSET *ue_st = NULL;
|
|
|
+ if(unlikely(!d->st)) {
|
|
|
+ char id[RRD_ID_LENGTH_MAX + 1];
|
|
|
+ snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s_%s", m->name, d->name);
|
|
|
+ d->st = rrdset_create_localhost(
|
|
|
+ "mem"
|
|
|
+ , id
|
|
|
+ , NULL
|
|
|
+ , "edac"
|
|
|
+ , "mem.edac_mc_dimm"
|
|
|
+ , "DIMM Error Detection And Correction (EDAC) Errors"
|
|
|
+ , "errors/s"
|
|
|
+ , PLUGIN_PROC_NAME
|
|
|
+ , "/sys/devices/system/edac/mc"
|
|
|
+ , NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1
|
|
|
+ , update_every
|
|
|
+ , RRDSET_TYPE_LINE
|
|
|
+ );
|
|
|
|
|
|
- if(unlikely(!ue_st)) {
|
|
|
- ue_st = rrdset_create_localhost(
|
|
|
- "mem"
|
|
|
- , "ecc_ue"
|
|
|
- , NULL
|
|
|
- , "ecc"
|
|
|
- , NULL
|
|
|
- , "ECC Memory Uncorrectable Errors"
|
|
|
- , "errors"
|
|
|
- , PLUGIN_PROC_NAME
|
|
|
- , "/sys/devices/system/edac/mc"
|
|
|
- , NETDATA_CHART_PRIO_MEM_HW_ECC_UE
|
|
|
- , update_every
|
|
|
- , RRDSET_TYPE_LINE
|
|
|
- );
|
|
|
- }
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm", d->name, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ char buffer[1024 + 1];
|
|
|
|
|
|
- for(m = mc_root; m; m = m->next) {
|
|
|
- if (m->ue_count_filename && m->ue_updated) {
|
|
|
- if(unlikely(!m->ue_rd))
|
|
|
- m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "dimm_dev_type", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm_dev_type", buffer, RRDLABEL_SRC_AUTO);
|
|
|
|
|
|
- rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count);
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "dimm_edac_mode", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm_edac_mode", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "dimm_label", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm_label", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "dimm_location", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm_location", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "dimm_mem_type", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "dimm_mem_type", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ if(read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024))
|
|
|
+ rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO);
|
|
|
+
|
|
|
+ d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
+ d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- rrdset_done(ue_st);
|
|
|
+ rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count);
|
|
|
+ rrddim_set_by_pointer(d->st, d->ue.rd, (collected_number)d->ue.count);
|
|
|
+
|
|
|
+ rrdset_done(d->st);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
return 0;
|