Browse Source

Memory Controller (MC) and DIMM Error Detection And Correction (EDAC) (#15473)

Co-authored-by: ilyam8 <ilya@netdata.cloud>
Costa Tsaousis 1 year ago
parent
commit
173a3f9bed

+ 60 - 21
collectors/proc.plugin/multi_metadata.yaml

@@ -1307,7 +1307,7 @@ modules:
       plugin_name: proc.plugin
       module_name: /sys/devices/system/edac/mc
       monitored_instance:
-        name: System Memory Errors
+        name: Memory modules (DIMMs)
         link: ""
         categories:
           - data-collection.linux-systems.memory-metrics
@@ -1319,9 +1319,10 @@ modules:
         description: ""
       keywords:
         - edac
-        - error detection and correction memory controllers
         - ecc
+        - dimm
         - ram
+        - hardware
       most_popular: false
     overview:
       data_collection:
@@ -1362,16 +1363,22 @@ modules:
       problems:
         list: []
     alerts:
-      - name: 1hour_ecc_memory_correctable
+      - name: ecc_memory_mc_noinfo_correctable
+        metric: mem.edac_mc
+        info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes
         link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
-        metric: mem.ecc_ce
-        info: number of ECC correctable errors in the last 10 minutes
-        os: "linux"
-      - name: 1hour_ecc_memory_uncorrectable
+      - name: ecc_memory_mc_noinfo_uncorrectable
+        metric: mem.edac_mc
+        info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
+      - name: ecc_memory_dimm_correctable
+        metric: mem.edac_mc_dimm
+        info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
+      - name: ecc_memory_dimm_uncorrectable
+        metric: mem.edac_mc_dimm
+        info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
         link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
-        metric: mem.ecc_ue
-        info: number of ECC uncorrectable errors in the last 10 minutes
-        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -1379,22 +1386,54 @@ modules:
       description: ""
       availability: []
       scopes:
-        - name: global
-          description: ""
-          labels: []
+        - name: memory controller
+          description: These metrics refer to the memory controller.
+          labels:
+            - name: controller
+              description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller."
+            - name: mc_name
+              description: Memory controller type.
+            - name: size_mb
+              description: The amount of memory in megabytes that this memory controller manages.
+            - name: max_location
+              description: Last available memory slot in this memory controller.
           metrics:
-            - name: mem.ecc_ce
-              description: ECC Memory Correctable Errors
-              unit: "errors"
+            - name: mem.edac_mc
+              description: Memory Controller (MC) Error Detection And Correction (EDAC) Errors
+              unit: errors/s
               chart_type: line
               dimensions:
-                - name: a dimension per mem controller
-            - name: mem.ecc_ue
-              description: ECC Memory Uncorrectable Errors
-              unit: "errors"
+                - name: correctable
+                - name: uncorrectable
+                - name: correctable_noinfo
+                - name: uncorrectable_noinfo
+        - name: memory module
+          description: These metrics refer to the memory module (or rank, [depends on the memory controller](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5)).
+          labels:
+            - name: controller
+              description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller."
+            - name: dimm
+              description: "[dimmX or rankX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#dimmx-or-rankx-directories) directory name of this memory module."
+            - name: dimm_dev_type
+              description: Type of DRAM device used in this memory module. For example, x1, x2, x4, x8.
+            - name: dimm_edac_mode
+              description: Used type of error detection and correction. For example, S4ECD4ED would mean a Chipkill with x4 DRAM.
+            - name: dimm_label
+              description: Label assigned to this memory module.
+            - name: dimm_location
+              description: Location of the memory module.
+            - name: dimm_mem_type
+              description: Type of the memory module. Usually either buffered or unbuffered memory.
+            - name: size
+              description: The amount of memory in megabytes that this memory module manages.
+          metrics:
+            - name: mem.edac_mc
+              description: DIMM Error Detection And Correction (EDAC) Errors
+              unit: errors/s
               chart_type: line
               dimensions:
-                - name: a dimension per mem controller
+                - name: correctable
+                - name: uncorrectable
   - meta:
       plugin_name: proc.plugin
       module_name: /sys/devices/system/node

+ 213 - 120
collectors/proc.plugin/sys_devices_system_edac_mc.c

@@ -2,35 +2,51 @@
 
 #include "plugin_proc.h"
 
+struct edac_count {
+    bool updated;
+    char *filename;
+    procfile *ff;
+    kernel_uint_t count;
+    RRDDIM *rd;
+};
+
+struct edac_dimm {
+	char *name;
+
+    struct edac_count ce;
+    struct edac_count ue;
+
+    RRDSET *st;
+
+    struct edac_dimm *prev, *next;
+};
+
 struct mc {
     char *name;
-    char ce_updated;
-    char ue_updated;
 
-    char *ce_count_filename;
-    char *ue_count_filename;
+    struct edac_count ce;
+    struct edac_count ue;
+    struct edac_count ce_noinfo;
+    struct edac_count ue_noinfo;
 
-    procfile *ce_ff;
-    procfile *ue_ff;
+    RRDSET *st;
 
-    collected_number ce_count;
-    collected_number ue_count;
+    struct edac_dimm *dimms;
 
-    RRDDIM *ce_rd;
-    RRDDIM *ue_rd;
-
-    struct mc *next;
+    struct mc *prev, *next;
 };
+
 static struct mc *mc_root = NULL;
+static char *mc_dirname = NULL;
 
 static void find_all_mc() {
     char name[FILENAME_MAX + 1];
     snprintfz(name, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/edac/mc");
-    char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
+    mc_dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
 
-    DIR *dir = opendir(dirname);
+    DIR *dir = opendir(mc_dirname);
     if(unlikely(!dir)) {
-        collector_error("Cannot read ECC memory errors directory '%s'", dirname);
+        collector_error("Cannot read EDAC memory errors directory '%s'", mc_dirname);
         return;
     }
 
@@ -42,162 +58,239 @@ static void find_all_mc() {
 
             struct stat st;
 
-            snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name);
+            snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", mc_dirname, de->d_name);
             if(stat(name, &st) != -1)
-                m->ce_count_filename = strdupz(name);
+                m->ce.filename = strdupz(name);
 
-            snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name);
+            snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", mc_dirname, de->d_name);
             if(stat(name, &st) != -1)
-                m->ue_count_filename = strdupz(name);
+                m->ue.filename = strdupz(name);
 
-            if(!m->ce_count_filename && !m->ue_count_filename) {
+            snprintfz(name, FILENAME_MAX, "%s/%s/ce_noinfo_count", mc_dirname, de->d_name);
+            if(stat(name, &st) != -1)
+                m->ce_noinfo.filename = strdupz(name);
+
+            snprintfz(name, FILENAME_MAX, "%s/%s/ue_noinfo_count", mc_dirname, de->d_name);
+            if(stat(name, &st) != -1)
+                m->ue_noinfo.filename = strdupz(name);
+
+            if(!m->ce.filename && !m->ue.filename && !m->ce_noinfo.filename && !m->ue_noinfo.filename) {
                 freez(m->name);
                 freez(m);
             }
-            else {
-                m->next = mc_root;
-                mc_root = m;
-            }
+            else
+                DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(mc_root, m, prev, next);
         }
     }
-
     closedir(dir);
-}
 
-int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) {
-    (void)dt;
+    for(struct mc *m = mc_root; m ;m = m->next) {
+        snprintfz(name, FILENAME_MAX, "%s/%s", mc_dirname, m->name);
+        dir = opendir(name);
+        if(!dir) {
+            collector_error("Cannot read EDAC memory errors directory '%s'", name);
+            continue;
+        }
 
-    if(unlikely(mc_root == NULL)) {
-        find_all_mc();
-        if(unlikely(mc_root == NULL))
-            return 1;
-    }
+        while((de = readdir(dir))) {
+            // it can be dimmX or rankX directory
+            // https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5
 
-    static int do_ce = -1, do_ue = -1;
-    NETDATA_DOUBLE ce_sum = 0, ue_sum = 0;
-    struct mc *m;
+            if (de->d_type == DT_DIR &&
+                ((strncmp(de->d_name, "rank", 4) == 0 || strncmp(de->d_name, "dimm", 4) == 0)) &&
+                isdigit(de->d_name[4])) {
 
-    if(unlikely(do_ce == -1)) {
-        do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_BOOLEAN_YES);
-        do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_BOOLEAN_YES);
-    }
+                struct edac_dimm *d = callocz(1, sizeof(struct edac_dimm));
+                d->name = strdupz(de->d_name);
 
-    if(do_ce != CONFIG_BOOLEAN_NO) {
-        for(m = mc_root; m; m = m->next) {
-            if(m->ce_count_filename) {
-                m->ce_updated = 0;
+                struct stat st;
 
-                if(unlikely(!m->ce_ff)) {
-                    m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
-                    if(unlikely(!m->ce_ff))
-                        continue;
-                }
+                snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ce_count", mc_dirname, m->name, de->d_name);
+                if(stat(name, &st) != -1)
+                    d->ce.filename = strdupz(name);
 
-                m->ce_ff = procfile_readall(m->ce_ff);
-                if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1))
-                    continue;
+                snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ue_count", mc_dirname, m->name, de->d_name);
+                if(stat(name, &st) != -1)
+                    d->ue.filename = strdupz(name);
 
-                m->ce_count = str2ull(procfile_lineword(m->ce_ff, 0, 0), NULL);
-                ce_sum += m->ce_count;
-                m->ce_updated = 1;
+                if(!d->ce.filename && !d->ue.filename) {
+                    freez(d->name);
+                    freez(d);
+                }
+                else
+                    DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(m->dimms, d, prev, next);
             }
         }
     }
+}
 
-    if(do_ue != CONFIG_BOOLEAN_NO) {
-        for(m = mc_root; m; m = m->next) {
-            if(m->ue_count_filename) {
-                m->ue_updated = 0;
+static kernel_uint_t read_edac_count(struct edac_count *t) {
+    t->updated = false;
+    t->count = 0;
 
-                if(unlikely(!m->ue_ff)) {
-                    m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT);
-                    if(unlikely(!m->ue_ff))
-                        continue;
-                }
+    if(t->filename) {
+        if(unlikely(!t->ff)) {
+            t->ff = procfile_open(t->filename, " \t", PROCFILE_FLAG_DEFAULT);
+            if(unlikely(!t->ff))
+                return 0;
+        }
 
-                m->ue_ff = procfile_readall(m->ue_ff);
-                if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1))
-                    continue;
+        t->ff = procfile_readall(t->ff);
+        if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1))
+            return 0;
 
-                m->ue_count = str2ull(procfile_lineword(m->ue_ff, 0, 0), NULL);
-                ue_sum += m->ue_count;
-                m->ue_updated = 1;
-            }
+        t->count = str2ull(procfile_lineword(t->ff, 0, 0), NULL);
+        t->updated = true;
+    }
+
+    return t->count;
+}
+
+static bool read_edac_mc_file(const char *mc, const char *filename, char *out, size_t out_size) {
+    char f[FILENAME_MAX + 1];
+    snprintfz(f, FILENAME_MAX, "%s/%s/%s", mc_dirname, mc, filename);
+    if(read_file(f, out, out_size) != 0) {
+        collector_error("EDAC: cannot read file '%s'", f);
+        return false;
+    }
+    return true;
+}
+
+static bool read_edac_mc_rank_file(const char *mc, const char *rank, const char *filename, char *out, size_t out_size) {
+    char f[FILENAME_MAX + 1];
+    snprintfz(f, FILENAME_MAX, "%s/%s/%s/%s", mc_dirname, mc, rank, filename);
+    if(read_file(f, out, out_size) != 0) {
+        collector_error("EDAC: cannot read file '%s'", f);
+        return false;
+    }
+    return true;
+}
+
+int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unused) {
+    if(unlikely(!mc_root)) {
+        find_all_mc();
+
+        if(!mc_root)
+            // don't call this again
+            return 1;
+    }
+
+    for(struct mc *m = mc_root; m; m = m->next) {
+        read_edac_count(&m->ce);
+        read_edac_count(&m->ce_noinfo);
+        read_edac_count(&m->ue);
+        read_edac_count(&m->ue_noinfo);
+
+        for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
+            read_edac_count(&d->ce);
+            read_edac_count(&d->ue);
         }
     }
 
     // --------------------------------------------------------------------
 
-    if(do_ce == CONFIG_BOOLEAN_YES || (do_ce == CONFIG_BOOLEAN_AUTO &&
-                                       (ce_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
-        do_ce = CONFIG_BOOLEAN_YES;
+    for(struct mc *m = mc_root; m ; m = m->next) {
+        if(unlikely(!m->ce.updated && !m->ue.updated && !m->ce_noinfo.updated && !m->ue_noinfo.updated))
+            continue;
 
-        static RRDSET *ce_st = NULL;
-
-        if(unlikely(!ce_st)) {
-            ce_st = rrdset_create_localhost(
+        if(unlikely(!m->st)) {
+            char id[RRD_ID_LENGTH_MAX + 1];
+            snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s", m->name);
+            m->st = rrdset_create_localhost(
                     "mem"
-                    , "ecc_ce"
-                    , NULL
-                    , "ecc"
+                    , id
                     , NULL
-                    , "ECC Memory Correctable Errors"
-                    , "errors"
+                    , "edac"
+                    , "mem.edac_mc"
+                    , "Memory Controller (MC) Error Detection And Correction (EDAC) Errors"
+                    , "errors/s"
                     , PLUGIN_PROC_NAME
                     , "/sys/devices/system/edac/mc"
                     , NETDATA_CHART_PRIO_MEM_HW_ECC_CE
                     , update_every
                     , RRDSET_TYPE_LINE
             );
-        }
 
-        for(m = mc_root; m; m = m->next) {
-            if (m->ce_count_filename && m->ce_updated) {
-                if(unlikely(!m->ce_rd))
-                    m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+            rrdlabels_add(m->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
 
-                rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count);
-            }
+            char buffer[1024 + 1];
+
+            if(read_edac_mc_file(m->name, "mc_name", buffer, 1024))
+                rrdlabels_add(m->st->rrdlabels, "mc_name", buffer, RRDLABEL_SRC_AUTO);
+
+            if(read_edac_mc_file(m->name, "size_mb", buffer, 1024))
+                rrdlabels_add(m->st->rrdlabels, "size_mb", buffer, RRDLABEL_SRC_AUTO);
+
+            if(read_edac_mc_file(m->name, "max_location", buffer, 1024))
+                rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO);
+
+            m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+            m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+            m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+            m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
         }
 
-        rrdset_done(ce_st);
-    }
+        rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count);
+        rrddim_set_by_pointer(m->st, m->ue.rd, (collected_number)m->ue.count);
+        rrddim_set_by_pointer(m->st, m->ce_noinfo.rd, (collected_number)m->ce_noinfo.count);
+        rrddim_set_by_pointer(m->st, m->ue_noinfo.rd, (collected_number)m->ue_noinfo.count);
 
-    // --------------------------------------------------------------------
+        rrdset_done(m->st);
 
-    if(do_ue == CONFIG_BOOLEAN_YES || (do_ue == CONFIG_BOOLEAN_AUTO &&
-                                       (ue_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
-        do_ue = CONFIG_BOOLEAN_YES;
+        for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
+            if(unlikely(!d->ce.updated && !d->ue.updated))
+                continue;
 
-        static RRDSET *ue_st = NULL;
+            if(unlikely(!d->st)) {
+                char id[RRD_ID_LENGTH_MAX + 1];
+                snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s_%s", m->name, d->name);
+                d->st = rrdset_create_localhost(
+                        "mem"
+                		, id
+                		, NULL
+                		, "edac"
+                        , "mem.edac_mc_dimm"
+                		, "DIMM Error Detection And Correction (EDAC) Errors"
+                        , "errors/s"
+                        , PLUGIN_PROC_NAME
+                        , "/sys/devices/system/edac/mc"
+                        , NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1
+                        , update_every
+                        , RRDSET_TYPE_LINE
+                );
 
-        if(unlikely(!ue_st)) {
-            ue_st = rrdset_create_localhost(
-                    "mem"
-                    , "ecc_ue"
-                    , NULL
-                    , "ecc"
-                    , NULL
-                    , "ECC Memory Uncorrectable Errors"
-                    , "errors"
-                    , PLUGIN_PROC_NAME
-                    , "/sys/devices/system/edac/mc"
-                    , NETDATA_CHART_PRIO_MEM_HW_ECC_UE
-                    , update_every
-                    , RRDSET_TYPE_LINE
-            );
-        }
+                rrdlabels_add(d->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
+                rrdlabels_add(d->st->rrdlabels, "dimm", d->name, RRDLABEL_SRC_AUTO);
+
+                char buffer[1024 + 1];
 
-        for(m = mc_root; m; m = m->next) {
-            if (m->ue_count_filename && m->ue_updated) {
-                if(unlikely(!m->ue_rd))
-                    m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+                if(read_edac_mc_rank_file(m->name, d->name, "dimm_dev_type", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "dimm_dev_type", buffer, RRDLABEL_SRC_AUTO);
 
-                rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count);
+                if(read_edac_mc_rank_file(m->name, d->name, "dimm_edac_mode", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "dimm_edac_mode", buffer, RRDLABEL_SRC_AUTO);
+
+                if(read_edac_mc_rank_file(m->name, d->name, "dimm_label", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "dimm_label", buffer, RRDLABEL_SRC_AUTO);
+
+                if(read_edac_mc_rank_file(m->name, d->name, "dimm_location", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "dimm_location", buffer, RRDLABEL_SRC_AUTO);
+
+                if(read_edac_mc_rank_file(m->name, d->name, "dimm_mem_type", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "dimm_mem_type", buffer, RRDLABEL_SRC_AUTO);
+
+                if(read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024))
+                    rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO);
+
+                d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+                d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
             }
-        }
 
-        rrdset_done(ue_st);
+            rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count);
+            rrddim_set_by_pointer(d->st, d->ue.rd, (collected_number)d->ue.count);
+
+            rrdset_done(d->st);
+        }
     }
 
     return 0;

+ 48 - 15
health/health.d/memory.conf

@@ -1,47 +1,80 @@
-
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-    alarm: 1hour_ecc_memory_correctable
-       on: mem.ecc_ce
+    alarm: 1hour_memory_hw_corrupted
+       on: mem.hwcorrupt
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-   lookup: sum -10m unaligned
+     calc: $HardwareCorrupted
+    units: MB
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: amount of memory corrupted due to a hardware failure
+       to: sysadmin
+
+## ECC Controller
+
+ template: ecc_memory_mc_correctable
+       on: mem.edac_mc
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned of correctable, correctable_noinfo
     units: errors
     every: 1m
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC correctable errors in the last 10 minutes
+     info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes
        to: sysadmin
 
-    alarm: 1hour_ecc_memory_uncorrectable
-       on: mem.ecc_ue
+ template: ecc_memory_mc_uncorrectable
+       on: mem.edac_mc
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-   lookup: sum -10m unaligned
+   lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
     units: errors
     every: 1m
      crit: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC uncorrectable errors in the last 10 minutes
+     info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
        to: sysadmin
 
-    alarm: 1hour_memory_hw_corrupted
-       on: mem.hwcorrupt
+## ECC DIMM
+
+ template: ecc_memory_dimm_correctable
+       on: mem.edac_mc_dimm
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-     calc: $HardwareCorrupted
-    units: MB
-    every: 10s
+   lookup: sum -10m unaligned of correctable
+    units: errors
+    every: 1m
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: amount of memory corrupted due to a hardware failure
+     info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+       to: sysadmin
+
+ template: ecc_memory_dimm_uncorrectable
+       on: mem.edac_mc_dimm
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned of uncorrectable
+    units: errors
+    every: 1m
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
        to: sysadmin