sys_devices_system_edac_mc.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "plugin_proc.h"
  3. struct edac_count {
  4. bool updated;
  5. char *filename;
  6. procfile *ff;
  7. kernel_uint_t count;
  8. RRDDIM *rd;
  9. };
  10. struct edac_dimm {
  11. char *name;
  12. struct edac_count ce;
  13. struct edac_count ue;
  14. RRDSET *st;
  15. struct edac_dimm *prev, *next;
  16. };
  17. struct mc {
  18. char *name;
  19. struct edac_count ce;
  20. struct edac_count ue;
  21. struct edac_count ce_noinfo;
  22. struct edac_count ue_noinfo;
  23. RRDSET *st;
  24. struct edac_dimm *dimms;
  25. struct mc *prev, *next;
  26. };
  27. static struct mc *mc_root = NULL;
  28. static char *mc_dirname = NULL;
  29. static void find_all_mc() {
  30. char name[FILENAME_MAX + 1];
  31. snprintfz(name, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/edac/mc");
  32. mc_dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name);
  33. DIR *dir = opendir(mc_dirname);
  34. if(unlikely(!dir)) {
  35. collector_error("Cannot read EDAC memory errors directory '%s'", mc_dirname);
  36. return;
  37. }
  38. struct dirent *de = NULL;
  39. while((de = readdir(dir))) {
  40. if(de->d_type == DT_DIR && de->d_name[0] == 'm' && de->d_name[1] == 'c' && isdigit(de->d_name[2])) {
  41. struct mc *m = callocz(1, sizeof(struct mc));
  42. m->name = strdupz(de->d_name);
  43. struct stat st;
  44. snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", mc_dirname, de->d_name);
  45. if(stat(name, &st) != -1)
  46. m->ce.filename = strdupz(name);
  47. snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", mc_dirname, de->d_name);
  48. if(stat(name, &st) != -1)
  49. m->ue.filename = strdupz(name);
  50. snprintfz(name, FILENAME_MAX, "%s/%s/ce_noinfo_count", mc_dirname, de->d_name);
  51. if(stat(name, &st) != -1)
  52. m->ce_noinfo.filename = strdupz(name);
  53. snprintfz(name, FILENAME_MAX, "%s/%s/ue_noinfo_count", mc_dirname, de->d_name);
  54. if(stat(name, &st) != -1)
  55. m->ue_noinfo.filename = strdupz(name);
  56. if(!m->ce.filename && !m->ue.filename && !m->ce_noinfo.filename && !m->ue_noinfo.filename) {
  57. freez(m->name);
  58. freez(m);
  59. }
  60. else
  61. DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(mc_root, m, prev, next);
  62. }
  63. }
  64. closedir(dir);
  65. for(struct mc *m = mc_root; m ;m = m->next) {
  66. snprintfz(name, FILENAME_MAX, "%s/%s", mc_dirname, m->name);
  67. dir = opendir(name);
  68. if(!dir) {
  69. collector_error("Cannot read EDAC memory errors directory '%s'", name);
  70. continue;
  71. }
  72. while((de = readdir(dir))) {
  73. // it can be dimmX or rankX directory
  74. // https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5
  75. if (de->d_type == DT_DIR &&
  76. ((strncmp(de->d_name, "rank", 4) == 0 || strncmp(de->d_name, "dimm", 4) == 0)) &&
  77. isdigit(de->d_name[4])) {
  78. struct edac_dimm *d = callocz(1, sizeof(struct edac_dimm));
  79. d->name = strdupz(de->d_name);
  80. struct stat st;
  81. snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ce_count", mc_dirname, m->name, de->d_name);
  82. if(stat(name, &st) != -1)
  83. d->ce.filename = strdupz(name);
  84. snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ue_count", mc_dirname, m->name, de->d_name);
  85. if(stat(name, &st) != -1)
  86. d->ue.filename = strdupz(name);
  87. if(!d->ce.filename && !d->ue.filename) {
  88. freez(d->name);
  89. freez(d);
  90. }
  91. else
  92. DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(m->dimms, d, prev, next);
  93. }
  94. }
  95. closedir(dir);
  96. }
  97. }
  98. static kernel_uint_t read_edac_count(struct edac_count *t) {
  99. t->updated = false;
  100. t->count = 0;
  101. if(t->filename) {
  102. if(unlikely(!t->ff)) {
  103. t->ff = procfile_open(t->filename, " \t", PROCFILE_FLAG_DEFAULT);
  104. if(unlikely(!t->ff))
  105. return 0;
  106. }
  107. t->ff = procfile_readall(t->ff);
  108. if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1))
  109. return 0;
  110. t->count = str2ull(procfile_lineword(t->ff, 0, 0), NULL);
  111. t->updated = true;
  112. }
  113. return t->count;
  114. }
  115. static bool read_edac_mc_file(const char *mc, const char *filename, char *out, size_t out_size) {
  116. char f[FILENAME_MAX + 1];
  117. snprintfz(f, FILENAME_MAX, "%s/%s/%s", mc_dirname, mc, filename);
  118. if(read_file(f, out, out_size) != 0) {
  119. collector_error("EDAC: cannot read file '%s'", f);
  120. return false;
  121. }
  122. return true;
  123. }
  124. static bool read_edac_mc_rank_file(const char *mc, const char *rank, const char *filename, char *out, size_t out_size) {
  125. char f[FILENAME_MAX + 1];
  126. snprintfz(f, FILENAME_MAX, "%s/%s/%s/%s", mc_dirname, mc, rank, filename);
  127. if(read_file(f, out, out_size) != 0) {
  128. collector_error("EDAC: cannot read file '%s'", f);
  129. return false;
  130. }
  131. return true;
  132. }
  133. int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unused) {
  134. if(unlikely(!mc_root)) {
  135. find_all_mc();
  136. if(!mc_root)
  137. // don't call this again
  138. return 1;
  139. }
  140. for(struct mc *m = mc_root; m; m = m->next) {
  141. read_edac_count(&m->ce);
  142. read_edac_count(&m->ce_noinfo);
  143. read_edac_count(&m->ue);
  144. read_edac_count(&m->ue_noinfo);
  145. for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
  146. read_edac_count(&d->ce);
  147. read_edac_count(&d->ue);
  148. }
  149. }
  150. // --------------------------------------------------------------------
  151. for(struct mc *m = mc_root; m ; m = m->next) {
  152. if(unlikely(!m->ce.updated && !m->ue.updated && !m->ce_noinfo.updated && !m->ue_noinfo.updated))
  153. continue;
  154. if(unlikely(!m->st)) {
  155. char id[RRD_ID_LENGTH_MAX + 1];
  156. snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s", m->name);
  157. m->st = rrdset_create_localhost(
  158. "mem"
  159. , id
  160. , NULL
  161. , "edac"
  162. , "mem.edac_mc"
  163. , "Memory Controller (MC) Error Detection And Correction (EDAC) Errors"
  164. , "errors/s"
  165. , PLUGIN_PROC_NAME
  166. , "/sys/devices/system/edac/mc"
  167. , NETDATA_CHART_PRIO_MEM_HW_ECC_CE
  168. , update_every
  169. , RRDSET_TYPE_LINE
  170. );
  171. rrdlabels_add(m->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
  172. char buffer[1024 + 1];
  173. if(read_edac_mc_file(m->name, "mc_name", buffer, 1024))
  174. rrdlabels_add(m->st->rrdlabels, "mc_name", buffer, RRDLABEL_SRC_AUTO);
  175. if(read_edac_mc_file(m->name, "size_mb", buffer, 1024))
  176. rrdlabels_add(m->st->rrdlabels, "size_mb", buffer, RRDLABEL_SRC_AUTO);
  177. if(read_edac_mc_file(m->name, "max_location", buffer, 1024))
  178. rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO);
  179. m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  180. m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  181. m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  182. m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  183. }
  184. rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count);
  185. rrddim_set_by_pointer(m->st, m->ue.rd, (collected_number)m->ue.count);
  186. rrddim_set_by_pointer(m->st, m->ce_noinfo.rd, (collected_number)m->ce_noinfo.count);
  187. rrddim_set_by_pointer(m->st, m->ue_noinfo.rd, (collected_number)m->ue_noinfo.count);
  188. rrdset_done(m->st);
  189. for(struct edac_dimm *d = m->dimms; d ;d = d->next) {
  190. if(unlikely(!d->ce.updated && !d->ue.updated))
  191. continue;
  192. if(unlikely(!d->st)) {
  193. char id[RRD_ID_LENGTH_MAX + 1];
  194. snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s_%s", m->name, d->name);
  195. d->st = rrdset_create_localhost(
  196. "mem"
  197. , id
  198. , NULL
  199. , "edac"
  200. , "mem.edac_mc_dimm"
  201. , "DIMM Error Detection And Correction (EDAC) Errors"
  202. , "errors/s"
  203. , PLUGIN_PROC_NAME
  204. , "/sys/devices/system/edac/mc"
  205. , NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1
  206. , update_every
  207. , RRDSET_TYPE_LINE
  208. );
  209. rrdlabels_add(d->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO);
  210. rrdlabels_add(d->st->rrdlabels, "dimm", d->name, RRDLABEL_SRC_AUTO);
  211. char buffer[1024 + 1];
  212. if (read_edac_mc_rank_file(m->name, d->name, "dimm_dev_type", buffer, 1024))
  213. rrdlabels_add(d->st->rrdlabels, "dimm_dev_type", buffer, RRDLABEL_SRC_AUTO);
  214. if (read_edac_mc_rank_file(m->name, d->name, "dimm_edac_mode", buffer, 1024))
  215. rrdlabels_add(d->st->rrdlabels, "dimm_edac_mode", buffer, RRDLABEL_SRC_AUTO);
  216. if (read_edac_mc_rank_file(m->name, d->name, "dimm_label", buffer, 1024))
  217. rrdlabels_add(d->st->rrdlabels, "dimm_label", buffer, RRDLABEL_SRC_AUTO);
  218. if (read_edac_mc_rank_file(m->name, d->name, "dimm_location", buffer, 1024))
  219. rrdlabels_add(d->st->rrdlabels, "dimm_location", buffer, RRDLABEL_SRC_AUTO);
  220. if (read_edac_mc_rank_file(m->name, d->name, "dimm_mem_type", buffer, 1024))
  221. rrdlabels_add(d->st->rrdlabels, "dimm_mem_type", buffer, RRDLABEL_SRC_AUTO);
  222. if (read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024))
  223. rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO);
  224. d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  225. d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  226. }
  227. rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count);
  228. rrddim_set_by_pointer(d->st, d->ue.rd, (collected_number)d->ue.count);
  229. rrdset_done(d->st);
  230. }
  231. }
  232. return 0;
  233. }