sys_devices_pci_aer.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "plugin_proc.h"
  3. static char *pci_aer_dirname = NULL;
  4. typedef enum __attribute__((packed)) {
  5. AER_DEV_NONFATAL = (1 << 0),
  6. AER_DEV_CORRECTABLE = (1 << 1),
  7. AER_DEV_FATAL = (1 << 2),
  8. AER_ROOTPORT_TOTAL_ERR_COR = (1 << 3),
  9. AER_ROOTPORT_TOTAL_ERR_FATAL = (1 << 4),
  10. } AER_TYPE;
  11. struct aer_value {
  12. kernel_uint_t count;
  13. RRDDIM *rd;
  14. };
  15. struct aer_entry {
  16. bool updated;
  17. STRING *name;
  18. AER_TYPE type;
  19. procfile *ff;
  20. DICTIONARY *values;
  21. RRDSET *st;
  22. };
  23. DICTIONARY *aer_root = NULL;
  24. static bool aer_value_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) {
  25. struct aer_value *v = old_value;
  26. struct aer_value *nv = new_value;
  27. v->count = nv->count;
  28. return false;
  29. }
  30. static void aer_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) {
  31. struct aer_entry *a = value;
  32. a->values = dictionary_create(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE);
  33. dictionary_register_conflict_callback(a->values, aer_value_conflict_callback, NULL);
  34. }
  35. static void add_pci_aer(const char *base_dir, const char *d_name, AER_TYPE type) {
  36. char buffer[FILENAME_MAX + 1];
  37. snprintfz(buffer, FILENAME_MAX, "%s/%s", base_dir, d_name);
  38. struct aer_entry *a = dictionary_set(aer_root, buffer, NULL, sizeof(struct aer_entry));
  39. if(!a->name)
  40. a->name = string_strdupz(d_name);
  41. a->type = type;
  42. }
  43. static bool recursively_find_pci_aer(AER_TYPE types, const char *base_dir, const char *d_name, int depth) {
  44. if(depth > 100)
  45. return false;
  46. char buffer[FILENAME_MAX + 1];
  47. snprintfz(buffer, FILENAME_MAX, "%s/%s", base_dir, d_name);
  48. DIR *dir = opendir(buffer);
  49. if(unlikely(!dir)) {
  50. collector_error("Cannot read PCI_AER directory '%s'", buffer);
  51. return true;
  52. }
  53. struct dirent *de = NULL;
  54. while((de = readdir(dir))) {
  55. if(de->d_type == DT_DIR) {
  56. if(de->d_name[0] == '.')
  57. continue;
  58. recursively_find_pci_aer(types, buffer, de->d_name, depth + 1);
  59. }
  60. else if(de->d_type == DT_REG) {
  61. if((types & AER_DEV_NONFATAL) && strcmp(de->d_name, "aer_dev_nonfatal") == 0) {
  62. add_pci_aer(buffer, de->d_name, AER_DEV_NONFATAL);
  63. }
  64. else if((types & AER_DEV_CORRECTABLE) && strcmp(de->d_name, "aer_dev_correctable") == 0) {
  65. add_pci_aer(buffer, de->d_name, AER_DEV_CORRECTABLE);
  66. }
  67. else if((types & AER_DEV_FATAL) && strcmp(de->d_name, "aer_dev_fatal") == 0) {
  68. add_pci_aer(buffer, de->d_name, AER_DEV_FATAL);
  69. }
  70. else if((types & AER_ROOTPORT_TOTAL_ERR_COR) && strcmp(de->d_name, "aer_rootport_total_err_cor") == 0) {
  71. add_pci_aer(buffer, de->d_name, AER_ROOTPORT_TOTAL_ERR_COR);
  72. }
  73. else if((types & AER_ROOTPORT_TOTAL_ERR_FATAL) && strcmp(de->d_name, "aer_rootport_total_err_fatal") == 0) {
  74. add_pci_aer(buffer, de->d_name, AER_ROOTPORT_TOTAL_ERR_FATAL);
  75. }
  76. }
  77. }
  78. closedir(dir);
  79. return true;
  80. }
  81. static void find_all_pci_aer(AER_TYPE types) {
  82. char name[FILENAME_MAX + 1];
  83. snprintfz(name, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices");
  84. pci_aer_dirname = config_get("plugin:proc:/sys/devices/pci/aer", "directory to monitor", name);
  85. DIR *dir = opendir(pci_aer_dirname);
  86. if(unlikely(!dir)) {
  87. collector_error("Cannot read PCI_AER directory '%s'", pci_aer_dirname);
  88. return;
  89. }
  90. struct dirent *de = NULL;
  91. while((de = readdir(dir))) {
  92. if(de->d_type == DT_DIR && de->d_name[0] == 'p' && de->d_name[1] == 'c' && de->d_name[2] == 'i' && isdigit(de->d_name[3]))
  93. recursively_find_pci_aer(types, pci_aer_dirname, de->d_name, 1);
  94. }
  95. closedir(dir);
  96. }
  97. static void read_pci_aer_values(const char *filename, struct aer_entry *t) {
  98. t->updated = false;
  99. if(unlikely(!t->ff)) {
  100. t->ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT);
  101. if(unlikely(!t->ff))
  102. return;
  103. }
  104. t->ff = procfile_readall(t->ff);
  105. if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1))
  106. return;
  107. size_t lines = procfile_lines(t->ff);
  108. for(size_t l = 0; l < lines ; l++) {
  109. if(procfile_linewords(t->ff, l) != 2)
  110. continue;
  111. struct aer_value v = {
  112. .count = str2ull(procfile_lineword(t->ff, l, 1), NULL)
  113. };
  114. char *key = procfile_lineword(t->ff, l, 0);
  115. if(!key || !*key || (key[0] == 'T' && key[1] == 'O' && key[2] == 'T' && key[3] == 'A' && key[4] == 'L' && key[5] == '_'))
  116. continue;
  117. dictionary_set(t->values, key, &v, sizeof(v));
  118. }
  119. t->updated = true;
  120. }
  121. static void read_pci_aer_count(const char *filename, struct aer_entry *t) {
  122. t->updated = false;
  123. if(unlikely(!t->ff)) {
  124. t->ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT);
  125. if(unlikely(!t->ff))
  126. return;
  127. }
  128. t->ff = procfile_readall(t->ff);
  129. if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1))
  130. return;
  131. struct aer_value v = {
  132. .count = str2ull(procfile_lineword(t->ff, 0, 0), NULL)
  133. };
  134. dictionary_set(t->values, "count", &v, sizeof(v));
  135. t->updated = true;
  136. }
  137. static void add_label_from_link(struct aer_entry *a, const char *path, const char *link) {
  138. char name[FILENAME_MAX + 1];
  139. strncpyz(name, path, FILENAME_MAX);
  140. char *slash = strrchr(name, '/');
  141. if(slash)
  142. *slash = '\0';
  143. char name2[FILENAME_MAX + 1];
  144. snprintfz(name2, FILENAME_MAX, "%s/%s", name, link);
  145. ssize_t len = readlink(name2, name, FILENAME_MAX);
  146. if(len != -1) {
  147. name[len] = '\0'; // Null-terminate the string
  148. slash = strrchr(name, '/');
  149. if(slash) slash++;
  150. else slash = name;
  151. rrdlabels_add(a->st->rrdlabels, link, slash, RRDLABEL_SRC_AUTO);
  152. }
  153. }
  154. int do_proc_sys_devices_pci_aer(int update_every, usec_t dt __maybe_unused) {
  155. if(unlikely(!aer_root)) {
  156. int do_root_ports = CONFIG_BOOLEAN_AUTO;
  157. int do_pci_slots = CONFIG_BOOLEAN_NO;
  158. char buffer[100 + 1] = "";
  159. rrdlabels_get_value_strcpyz(localhost->rrdlabels, buffer, 100, "_virtualization");
  160. if(strcmp(buffer, "none") != 0) {
  161. // no need to run on virtualized environments
  162. do_root_ports = CONFIG_BOOLEAN_NO;
  163. do_pci_slots = CONFIG_BOOLEAN_NO;
  164. }
  165. do_root_ports = config_get_boolean("plugin:proc:/sys/class/pci/aer", "enable root ports", do_root_ports);
  166. do_pci_slots = config_get_boolean("plugin:proc:/sys/class/pci/aer", "enable pci slots", do_pci_slots);
  167. if(!do_root_ports && !do_pci_slots)
  168. return 1;
  169. aer_root = dictionary_create(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE);
  170. dictionary_register_insert_callback(aer_root, aer_insert_callback, NULL);
  171. AER_TYPE types = ((do_root_ports) ? (AER_ROOTPORT_TOTAL_ERR_COR|AER_ROOTPORT_TOTAL_ERR_FATAL) : 0) |
  172. ((do_pci_slots) ? (AER_DEV_FATAL|AER_DEV_NONFATAL|AER_DEV_CORRECTABLE) : 0);
  173. find_all_pci_aer(types);
  174. if(!dictionary_entries(aer_root))
  175. return 1;
  176. }
  177. struct aer_entry *a;
  178. dfe_start_read(aer_root, a) {
  179. switch(a->type) {
  180. case AER_DEV_NONFATAL:
  181. case AER_DEV_FATAL:
  182. case AER_DEV_CORRECTABLE:
  183. read_pci_aer_values(a_dfe.name, a);
  184. break;
  185. case AER_ROOTPORT_TOTAL_ERR_COR:
  186. case AER_ROOTPORT_TOTAL_ERR_FATAL:
  187. read_pci_aer_count(a_dfe.name, a);
  188. break;
  189. }
  190. if(!a->updated)
  191. continue;
  192. if(!a->st) {
  193. const char *title = "";
  194. const char *context = "";
  195. switch(a->type) {
  196. case AER_DEV_NONFATAL:
  197. title = "PCI Advanced Error Reporting (AER) Non-Fatal Errors";
  198. context = "pci.aer_nonfatal";
  199. break;
  200. case AER_DEV_FATAL:
  201. title = "PCI Advanced Error Reporting (AER) Fatal Errors";
  202. context = "pci.aer_fatal";
  203. break;
  204. case AER_DEV_CORRECTABLE:
  205. title = "PCI Advanced Error Reporting (AER) Correctable Errors";
  206. context = "pci.aer_correctable";
  207. break;
  208. case AER_ROOTPORT_TOTAL_ERR_COR:
  209. title = "PCI Root-Port Advanced Error Reporting (AER) Correctable Errors";
  210. context = "pci.rootport_aer_correctable";
  211. break;
  212. case AER_ROOTPORT_TOTAL_ERR_FATAL:
  213. title = "PCI Root-Port Advanced Error Reporting (AER) Fatal Errors";
  214. context = "pci.rootport_aer_fatal";
  215. break;
  216. default:
  217. title = "Unknown PCI Advanced Error Reporting";
  218. context = "pci.unknown_aer";
  219. break;
  220. }
  221. char id[RRD_ID_LENGTH_MAX + 1];
  222. char nm[RRD_ID_LENGTH_MAX + 1];
  223. size_t len = strlen(pci_aer_dirname);
  224. const char *fname = a_dfe.name;
  225. if(strncmp(a_dfe.name, pci_aer_dirname, len) == 0)
  226. fname = &a_dfe.name[len];
  227. if(*fname == '/')
  228. fname++;
  229. snprintfz(id, RRD_ID_LENGTH_MAX, "%s_%s", &context[4], fname);
  230. char *slash = strrchr(id, '/');
  231. if(slash)
  232. *slash = '\0';
  233. netdata_fix_chart_id(id);
  234. snprintfz(nm, RRD_ID_LENGTH_MAX, "%s", fname);
  235. slash = strrchr(nm, '/');
  236. if(slash)
  237. *slash = '\0';
  238. a->st = rrdset_create_localhost(
  239. "pci"
  240. , id
  241. , NULL
  242. , "aer"
  243. , context
  244. , title
  245. , "errors/s"
  246. , PLUGIN_PROC_NAME
  247. , "/sys/devices/pci/aer"
  248. , NETDATA_CHART_PRIO_PCI_AER
  249. , update_every
  250. , RRDSET_TYPE_LINE
  251. );
  252. rrdlabels_add(a->st->rrdlabels, "device", nm, RRDLABEL_SRC_AUTO);
  253. add_label_from_link(a, a_dfe.name, "driver");
  254. struct aer_value *v;
  255. dfe_start_read(a->values, v) {
  256. v->rd = rrddim_add(a->st, v_dfe.name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  257. }
  258. dfe_done(v);
  259. }
  260. struct aer_value *v;
  261. dfe_start_read(a->values, v) {
  262. if(unlikely(!v->rd))
  263. v->rd = rrddim_add(a->st, v_dfe.name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
  264. rrddim_set_by_pointer(a->st, v->rd, (collected_number)v->count);
  265. }
  266. dfe_done(v);
  267. rrdset_done(a->st);
  268. }
  269. dfe_done(a);
  270. return 0;
  271. }