proc_mdstat.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "plugin_proc.h"
  3. #define PLUGIN_PROC_MODULE_MDSTAT_NAME "/proc/mdstat"
  4. struct raid {
  5. int used;
  6. char *name;
  7. RRDDIM *rd_health;
  8. unsigned long long failed_disks;
  9. RRDSET *st_disks;
  10. RRDDIM *rd_total;
  11. RRDDIM *rd_inuse;
  12. unsigned long long total_disks;
  13. unsigned long long inuse_disks;
  14. RRDSET *st_operation;
  15. RRDDIM *rd_check;
  16. RRDDIM *rd_resync;
  17. RRDDIM *rd_recovery;
  18. RRDDIM *rd_reshape;
  19. unsigned long long check;
  20. unsigned long long resync;
  21. unsigned long long recovery;
  22. unsigned long long reshape;
  23. RRDSET *st_finish;
  24. RRDDIM *rd_finish_in;
  25. unsigned long long finish_in;
  26. RRDSET *st_speed;
  27. RRDDIM *rd_speed;
  28. unsigned long long speed;
  29. char *mismatch_cnt_filename;
  30. RRDSET *st_mismatch_cnt;
  31. RRDDIM *rd_mismatch_cnt;
  32. unsigned long long mismatch_cnt;
  33. };
  34. static inline char *remove_trailing_chars(char *s, char c) {
  35. while(*s) {
  36. if(unlikely(*s == c)) {
  37. *s = '\0';
  38. }
  39. s++;
  40. }
  41. return s;
  42. }
  43. int do_proc_mdstat(int update_every, usec_t dt) {
  44. (void)dt;
  45. static procfile *ff = NULL;
  46. static char *mdstat_filename = NULL, *mismatch_cnt_filename = NULL;
  47. static struct raid *raids = NULL;
  48. static size_t raids_num = 0, raids_allocated = 0;
  49. size_t raid_idx = 0;
  50. if(unlikely(!mismatch_cnt_filename)) {
  51. char filename[FILENAME_MAX + 1];
  52. snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/block/%s/md/mismatch_cnt");
  53. mismatch_cnt_filename = config_get("plugin:proc:/proc/mdstat", "mismatch_cnt filename to monitor", filename);
  54. }
  55. if(unlikely(!ff)) {
  56. char filename[FILENAME_MAX + 1];
  57. snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/mdstat");
  58. mdstat_filename = config_get("plugin:proc:/proc/mdstat", "filename to monitor", filename);
  59. ff = procfile_open(mdstat_filename, " \t:", PROCFILE_FLAG_DEFAULT);
  60. if(unlikely(!ff)) return 1;
  61. }
  62. ff = procfile_readall(ff);
  63. if(unlikely(!ff)) return 0; // we return 0, so that we will retry opening it next time
  64. size_t lines = procfile_lines(ff);
  65. size_t words = 0;
  66. if(unlikely(lines < 2)) {
  67. error("Cannot read /proc/mdstat. Expected 2 or more lines, read %zu.", lines);
  68. return 1;
  69. }
  70. // find how many raids are there
  71. size_t l;
  72. raids_num = 0;
  73. for(l = 1; l < lines - 2 ; l++) {
  74. if(unlikely(procfile_lineword(ff, l, 1)[0] == 'a')) // check if the raid is active
  75. raids_num++;
  76. }
  77. if(unlikely(!raids_num)) return 0; // we return 0, so that we will retry searching for raids next time
  78. // allocate the memory we need;
  79. if(unlikely(raids_num != raids_allocated)) {
  80. for(raid_idx = 0; raid_idx < raids_allocated; raid_idx++) {
  81. struct raid *raid = &raids[raid_idx];
  82. freez(raid->name);
  83. freez(raid->mismatch_cnt_filename);
  84. }
  85. raids = (struct raid *)reallocz(raids, raids_num * sizeof(struct raid));
  86. raids_allocated = raids_num;
  87. memset(raids, 0, raids_num * sizeof(struct raid));
  88. }
  89. // loop through all lines except the first and the last ones
  90. for(l = 1, raid_idx = 0; l < (lines - 2) && raid_idx < raids_num; l++) {
  91. struct raid *raid = &raids[raid_idx];
  92. raid->used = 0;
  93. words = procfile_linewords(ff, l);
  94. if(unlikely(words < 2)) continue;
  95. if(unlikely(procfile_lineword(ff, l, 1)[0] != 'a')) continue;
  96. if(!raid->name) {
  97. raid->name = strdupz(procfile_lineword(ff, l, 0));
  98. }
  99. else if(strcmp(raid->name, procfile_lineword(ff, l, 0))) {
  100. freez(raid->name);
  101. freez(raid->mismatch_cnt_filename);
  102. memset(raid, 0, sizeof(struct raid));
  103. raid->name = strdupz(procfile_lineword(ff, l, 0));
  104. }
  105. if(unlikely(!raid->name || !raid->name[0])) continue;
  106. raid_idx++;
  107. // check if raid has disk status
  108. l++;
  109. words = procfile_linewords(ff, l);
  110. if(words < 2 || procfile_lineword(ff, l, words - 1)[0] != '[') {
  111. continue;
  112. }
  113. // split inuse and total number of disks
  114. char *s = NULL, *str_total = NULL, *str_inuse = NULL;
  115. s = procfile_lineword(ff, l, words - 2);
  116. if(unlikely(s[0] != '[')) {
  117. error("Cannot read /proc/mdstat raid health status. Unexpected format: missing opening bracket.");
  118. continue;
  119. }
  120. str_total = ++s;
  121. while(*s) {
  122. if(unlikely(*s == '/')) {
  123. *s = '\0';
  124. str_inuse = s + 1;
  125. }
  126. else if(unlikely(*s == ']')) {
  127. *s = '\0';
  128. break;
  129. }
  130. s++;
  131. }
  132. if(unlikely(str_total[0] == '\0' || str_inuse[0] == '\0')) {
  133. error("Cannot read /proc/mdstat raid health status. Unexpected format.");
  134. continue;
  135. }
  136. raid->inuse_disks = str2ull(str_inuse);
  137. raid->total_disks = str2ull(str_total);
  138. raid->failed_disks = raid->total_disks - raid->inuse_disks;
  139. raid->used = 1;
  140. raid->check = 0;
  141. raid->resync = 0;
  142. raid->recovery = 0;
  143. raid->reshape = 0;
  144. raid->finish_in = 0;
  145. raid->speed = 0;
  146. // check if any operation is performed on the raid
  147. l++;
  148. words = procfile_linewords(ff, l);
  149. if(likely(words < 2)) continue;
  150. if(unlikely(procfile_lineword(ff, l, 0)[0] != '[')) continue;
  151. if(unlikely(words < 7)) {
  152. error("Cannot read /proc/mdstat line. Expected 7 params, read %zu.", words);
  153. continue;
  154. }
  155. char *word;
  156. word = procfile_lineword(ff, l, 3);
  157. remove_trailing_chars(word, '%');
  158. unsigned long long percentage = (unsigned long long)(str2ld(word, NULL) * 100);
  159. // possible operations: check, resync, recovery, reshape
  160. // 4-th character is unique for each operation so it is checked
  161. switch(procfile_lineword(ff, l, 1)[3]) {
  162. case 'c': // check
  163. raid->check = percentage;
  164. break;
  165. case 'y': // resync
  166. raid->resync = percentage;
  167. break;
  168. case 'o': // recovery
  169. raid->recovery = percentage;
  170. break;
  171. case 'h': // reshape
  172. raid->reshape = percentage;
  173. break;
  174. }
  175. word = procfile_lineword(ff, l, 5);
  176. s = remove_trailing_chars(word, 'm'); // remove trailing "min"
  177. word += 7; // skip leading "finish="
  178. if(likely(s > word))
  179. raid->finish_in = (unsigned long long)(str2ld(word, NULL) * 60);
  180. word = procfile_lineword(ff, l, 6);
  181. s = remove_trailing_chars(word, 'K'); // remove trailing "K/sec"
  182. word += 6; // skip leading "speed="
  183. if(likely(s > word))
  184. raid->speed = str2ull(word);
  185. }
  186. // read mismatch_cnt files
  187. for(raid_idx = 0; raid_idx < raids_num ; raid_idx++) {
  188. char filename[FILENAME_MAX + 1];
  189. struct raid *raid = &raids[raid_idx];
  190. if(likely(raid->used)) {
  191. if(!raid->mismatch_cnt_filename) {
  192. snprintfz(filename, FILENAME_MAX, mismatch_cnt_filename, raid->name);
  193. raid->mismatch_cnt_filename = strdupz(filename);
  194. }
  195. if(unlikely(read_single_number_file(raid->mismatch_cnt_filename, &raid->mismatch_cnt))) {
  196. error("Cannot read file '%s'", raid->mismatch_cnt_filename);
  197. return 1;
  198. }
  199. }
  200. }
  201. // --------------------------------------------------------------------
  202. static RRDSET *st_mdstat_health = NULL;
  203. if(unlikely(!st_mdstat_health))
  204. st_mdstat_health = rrdset_create_localhost(
  205. "mdstat"
  206. , "mdstat_health"
  207. , NULL
  208. , "health"
  209. , "md.health"
  210. , "Faulty Devices In MD"
  211. , "failed disks"
  212. , PLUGIN_PROC_NAME
  213. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  214. , NETDATA_CHART_PRIO_MDSTAT_HEALTH
  215. , update_every
  216. , RRDSET_TYPE_LINE
  217. );
  218. else
  219. rrdset_next(st_mdstat_health);
  220. for(raid_idx = 0; raid_idx < raids_num; raid_idx++) {
  221. struct raid *raid = &raids[raid_idx];
  222. if(likely(raid->used)) {
  223. if(unlikely(!raid->rd_health && !(raid->rd_health = rrddim_find(st_mdstat_health, raid->name))))
  224. raid->rd_health = rrddim_add(st_mdstat_health, raid->name, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  225. rrddim_set_by_pointer(st_mdstat_health, raid->rd_health, raid->failed_disks);
  226. }
  227. }
  228. rrdset_done(st_mdstat_health);
  229. // --------------------------------------------------------------------
  230. for(raid_idx = 0; raid_idx < raids_num ; raid_idx++) {
  231. struct raid *raid = &raids[raid_idx];
  232. char id[50 + 1];
  233. char family[50 + 1];
  234. if(likely(raid->used)) {
  235. snprintfz(id, 50, "%s_disks", raid->name);
  236. if(unlikely(!raid->st_disks && !(raid->st_disks = rrdset_find_byname_localhost(id)))) {
  237. snprintfz(family, 50, "%s", raid->name);
  238. raid->st_disks = rrdset_create_localhost(
  239. "mdstat"
  240. , id
  241. , NULL
  242. , family
  243. , "md.disks"
  244. , "Disks Stats"
  245. , "disks"
  246. , PLUGIN_PROC_NAME
  247. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  248. , NETDATA_CHART_PRIO_MDSTAT_DISKS + raid_idx * 10
  249. , update_every
  250. , RRDSET_TYPE_STACKED
  251. );
  252. }
  253. else
  254. rrdset_next(raid->st_disks);
  255. if(unlikely(!raid->rd_inuse && !(raid->rd_inuse = rrddim_find(raid->st_disks, "inuse"))))
  256. raid->rd_inuse = rrddim_add(raid->st_disks, "inuse", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  257. if(unlikely(!raid->rd_total && !(raid->rd_total = rrddim_find(raid->st_disks, "total"))))
  258. raid->rd_total = rrddim_add(raid->st_disks, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  259. rrddim_set_by_pointer(raid->st_disks, raid->rd_inuse, raid->inuse_disks);
  260. rrddim_set_by_pointer(raid->st_disks, raid->rd_total, raid->total_disks);
  261. rrdset_done(raid->st_disks);
  262. // --------------------------------------------------------------------
  263. snprintfz(id, 50, "%s_mismatch", raid->name);
  264. if(unlikely(!raid->st_mismatch_cnt && !(raid->st_mismatch_cnt = rrdset_find_byname_localhost(id)))) {
  265. snprintfz(family, 50, "%s", raid->name);
  266. raid->st_mismatch_cnt = rrdset_create_localhost(
  267. "mdstat"
  268. , id
  269. , NULL
  270. , family
  271. , "md.mismatch_cnt"
  272. , "Mismatch Count"
  273. , "unsynchronized blocks"
  274. , PLUGIN_PROC_NAME
  275. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  276. , NETDATA_CHART_PRIO_MDSTAT_MISMATCH + raid_idx * 10
  277. , update_every
  278. , RRDSET_TYPE_LINE
  279. );
  280. }
  281. else
  282. rrdset_next(raid->st_mismatch_cnt);
  283. if(unlikely(!raid->rd_mismatch_cnt && !(raid->rd_mismatch_cnt = rrddim_find(raid->st_mismatch_cnt, "count"))))
  284. raid->rd_mismatch_cnt = rrddim_add(raid->st_mismatch_cnt, "count", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  285. rrddim_set_by_pointer(raid->st_mismatch_cnt, raid->rd_mismatch_cnt, raid->mismatch_cnt);
  286. rrdset_done(raid->st_mismatch_cnt);
  287. // --------------------------------------------------------------------
  288. snprintfz(id, 50, "%s_operation", raid->name);
  289. if(unlikely(!raid->st_operation && !(raid->st_operation = rrdset_find_byname_localhost(id)))) {
  290. snprintfz(family, 50, "%s", raid->name);
  291. raid->st_operation = rrdset_create_localhost(
  292. "mdstat"
  293. , id
  294. , NULL
  295. , family
  296. , "md.status"
  297. , "Current Status"
  298. , "percent"
  299. , PLUGIN_PROC_NAME
  300. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  301. , NETDATA_CHART_PRIO_MDSTAT_OPERATION + raid_idx * 10
  302. , update_every
  303. , RRDSET_TYPE_LINE
  304. );
  305. }
  306. else
  307. rrdset_next(raid->st_operation);
  308. if(unlikely(!raid->rd_check && !(raid->rd_check = rrddim_find(raid->st_operation, "check"))))
  309. raid->rd_check = rrddim_add(raid->st_operation, "check", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
  310. if(unlikely(!raid->rd_resync && !(raid->rd_resync = rrddim_find(raid->st_operation, "resync"))))
  311. raid->rd_resync = rrddim_add(raid->st_operation, "resync", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
  312. if(unlikely(!raid->rd_recovery && !(raid->rd_recovery = rrddim_find(raid->st_operation, "recovery"))))
  313. raid->rd_recovery = rrddim_add(raid->st_operation, "recovery", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
  314. if(unlikely(!raid->rd_reshape && !(raid->rd_reshape = rrddim_find(raid->st_operation, "reshape"))))
  315. raid->rd_reshape = rrddim_add(raid->st_operation, "reshape", NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE);
  316. rrddim_set_by_pointer(raid->st_operation, raid->rd_check, raid->check);
  317. rrddim_set_by_pointer(raid->st_operation, raid->rd_resync, raid->resync);
  318. rrddim_set_by_pointer(raid->st_operation, raid->rd_recovery, raid->recovery);
  319. rrddim_set_by_pointer(raid->st_operation, raid->rd_reshape, raid->reshape);
  320. rrdset_done(raid->st_operation);
  321. // --------------------------------------------------------------------
  322. snprintfz(id, 50, "%s_finish", raid->name);
  323. if(unlikely(!raid->st_finish && !(raid->st_finish = rrdset_find_byname_localhost(id)))) {
  324. snprintfz(family, 50, "%s", raid->name);
  325. raid->st_finish = rrdset_create_localhost(
  326. "mdstat"
  327. , id
  328. , NULL
  329. , family
  330. , "md.rate"
  331. , "Approximate Time Unit Finish"
  332. , "seconds"
  333. , PLUGIN_PROC_NAME
  334. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  335. , NETDATA_CHART_PRIO_MDSTAT_FINISH + raid_idx * 10
  336. , update_every
  337. , RRDSET_TYPE_LINE
  338. );
  339. }
  340. else
  341. rrdset_next(raid->st_finish);
  342. if(unlikely(!raid->rd_finish_in && !(raid->rd_finish_in = rrddim_find(raid->st_finish, "finish_in"))))
  343. raid->rd_finish_in = rrddim_add(raid->st_finish, "finish_in", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  344. rrddim_set_by_pointer(raid->st_finish, raid->rd_finish_in, raid->finish_in);
  345. rrdset_done(raid->st_finish);
  346. // --------------------------------------------------------------------
  347. snprintfz(id, 50, "%s_speed", raid->name);
  348. if(unlikely(!raid->st_speed && !(raid->st_speed = rrdset_find_byname_localhost(id)))) {
  349. snprintfz(family, 50, "%s", raid->name);
  350. raid->st_speed = rrdset_create_localhost(
  351. "mdstat"
  352. , id
  353. , NULL
  354. , family
  355. , "md.rate"
  356. , "Operation Speed"
  357. , "KB/s"
  358. , PLUGIN_PROC_NAME
  359. , PLUGIN_PROC_MODULE_MDSTAT_NAME
  360. , NETDATA_CHART_PRIO_MDSTAT_SPEED + raid_idx * 10
  361. , update_every
  362. , RRDSET_TYPE_LINE
  363. );
  364. }
  365. else
  366. rrdset_next(raid->st_speed);
  367. if(unlikely(!raid->rd_speed && !(raid->rd_speed = rrddim_find(raid->st_speed, "speed"))))
  368. raid->rd_speed = rrddim_add(raid->st_speed, "speed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
  369. rrddim_set_by_pointer(raid->st_speed, raid->rd_speed, raid->speed);
  370. rrdset_done(raid->st_speed);
  371. }
  372. }
  373. return 0;
  374. }