memory.conf 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. alarm: 1hour_memory_hw_corrupted
  3. on: mem.hwcorrupt
  4. class: Errors
  5. type: System
  6. component: Memory
  7. os: linux
  8. hosts: *
  9. calc: $HardwareCorrupted
  10. units: MB
  11. every: 10s
  12. warn: $this > 0
  13. delay: down 1h multiplier 1.5 max 1h
  14. summary: System corrupted memory
  15. info: Amount of memory corrupted due to a hardware failure
  16. to: sysadmin
  17. ## ECC Controller
  18. template: ecc_memory_mc_correctable
  19. on: mem.edac_mc
  20. class: Errors
  21. type: System
  22. component: Memory
  23. os: linux
  24. hosts: *
  25. lookup: sum -10m unaligned of correctable, correctable_noinfo
  26. units: errors
  27. every: 1m
  28. warn: $this > 0
  29. delay: down 1h multiplier 1.5 max 1h
  30. summary: System ECC memory ${label:controller} correctable errors
  31. info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes
  32. to: sysadmin
  33. template: ecc_memory_mc_uncorrectable
  34. on: mem.edac_mc
  35. class: Errors
  36. type: System
  37. component: Memory
  38. os: linux
  39. hosts: *
  40. lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
  41. units: errors
  42. every: 1m
  43. crit: $this > 0
  44. delay: down 1h multiplier 1.5 max 1h
  45. summary: System ECC memory ${label:controller} uncorrectable errors
  46. info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
  47. to: sysadmin
  48. ## ECC DIMM
  49. template: ecc_memory_dimm_correctable
  50. on: mem.edac_mc_dimm
  51. class: Errors
  52. type: System
  53. component: Memory
  54. os: linux
  55. hosts: *
  56. lookup: sum -10m unaligned of correctable
  57. units: errors
  58. every: 1m
  59. warn: $this > 0
  60. delay: down 1h multiplier 1.5 max 1h
  61. summary: System ECC memory DIMM ${label:dimm} correctable errors
  62. info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
  63. to: sysadmin
  64. template: ecc_memory_dimm_uncorrectable
  65. on: mem.edac_mc_dimm
  66. class: Errors
  67. type: System
  68. component: Memory
  69. os: linux
  70. hosts: *
  71. lookup: sum -10m unaligned of uncorrectable
  72. units: errors
  73. every: 1m
  74. crit: $this > 0
  75. delay: down 1h multiplier 1.5 max 1h
  76. summary: System ECC memory DIMM ${label:dimm} uncorrectable errors
  77. info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
  78. to: sysadmin