memory.conf 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. alarm: 1hour_memory_hw_corrupted
  3. on: mem.hwcorrupt
  4. class: Errors
  5. type: System
  6. component: Memory
  7. os: linux
  8. hosts: *
  9. calc: $HardwareCorrupted
  10. units: MB
  11. every: 10s
  12. warn: $this > 0
  13. delay: down 1h multiplier 1.5 max 1h
  14. info: amount of memory corrupted due to a hardware failure
  15. to: sysadmin
  16. ## ECC Controller
  17. template: ecc_memory_mc_correctable
  18. on: mem.edac_mc
  19. class: Errors
  20. type: System
  21. component: Memory
  22. os: linux
  23. hosts: *
  24. lookup: sum -10m unaligned of correctable, correctable_noinfo
  25. units: errors
  26. every: 1m
  27. warn: $this > 0
  28. delay: down 1h multiplier 1.5 max 1h
  29. info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes
  30. to: sysadmin
  31. template: ecc_memory_mc_uncorrectable
  32. on: mem.edac_mc
  33. class: Errors
  34. type: System
  35. component: Memory
  36. os: linux
  37. hosts: *
  38. lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
  39. units: errors
  40. every: 1m
  41. crit: $this > 0
  42. delay: down 1h multiplier 1.5 max 1h
  43. info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
  44. to: sysadmin
  45. ## ECC DIMM
  46. template: ecc_memory_dimm_correctable
  47. on: mem.edac_mc_dimm
  48. class: Errors
  49. type: System
  50. component: Memory
  51. os: linux
  52. hosts: *
  53. lookup: sum -10m unaligned of correctable
  54. units: errors
  55. every: 1m
  56. warn: $this > 0
  57. delay: down 1h multiplier 1.5 max 1h
  58. info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
  59. to: sysadmin
  60. template: ecc_memory_dimm_uncorrectable
  61. on: mem.edac_mc_dimm
  62. class: Errors
  63. type: System
  64. component: Memory
  65. os: linux
  66. hosts: *
  67. lookup: sum -10m unaligned of uncorrectable
  68. units: errors
  69. every: 1m
  70. crit: $this > 0
  71. delay: down 1h multiplier 1.5 max 1h
  72. info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
  73. to: sysadmin