dbengine.conf 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. alarm: 10min_dbengine_global_fs_errors
  3. on: netdata.dbengine_global_errors
  4. class: Errors
  5. type: Netdata
  6. component: DB engine
  7. os: linux freebsd macos
  8. hosts: *
  9. lookup: sum -10m unaligned of fs_errors
  10. units: errors
  11. every: 10s
  12. crit: $this > 0
  13. delay: down 15m multiplier 1.5 max 1h
  14. info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
  15. to: sysadmin
  16. alarm: 10min_dbengine_global_io_errors
  17. on: netdata.dbengine_global_errors
  18. class: Errors
  19. type: Netdata
  20. component: DB engine
  21. os: linux freebsd macos
  22. hosts: *
  23. lookup: sum -10m unaligned of io_errors
  24. units: errors
  25. every: 10s
  26. crit: $this > 0
  27. delay: down 1h multiplier 1.5 max 3h
  28. info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
  29. to: sysadmin
  30. alarm: 10min_dbengine_global_flushing_warnings
  31. on: netdata.dbengine_global_errors
  32. class: Errors
  33. type: Netdata
  34. component: DB engine
  35. os: linux freebsd macos
  36. hosts: *
  37. lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
  38. units: errors
  39. every: 10s
  40. warn: $this > 0
  41. delay: down 1h multiplier 1.5 max 3h
  42. info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
  43. Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
  44. to: sysadmin
  45. alarm: 10min_dbengine_global_flushing_errors
  46. on: netdata.dbengine_long_term_page_stats
  47. class: Errors
  48. type: Netdata
  49. component: DB engine
  50. os: linux freebsd macos
  51. hosts: *
  52. lookup: sum -10m unaligned of flushing_pressure_deletions
  53. units: pages
  54. every: 10s
  55. crit: $this != 0
  56. delay: down 1h multiplier 1.5 max 3h
  57. info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
  58. Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
  59. to: sysadmin