hdfs.conf 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # Common
  2. template: hdfs_capacity_usage
  3. on: hdfs.capacity
  4. class: Utilization
  5. type: Storage
  6. component: HDFS
  7. calc: ($used) * 100 / ($used + $remaining)
  8. units: %
  9. every: 10s
  10. warn: $this > (($status >= $WARNING) ? (70) : (80))
  11. crit: $this > (($status == $CRITICAL) ? (80) : (98))
  12. delay: down 15m multiplier 1.5 max 1h
  13. summary: HDFS datanodes space utilization
  14. info: summary datanodes space capacity utilization
  15. to: sysadmin
  16. # NameNode
  17. template: hdfs_missing_blocks
  18. on: hdfs.blocks
  19. class: Errors
  20. type: Storage
  21. component: HDFS
  22. calc: $missing
  23. units: missing blocks
  24. every: 10s
  25. warn: $this > 0
  26. delay: down 15m multiplier 1.5 max 1h
  27. summary: HDFS missing blocks
  28. info: number of missing blocks
  29. to: sysadmin
  30. template: hdfs_stale_nodes
  31. on: hdfs.data_nodes
  32. class: Errors
  33. type: Storage
  34. component: HDFS
  35. calc: $stale
  36. units: dead nodes
  37. every: 10s
  38. warn: $this > 0
  39. delay: down 15m multiplier 1.5 max 1h
  40. summary: HDFS stale datanodes
  41. info: number of datanodes marked stale due to delayed heartbeat
  42. to: sysadmin
  43. template: hdfs_dead_nodes
  44. on: hdfs.data_nodes
  45. class: Errors
  46. type: Storage
  47. component: HDFS
  48. calc: $dead
  49. units: dead nodes
  50. every: 10s
  51. crit: $this > 0
  52. delay: down 15m multiplier 1.5 max 1h
  53. summary: HDFS dead datanodes
  54. info: number of datanodes which are currently dead
  55. to: sysadmin
  56. # DataNode
  57. template: hdfs_num_failed_volumes
  58. on: hdfs.num_failed_volumes
  59. class: Errors
  60. type: Storage
  61. component: HDFS
  62. calc: $fsds_num_failed_volumes
  63. units: failed volumes
  64. every: 10s
  65. warn: $this > 0
  66. delay: down 15m multiplier 1.5 max 1h
  67. summary: HDFS failed volumes
  68. info: number of failed volumes
  69. to: sysadmin