consul.conf 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. template: consul_license_expiration_time
  3. on: consul.license_expiration_time
  4. class: Errors
  5. type: ServiceMesh
  6. component: Consul
  7. calc: $license_expiration
  8. every: 60m
  9. units: seconds
  10. warn: $this < 14*24*60*60
  11. crit: $this < 7*24*60*60
  12. info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
  13. to: sysadmin
  14. template: consul_autopilot_health_status
  15. on: consul.autopilot_health_status
  16. class: Errors
  17. type: ServiceMesh
  18. component: Consul
  19. calc: $unhealthy
  20. every: 10s
  21. units: status
  22. warn: $this == 1
  23. delay: down 5m multiplier 1.5 max 1h
  24. info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
  25. to: sysadmin
  26. template: consul_autopilot_server_health_status
  27. on: consul.autopilot_server_health_status
  28. class: Errors
  29. type: ServiceMesh
  30. component: Consul
  31. calc: $unhealthy
  32. every: 10s
  33. units: status
  34. warn: $this == 1
  35. delay: down 5m multiplier 1.5 max 1h
  36. info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
  37. to: sysadmin
  38. template: consul_raft_leader_last_contact_time
  39. on: consul.raft_leader_last_contact_time
  40. class: Errors
  41. type: ServiceMesh
  42. component: Consul
  43. lookup: average -1m unaligned of quantile_0.5
  44. every: 10s
  45. units: milliseconds
  46. warn: $this > (($status >= $WARNING) ? (150) : (200))
  47. crit: $this > (($status == $CRITICAL) ? (200) : (500))
  48. delay: down 5m multiplier 1.5 max 1h
  49. info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
  50. to: sysadmin
  51. template: consul_raft_leadership_transitions
  52. on: consul.raft_leadership_transitions_rate
  53. class: Errors
  54. type: ServiceMesh
  55. component: Consul
  56. lookup: sum -1m unaligned
  57. every: 10s
  58. units: transitions
  59. warn: $this > 0
  60. delay: down 5m multiplier 1.5 max 1h
  61. info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
  62. to: sysadmin
  63. template: consul_raft_thread_main_saturation
  64. on: consul.raft_thread_main_saturation_perc
  65. class: Utilization
  66. type: ServiceMesh
  67. component: Consul
  68. lookup: average -1m unaligned of quantile_0.9
  69. every: 10s
  70. units: percentage
  71. warn: $this > (($status >= $WARNING) ? (40) : (50))
  72. delay: down 5m multiplier 1.5 max 1h
  73. info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
  74. to: sysadmin
  75. template: consul_raft_thread_fsm_saturation
  76. on: consul.raft_thread_fsm_saturation_perc
  77. class: Utilization
  78. type: ServiceMesh
  79. component: Consul
  80. lookup: average -1m unaligned of quantile_0.9
  81. every: 10s
  82. units: milliseconds
  83. warn: $this > (($status >= $WARNING) ? (40) : (50))
  84. delay: down 5m multiplier 1.5 max 1h
  85. info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
  86. to: sysadmin
  87. template: consul_client_rpc_requests_exceeded
  88. on: consul.client_rpc_requests_exceeded_rate
  89. class: Errors
  90. type: ServiceMesh
  91. component: Consul
  92. lookup: sum -1m unaligned
  93. every: 10s
  94. units: requests
  95. warn: $this > (($status >= $WARNING) ? (0) : (5))
  96. delay: down 5m multiplier 1.5 max 1h
  97. info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
  98. to: sysadmin
  99. template: consul_client_rpc_requests_failed
  100. on: consul.client_rpc_requests_failed_rate
  101. class: Errors
  102. type: ServiceMesh
  103. component: Consul
  104. lookup: sum -1m unaligned
  105. every: 10s
  106. units: requests
  107. warn: $this > (($status >= $WARNING) ? (0) : (5))
  108. delay: down 5m multiplier 1.5 max 1h
  109. info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
  110. to: sysadmin
  111. template: consul_node_health_check_status
  112. on: consul.node_health_check_status
  113. class: Errors
  114. type: ServiceMesh
  115. component: Consul
  116. calc: $warning + $critical
  117. every: 10s
  118. units: status
  119. warn: $this != nan AND $this != 0
  120. delay: down 5m multiplier 1.5 max 1h
  121. info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
  122. to: sysadmin
  123. template: consul_service_health_check_status
  124. on: consul.service_health_check_status
  125. class: Errors
  126. type: ServiceMesh
  127. component: Consul
  128. calc: $warning + $critical
  129. every: 10s
  130. units: status
  131. warn: $this == 1
  132. delay: down 5m multiplier 1.5 max 1h
  133. info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
  134. to: sysadmin
  135. template: consul_gc_pause_time
  136. on: consul.gc_pause_time
  137. class: Errors
  138. type: ServiceMesh
  139. component: Consul
  140. lookup: sum -1m unaligned
  141. every: 10s
  142. units: seconds
  143. warn: $this > (($status >= $WARNING) ? (1) : (2))
  144. crit: $this > (($status >= $WARNING) ? (2) : (5))
  145. delay: down 5m multiplier 1.5 max 1h
  146. info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
  147. to: sysadmin