consul.conf 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. template: consul_license_expiration_time
  3. on: consul.license_expiration_time
  4. class: Errors
  5. type: ServiceMesh
  6. component: Consul
  7. calc: $license_expiration
  8. every: 60m
  9. units: seconds
  10. warn: $this < 14*24*60*60
  11. crit: $this < 7*24*60*60
  12. summary: Consul license expiration on ${label:node_name}
  13. info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
  14. to: sysadmin
  15. template: consul_autopilot_health_status
  16. on: consul.autopilot_health_status
  17. class: Errors
  18. type: ServiceMesh
  19. component: Consul
  20. calc: $unhealthy
  21. every: 10s
  22. units: status
  23. warn: $this == 1
  24. delay: down 5m multiplier 1.5 max 1h
  25. summary: Consul datacenter ${label:datacenter} health
  26. info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
  27. to: sysadmin
  28. template: consul_autopilot_server_health_status
  29. on: consul.autopilot_server_health_status
  30. class: Errors
  31. type: ServiceMesh
  32. component: Consul
  33. calc: $unhealthy
  34. every: 10s
  35. units: status
  36. warn: $this == 1
  37. delay: down 5m multiplier 1.5 max 1h
  38. summary: Consul server ${label:node_name} health
  39. info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
  40. to: sysadmin
  41. template: consul_raft_leader_last_contact_time
  42. on: consul.raft_leader_last_contact_time
  43. class: Errors
  44. type: ServiceMesh
  45. component: Consul
  46. lookup: average -1m unaligned of quantile_0.5
  47. every: 10s
  48. units: milliseconds
  49. warn: $this > (($status >= $WARNING) ? (150) : (200))
  50. crit: $this > (($status == $CRITICAL) ? (200) : (500))
  51. delay: down 5m multiplier 1.5 max 1h
  52. summary: Consul leader server ${label:node_name} last contact time
  53. info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
  54. to: sysadmin
  55. template: consul_raft_leadership_transitions
  56. on: consul.raft_leadership_transitions_rate
  57. class: Errors
  58. type: ServiceMesh
  59. component: Consul
  60. lookup: sum -1m unaligned
  61. every: 10s
  62. units: transitions
  63. warn: $this > 0
  64. delay: down 5m multiplier 1.5 max 1h
  65. summary: Consul server ${label:node_name} leadership transitions
  66. info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
  67. to: sysadmin
  68. template: consul_raft_thread_main_saturation
  69. on: consul.raft_thread_main_saturation_perc
  70. class: Utilization
  71. type: ServiceMesh
  72. component: Consul
  73. lookup: average -1m unaligned of quantile_0.9
  74. every: 10s
  75. units: percentage
  76. warn: $this > (($status >= $WARNING) ? (40) : (50))
  77. delay: down 5m multiplier 1.5 max 1h
  78. summary: Consul server ${label:node_name} main Raft saturation
  79. info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
  80. to: sysadmin
  81. template: consul_raft_thread_fsm_saturation
  82. on: consul.raft_thread_fsm_saturation_perc
  83. class: Utilization
  84. type: ServiceMesh
  85. component: Consul
  86. lookup: average -1m unaligned of quantile_0.9
  87. every: 10s
  88. units: milliseconds
  89. warn: $this > (($status >= $WARNING) ? (40) : (50))
  90. delay: down 5m multiplier 1.5 max 1h
  91. summary: Consul server ${label:node_name} FSM Raft saturation
  92. info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
  93. to: sysadmin
  94. template: consul_client_rpc_requests_exceeded
  95. on: consul.client_rpc_requests_exceeded_rate
  96. class: Errors
  97. type: ServiceMesh
  98. component: Consul
  99. lookup: sum -1m unaligned
  100. every: 10s
  101. units: requests
  102. warn: $this > (($status >= $WARNING) ? (0) : (5))
  103. delay: down 5m multiplier 1.5 max 1h
  104. summary: Consul server ${label:node_name} RPC requests rate
  105. info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
  106. to: sysadmin
  107. template: consul_client_rpc_requests_failed
  108. on: consul.client_rpc_requests_failed_rate
  109. class: Errors
  110. type: ServiceMesh
  111. component: Consul
  112. lookup: sum -1m unaligned
  113. every: 10s
  114. units: requests
  115. warn: $this > (($status >= $WARNING) ? (0) : (5))
  116. delay: down 5m multiplier 1.5 max 1h
  117. summary: Consul server ${label:node_name} failed RPC requests
  118. info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
  119. to: sysadmin
  120. template: consul_node_health_check_status
  121. on: consul.node_health_check_status
  122. class: Errors
  123. type: ServiceMesh
  124. component: Consul
  125. calc: $warning + $critical
  126. every: 10s
  127. units: status
  128. warn: $this != nan AND $this != 0
  129. delay: down 5m multiplier 1.5 max 1h
  130. summary: Consul node health check ${label:check_name} on ${label:node_name}
  131. info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
  132. to: sysadmin
  133. template: consul_service_health_check_status
  134. on: consul.service_health_check_status
  135. class: Errors
  136. type: ServiceMesh
  137. component: Consul
  138. calc: $warning + $critical
  139. every: 10s
  140. units: status
  141. warn: $this == 1
  142. delay: down 5m multiplier 1.5 max 1h
  143. summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name}
  144. info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
  145. to: sysadmin
  146. template: consul_gc_pause_time
  147. on: consul.gc_pause_time
  148. class: Errors
  149. type: ServiceMesh
  150. component: Consul
  151. lookup: sum -1m unaligned
  152. every: 10s
  153. units: seconds
  154. warn: $this > (($status >= $WARNING) ? (1) : (2))
  155. crit: $this > (($status >= $WARNING) ? (2) : (5))
  156. delay: down 5m multiplier 1.5 max 1h
  157. summary: Consul server ${label:node_name} garbage collection pauses
  158. info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
  159. to: sysadmin