kubelet.conf 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. # -----------------------------------------------------------------------------
  3. # True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
  4. template: kubelet_node_config_error
  5. on: k8s_kubelet.kubelet_node_config_error
  6. class: Errors
  7. type: Kubernetes
  8. component: Kubelet
  9. calc: $experiencing_error
  10. units: bool
  11. every: 10s
  12. warn: $this == 1
  13. delay: down 1m multiplier 1.5 max 2h
  14. summary: Kubelet node config error
  15. info: The node is experiencing a configuration-related error (0: false, 1: true)
  16. to: sysadmin
  17. # Failed Token() requests to the alternate token source
  18. template: kubelet_token_requests
  19. on: k8s_kubelet.kubelet_token_requests
  20. class: Errors
  21. type: Kubernetes
  22. component: Kubelet
  23. lookup: sum -10s of failed
  24. units: requests
  25. every: 10s
  26. warn: $this > 0
  27. delay: down 1m multiplier 1.5 max 2h
  28. summary: Kubelet failed token requests
  29. info: Number of failed Token() requests to the alternate token source
  30. to: sysadmin
  31. # Docker and runtime operation errors
  32. template: kubelet_operations_error
  33. on: k8s_kubelet.kubelet_operations_errors
  34. class: Errors
  35. type: Kubernetes
  36. component: Kubelet
  37. lookup: sum -1m
  38. units: errors
  39. every: 10s
  40. warn: $this > (($status >= $WARNING) ? (0) : (20))
  41. delay: up 30s down 1m multiplier 1.5 max 2h
  42. summary: Kubelet runtime errors
  43. info: Number of Docker or runtime operation errors
  44. to: sysadmin
  45. # -----------------------------------------------------------------------------
  46. # Pod Lifecycle Event Generator Relisting Latency
  47. # 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
  48. # 2. do the same for the last 10s
  49. # 3. raise an alarm if the later is:
  50. # - 2x the first for quantile 0.5
  51. # - 4x the first for quantile 0.9
  52. # - 8x the first for quantile 0.99
  53. #
  54. # we assume the minimum latency is 1000 microseconds
  55. # quantile 0.5
  56. template: kubelet_1m_pleg_relist_latency_quantile_05
  57. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  58. class: Latency
  59. type: Kubernetes
  60. component: Kubelet
  61. lookup: average -1m unaligned of 0.5
  62. units: microseconds
  63. every: 10s
  64. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
  65. template: kubelet_10s_pleg_relist_latency_quantile_05
  66. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  67. class: Latency
  68. type: Kubernetes
  69. component: Kubelet
  70. lookup: average -10s unaligned of 0.5
  71. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
  72. every: 10s
  73. units: %
  74. warn: $this > (($status >= $WARNING)?(100):(200))
  75. crit: $this > (($status >= $WARNING)?(200):(400))
  76. delay: down 1m multiplier 1.5 max 2h
  77. summary: Kubelet relisting latency (quantile 0.5)
  78. info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  79. compared to the last minute (quantile 0.5)
  80. to: sysadmin
  81. # quantile 0.9
  82. template: kubelet_1m_pleg_relist_latency_quantile_09
  83. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  84. class: Latency
  85. type: Kubernetes
  86. component: Kubelet
  87. lookup: average -1m unaligned of 0.9
  88. units: microseconds
  89. every: 10s
  90. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
  91. template: kubelet_10s_pleg_relist_latency_quantile_09
  92. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  93. class: Latency
  94. type: Kubernetes
  95. component: Kubelet
  96. lookup: average -10s unaligned of 0.9
  97. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
  98. every: 10s
  99. units: %
  100. warn: $this > (($status >= $WARNING)?(200):(400))
  101. crit: $this > (($status >= $WARNING)?(400):(800))
  102. delay: down 1m multiplier 1.5 max 2h
  103. summary: Kubelet relisting latency (quantile 0.9)
  104. info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  105. compared to the last minute (quantile 0.9)
  106. to: sysadmin
  107. # quantile 0.99
  108. template: kubelet_1m_pleg_relist_latency_quantile_099
  109. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  110. class: Latency
  111. type: Kubernetes
  112. component: Kubelet
  113. lookup: average -1m unaligned of 0.99
  114. units: microseconds
  115. every: 10s
  116. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
  117. template: kubelet_10s_pleg_relist_latency_quantile_099
  118. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  119. class: Latency
  120. type: Kubernetes
  121. component: Kubelet
  122. lookup: average -10s unaligned of 0.99
  123. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
  124. every: 10s
  125. units: %
  126. warn: $this > (($status >= $WARNING)?(400):(800))
  127. crit: $this > (($status >= $WARNING)?(800):(1200))
  128. delay: down 1m multiplier 1.5 max 2h
  129. summary: Kubelet relisting latency (quantile 0.99)
  130. info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  131. compared to the last minute (quantile 0.99)
  132. to: sysadmin