kubelet.conf 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. # -----------------------------------------------------------------------------
  3. # True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
  4. template: kubelet_node_config_error
  5. on: k8s_kubelet.kubelet_node_config_error
  6. class: Errors
  7. type: Kubernetes
  8. component: Kubelet
  9. calc: $experiencing_error
  10. units: bool
  11. every: 10s
  12. warn: $this == 1
  13. delay: down 1m multiplier 1.5 max 2h
  14. info: the node is experiencing a configuration-related error (0: false, 1: true)
  15. to: sysadmin
  16. # Failed Token() requests to the alternate token source
  17. template: kubelet_token_requests
  18. on: k8s_kubelet.kubelet_token_requests
  19. class: Errors
  20. type: Kubernetes
  21. component: Kubelet
  22. lookup: sum -10s of failed
  23. units: requests
  24. every: 10s
  25. warn: $this > 0
  26. delay: down 1m multiplier 1.5 max 2h
  27. info: number of failed Token() requests to the alternate token source
  28. to: sysadmin
  29. # Docker and runtime operation errors
  30. template: kubelet_operations_error
  31. on: k8s_kubelet.kubelet_operations_errors
  32. class: Errors
  33. type: Kubernetes
  34. component: Kubelet
  35. lookup: sum -1m
  36. units: errors
  37. every: 10s
  38. warn: $this > (($status >= $WARNING) ? (0) : (20))
  39. delay: up 30s down 1m multiplier 1.5 max 2h
  40. info: number of Docker or runtime operation errors
  41. to: sysadmin
  42. # -----------------------------------------------------------------------------
  43. # Pod Lifecycle Event Generator Relisting Latency
  44. # 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
  45. # 2. do the same for the last 10s
  46. # 3. raise an alarm if the later is:
  47. # - 2x the first for quantile 0.5
  48. # - 4x the first for quantile 0.9
  49. # - 8x the first for quantile 0.99
  50. #
  51. # we assume the minimum latency is 1000 microseconds
  52. # quantile 0.5
  53. template: kubelet_1m_pleg_relist_latency_quantile_05
  54. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  55. class: Latency
  56. type: Kubernetes
  57. component: Kubelet
  58. lookup: average -1m unaligned of 0.5
  59. units: microseconds
  60. every: 10s
  61. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
  62. template: kubelet_10s_pleg_relist_latency_quantile_05
  63. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  64. class: Latency
  65. type: Kubernetes
  66. component: Kubelet
  67. lookup: average -10s unaligned of 0.5
  68. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
  69. every: 10s
  70. units: %
  71. warn: $this > (($status >= $WARNING)?(100):(200))
  72. crit: $this > (($status >= $WARNING)?(200):(400))
  73. delay: down 1m multiplier 1.5 max 2h
  74. info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  75. compared to the last minute (quantile 0.5)
  76. to: sysadmin
  77. # quantile 0.9
  78. template: kubelet_1m_pleg_relist_latency_quantile_09
  79. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  80. class: Latency
  81. type: Kubernetes
  82. component: Kubelet
  83. lookup: average -1m unaligned of 0.9
  84. units: microseconds
  85. every: 10s
  86. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
  87. template: kubelet_10s_pleg_relist_latency_quantile_09
  88. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  89. class: Latency
  90. type: Kubernetes
  91. component: Kubelet
  92. lookup: average -10s unaligned of 0.9
  93. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
  94. every: 10s
  95. units: %
  96. warn: $this > (($status >= $WARNING)?(200):(400))
  97. crit: $this > (($status >= $WARNING)?(400):(800))
  98. delay: down 1m multiplier 1.5 max 2h
  99. info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  100. compared to the last minute (quantile 0.9)
  101. to: sysadmin
  102. # quantile 0.99
  103. template: kubelet_1m_pleg_relist_latency_quantile_099
  104. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  105. class: Latency
  106. type: Kubernetes
  107. component: Kubelet
  108. lookup: average -1m unaligned of 0.99
  109. units: microseconds
  110. every: 10s
  111. info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
  112. template: kubelet_10s_pleg_relist_latency_quantile_099
  113. on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
  114. class: Latency
  115. type: Kubernetes
  116. component: Kubelet
  117. lookup: average -10s unaligned of 0.99
  118. calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
  119. every: 10s
  120. units: %
  121. warn: $this > (($status >= $WARNING)?(400):(800))
  122. crit: $this > (($status >= $WARNING)?(800):(1200))
  123. delay: down 1m multiplier 1.5 max 2h
  124. info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
  125. compared to the last minute (quantile 0.99)
  126. to: sysadmin