Browse Source

improvement(go.d/k8sstate): collect pod status reason (#18887)

Ilya Mashchenko 4 months ago
parent
commit
6e17cb0fd4

+ 38 - 27
src/go/plugin/go.d/modules/k8s_state/charts.go

@@ -43,6 +43,7 @@ const (
 	prioPodMemLimitsUsed
 	prioPodCondition
 	prioPodPhase
+	prioPodStatusReason
 	prioPodAge
 	prioPodContainersCount
 	prioPodContainersState
@@ -106,6 +107,7 @@ var podChartsTmpl = module.Charts{
 	podMemLimitsUsedChartTmpl.Copy(),
 	podConditionChartTmpl.Copy(),
 	podPhaseChartTmpl.Copy(),
+	podStatusReasonChartTmpl.Copy(),
 	podAgeChartTmpl.Copy(),
 	podContainersCountChartTmpl.Copy(),
 	podContainersStateChartTmpl.Copy(),
@@ -247,15 +249,24 @@ var (
 		},
 	}
 	// condition
-	nodeConditionsChartTmpl = module.Chart{
-		IDSep:    true,
-		ID:       "node_%s.condition_status",
-		Title:    "Condition status",
-		Units:    "status",
-		Fam:      "node condition",
-		Ctx:      "k8s_state.node_condition",
-		Priority: prioNodeConditions,
-	}
+	nodeConditionsChartTmpl = func() module.Chart {
+		chart := module.Chart{
+			IDSep:    true,
+			ID:       "node_%s.condition_status",
+			Title:    "Condition status",
+			Units:    "status",
+			Fam:      "node condition",
+			Ctx:      "k8s_state.node_condition",
+			Priority: prioNodeConditions,
+		}
+		for _, v := range nodeConditionStatuses {
+			chart.Dims = append(chart.Dims, &module.Dim{
+				ID:   "node_%s_cond_" + v,
+				Name: v,
+			})
+		}
+		return chart
+	}()
 	nodeSchedulabilityChartTmpl = module.Chart{
 		IDSep:    true,
 		ID:       "node_%s.schedulability",
@@ -426,24 +437,6 @@ func (ks *KubeState) removeNodeCharts(ns *nodeState) {
 	}
 }
 
-func (ks *KubeState) addNodeConditionToCharts(ns *nodeState, cond string) {
-	id := fmt.Sprintf(nodeConditionsChartTmpl.ID, replaceDots(ns.id()))
-	c := ks.Charts().Get(id)
-	if c == nil {
-		ks.Warningf("chart '%s' does not exist", id)
-		return
-	}
-	dim := &module.Dim{
-		ID:   fmt.Sprintf("node_%s_cond_%s", ns.id(), strings.ToLower(cond)),
-		Name: cond,
-	}
-	if err := c.AddDim(dim); err != nil {
-		ks.Warning(err)
-		return
-	}
-	c.MarkNotCreated()
-}
-
 var (
 	podCPURequestsUsedChartTmpl = module.Chart{
 		IDSep:    true,
@@ -523,6 +516,24 @@ var (
 			{ID: "pod_%s_phase_pending", Name: "pending"},
 		},
 	}
+	podStatusReasonChartTmpl = func() module.Chart {
+		chart := module.Chart{
+			IDSep:    true,
+			ID:       "pod_%s.status_reason",
+			Title:    "Status reason",
+			Units:    "status",
+			Fam:      "pod status",
+			Ctx:      "k8s_state.pod_status_reason",
+			Priority: prioPodStatusReason,
+		}
+		for _, v := range podStatusReasons {
+			chart.Dims = append(chart.Dims, &module.Dim{
+				ID:   "pod_%s_status_reason_" + v,
+				Name: v,
+			})
+		}
+		return chart
+	}()
 	podAgeChartTmpl = module.Chart{
 		IDSep:    true,
 		ID:       "pod_%s.age",

+ 48 - 14
src/go/plugin/go.d/modules/k8s_state/collect.go

@@ -6,7 +6,6 @@ import (
 	"errors"
 	"fmt"
 	"slices"
-	"strings"
 	"time"
 
 	"github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module"
@@ -17,28 +16,47 @@ import (
 const precision = 1000
 
 var (
+	podStatusReasons = []string{
+		"Evicted",
+		"NodeAffinity",
+		"NodeLost",
+		"Shutdown",
+		"UnexpectedAdmissionError",
+		"Other",
+	}
+
 	containerWaitingStateReasons = []string{
-		"PodInitializing",
 		"ContainerCreating",
 		"CrashLoopBackOff",
 		"CreateContainerConfigError",
+		"CreateContainerError",
 		"ErrImagePull",
 		"ImagePullBackOff",
-		"CreateContainerError",
 		"InvalidImageName",
+		"PodInitializing",
 		"Other",
 	}
 	containerTerminatedStateReasons = []string{
-		"OOMKilled",
 		"Completed",
-		"Error",
 		"ContainerCannotRun",
 		"DeadlineExceeded",
+		"Error",
 		"Evicted",
+		"OOMKilled",
 		"Other",
 	}
 )
 
+var (
+	nodeConditionStatuses = []string{
+		"Ready",
+		"DiskPressure",
+		"MemoryPressure",
+		"NetworkUnavailable",
+		"PIDPressure",
+	}
+)
+
 func (ks *KubeState) collect() (map[string]int64, error) {
 	if ks.discoverer == nil {
 		return nil, errors.New("nil discoverer")
@@ -56,6 +74,7 @@ func (ks *KubeState) collect() (map[string]int64, error) {
 
 		ks.kubeClusterID = ks.getKubeClusterID()
 		ks.kubeClusterName = ks.getKubeClusterName()
+
 		if chart := ks.Charts().Get(discoveryStatusChart.ID); chart != nil {
 			chart.Labels = []module.Label{
 				{Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s},
@@ -92,7 +111,7 @@ func (ks *KubeState) collectKubeState(mx map[string]int64) {
 func (ks *KubeState) collectPodsState(mx map[string]int64) {
 	now := time.Now()
 	for _, ps := range ks.state.pods {
-		// Skip cronjobs (each of them is a unique container because name contains hash)
+		// Skip cronjobs (each of them is a unique container because the name contains hash)
 		// to avoid overwhelming Netdata with high cardinality metrics.
 		// Related issue https://github.com/netdata/netdata/issues/16412
 		if ps.controllerKind == "Job" {
@@ -104,6 +123,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 			ks.removePodCharts(ps)
 			continue
 		}
+
 		if ps.new {
 			ps.new = false
 			ks.addPodCharts(ps)
@@ -130,12 +150,14 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 			ns.stats.podsPhaseRunning += boolToInt(ps.phase == corev1.PodRunning)
 			ns.stats.podsPhaseSucceeded += boolToInt(ps.phase == corev1.PodSucceeded)
 			ns.stats.podsPhaseFailed += boolToInt(ps.phase == corev1.PodFailed)
+
 			for _, cs := range ps.initContainers {
 				ns.stats.initContainers++
 				ns.stats.initContStateRunning += boolToInt(cs.stateRunning)
 				ns.stats.initContStateWaiting += boolToInt(cs.stateWaiting)
 				ns.stats.initContStateTerminated += boolToInt(cs.stateTerminated)
 			}
+
 			for _, cs := range ps.containers {
 				ns.stats.containers++
 				ns.stats.contStateRunning += boolToInt(cs.stateRunning)
@@ -155,6 +177,17 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 		mx[px+"phase_succeeded"] = boolToInt(ps.phase == corev1.PodSucceeded)
 		mx[px+"phase_pending"] = boolToInt(ps.phase == corev1.PodPending)
 		mx[px+"age"] = int64(now.Sub(ps.creationTime).Seconds())
+
+		for _, v := range podStatusReasons {
+			mx[px+"status_reason_"+v] = 0
+		}
+		if v := ps.statusReason; v != "" {
+			if !slices.Contains(podStatusReasons, v) {
+				v = "Other"
+			}
+			mx[px+"status_reason_"+v] = 1
+		}
+
 		mx[px+"cpu_requests_used"] = ps.reqCPU
 		mx[px+"cpu_limits_used"] = ps.limitCPU
 		mx[px+"mem_requests_used"] = ps.reqMem
@@ -166,6 +199,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 		mx[px+"init_containers_state_running"] = 0
 		mx[px+"init_containers_state_waiting"] = 0
 		mx[px+"init_containers_state_terminated"] = 0
+
 		for _, cs := range ps.initContainers {
 			mx[px+"init_containers_state_running"] += boolToInt(cs.stateRunning)
 			mx[px+"init_containers_state_waiting"] += boolToInt(cs.stateWaiting)
@@ -174,6 +208,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 		mx[px+"containers_state_running"] = 0
 		mx[px+"containers_state_waiting"] = 0
 		mx[px+"containers_state_terminated"] = 0
+
 		for _, cs := range ps.containers {
 			if cs.new {
 				cs.new = false
@@ -194,7 +229,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 				mx[ppx+"state_waiting_reason_"+v] = 0
 			}
 			if v := cs.waitingReason; v != "" {
-				if !slices.Contains(containerWaitingStateReasons, cs.waitingReason) {
+				if !slices.Contains(containerWaitingStateReasons, v) {
 					v = "Other"
 				}
 				mx[ppx+"state_waiting_reason_"+v] = 1
@@ -204,7 +239,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
 				mx[ppx+"state_terminated_reason_"+v] = 0
 			}
 			if v := cs.terminatedReason; v != "" {
-				if !slices.Contains(containerTerminatedStateReasons, cs.terminatedReason) {
+				if !slices.Contains(containerTerminatedStateReasons, v) {
 					v = "Other"
 				}
 				mx[ppx+"state_terminated_reason_"+v] = 1
@@ -228,12 +263,11 @@ func (ks *KubeState) collectNodesState(mx map[string]int64) {
 
 		px := fmt.Sprintf("node_%s_", ns.id())
 
-		for typ, cond := range ns.conditions {
-			if cond.new {
-				cond.new = false
-				ks.addNodeConditionToCharts(ns, typ)
-			}
-			mx[px+"cond_"+strings.ToLower(typ)] = condStatusToInt(cond.status)
+		for _, v := range nodeConditionStatuses {
+			mx[px+"cond_"+v] = 0
+		}
+		for _, v := range ns.conditions {
+			mx[px+"cond_"+string(v.Type)] = condStatusToInt(v.Status)
 		}
 
 		mx[px+"age"] = int64(now.Sub(ns.creationTime).Seconds())

+ 46 - 20
src/go/plugin/go.d/modules/k8s_state/kube_state_test.go

@@ -213,11 +213,11 @@ func TestKubeState_Collect(t *testing.T) {
 						"node_node01_alloc_pods_allocated":             0,
 						"node_node01_alloc_pods_available":             110,
 						"node_node01_alloc_pods_util":                  0,
-						"node_node01_cond_diskpressure":                0,
-						"node_node01_cond_memorypressure":              0,
-						"node_node01_cond_networkunavailable":          0,
-						"node_node01_cond_pidpressure":                 0,
-						"node_node01_cond_ready":                       1,
+						"node_node01_cond_DiskPressure":                0,
+						"node_node01_cond_MemoryPressure":              0,
+						"node_node01_cond_NetworkUnavailable":          0,
+						"node_node01_cond_PIDPressure":                 0,
+						"node_node01_cond_Ready":                       1,
 						"node_node01_schedulability_schedulable":       1,
 						"node_node01_schedulability_unschedulable":     0,
 						"node_node01_containers":                       0,
@@ -240,6 +240,7 @@ func TestKubeState_Collect(t *testing.T) {
 						"node_node01_pods_readiness_ready":             0,
 						"node_node01_pods_readiness_unready":           0,
 					}
+
 					copyAge(expected, mx)
 
 					assert.Equal(t, expected, mx)
@@ -331,6 +332,12 @@ func TestKubeState_Collect(t *testing.T) {
 						"pod_default_pod01_phase_pending":                                                        0,
 						"pod_default_pod01_phase_running":                                                        1,
 						"pod_default_pod01_phase_succeeded":                                                      0,
+						"pod_default_pod01_status_reason_Evicted":                                                0,
+						"pod_default_pod01_status_reason_NodeAffinity":                                           0,
+						"pod_default_pod01_status_reason_NodeLost":                                               0,
+						"pod_default_pod01_status_reason_Other":                                                  0,
+						"pod_default_pod01_status_reason_Shutdown":                                               0,
+						"pod_default_pod01_status_reason_UnexpectedAdmissionError":                               0,
 					}
 
 					copyAge(expected, mx)
@@ -375,11 +382,11 @@ func TestKubeState_Collect(t *testing.T) {
 						"node_node01_alloc_pods_allocated":                                                       1,
 						"node_node01_alloc_pods_available":                                                       109,
 						"node_node01_alloc_pods_util":                                                            909,
-						"node_node01_cond_diskpressure":                                                          0,
-						"node_node01_cond_memorypressure":                                                        0,
-						"node_node01_cond_networkunavailable":                                                    0,
-						"node_node01_cond_pidpressure":                                                           0,
-						"node_node01_cond_ready":                                                                 1,
+						"node_node01_cond_DiskPressure":                                                          0,
+						"node_node01_cond_MemoryPressure":                                                        0,
+						"node_node01_cond_NetworkUnavailable":                                                    0,
+						"node_node01_cond_PIDPressure":                                                           0,
+						"node_node01_cond_Ready":                                                                 1,
 						"node_node01_containers":                                                                 2,
 						"node_node01_containers_state_running":                                                   2,
 						"node_node01_containers_state_terminated":                                                0,
@@ -464,6 +471,12 @@ func TestKubeState_Collect(t *testing.T) {
 						"pod_default_pod01_phase_pending":                                                        0,
 						"pod_default_pod01_phase_running":                                                        1,
 						"pod_default_pod01_phase_succeeded":                                                      0,
+						"pod_default_pod01_status_reason_Evicted":                                                0,
+						"pod_default_pod01_status_reason_NodeAffinity":                                           0,
+						"pod_default_pod01_status_reason_NodeLost":                                               0,
+						"pod_default_pod01_status_reason_Other":                                                  0,
+						"pod_default_pod01_status_reason_Shutdown":                                               0,
+						"pod_default_pod01_status_reason_UnexpectedAdmissionError":                               0,
 					}
 
 					copyAge(expected, mx)
@@ -513,11 +526,11 @@ func TestKubeState_Collect(t *testing.T) {
 						"node_node01_alloc_pods_allocated":             0,
 						"node_node01_alloc_pods_available":             110,
 						"node_node01_alloc_pods_util":                  0,
-						"node_node01_cond_diskpressure":                0,
-						"node_node01_cond_memorypressure":              0,
-						"node_node01_cond_networkunavailable":          0,
-						"node_node01_cond_pidpressure":                 0,
-						"node_node01_cond_ready":                       1,
+						"node_node01_cond_DiskPressure":                0,
+						"node_node01_cond_MemoryPressure":              0,
+						"node_node01_cond_NetworkUnavailable":          0,
+						"node_node01_cond_PIDPressure":                 0,
+						"node_node01_cond_Ready":                       1,
 						"node_node01_schedulability_schedulable":       1,
 						"node_node01_schedulability_unschedulable":     0,
 						"node_node01_containers":                       0,
@@ -632,11 +645,11 @@ func TestKubeState_Collect(t *testing.T) {
 						"node_node01_alloc_pods_allocated":                                                       2,
 						"node_node01_alloc_pods_available":                                                       108,
 						"node_node01_alloc_pods_util":                                                            1818,
-						"node_node01_cond_diskpressure":                                                          0,
-						"node_node01_cond_memorypressure":                                                        0,
-						"node_node01_cond_networkunavailable":                                                    0,
-						"node_node01_cond_pidpressure":                                                           0,
-						"node_node01_cond_ready":                                                                 1,
+						"node_node01_cond_DiskPressure":                                                          0,
+						"node_node01_cond_MemoryPressure":                                                        0,
+						"node_node01_cond_NetworkUnavailable":                                                    0,
+						"node_node01_cond_PIDPressure":                                                           0,
+						"node_node01_cond_Ready":                                                                 1,
 						"node_node01_containers":                                                                 4,
 						"node_node01_containers_state_running":                                                   4,
 						"node_node01_containers_state_terminated":                                                0,
@@ -721,6 +734,12 @@ func TestKubeState_Collect(t *testing.T) {
 						"pod_default_pod01_phase_pending":                                                        0,
 						"pod_default_pod01_phase_running":                                                        1,
 						"pod_default_pod01_phase_succeeded":                                                      0,
+						"pod_default_pod01_status_reason_Evicted":                                                0,
+						"pod_default_pod01_status_reason_NodeAffinity":                                           0,
+						"pod_default_pod01_status_reason_NodeLost":                                               0,
+						"pod_default_pod01_status_reason_Other":                                                  0,
+						"pod_default_pod01_status_reason_Shutdown":                                               0,
+						"pod_default_pod01_status_reason_UnexpectedAdmissionError":                               0,
 						"pod_default_pod02_age":                                                                  4,
 						"pod_default_pod02_cond_containersready":                                                 1,
 						"pod_default_pod02_cond_podinitialized":                                                  1,
@@ -784,7 +803,14 @@ func TestKubeState_Collect(t *testing.T) {
 						"pod_default_pod02_phase_pending":                                                        0,
 						"pod_default_pod02_phase_running":                                                        1,
 						"pod_default_pod02_phase_succeeded":                                                      0,
+						"pod_default_pod02_status_reason_Evicted":                                                0,
+						"pod_default_pod02_status_reason_NodeAffinity":                                           0,
+						"pod_default_pod02_status_reason_NodeLost":                                               0,
+						"pod_default_pod02_status_reason_Other":                                                  0,
+						"pod_default_pod02_status_reason_Shutdown":                                               0,
+						"pod_default_pod02_status_reason_UnexpectedAdmissionError":                               0,
 					}
+
 					copyAge(expected, mx)
 
 					assert.Equal(t, expected, mx)

+ 20 - 5
src/go/plugin/go.d/modules/k8s_state/metadata.yaml

@@ -141,7 +141,11 @@ modules:
               unit: status
               chart_type: line
               dimensions:
-                - name: a dimension per condition
+                - name: Ready
+                - name: DiskPressure
+                - name: MemoryPressure
+                - name: NetworkUnavailable
+                - name: PIDPressure
             - name: k8s_state.node_schedulability
               description: Schedulability
               unit: state
@@ -271,6 +275,17 @@ modules:
                 - name: failed
                 - name: succeeded
                 - name: pending
+            - name: k8s_state.pod_status_reason
+              description: Status reason
+              unit: status
+              chart_type: line
+              dimensions:
+                - name: Evicted
+                - name: NodeAffinity
+                - name: NodeLost
+                - name: Shutdown
+                - name: UnexpectedAdmissionError
+                - name: Other
             - name: k8s_state.pod_age
               description: Age
               unit: seconds
@@ -347,24 +362,24 @@ modules:
               unit: state
               chart_type: line
               dimensions:
-                - name: PodInitializing
                 - name: ContainerCreating
                 - name: CrashLoopBackOff
                 - name: CreateContainerConfigError
+                - name: CreateContainerError
                 - name: ErrImagePull
                 - name: ImagePullBackOff
-                - name: CreateContainerError
                 - name: InvalidImageName
+                - name: PodInitializing
                 - name: Other
             - name: k8s_state.pod_container_terminated_state_reason
               description: Container terminated state reason
               unit: state
               chart_type: line
               dimensions:
-                - name: OOMKilled
                 - name: Completed
-                - name: Error
                 - name: ContainerCannotRun
                 - name: DeadlineExceeded
+                - name: Error
                 - name: Evicted
+                - name: OOMKilled
                 - name: Other

+ 5 - 11
src/go/plugin/go.d/modules/k8s_state/state.go

@@ -19,9 +19,8 @@ func newKubeState() *kubeState {
 
 func newNodeState() *nodeState {
 	return &nodeState{
-		new:        true,
-		labels:     make(map[string]string),
-		conditions: make(map[string]*nodeStateCondition),
+		new:    true,
+		labels: make(map[string]string),
 	}
 }
 
@@ -58,16 +57,10 @@ type (
 		allocatableCPU  int64
 		allocatableMem  int64
 		allocatablePods int64
-		conditions      map[string]*nodeStateCondition
+		conditions      []corev1.NodeCondition
 
 		stats nodeStateStats
 	}
-	nodeStateCondition struct {
-		new bool
-		// https://kubernetes.io/docs/concepts/architecture/nodes/#condition
-		//typ    corev1.NodeConditionType
-		status corev1.ConditionStatus
-	}
 	nodeStateStats struct {
 		reqCPU   int64
 		limitCPU int64
@@ -127,7 +120,8 @@ type (
 		condPodInitialized  corev1.ConditionStatus
 		condPodReady        corev1.ConditionStatus
 		// https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
-		phase corev1.PodPhase
+		phase        corev1.PodPhase
+		statusReason string
 
 		initContainers map[string]*containerState
 		containers     map[string]*containerState

+ 1 - 8
src/go/plugin/go.d/modules/k8s_state/update_node_state.go

@@ -36,12 +36,5 @@ func (ks *KubeState) updateNodeState(r resource) {
 	}
 
 	ns.unSchedulable = node.Spec.Unschedulable
-
-	for _, c := range node.Status.Conditions {
-		if v, ok := ns.conditions[string(c.Type)]; !ok {
-			ns.conditions[string(c.Type)] = &nodeStateCondition{new: true, status: c.Status}
-		} else {
-			v.status = c.Status
-		}
-	}
+	ns.conditions = node.Status.Conditions
 }

+ 1 - 0
src/go/plugin/go.d/modules/k8s_state/update_pod_state.go

@@ -78,6 +78,7 @@ func (ks *KubeState) updatePodState(r resource) {
 	}
 
 	ps.phase = pod.Status.Phase
+	ps.statusReason = pod.Status.Reason
 
 	for _, cntr := range pod.Status.ContainerStatuses {
 		cs, ok := ps.containers[cntr.Name]