Browse Source

improvement(go.d.plugin): add data collection status chart (#18981)

Ilya Mashchenko 4 months ago
parent
commit
5ecedd74c9
1 changed files with 77 additions and 45 deletions
  1. 77 45
      src/go/plugin/go.d/agent/module/job.go

+ 77 - 45
src/go/plugin/go.d/agent/module/job.go

@@ -8,8 +8,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"os"
-	"regexp"
 	"runtime/debug"
 	"strings"
 	"sync"
@@ -34,26 +32,31 @@ func shouldObsoleteCharts() bool {
 	return obsoleteCharts
 }
 
-var reSpace = regexp.MustCompile(`\s+`)
-
-var ndInternalMonitoringDisabled = os.Getenv("NETDATA_INTERNALS_MONITORING") == "NO"
-
-func newRuntimeChart(pluginName string) *Chart {
-	// this is needed to keep the same name as we had before https://github.com/netdata/netdata/go/plugins/plugin/go.d/issues/650
-	ctxName := pluginName
-	if ctxName == "go.d" {
-		ctxName = "go"
+func newCollectStatusChart(pluginName string) *Chart {
+	return &Chart{
+		typ:      "netdata",
+		Title:    "Data Collection Status",
+		Units:    "status",
+		Fam:      pluginName,
+		Ctx:      "netdata.plugin_data_collection_status",
+		Priority: 144000,
+		Dims: Dims{
+			{ID: "success"},
+			{ID: "failed"},
+		},
 	}
-	ctxName = reSpace.ReplaceAllString(ctxName, "_")
+}
+
+func newCollectDurationChart(pluginName string) *Chart {
 	return &Chart{
 		typ:      "netdata",
-		Title:    "Execution time",
+		Title:    "Data Collection Duration",
 		Units:    "ms",
 		Fam:      pluginName,
-		Ctx:      fmt.Sprintf("netdata.%s_plugin_execution_time", ctxName),
+		Ctx:      "netdata.plugin_data_collection_duration",
 		Priority: 145000,
 		Dims: Dims{
-			{ID: "time"},
+			{ID: "duration"},
 		},
 	}
 }
@@ -93,21 +96,22 @@ func NewJob(cfg JobConfig) *Job {
 		AutoDetectEvery: cfg.AutoDetectEvery,
 		AutoDetectTries: infTries,
 
-		pluginName:  cfg.PluginName,
-		name:        cfg.Name,
-		moduleName:  cfg.ModuleName,
-		fullName:    cfg.FullName,
-		updateEvery: cfg.UpdateEvery,
-		priority:    cfg.Priority,
-		isStock:     cfg.IsStock,
-		module:      cfg.Module,
-		labels:      cfg.Labels,
-		out:         cfg.Out,
-		runChart:    newRuntimeChart(cfg.PluginName),
-		stop:        make(chan struct{}),
-		tick:        make(chan int),
-		buf:         &buf,
-		api:         netdataapi.New(&buf),
+		pluginName:           cfg.PluginName,
+		name:                 cfg.Name,
+		moduleName:           cfg.ModuleName,
+		fullName:             cfg.FullName,
+		updateEvery:          cfg.UpdateEvery,
+		priority:             cfg.Priority,
+		isStock:              cfg.IsStock,
+		module:               cfg.Module,
+		labels:               cfg.Labels,
+		out:                  cfg.Out,
+		collectStatusChart:   newCollectStatusChart(cfg.PluginName),
+		collectDurationChart: newCollectDurationChart(cfg.PluginName),
+		stop:                 make(chan struct{}),
+		tick:                 make(chan int),
+		buf:                  &buf,
+		api:                  netdataapi.New(&buf),
 
 		vnodeGUID:     cfg.VnodeGUID,
 		vnodeHostname: cfg.VnodeHostname,
@@ -149,12 +153,13 @@ type Job struct {
 	initialized bool
 	panicked    bool
 
-	runChart *Chart
-	charts   *Charts
-	tick     chan int
-	out      io.Writer
-	buf      *bytes.Buffer
-	api      *netdataapi.API
+	collectStatusChart   *Chart
+	collectDurationChart *Chart
+	charts               *Charts
+	tick                 chan int
+	out                  io.Writer
+	buf                  *bytes.Buffer
+	api                  *netdataapi.API
 
 	retries int
 	prevRun time.Time
@@ -304,10 +309,15 @@ func (j *Job) Cleanup() {
 	}
 	_ = j.api.HOST(j.vnodeGUID)
 
-	if j.runChart.created {
-		j.runChart.MarkRemove()
-		j.createChart(j.runChart)
+	if j.collectStatusChart.created {
+		j.collectStatusChart.MarkRemove()
+		j.createChart(j.collectStatusChart)
 	}
+	if j.collectDurationChart.created {
+		j.collectDurationChart.MarkRemove()
+		j.createChart(j.collectDurationChart)
+	}
+
 	if j.charts != nil {
 		for _, chart := range *j.charts {
 			if chart.created {
@@ -410,9 +420,14 @@ func (j *Job) processMetrics(metrics map[string]int64, startTime time.Time, sinc
 
 	_ = j.api.HOST(j.vnodeGUID)
 
-	if !ndInternalMonitoringDisabled && !j.runChart.created {
-		j.runChart.ID = fmt.Sprintf("execution_time_of_%s", j.FullName())
-		j.createChart(j.runChart)
+	if !j.collectStatusChart.created {
+		j.collectStatusChart.ID = fmt.Sprintf("%s_%s_data_collection_status", cleanPluginName(j.pluginName), j.FullName())
+		j.createChart(j.collectStatusChart)
+	}
+
+	if !j.collectDurationChart.created {
+		j.collectDurationChart.ID = fmt.Sprintf("%s_%s_data_collection_duration", cleanPluginName(j.pluginName), j.FullName())
+		j.createChart(j.collectDurationChart)
 	}
 
 	elapsed := int64(durationTo(time.Since(startTime), time.Millisecond))
@@ -442,12 +457,17 @@ func (j *Job) processMetrics(metrics map[string]int64, startTime time.Time, sinc
 	}
 	*j.charts = (*j.charts)[:i]
 
+	j.updateChart(
+		j.collectStatusChart,
+		map[string]int64{"success": boolToInt(updated > 0), "failed": boolToInt(updated == 0)},
+		sinceLastRun,
+	)
+
 	if updated == 0 {
 		return false
 	}
-	if !ndInternalMonitoringDisabled {
-		j.updateChart(j.runChart, map[string]int64{"time": elapsed}, sinceLastRun)
-	}
+
+	j.updateChart(j.collectDurationChart, map[string]int64{"duration": elapsed}, sinceLastRun)
 
 	return true
 }
@@ -648,4 +668,16 @@ func handleZero(v int) int {
 	return v
 }
 
+func boolToInt(b bool) int64 {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+func cleanPluginName(name string) string {
+	r := strings.NewReplacer(" ", "_", ".", "_")
+	return r.Replace(name)
+}
+
 var lblReplacer = strings.NewReplacer("'", "")