Browse Source

health: add python.d/go.d jobs last_collected_secs alarms (#11168)

Ilya Mashchenko 3 years ago
parent
commit
2555dccae7

+ 2 - 13
health/Makefile.am

@@ -25,9 +25,7 @@ install-exec-local:
 healthconfigdir=$(libconfigdir)/health.d
 dist_healthconfig_DATA = \
     health.d/adaptec_raid.conf \
-    health.d/am2320.conf \
     health.d/anomalies.conf \
-    health.d/apache.conf \
     health.d/apcupsd.conf \
     health.d/backend.conf \
     health.d/bcache.conf \
@@ -39,7 +37,6 @@ dist_healthconfig_DATA = \
     health.d/cgroups.conf \
     health.d/cpu.conf \
     health.d/cockroachdb.conf \
-    health.d/couchdb.conf \
     health.d/disks.conf \
     health.d/dnsmasq_dhcp.conf \
     health.d/dns_query.conf \
@@ -51,6 +48,7 @@ dist_healthconfig_DATA = \
     health.d/ioping.conf \
     health.d/fronius.conf \
     health.d/gearman.conf \
+    health.d/go.d.plugin.conf \
     health.d/haproxy.conf \
     health.d/hdfs.conf \
     health.d/httpcheck.conf \
@@ -59,26 +57,19 @@ dist_healthconfig_DATA = \
     health.d/ipmi.conf \
     health.d/isc_dhcpd.conf \
     health.d/kubelet.conf \
-    health.d/lighttpd.conf \
     health.d/linux_power_supply.conf \
     health.d/load.conf \
     health.d/mdstat.conf \
     health.d/megacli.conf \
     health.d/memcached.conf \
     health.d/memory.conf \
-    health.d/mongodb.conf \
     health.d/mysql.conf \
-    health.d/named.conf \
     health.d/net.conf \
     health.d/netfilter.conf \
-    health.d/nginx.conf \
-    health.d/nginx_plus.conf \
     health.d/pihole.conf \
-    health.d/phpfpm.conf \
     health.d/portcheck.conf \
-    health.d/postgres.conf \
     health.d/processes.conf \
-    health.d/pulsar.conf \
+    health.d/python.d.plugin.conf \
     health.d/qos.conf \
     health.d/ram.conf \
     health.d/redis.conf \
@@ -86,7 +77,6 @@ dist_healthconfig_DATA = \
     health.d/riakkv.conf \
     health.d/scaleio.conf \
     health.d/softnet.conf \
-    health.d/squid.conf \
     health.d/stiebeleltron.conf \
     health.d/synchronization.conf \
     health.d/swap.conf \
@@ -107,6 +97,5 @@ dist_healthconfig_DATA = \
     health.d/wmi.conf \
     health.d/x509check.conf \
     health.d/zfs.conf \
-    health.d/zookeeper.conf \
     health.d/dbengine.conf \
     $(NULL)

+ 0 - 15
health/health.d/am2320.conf

@@ -1,15 +0,0 @@
-# make sure am2320 is sending stats
-
- template: am2320_last_collected_secs
-       on: am2320.temperature
-    class: Other
-component: Sensors
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: webmaster

+ 0 - 16
health/health.d/cockroachdb.conf

@@ -1,20 +1,4 @@
 
-# Availability
-
- template: cockroachdb_last_collected_secs
-       on: cockroachdb.live_nodes
-    class: Database
-component: CockroachDB
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: dba
-
 # Capacity
 
  template: cockroachdb_used_storage_capacity

+ 0 - 16
health/health.d/couchdb.conf

@@ -1,16 +0,0 @@
-
-# make sure couchdb is running
-
- template: couchdb_last_collected_secs
-       on: couchdb.request_methods
-    class: Database
-component: CouchDB
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: dba

+ 0 - 14
health/health.d/gearman.conf

@@ -1,17 +1,3 @@
-# make sure Gearman is running
- template: gearman_last_collected_secs
-       on: gearman.total_jobs
-    class: Computing
-component: Gearman
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: sysadmin
 
  template: gearman_workers_queued
        on: gearman.single_job

+ 7 - 7
health/health.d/nginx_plus.conf → health/health.d/go.d.plugin.conf

@@ -1,11 +1,12 @@
 
-# make sure nginx_plus is running
+# make sure go.d.plugin data collection job is running
 
- template: nginx_plus_last_collected_secs
-       on: nginx_plus.requests_total
-    class: Web Server
-component: NGINX Plus
-     type: Latency
+ template: go.d_job_last_collected_secs
+       on: netdata.go_plugin_execution_time
+    class: Netdata
+component: go.d.plugin
+     type: Error
+   module: *
      calc: $now - $last_collected_t
     units: seconds ago
     every: 10s
@@ -14,4 +15,3 @@ component: NGINX Plus
     delay: down 5m multiplier 1.5 max 1h
      info: number of seconds since the last successful data collection
        to: webmaster
-

+ 0 - 17
health/health.d/hdfs.conf

@@ -1,21 +1,4 @@
 
-# make sure hdfs is running
-
- template: hdfs_last_collected_secs
-       on: hdfs.heap_memory
-    class: Storage
-component: HDFS
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: webmaster
-
-
 # Common
 
  template: hdfs_capacity_usage

+ 0 - 14
health/health.d/httpcheck.conf

@@ -1,17 +1,3 @@
- template: httpcheck_last_collected_secs
- families: *
-       on: httpcheck.status
-    class: Other
-component: HTTP endpoint
-     type: Latency
-     calc: $now - $last_collected_t
-    every: 10s
-    units: seconds ago
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: sysadmin
 
 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
  template: httpcheck_web_service_up

+ 0 - 17
health/health.d/lighttpd.conf

@@ -1,17 +0,0 @@
-
-# make sure lighttpd is running
-
- template: lighttpd_last_collected_secs
-       on: lighttpd.requests
-    class: Web Server
-component: Lighttpd
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: webmaster
-

+ 0 - 17
health/health.d/memcached.conf

@@ -1,21 +1,4 @@
 
-# make sure memcached is running
-
- template: memcached_last_collected_secs
-       on: memcached.cache
-    class: KV Storage
-component: Memcached
-     type: Latency
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: dba
-
-
 # detect if memcached cache is full
 
  template: memcached_cache_memory_usage

Some files were not shown because too many files changed in this diff