3 years ago · 2555dccae7
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -25,9 +25,7 @@ install-exec-local:
 
				 healthconfigdir=$(libconfigdir)/health.d
			
 
				 dist_healthconfig_DATA = \
			
 
				     health.d/adaptec_raid.conf \
			
 
				-    health.d/am2320.conf \
			
 
				     health.d/anomalies.conf \
			
 
				-    health.d/apache.conf \
			
 
				     health.d/apcupsd.conf \
			
 
				     health.d/backend.conf \
			
 
				     health.d/bcache.conf \
			
@@ -39,7 +37,6 @@ dist_healthconfig_DATA = \
 
				     health.d/cgroups.conf \
			
 
				     health.d/cpu.conf \
			
 
				     health.d/cockroachdb.conf \
			
 
				-    health.d/couchdb.conf \
			
 
				     health.d/disks.conf \
			
 
				     health.d/dnsmasq_dhcp.conf \
			
 
				     health.d/dns_query.conf \
			
@@ -51,6 +48,7 @@ dist_healthconfig_DATA = \
 
				     health.d/ioping.conf \
			
 
				     health.d/fronius.conf \
			
 
				     health.d/gearman.conf \
			
 
				+    health.d/go.d.plugin.conf \
			
 
				     health.d/haproxy.conf \
			
 
				     health.d/hdfs.conf \
			
 
				     health.d/httpcheck.conf \
			
@@ -59,26 +57,19 @@ dist_healthconfig_DATA = \
 
				     health.d/ipmi.conf \
			
 
				     health.d/isc_dhcpd.conf \
			
 
				     health.d/kubelet.conf \
			
 
				-    health.d/lighttpd.conf \
			
 
				     health.d/linux_power_supply.conf \
			
 
				     health.d/load.conf \
			
 
				     health.d/mdstat.conf \
			
 
				     health.d/megacli.conf \
			
 
				     health.d/memcached.conf \
			
 
				     health.d/memory.conf \
			
 
				-    health.d/mongodb.conf \
			
 
				     health.d/mysql.conf \
			
 
				-    health.d/named.conf \
			
 
				     health.d/net.conf \
			
 
				     health.d/netfilter.conf \
			
 
				-    health.d/nginx.conf \
			
 
				-    health.d/nginx_plus.conf \
			
 
				     health.d/pihole.conf \
			
 
				-    health.d/phpfpm.conf \
			
 
				     health.d/portcheck.conf \
			
 
				-    health.d/postgres.conf \
			
 
				     health.d/processes.conf \
			
 
				-    health.d/pulsar.conf \
			
 
				+    health.d/python.d.plugin.conf \
			
 
				     health.d/qos.conf \
			
 
				     health.d/ram.conf \
			
 
				     health.d/redis.conf \
			
@@ -86,7 +77,6 @@ dist_healthconfig_DATA = \
 
				     health.d/riakkv.conf \
			
 
				     health.d/scaleio.conf \
			
 
				     health.d/softnet.conf \
			
 
				-    health.d/squid.conf \
			
 
				     health.d/stiebeleltron.conf \
			
 
				     health.d/synchronization.conf \
			
 
				     health.d/swap.conf \
			
@@ -107,6 +97,5 @@ dist_healthconfig_DATA = \
 
				     health.d/wmi.conf \
			
 
				     health.d/x509check.conf \
			
 
				     health.d/zfs.conf \
			
 
				-    health.d/zookeeper.conf \
			
 
				     health.d/dbengine.conf \
			
 
				     $(NULL)
			
--- a/health/health.d/am2320.conf
+++ b/health/health.d/am2320.conf
@@ -1,15 +0,0 @@
 
				-# make sure am2320 is sending stats
			
 
				-
			
 
				- template: am2320_last_collected_secs
			
 
				-       on: am2320.temperature
			
 
				-    class: Other
			
 
				-component: Sensors
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: webmaster
			
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -1,20 +1,4 @@
 
				 
			
 
				-# Availability
			
 
				-
			
 
				- template: cockroachdb_last_collected_secs
			
 
				-       on: cockroachdb.live_nodes
			
 
				-    class: Database
			
 
				-component: CockroachDB
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: dba
			
 
				-
			
 
				 # Capacity
			
 
				 
			
 
				  template: cockroachdb_used_storage_capacity
			
--- a/health/health.d/couchdb.conf
+++ b/health/health.d/couchdb.conf
@@ -1,16 +0,0 @@
 
				-
			
 
				-# make sure couchdb is running
			
 
				-
			
 
				- template: couchdb_last_collected_secs
			
 
				-       on: couchdb.request_methods
			
 
				-    class: Database
			
 
				-component: CouchDB
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: dba
			
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -1,17 +1,3 @@
 
				-# make sure Gearman is running
			
 
				- template: gearman_last_collected_secs
			
 
				-       on: gearman.total_jobs
			
 
				-    class: Computing
			
 
				-component: Gearman
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: sysadmin
			
 
				 
			
 
				  template: gearman_workers_queued
			
 
				        on: gearman.single_job
			
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -1,11 +1,12 @@
 
				 
			
 
				-# make sure nginx_plus is running
			
 
				+# make sure go.d.plugin data collection job is running
			
 
				 
			
 
				- template: nginx_plus_last_collected_secs
			
 
				-       on: nginx_plus.requests_total
			
 
				-    class: Web Server
			
 
				-component: NGINX Plus
			
 
				-     type: Latency
			
 
				+ template: go.d_job_last_collected_secs
			
 
				+       on: netdata.go_plugin_execution_time
			
 
				+    class: Netdata
			
 
				+component: go.d.plugin
			
 
				+     type: Error
			
 
				+   module: *
			
 
				      calc: $now - $last_collected_t
			
 
				     units: seconds ago
			
 
				     every: 10s
			
@@ -14,4 +15,3 @@ component: NGINX Plus
 
				     delay: down 5m multiplier 1.5 max 1h
			
 
				      info: number of seconds since the last successful data collection
			
 
				        to: webmaster
			
 
				-
			
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -1,21 +1,4 @@
 
				 
			
 
				-# make sure hdfs is running
			
 
				-
			
 
				- template: hdfs_last_collected_secs
			
 
				-       on: hdfs.heap_memory
			
 
				-    class: Storage
			
 
				-component: HDFS
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: webmaster
			
 
				-
			
 
				-
			
 
				 # Common
			
 
				 
			
 
				  template: hdfs_capacity_usage
			
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -1,17 +1,3 @@
 
				- template: httpcheck_last_collected_secs
			
 
				- families: *
			
 
				-       on: httpcheck.status
			
 
				-    class: Other
			
 
				-component: HTTP endpoint
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    every: 10s
			
 
				-    units: seconds ago
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: sysadmin
			
 
				 
			
 
				 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
			
 
				  template: httpcheck_web_service_up
			
--- a/health/health.d/lighttpd.conf
+++ b/health/health.d/lighttpd.conf
@@ -1,17 +0,0 @@
 
				-
			
 
				-# make sure lighttpd is running
			
 
				-
			
 
				- template: lighttpd_last_collected_secs
			
 
				-       on: lighttpd.requests
			
 
				-    class: Web Server
			
 
				-component: Lighttpd
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: webmaster
			
 
				-
			
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -1,21 +1,4 @@
 
				 
			
 
				-# make sure memcached is running
			
 
				-
			
 
				- template: memcached_last_collected_secs
			
 
				-       on: memcached.cache
			
 
				-    class: KV Storage
			
 
				-component: Memcached
			
 
				-     type: Latency
			
 
				-     calc: $now - $last_collected_t
			
 
				-    units: seconds ago
			
 
				-    every: 10s
			
 
				-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
			
 
				-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
			
 
				-    delay: down 5m multiplier 1.5 max 1h
			
 
				-     info: number of seconds since the last successful data collection
			
 
				-       to: dba
			
 
				-
			
 
				-
			
 
				 # detect if memcached cache is full
			
 
				 
			
 
				  template: memcached_cache_memory_usage