Browse Source

health: add systemdunits alarms (#10906)

Ilya Mashchenko 3 years ago
parent
commit
78b540bb75
2 changed files with 113 additions and 0 deletions
  1. 1 0
      health/Makefile.am
  2. 112 0
      health/health.d/systemdunits.conf

+ 1 - 0
health/Makefile.am

@@ -90,6 +90,7 @@ dist_healthconfig_DATA = \
     health.d/stiebeleltron.conf \
     health.d/synchronization.conf \
     health.d/swap.conf \
+    health.d/systemdunits.conf \
     health.d/tcp_conn.conf \
     health.d/tcp_listen.conf \
     health.d/tcp_mem.conf \

+ 112 - 0
health/health.d/systemdunits.conf

@@ -0,0 +1,112 @@
+## Check if the are any systemd units in the failed state (crashed).
+## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.
+
+## Service units
+template: systemd_service_units_state
+      on: systemd.service_units_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd service units are in the failed state
+      to: sysadmin
+
+## Socket units
+template: systemd_socket_units_state
+      on: systemd.socket_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd socket units are in the failed state
+      to: sysadmin
+
+## Target units
+template: systemd_target_units_state
+      on: systemd.target_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd target units are in the failed state
+      to: sysadmin
+
+## Path units
+template: systemd_path_units_state
+      on: systemd.path_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd path units are in the failed state
+      to: sysadmin
+
+## Device units
+template: systemd_device_units_state
+      on: systemd.device_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more the systemd device units are in the failed state
+      to: sysadmin
+
+## Mount units
+template: systemd_mount_units_state
+      on: systemd.mount_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more the systemd mount units are in the failed state
+      to: sysadmin
+
+## Automount units
+template: systemd_automount_units_state
+      on: systemd.automount_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd automount units are in the failed state
+      to: sysadmin
+
+## Swap units
+template: systemd_swap_units_state
+      on: systemd.swap_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd swap units are in the failed state
+      to: sysadmin
+
+## Scope units
+template: systemd_scope_units_state
+      on: systemd.scope_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd scope units are in the failed state
+      to: sysadmin
+
+## Slice units
+template: systemd_slice_units_state
+      on: systemd.slice_unit_state
+  lookup: max -1s min2max
+   units: ok/failed
+   every: 10s
+    warn: $this != nan AND $this == 5
+   delay: down 5m multiplier 1.5 max 1h
+    info: one or more systemd slice units are in the failed state
+      to: sysadmin