Browse Source

adaptec_raid python module (#4429)

* adaptec_raid module init version

* adaptec_raid minor

* adaptec_raid minor

* adaptec_raid minor

* adaptec_raid arcconf command fix

* adaptec_raid minor fixes

* adaptec_raid add alarms

* adaptec_raid add link to screenshot to the readme
Ilya Mashchenko 6 years ago
parent
commit
b85833f081

+ 1 - 0
collectors/python.d.plugin/Makefile.am

@@ -37,6 +37,7 @@ dist_pythonconfig_DATA = \
     $(top_srcdir)/installer/.keep \
     $(NULL)
 
+include adaptec_raid/Makefile.inc
 include apache/Makefile.inc
 include beanstalk/Makefile.inc
 include bind_rndc/Makefile.inc

+ 13 - 0
collectors/python.d.plugin/adaptec_raid/Makefile.inc

@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA       += adaptec_raid/adaptec_raid.chart.py
+dist_pythonconfig_DATA += adaptec_raid/adaptec_raid.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA       += adaptec_raid/README.md adaptec_raid/Makefile.inc
+

+ 27 - 0
collectors/python.d.plugin/adaptec_raid/README.md

@@ -0,0 +1,27 @@
+# adaptec raid
+
+Module collects logical and physical devices health metrics.
+
+**Requirements:**
+ * `netdata` user needs to be able to sudo the `arcconf` program without password
+
+To grab stats it executes:
+ * `sudo -n arcconf GETCONFIG 1 LD`
+ * `sudo -n arcconf GETCONFIG 1 PD`
+
+
+It produces:
+
+1. **Logical Device Status**
+
+2. **Physical Device State**
+
+3. **Physical Device S.M.A.R.T warnings**
+
+4. **Physical Device Temperature**
+
+Screenshot:
+
+![image](https://user-images.githubusercontent.com/22274335/47278133-6d306680-d601-11e8-87c2-cc9c0f42d686.png)
+
+---

+ 245 - 0
collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py

@@ -0,0 +1,245 @@
+# -*- coding: utf-8 -*-
+# Description: adaptec_raid netdata python.d module
+# Author: Ilya Mashchenko (l2isbad)
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+
+import re
+
+from copy import deepcopy
+
+from bases.FrameworkServices.ExecutableService import ExecutableService
+from bases.collection import find_binary
+
+
+update_every = 5
+
+ORDER = [
+    'ld_status',
+    'pd_state',
+    'pd_smart_warnings',
+    'pd_temperature',
+]
+
+CHARTS = {
+    'ld_status': {
+        'options': [None, 'Status Is Not OK', 'bool', 'logical devices', 'adapter_raid.ld_status', 'line'],
+        'lines': []
+    },
+    'pd_state': {
+        'options': [None, 'State Is Not OK', 'bool', 'physical devices', 'adapter_raid.pd_state', 'line'],
+        'lines': []
+    },
+    'pd_smart_warnings': {
+        'options': [None, 'S.M.A.R.T warnings', 'count', 'physical devices',
+                    'adapter_raid.smart_warnings', 'line'],
+        'lines': []
+    },
+    'pd_temperature': {
+        'options': [None, 'Temperature', 'celsius', 'physical devices', 'adapter_raid.temperature', 'line'],
+        'lines': []
+    },
+}
+
+SUDO = 'sudo'
+ARCCONF = 'arcconf'
+
+BAD_LD_STATUS = (
+    'Degraded',
+    'Failed',
+)
+
+GOOD_PD_STATUS = (
+    'Online',
+)
+
+RE_LD = re.compile(
+    r'Logical device number\s+([0-9]+).*?'
+    r'Status of logical device\s+: ([a-zA-Z]+)'
+)
+
+
+def find_lds(d):
+    d = ' '.join(v.strip() for v in d)
+    return [LD(*v) for v in RE_LD.findall(d)]
+
+
+def find_pds(d):
+    pds = list()
+    pd = PD()
+
+    for row in d:
+        row = row.strip()
+        if row.startswith('Device #'):
+            pd = PD()
+            pd.id = row.split('#')[-1]
+        elif not pd.id:
+            continue
+
+        if row.startswith('State'):
+            v = row.split()[-1]
+            pd.state = v
+        elif row.startswith('S.M.A.R.T. warnings'):
+            v = row.split()[-1]
+            pd.smart_warnings = v
+        elif row.startswith('Temperature'):
+            v = row.split(':')[-1].split()[0]
+            pd.temperature = v
+        elif row.startswith('NCQ status'):
+            if pd.id and pd.state and pd.smart_warnings:
+                pds.append(pd)
+            pd = PD()
+
+    return pds
+
+
+class LD:
+    def __init__(self, ld_id, status):
+        self.id = ld_id
+        self.status = status
+
+    def data(self):
+        return {
+            'ld_{0}_status'.format(self.id): int(self.status in BAD_LD_STATUS)
+        }
+
+
+class PD:
+    def __init__(self):
+        self.id = None
+        self.state = None
+        self.smart_warnings = None
+        self.temperature = None
+
+    def data(self):
+        data = {
+            'pd_{0}_state'.format(self.id): int(self.state not in GOOD_PD_STATUS),
+            'pd_{0}_smart_warnings'.format(self.id): self.smart_warnings,
+        }
+        if self.temperature and self.temperature.isdigit():
+            data['pd_{0}_temperature'.format(self.id)] = self.temperature
+
+        return data
+
+
+class Arcconf:
+    def __init__(self, arcconf):
+        self.arcconf = arcconf
+
+    def ld_info(self):
+        return [self.arcconf, 'GETCONFIG', '1', 'LD']
+
+    def pd_info(self):
+        return [self.arcconf, 'GETCONFIG', '1', 'PD']
+
+
+# TODO: hardcoded sudo...
+class SudoArcconf:
+    def __init__(self, arcconf, sudo):
+        self.arcconf = Arcconf(arcconf)
+        self.sudo = sudo
+
+    def ld_info(self):
+        return [self.sudo, '-n'] + self.arcconf.ld_info()
+
+    def pd_info(self):
+        return [self.sudo, '-n'] + self.arcconf.pd_info()
+
+
+class Service(ExecutableService):
+    def __init__(self, configuration=None, name=None):
+        ExecutableService.__init__(self, configuration=configuration, name=name)
+        self.order = ORDER
+        self.definitions = deepcopy(CHARTS)
+        self.use_sudo = self.configuration.get('use_sudo', True)
+        self.arcconf = None
+
+    def execute(self, command, stderr=False):
+        return self._get_raw_data(command=command, stderr=stderr)
+
+    def check(self):
+        sudo = find_binary(SUDO)
+        if self.use_sudo:
+            if not sudo:
+                self.error('can\'t locate "{0}" binary'.format(SUDO))
+                return False
+            err = self.execute([sudo, '-n', '-v'], True)
+            if err:
+                self.error(' '.join(err))
+                return False
+
+        arcconf = find_binary(ARCCONF)
+        if not arcconf:
+            self.error('can\'t locate "{0}" binary'.format(ARCCONF))
+            return False
+
+        if self.use_sudo:
+            self.arcconf = SudoArcconf(arcconf, sudo)
+        else:
+            self.arcconf = Arcconf(arcconf)
+
+        lds = self.get_lds()
+        if not lds:
+            return False
+
+        self.debug('discovered logical devices ids: {0}'.format([ld.id for ld in lds]))
+
+        pds = self.get_pds()
+        if not pds:
+            return False
+
+        self.debug('discovered physical devices ids: {0}'.format([pd.id for pd in pds]))
+
+        self.update_charts(lds, pds)
+        return True
+
+    def get_data(self):
+        data = dict()
+
+        for ld in self.get_lds():
+            data.update(ld.data())
+
+        for pd in self.get_pds():
+            data.update(pd.data())
+
+        return data
+
+    def get_lds(self):
+        raw_lds = self.execute(self.arcconf.ld_info())
+        if not raw_lds:
+            return None
+
+        lds = find_lds(raw_lds)
+        if not lds:
+            self.error('failed to parse "{0}" output'.format(' '.join(self.arcconf.ld_info())))
+            self.debug('output: {0}'.format(raw_lds))
+            return None
+        return lds
+
+    def get_pds(self):
+        raw_pds = self.execute(self.arcconf.pd_info())
+        if not raw_pds:
+            return None
+
+        pds = find_pds(raw_pds)
+        if not pds:
+            self.error('failed to parse "{0}" output'.format(' '.join(self.arcconf.pd_info())))
+            self.debug('output: {0}'.format(raw_pds))
+            return None
+        return pds
+
+    def update_charts(self, lds, pds):
+        charts = self.definitions
+        for ld in lds:
+            dim = ['ld_{0}_status'.format(ld.id), 'ld {0}'.format(ld.id)]
+            charts['ld_status']['lines'].append(dim)
+
+        for pd in pds:
+            dim = ['pd_{0}_state'.format(pd.id), 'pd {0}'.format(pd.id)]
+            charts['pd_state']['lines'].append(dim)
+
+            dim = ['pd_{0}_smart_warnings'.format(pd.id), 'pd {0}'.format(pd.id)]
+            charts['pd_smart_warnings']['lines'].append(dim)
+
+            dim = ['pd_{0}_temperature'.format(pd.id), 'pd {0}'.format(pd.id)]
+            charts['pd_temperature']['lines'].append(dim)

+ 59 - 0
collectors/python.d.plugin/adaptec_raid/adaptec_raid.conf

@@ -0,0 +1,59 @@
+# netdata python.d.plugin configuration for adaptec raid
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 60
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+#     name: myname            # the JOB's name as it will appear at the
+#                             # dashboard (by default is the job_name)
+#                             # JOBs sharing a name are mutually exclusive
+#     update_every: 1         # the JOB's data collection frequency
+#     priority: 60000         # the JOB's order on the dashboard
+#     retries: 60             # the JOB's number of restoration attempts
+#     autodetection_retry: 0  # the JOB's re-check interval in seconds
+# ----------------------------------------------------------------------
+
+# IMPORTANT
+# The netdata user needs to be able to sudo the arcconf program without password:
+# netdata ALL=(root)       NOPASSWD: /path/to/arcconf

+ 1 - 0
health/Makefile.am

@@ -22,6 +22,7 @@ dist_userhealthconfig_DATA = \
 healthconfigdir=$(libconfigdir)/health.d
 dist_healthconfig_DATA = \
     $(top_srcdir)/installer/.keep \
+    health.d/adaptec_raid.conf \
     health.d/apache.conf \
     health.d/apcupsd.conf \
     health.d/backend.conf \

+ 24 - 0
health/health.d/adaptec_raid.conf

@@ -0,0 +1,24 @@
+
+# logical device status check
+
+template: adapter_raid_ld_status
+      on: adapter_raid.ld_status
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 logical device is failed or degraded
+      to: sysadmin
+
+# physical device state check
+
+template: adapter_raid_pd_state
+      on: adapter_raid.pd_state
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 physical device is not in online state
+      to: sysadmin