|
- # -*- coding: utf-8 -*-
- # Description: smart netdata python.d module
- # Author: ilyam8, vorph1
- # SPDX-License-Identifier: GPL-3.0-or-later
- import os
- import re
- from copy import deepcopy
- from time import time
- from bases.FrameworkServices.SimpleService import SimpleService
- from bases.collection import read_last_line
- INCREMENTAL = 'incremental'
- ABSOLUTE = 'absolute'
- ATA = 'ata'
- SCSI = 'scsi'
- CSV = '.csv'
- DEF_RESCAN_INTERVAL = 60
- DEF_AGE = 30
- DEF_PATH = '/var/log/smartd'
- ATTR1 = '1'
- ATTR2 = '2'
- ATTR3 = '3'
- ATTR4 = '4'
- ATTR5 = '5'
- ATTR7 = '7'
- ATTR8 = '8'
- ATTR9 = '9'
- ATTR10 = '10'
- ATTR11 = '11'
- ATTR12 = '12'
- ATTR13 = '13'
- ATTR170 = '170'
- ATTR171 = '171'
- ATTR172 = '172'
- ATTR173 = '173'
- ATTR174 = '174'
- ATTR180 = '180'
- ATTR183 = '183'
- ATTR190 = '190'
- ATTR194 = '194'
- ATTR196 = '196'
- ATTR197 = '197'
- ATTR198 = '198'
- ATTR199 = '199'
- ATTR202 = '202'
- ATTR206 = '206'
- ATTR233 = '233'
- ATTR249 = '249'
- ATTR_READ_ERR_COR = 'read-total-err-corrected'
- ATTR_READ_ERR_UNC = 'read-total-unc-errors'
- ATTR_WRITE_ERR_COR = 'write-total-err-corrected'
- ATTR_WRITE_ERR_UNC = 'write-total-unc-errors'
- ATTR_VERIFY_ERR_COR = 'verify-total-err-corrected'
- ATTR_VERIFY_ERR_UNC = 'verify-total-unc-errors'
- ATTR_TEMPERATURE = 'temperature'
- RE_ATA = re.compile(
- '(\d+);' # attribute
- '(\d+);' # normalized value
- '(\d+)', # raw value
- re.X
- )
- RE_SCSI = re.compile(
- '([a-z-]+);' # attribute
- '([0-9.]+)', # raw value
- re.X
- )
- ORDER = [
- # errors
- 'read_error_rate',
- 'seek_error_rate',
- 'soft_read_error_rate',
- 'write_error_rate',
- 'read_total_err_corrected',
- 'read_total_unc_errors',
- 'write_total_err_corrected',
- 'write_total_unc_errors',
- 'verify_total_err_corrected',
- 'verify_total_unc_errors',
- # external failure
- 'sata_interface_downshift',
- 'udma_crc_error_count',
- # performance
- 'throughput_performance',
- 'seek_time_performance',
- # power
- 'start_stop_count',
- 'power_on_hours_count',
- 'power_cycle_count',
- 'unexpected_power_loss',
- # spin
- 'spin_up_time',
- 'spin_up_retries',
- 'calibration_retries',
- # temperature
- 'airflow_temperature_celsius',
- 'temperature_celsius',
- # wear
- 'reallocated_sectors_count',
- 'reserved_block_count',
- 'program_fail_count',
- 'erase_fail_count',
- 'wear_leveller_worst_case_erase_count',
- 'unused_reserved_nand_blocks',
- 'reallocation_event_count',
- 'current_pending_sector_count',
- 'offline_uncorrectable_sector_count',
- 'percent_lifetime_used',
- 'media_wearout_indicator',
- ]
- CHARTS = {
- 'read_error_rate': {
- 'options': [None, 'Read Error Rate', 'value', 'errors', 'smartd_log.read_error_rate', 'line'],
- 'lines': [],
- 'attrs': [ATTR1],
- 'algo': ABSOLUTE,
- },
- 'seek_error_rate': {
- 'options': [None, 'Seek Error Rate', 'value', 'errors', 'smartd_log.seek_error_rate', 'line'],
- 'lines': [],
- 'attrs': [ATTR7],
- 'algo': ABSOLUTE,
- },
- 'soft_read_error_rate': {
- 'options': [None, 'Soft Read Error Rate', 'errors', 'errors', 'smartd_log.soft_read_error_rate', 'line'],
- 'lines': [],
- 'attrs': [ATTR13],
- 'algo': INCREMENTAL,
- },
- 'write_error_rate': {
- 'options': [None, 'Write Error Rate', 'value', 'errors', 'smartd_log.write_error_rate', 'line'],
- 'lines': [],
- 'attrs': [ATTR206],
- 'algo': ABSOLUTE,
- },
- 'read_total_err_corrected': {
- 'options': [None, 'Read Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'],
- 'lines': [],
- 'attrs': [ATTR_READ_ERR_COR],
- 'algo': INCREMENTAL,
- },
- 'read_total_unc_errors': {
- 'options': [None, 'Read Error Uncorrected', 'errors', 'errors', 'smartd_log.read_total_unc_errors', 'line'],
- 'lines': [],
- 'attrs': [ATTR_READ_ERR_UNC],
- 'algo': INCREMENTAL,
- },
- 'write_total_err_corrected': {
- 'options': [None, 'Write Error Corrected', 'errors', 'errors', 'smartd_log.write_total_err_corrected', 'line'],
- 'lines': [],
- 'attrs': [ATTR_WRITE_ERR_COR],
- 'algo': INCREMENTAL,
- },
- 'write_total_unc_errors': {
- 'options': [None, 'Write Error Uncorrected', 'errors', 'errors', 'smartd_log.write_total_unc_errors', 'line'],
- 'lines': [],
- 'attrs': [ATTR_WRITE_ERR_UNC],
- 'algo': INCREMENTAL,
- },
- 'verify_total_err_corrected': {
- 'options': [None, 'Verify Error Corrected', 'errors', 'errors', 'smartd_log.verify_total_err_corrected',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR_VERIFY_ERR_COR],
- 'algo': INCREMENTAL,
- },
- 'verify_total_unc_errors': {
- 'options': [None, 'Verify Error Uncorrected', 'errors', 'errors', 'smartd_log.verify_total_unc_errors', 'line'],
- 'lines': [],
- 'attrs': [ATTR_VERIFY_ERR_UNC],
- 'algo': INCREMENTAL,
- },
- 'sata_interface_downshift': {
- 'options': [None, 'SATA Interface Downshift', 'events', 'external failure',
- 'smartd_log.sata_interface_downshift', 'line'],
- 'lines': [],
- 'attrs': [ATTR183],
- 'algo': INCREMENTAL,
- },
- 'udma_crc_error_count': {
- 'options': [None, 'UDMA CRC Error Count', 'errors', 'external failure', 'smartd_log.udma_crc_error_count',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR199],
- 'algo': INCREMENTAL,
- },
- 'throughput_performance': {
- 'options': [None, 'Throughput Performance', 'value', 'performance', 'smartd_log.throughput_performance',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR2],
- 'algo': ABSOLUTE,
- },
- 'seek_time_performance': {
- 'options': [None, 'Seek Time Performance', 'value', 'performance', 'smartd_log.seek_time_performance', 'line'],
- 'lines': [],
- 'attrs': [ATTR8],
- 'algo': ABSOLUTE,
- },
- 'start_stop_count': {
- 'options': [None, 'Start/Stop Count', 'events', 'power', 'smartd_log.start_stop_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR4],
- 'algo': ABSOLUTE,
- },
- 'power_on_hours_count': {
- 'options': [None, 'Power-On Hours Count', 'hours', 'power', 'smartd_log.power_on_hours_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR9],
- 'algo': ABSOLUTE,
- },
- 'power_cycle_count': {
- 'options': [None, 'Power Cycle Count', 'events', 'power', 'smartd_log.power_cycle_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR12],
- 'algo': ABSOLUTE,
- },
- 'unexpected_power_loss': {
- 'options': [None, 'Unexpected Power Loss', 'events', 'power', 'smartd_log.unexpected_power_loss', 'line'],
- 'lines': [],
- 'attrs': [ATTR174],
- 'algo': ABSOLUTE,
- },
- 'spin_up_time': {
- 'options': [None, 'Spin-Up Time', 'ms', 'spin', 'smartd_log.spin_up_time', 'line'],
- 'lines': [],
- 'attrs': [ATTR3],
- 'algo': ABSOLUTE,
- },
- 'spin_up_retries': {
- 'options': [None, 'Spin-up Retries', 'retries', 'spin', 'smartd_log.spin_up_retries', 'line'],
- 'lines': [],
- 'attrs': [ATTR10],
- 'algo': INCREMENTAL,
- },
- 'calibration_retries': {
- 'options': [None, 'Calibration Retries', 'retries', 'spin', 'smartd_log.calibration_retries', 'line'],
- 'lines': [],
- 'attrs': [ATTR11],
- 'algo': INCREMENTAL,
- },
- 'airflow_temperature_celsius': {
- 'options': [None, 'Airflow Temperature Celsius', 'celsius', 'temperature',
- 'smartd_log.airflow_temperature_celsius', 'line'],
- 'lines': [],
- 'attrs': [ATTR190],
- 'algo': ABSOLUTE,
- },
- 'temperature_celsius': {
- 'options': [None, 'Temperature', 'celsius', 'temperature', 'smartd_log.temperature_celsius', 'line'],
- 'lines': [],
- 'attrs': [ATTR194, ATTR_TEMPERATURE],
- 'algo': ABSOLUTE,
- },
- 'reallocated_sectors_count': {
- 'options': [None, 'Reallocated Sectors Count', 'sectors', 'wear', 'smartd_log.reallocated_sectors_count',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR5],
- 'algo': ABSOLUTE,
- },
- 'reserved_block_count': {
- 'options': [None, 'Reserved Block Count', 'percentage', 'wear', 'smartd_log.reserved_block_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR170],
- 'algo': ABSOLUTE,
- },
- 'program_fail_count': {
- 'options': [None, 'Program Fail Count', 'errors', 'wear', 'smartd_log.program_fail_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR171],
- 'algo': INCREMENTAL,
- },
- 'erase_fail_count': {
- 'options': [None, 'Erase Fail Count', 'failures', 'wear', 'smartd_log.erase_fail_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR172],
- 'algo': INCREMENTAL,
- },
- 'wear_leveller_worst_case_erase_count': {
- 'options': [None, 'Wear Leveller Worst Case Erase Count', 'erases', 'wear',
- 'smartd_log.wear_leveller_worst_case_erase_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR173],
- 'algo': ABSOLUTE,
- },
- 'unused_reserved_nand_blocks': {
- 'options': [None, 'Unused Reserved NAND Blocks', 'blocks', 'wear', 'smartd_log.unused_reserved_nand_blocks',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR180],
- 'algo': ABSOLUTE,
- },
- 'reallocation_event_count': {
- 'options': [None, 'Reallocation Event Count', 'events', 'wear', 'smartd_log.reallocation_event_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR196],
- 'algo': INCREMENTAL,
- },
- 'current_pending_sector_count': {
- 'options': [None, 'Current Pending Sector Count', 'sectors', 'wear', 'smartd_log.current_pending_sector_count',
- 'line'],
- 'lines': [],
- 'attrs': [ATTR197],
- 'algo': ABSOLUTE,
- },
- 'offline_uncorrectable_sector_count': {
- 'options': [None, 'Offline Uncorrectable Sector Count', 'sectors', 'wear',
- 'smartd_log.offline_uncorrectable_sector_count', 'line'],
- 'lines': [],
- 'attrs': [ATTR198],
- 'algo': ABSOLUTE,
- },
- 'percent_lifetime_used': {
- 'options': [None, 'Percent Lifetime Used', 'percentage', 'wear', 'smartd_log.percent_lifetime_used', 'line'],
- 'lines': [],
- 'attrs': [ATTR202],
- 'algo': ABSOLUTE,
- },
- 'media_wearout_indicator': {
- 'options': [None, 'Media Wearout Indicator', 'percentage', 'wear', 'smartd_log.media_wearout_indicator', 'line'],
- 'lines': [],
- 'attrs': [ATTR233],
- 'algo': ABSOLUTE,
- },
- 'nand_writes_1gib': {
- 'options': [None, 'NAND Writes', 'GiB', 'wear', 'smartd_log.nand_writes_1gib', 'line'],
- 'lines': [],
- 'attrs': [ATTR249],
- 'algo': ABSOLUTE,
- },
- }
- # NOTE: 'parse_temp' decodes ATA 194 raw value. Not heavily tested. Written by @Ferroin
- # C code:
- # https://github.com/smartmontools/smartmontools/blob/master/smartmontools/atacmds.cpp#L2051
- #
- # Calling 'parse_temp' on the raw value will return a 4-tuple, containing
- # * temperature
- # * minimum
- # * maximum
- # * over-temperature count
- # substituting None for values it can't decode.
- #
- # Example:
- # >>> parse_temp(42952491042)
- # >>> (34, 10, 43, None)
- #
- #
- # def check_temp_word(i):
- # if i <= 0x7F:
- # return 0x11
- # elif i <= 0xFF:
- # return 0x01
- # elif 0xFF80 <= i:
- # return 0x10
- # return 0x00
- #
- #
- # def check_temp_range(t, b0, b1):
- # if b0 > b1:
- # t0, t1 = b1, b0
- # else:
- # t0, t1 = b0, b1
- #
- # if all([
- # -60 <= t0,
- # t0 <= t,
- # t <= t1,
- # t1 <= 120,
- # not (t0 == -1 and t1 <= 0)
- # ]):
- # return t0, t1
- # return None, None
- #
- #
- # def parse_temp(raw):
- # byte = list()
- # word = list()
- # for i in range(0, 6):
- # byte.append(0xFF & (raw >> (i * 8)))
- # for i in range(0, 3):
- # word.append(0xFFFF & (raw >> (i * 16)))
- #
- # ctwd = check_temp_word(word[0])
- #
- # if not word[2]:
- # if ctwd and not word[1]:
- # # byte[0] is temp, no other data
- # return byte[0], None, None, None
- #
- # if ctwd and all(check_temp_range(byte[0], byte[2], byte[3])):
- # # byte[0] is temp, byte[2] is max or min, byte[3] is min or max
- # trange = check_temp_range(byte[0], byte[2], byte[3])
- # return byte[0], trange[0], trange[1], None
- #
- # if ctwd and all(check_temp_range(byte[0], byte[1], byte[2])):
- # # byte[0] is temp, byte[1] is max or min, byte[2] is min or max
- # trange = check_temp_range(byte[0], byte[1], byte[2])
- # return byte[0], trange[0], trange[1], None
- #
- # return None, None, None, None
- #
- # if ctwd:
- # if all(
- # [
- # ctwd & check_temp_word(word[1]) & check_temp_word(word[2]) != 0x00,
- # all(check_temp_range(byte[0], byte[2], byte[4])),
- # ]
- # ):
- # # byte[0] is temp, byte[2] is max or min, byte[4] is min or max
- # trange = check_temp_range(byte[0], byte[2], byte[4])
- # return byte[0], trange[0], trange[1], None
- # else:
- # trange = check_temp_range(byte[0], byte[2], byte[3])
- # if word[2] < 0x7FFF and all(trange) and trange[1] >= 40:
- # # byte[0] is temp, byte[2] is max or min, byte[3] is min or max, word[2] is overtemp count
- # return byte[0], trange[0], trange[1], word[2]
- # # no data
- # return None, None, None, None
- CHARTED_ATTRS = dict((attr, k) for k, v in CHARTS.items() for attr in v['attrs'])
- class BaseAtaSmartAttribute:
- def __init__(self, name, normalized_value, raw_value):
- self.name = name
- self.normalized_value = normalized_value
- self.raw_value = raw_value
- def value(self):
- raise NotImplementedError
- class AtaRaw(BaseAtaSmartAttribute):
- def value(self):
- return self.raw_value
- class AtaNormalized(BaseAtaSmartAttribute):
- def value(self):
- return self.normalized_value
- class Ata3(BaseAtaSmartAttribute):
- def value(self):
- value = int(self.raw_value)
- # https://github.com/netdata/netdata/issues/5919
- #
- # 3;151;38684000679;
- # 423 (Average 447)
- # 38684000679 & 0xFFF -> 423
- # (38684000679 & 0xFFF0000) >> 16 -> 447
- if value > 1e6:
- return value & 0xFFF
- return value
- class Ata9(BaseAtaSmartAttribute):
- def value(self):
- value = int(self.raw_value)
- if value > 1e6:
- return value & 0xFFFF
- return value
- class Ata190(BaseAtaSmartAttribute):
- def value(self):
- return 100 - int(self.normalized_value)
- class Ata194(BaseAtaSmartAttribute):
- # https://github.com/netdata/netdata/issues/3041
- # https://github.com/netdata/netdata/issues/5919
- #
- # The low byte is the current temperature, the third lowest is the maximum, and the fifth lowest is the minimum
- def value(self):
- value = int(self.raw_value)
- if value > 1e6:
- return value & 0xFF
- return min(int(self.normalized_value), int(self.raw_value))
- class BaseSCSISmartAttribute:
- def __init__(self, name, raw_value):
- self.name = name
- self.raw_value = raw_value
- def value(self):
- raise NotImplementedError
- class SCSIRaw(BaseSCSISmartAttribute):
- def value(self):
- return self.raw_value
- def ata_attribute_factory(value):
- name = value[0]
- if name == ATTR3:
- return Ata3(*value)
- elif name == ATTR9:
- return Ata9(*value)
- elif name == ATTR190:
- return Ata190(*value)
- elif name == ATTR194:
- return Ata194(*value)
- elif name in [
- ATTR1,
- ATTR7,
- ATTR202,
- ATTR206,
- ATTR233,
- ]:
- return AtaNormalized(*value)
- return AtaRaw(*value)
- def scsi_attribute_factory(value):
- return SCSIRaw(*value)
- def attribute_factory(value):
- name = value[0]
- if name.isdigit():
- return ata_attribute_factory(value)
- return scsi_attribute_factory(value)
- def handle_error(*errors):
- def on_method(method):
- def on_call(*args):
- try:
- return method(*args)
- except errors:
- return None
- return on_call
- return on_method
- class DiskLogFile:
- def __init__(self, full_path):
- self.path = full_path
- self.size = os.path.getsize(full_path)
- @handle_error(OSError)
- def is_changed(self):
- return self.size != os.path.getsize(self.path)
- @handle_error(OSError)
- def is_active(self, current_time, limit):
- return (current_time - os.path.getmtime(self.path)) / 60 < limit
- @handle_error(OSError)
- def read(self):
- self.size = os.path.getsize(self.path)
- return read_last_line(self.path)
- class BaseDisk:
- def __init__(self, name, log_file):
- self.raw_name = name
- self.name = re.sub(r'_+', '_', name)
- self.log_file = log_file
- self.attrs = list()
- self.alive = True
- self.charted = False
- def __eq__(self, other):
- if isinstance(other, BaseDisk):
- return self.raw_name == other.raw_name
- return self.raw_name == other
- def __ne__(self, other):
- return not self == other
- def __hash__(self):
- return hash(repr(self))
- def parser(self, data):
- raise NotImplementedError
- @handle_error(TypeError)
- def populate_attrs(self):
- self.attrs = list()
- line = self.log_file.read()
- for value in self.parser(line):
- self.attrs.append(attribute_factory(value))
- return len(self.attrs)
- def data(self):
- data = dict()
- for attr in self.attrs:
- data['{0}_{1}'.format(self.name, attr.name)] = attr.value()
- return data
- class ATADisk(BaseDisk):
- def parser(self, data):
- return RE_ATA.findall(data)
- class SCSIDisk(BaseDisk):
- def parser(self, data):
- return RE_SCSI.findall(data)
- class Service(SimpleService):
- def __init__(self, configuration=None, name=None):
- SimpleService.__init__(self, configuration=configuration, name=name)
- self.order = ORDER
- self.definitions = deepcopy(CHARTS)
- self.log_path = configuration.get('log_path', DEF_PATH)
- self.age = configuration.get('age', DEF_AGE)
- self.exclude = configuration.get('exclude_disks', str()).split()
- self.disks = list()
- self.runs = 0
- def check(self):
- return self.scan() > 0
- def get_data(self):
- self.runs += 1
- if self.runs % DEF_RESCAN_INTERVAL == 0:
- self.cleanup()
- self.scan()
- data = dict()
- for disk in self.disks:
- if not disk.alive:
- continue
- if not disk.charted:
- self.add_disk_to_charts(disk)
- changed = disk.log_file.is_changed()
- if changed is None:
- disk.alive = False
- continue
- if changed and disk.populate_attrs() is None:
- disk.alive = False
- continue
- data.update(disk.data())
- return data
- def cleanup(self):
- current_time = time()
- for disk in self.disks[:]:
- if any(
- [
- not disk.alive,
- not disk.log_file.is_active(current_time, self.age),
- ]
- ):
- self.disks.remove(disk.raw_name)
- self.remove_disk_from_charts(disk)
- def scan(self):
- self.debug('scanning {0}'.format(self.log_path))
- current_time = time()
- for full_name in os.listdir(self.log_path):
- disk = self.create_disk_from_file(full_name, current_time)
- if not disk:
- continue
- self.disks.append(disk)
- return len(self.disks)
- def create_disk_from_file(self, full_name, current_time):
- if not full_name.endswith(CSV):
- self.debug('skipping {0}: not a csv file'.format(full_name))
- return None
- name = os.path.basename(full_name).split('.')[-3]
- path = os.path.join(self.log_path, full_name)
- if name in self.disks:
- self.debug('skipping {0}: already in disks'.format(full_name))
- return None
- if [p for p in self.exclude if p in name]:
- self.debug('skipping {0}: filtered by `exclude` option'.format(full_name))
- return None
- if not os.access(path, os.R_OK):
- self.debug('skipping {0}: not readable'.format(full_name))
- return None
- if os.path.getsize(path) == 0:
- self.debug('skipping {0}: zero size'.format(full_name))
- return None
- if (current_time - os.path.getmtime(path)) / 60 > self.age:
- self.debug('skipping {0}: haven\'t been updated for last {1} minutes'.format(full_name, self.age))
- return None
- if ATA in full_name:
- disk = ATADisk(name, DiskLogFile(path))
- elif SCSI in full_name:
- disk = SCSIDisk(name, DiskLogFile(path))
- else:
- self.debug('skipping {0}: unknown type'.format(full_name))
- return None
- disk.populate_attrs()
- if not disk.attrs:
- self.error('skipping {0}: parsing failed'.format(full_name))
- return None
- self.debug('added {0}'.format(full_name))
- return disk
- def add_disk_to_charts(self, disk):
- if len(self.charts) == 0 or disk.charted:
- return
- disk.charted = True
- for attr in disk.attrs:
- chart_id = CHARTED_ATTRS.get(attr.name)
- if not chart_id or chart_id not in self.charts:
- continue
- chart = self.charts[chart_id]
- dim = [
- '{0}_{1}'.format(disk.name, attr.name),
- disk.name,
- CHARTS[chart_id]['algo'],
- ]
- if dim[0] in self.charts[chart_id].dimensions:
- chart.hide_dimension(dim[0], reverse=True)
- else:
- chart.add_dimension(dim)
- def remove_disk_from_charts(self, disk):
- if len(self.charts) == 0 or not disk.charted:
- return
- for attr in disk.attrs:
- chart_id = CHARTED_ATTRS.get(attr.name)
- if not chart_id or chart_id not in self.charts:
- continue
- self.charts[chart_id].del_dimension('{0}_{1}'.format(disk.name, attr.name))
|