ceph.chart.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. # -*- coding: utf-8 -*-
  2. # Description: ceph netdata python.d module
  3. # Author: Luis Eduardo (lets00)
  4. # SPDX-License-Identifier: GPL-3.0-or-later
  5. try:
  6. import rados
  7. CEPH = True
  8. except ImportError:
  9. CEPH = False
  10. import json
  11. import os
  12. from bases.FrameworkServices.SimpleService import SimpleService
  13. # default module values (can be overridden per job in `config`)
  14. update_every = 10
  15. ORDER = [
  16. 'general_usage',
  17. 'general_objects',
  18. 'general_bytes',
  19. 'general_operations',
  20. 'general_latency',
  21. 'pool_usage',
  22. 'pool_objects',
  23. 'pool_read_bytes',
  24. 'pool_write_bytes',
  25. 'pool_read_operations',
  26. 'pool_write_operations',
  27. 'osd_usage',
  28. 'osd_size',
  29. 'osd_apply_latency',
  30. 'osd_commit_latency'
  31. ]
  32. CHARTS = {
  33. 'general_usage': {
  34. 'options': [None, 'Ceph General Space', 'KiB', 'general', 'ceph.general_usage', 'stacked'],
  35. 'lines': [
  36. ['general_available', 'avail', 'absolute'],
  37. ['general_usage', 'used', 'absolute']
  38. ]
  39. },
  40. 'general_objects': {
  41. 'options': [None, 'Ceph General Objects', 'objects', 'general', 'ceph.general_objects', 'area'],
  42. 'lines': [
  43. ['general_objects', 'cluster', 'absolute']
  44. ]
  45. },
  46. 'general_bytes': {
  47. 'options': [None, 'Ceph General Read/Write Data/s', 'KiB/s', 'general', 'ceph.general_bytes',
  48. 'area'],
  49. 'lines': [
  50. ['general_read_bytes', 'read', 'absolute', 1, 1024],
  51. ['general_write_bytes', 'write', 'absolute', -1, 1024]
  52. ]
  53. },
  54. 'general_operations': {
  55. 'options': [None, 'Ceph General Read/Write Operations/s', 'operations', 'general', 'ceph.general_operations',
  56. 'area'],
  57. 'lines': [
  58. ['general_read_operations', 'read', 'absolute', 1],
  59. ['general_write_operations', 'write', 'absolute', -1]
  60. ]
  61. },
  62. 'general_latency': {
  63. 'options': [None, 'Ceph General Apply/Commit latency', 'milliseconds', 'general', 'ceph.general_latency',
  64. 'area'],
  65. 'lines': [
  66. ['general_apply_latency', 'apply', 'absolute'],
  67. ['general_commit_latency', 'commit', 'absolute']
  68. ]
  69. },
  70. 'pool_usage': {
  71. 'options': [None, 'Ceph Pools', 'KiB', 'pool', 'ceph.pool_usage', 'line'],
  72. 'lines': []
  73. },
  74. 'pool_objects': {
  75. 'options': [None, 'Ceph Pools', 'objects', 'pool', 'ceph.pool_objects', 'line'],
  76. 'lines': []
  77. },
  78. 'pool_read_bytes': {
  79. 'options': [None, 'Ceph Read Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_read_bytes', 'area'],
  80. 'lines': []
  81. },
  82. 'pool_write_bytes': {
  83. 'options': [None, 'Ceph Write Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_write_bytes', 'area'],
  84. 'lines': []
  85. },
  86. 'pool_read_operations': {
  87. 'options': [None, 'Ceph Read Pool Operations/s', 'operations', 'pool', 'ceph.pool_read_operations', 'area'],
  88. 'lines': []
  89. },
  90. 'pool_write_operations': {
  91. 'options': [None, 'Ceph Write Pool Operations/s', 'operations', 'pool', 'ceph.pool_write_operations', 'area'],
  92. 'lines': []
  93. },
  94. 'osd_usage': {
  95. 'options': [None, 'Ceph OSDs', 'KiB', 'osd', 'ceph.osd_usage', 'line'],
  96. 'lines': []
  97. },
  98. 'osd_size': {
  99. 'options': [None, 'Ceph OSDs size', 'KiB', 'osd', 'ceph.osd_size', 'line'],
  100. 'lines': []
  101. },
  102. 'osd_apply_latency': {
  103. 'options': [None, 'Ceph OSDs apply latency', 'milliseconds', 'osd', 'ceph.apply_latency', 'line'],
  104. 'lines': []
  105. },
  106. 'osd_commit_latency': {
  107. 'options': [None, 'Ceph OSDs commit latency', 'milliseconds', 'osd', 'ceph.commit_latency', 'line'],
  108. 'lines': []
  109. }
  110. }
  111. class Service(SimpleService):
  112. def __init__(self, configuration=None, name=None):
  113. SimpleService.__init__(self, configuration=configuration, name=name)
  114. self.order = ORDER
  115. self.definitions = CHARTS
  116. self.config_file = self.configuration.get('config_file')
  117. self.keyring_file = self.configuration.get('keyring_file')
  118. self.rados_id = self.configuration.get('rados_id', 'admin')
  119. def check(self):
  120. """
  121. Checks module
  122. :return:
  123. """
  124. if not CEPH:
  125. self.error('rados module is needed to use ceph.chart.py')
  126. return False
  127. if not (self.config_file and self.keyring_file):
  128. self.error('config_file and/or keyring_file is not defined')
  129. return False
  130. # Verify files and permissions
  131. if not (os.access(self.config_file, os.F_OK)):
  132. self.error('{0} does not exist'.format(self.config_file))
  133. return False
  134. if not (os.access(self.keyring_file, os.F_OK)):
  135. self.error('{0} does not exist'.format(self.keyring_file))
  136. return False
  137. if not (os.access(self.config_file, os.R_OK)):
  138. self.error('Ceph plugin does not read {0}, define read permission.'.format(self.config_file))
  139. return False
  140. if not (os.access(self.keyring_file, os.R_OK)):
  141. self.error('Ceph plugin does not read {0}, define read permission.'.format(self.keyring_file))
  142. return False
  143. try:
  144. self.cluster = rados.Rados(conffile=self.config_file,
  145. conf=dict(keyring=self.keyring_file),
  146. rados_id=self.rados_id)
  147. self.cluster.connect()
  148. except rados.Error as error:
  149. self.error(error)
  150. return False
  151. self.create_definitions()
  152. return True
  153. def create_definitions(self):
  154. """
  155. Create dynamically charts options
  156. :return: None
  157. """
  158. # Pool lines
  159. for pool in sorted(self._get_df()['pools'], key=lambda x: sorted(x.keys())):
  160. self.definitions['pool_usage']['lines'].append([pool['name'],
  161. pool['name'],
  162. 'absolute'])
  163. self.definitions['pool_objects']['lines'].append(["obj_{0}".format(pool['name']),
  164. pool['name'],
  165. 'absolute'])
  166. self.definitions['pool_read_bytes']['lines'].append(['read_{0}'.format(pool['name']),
  167. pool['name'],
  168. 'absolute', 1, 1024])
  169. self.definitions['pool_write_bytes']['lines'].append(['write_{0}'.format(pool['name']),
  170. pool['name'],
  171. 'absolute', 1, 1024])
  172. self.definitions['pool_read_operations']['lines'].append(['read_operations_{0}'.format(pool['name']),
  173. pool['name'],
  174. 'absolute'])
  175. self.definitions['pool_write_operations']['lines'].append(['write_operations_{0}'.format(pool['name']),
  176. pool['name'],
  177. 'absolute'])
  178. # OSD lines
  179. for osd in sorted(self._get_osd_df()['nodes'], key=lambda x: sorted(x.keys())):
  180. self.definitions['osd_usage']['lines'].append([osd['name'],
  181. osd['name'],
  182. 'absolute'])
  183. self.definitions['osd_size']['lines'].append(['size_{0}'.format(osd['name']),
  184. osd['name'],
  185. 'absolute'])
  186. self.definitions['osd_apply_latency']['lines'].append(['apply_latency_{0}'.format(osd['name']),
  187. osd['name'],
  188. 'absolute'])
  189. self.definitions['osd_commit_latency']['lines'].append(['commit_latency_{0}'.format(osd['name']),
  190. osd['name'],
  191. 'absolute'])
  192. def get_data(self):
  193. """
  194. Catch all ceph data
  195. :return: dict
  196. """
  197. try:
  198. data = {}
  199. df = self._get_df()
  200. osd_df = self._get_osd_df()
  201. osd_perf = self._get_osd_perf()
  202. osd_perf_infos = get_osd_perf_infos(osd_perf)
  203. pool_stats = self._get_osd_pool_stats()
  204. data.update(self._get_general(osd_perf_infos, pool_stats))
  205. for pool in df['pools']:
  206. data.update(self._get_pool_usage(pool))
  207. data.update(self._get_pool_objects(pool))
  208. for pool_io in pool_stats:
  209. data.update(self._get_pool_rw(pool_io))
  210. for osd in osd_df['nodes']:
  211. data.update(self._get_osd_usage(osd))
  212. data.update(self._get_osd_size(osd))
  213. for osd_apply_commit in osd_perf_infos:
  214. data.update(self._get_osd_latency(osd_apply_commit))
  215. return data
  216. except (ValueError, AttributeError) as error:
  217. self.error(error)
  218. return None
  219. def _get_general(self, osd_perf_infos, pool_stats):
  220. """
  221. Get ceph's general usage
  222. :return: dict
  223. """
  224. status = self.cluster.get_cluster_stats()
  225. read_bytes_sec = 0
  226. write_bytes_sec = 0
  227. read_op_per_sec = 0
  228. write_op_per_sec = 0
  229. apply_latency = 0
  230. commit_latency = 0
  231. for pool_rw_io_b in pool_stats:
  232. read_bytes_sec += pool_rw_io_b['client_io_rate'].get('read_bytes_sec', 0)
  233. write_bytes_sec += pool_rw_io_b['client_io_rate'].get('write_bytes_sec', 0)
  234. read_op_per_sec += pool_rw_io_b['client_io_rate'].get('read_op_per_sec', 0)
  235. write_op_per_sec += pool_rw_io_b['client_io_rate'].get('write_op_per_sec', 0)
  236. for perf in osd_perf_infos:
  237. apply_latency += perf['perf_stats']['apply_latency_ms']
  238. commit_latency += perf['perf_stats']['commit_latency_ms']
  239. return {
  240. 'general_usage': int(status['kb_used']),
  241. 'general_available': int(status['kb_avail']),
  242. 'general_objects': int(status['num_objects']),
  243. 'general_read_bytes': read_bytes_sec,
  244. 'general_write_bytes': write_bytes_sec,
  245. 'general_read_operations': read_op_per_sec,
  246. 'general_write_operations': write_op_per_sec,
  247. 'general_apply_latency': apply_latency,
  248. 'general_commit_latency': commit_latency
  249. }
  250. @staticmethod
  251. def _get_pool_usage(pool):
  252. """
  253. Process raw data into pool usage dict information
  254. :return: A pool dict with pool name's key and usage bytes' value
  255. """
  256. return {pool['name']: pool['stats']['kb_used']}
  257. @staticmethod
  258. def _get_pool_objects(pool):
  259. """
  260. Process raw data into pool usage dict information
  261. :return: A pool dict with pool name's key and object numbers
  262. """
  263. return {'obj_{0}'.format(pool['name']): pool['stats']['objects']}
  264. @staticmethod
  265. def _get_pool_rw(pool):
  266. """
  267. Get read/write kb and operations in a pool
  268. :return: A pool dict with both read/write bytes and operations.
  269. """
  270. return {
  271. 'read_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_bytes_sec', 0)),
  272. 'write_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_bytes_sec', 0)),
  273. 'read_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_op_per_sec', 0)),
  274. 'write_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_op_per_sec', 0))
  275. }
  276. @staticmethod
  277. def _get_osd_usage(osd):
  278. """
  279. Process raw data into osd dict information to get osd usage
  280. :return: A osd dict with osd name's key and usage bytes' value
  281. """
  282. return {osd['name']: float(osd['kb_used'])}
  283. @staticmethod
  284. def _get_osd_size(osd):
  285. """
  286. Process raw data into osd dict information to get osd size (kb)
  287. :return: A osd dict with osd name's key and size bytes' value
  288. """
  289. return {'size_{0}'.format(osd['name']): float(osd['kb'])}
  290. @staticmethod
  291. def _get_osd_latency(osd):
  292. """
  293. Get ceph osd apply and commit latency
  294. :return: A osd dict with osd name's key with both apply and commit latency values
  295. """
  296. return {
  297. 'apply_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['apply_latency_ms'],
  298. 'commit_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['commit_latency_ms']
  299. }
  300. def _get_df(self):
  301. """
  302. Get ceph df output
  303. :return: ceph df --format json
  304. """
  305. return json.loads(self.cluster.mon_command(json.dumps({
  306. 'prefix': 'df',
  307. 'format': 'json'
  308. }), b'')[1].decode('utf-8'))
  309. def _get_osd_df(self):
  310. """
  311. Get ceph osd df output
  312. :return: ceph osd df --format json
  313. """
  314. return json.loads(self.cluster.mon_command(json.dumps({
  315. 'prefix': 'osd df',
  316. 'format': 'json'
  317. }), b'')[1].decode('utf-8').replace('-nan', '"-nan"'))
  318. def _get_osd_perf(self):
  319. """
  320. Get ceph osd performance
  321. :return: ceph osd perf --format json
  322. """
  323. return json.loads(self.cluster.mon_command(json.dumps({
  324. 'prefix': 'osd perf',
  325. 'format': 'json'
  326. }), b'')[1].decode('utf-8'))
  327. def _get_osd_pool_stats(self):
  328. """
  329. Get ceph osd pool status.
  330. This command is used to get information about both
  331. read/write operation and bytes per second on each pool
  332. :return: ceph osd pool stats --format json
  333. """
  334. return json.loads(self.cluster.mon_command(json.dumps({
  335. 'prefix': 'osd pool stats',
  336. 'format': 'json'
  337. }), b'')[1].decode('utf-8'))
  338. def get_osd_perf_infos(osd_perf):
  339. # https://github.com/netdata/netdata/issues/8247
  340. # module uses 'osd_perf_infos' data, its been moved under 'osdstats` since Ceph v14.2
  341. if 'osd_perf_infos' in osd_perf:
  342. return osd_perf['osd_perf_infos']
  343. return osd_perf['osdstats']['osd_perf_infos']