ceph.chart.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # -*- coding: utf-8 -*-
  2. # Description: ceph netdata python.d module
  3. # Author: Luis Eduardo (lets00)
  4. # SPDX-License-Identifier: GPL-3.0-or-later
  5. try:
  6. import rados
  7. CEPH = True
  8. except ImportError:
  9. CEPH = False
  10. import json
  11. import os
  12. from bases.FrameworkServices.SimpleService import SimpleService
  13. # default module values (can be overridden per job in `config`)
  14. update_every = 10
  15. ORDER = [
  16. 'general_usage',
  17. 'general_objects',
  18. 'general_bytes',
  19. 'general_operations',
  20. 'general_latency',
  21. 'pool_usage',
  22. 'pool_objects',
  23. 'pool_read_bytes',
  24. 'pool_write_bytes',
  25. 'pool_read_operations',
  26. 'pool_write_operations',
  27. 'osd_usage',
  28. 'osd_apply_latency',
  29. 'osd_commit_latency'
  30. ]
  31. CHARTS = {
  32. 'general_usage': {
  33. 'options': [None, 'Ceph General Space', 'KiB', 'general', 'ceph.general_usage', 'stacked'],
  34. 'lines': [
  35. ['general_available', 'avail', 'absolute'],
  36. ['general_usage', 'used', 'absolute']
  37. ]
  38. },
  39. 'general_objects': {
  40. 'options': [None, 'Ceph General Objects', 'objects', 'general', 'ceph.general_objects', 'area'],
  41. 'lines': [
  42. ['general_objects', 'cluster', 'absolute']
  43. ]
  44. },
  45. 'general_bytes': {
  46. 'options': [None, 'Ceph General Read/Write Data/s', 'KiB/s', 'general', 'ceph.general_bytes',
  47. 'area'],
  48. 'lines': [
  49. ['general_read_bytes', 'read', 'absolute', 1, 1024],
  50. ['general_write_bytes', 'write', 'absolute', -1, 1024]
  51. ]
  52. },
  53. 'general_operations': {
  54. 'options': [None, 'Ceph General Read/Write Operations/s', 'operations', 'general', 'ceph.general_operations',
  55. 'area'],
  56. 'lines': [
  57. ['general_read_operations', 'read', 'absolute', 1],
  58. ['general_write_operations', 'write', 'absolute', -1]
  59. ]
  60. },
  61. 'general_latency': {
  62. 'options': [None, 'Ceph General Apply/Commit latency', 'milliseconds', 'general', 'ceph.general_latency',
  63. 'area'],
  64. 'lines': [
  65. ['general_apply_latency', 'apply', 'absolute'],
  66. ['general_commit_latency', 'commit', 'absolute']
  67. ]
  68. },
  69. 'pool_usage': {
  70. 'options': [None, 'Ceph Pools', 'KiB', 'pool', 'ceph.pool_usage', 'line'],
  71. 'lines': []
  72. },
  73. 'pool_objects': {
  74. 'options': [None, 'Ceph Pools', 'objects', 'pool', 'ceph.pool_objects', 'line'],
  75. 'lines': []
  76. },
  77. 'pool_read_bytes': {
  78. 'options': [None, 'Ceph Read Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_read_bytes', 'area'],
  79. 'lines': []
  80. },
  81. 'pool_write_bytes': {
  82. 'options': [None, 'Ceph Write Pool Data/s', 'KiB/s', 'pool', 'ceph.pool_write_bytes', 'area'],
  83. 'lines': []
  84. },
  85. 'pool_read_operations': {
  86. 'options': [None, 'Ceph Read Pool Operations/s', 'operations', 'pool', 'ceph.pool_read_operations', 'area'],
  87. 'lines': []
  88. },
  89. 'pool_write_operations': {
  90. 'options': [None, 'Ceph Write Pool Operations/s', 'operations', 'pool', 'ceph.pool_write_operations', 'area'],
  91. 'lines': []
  92. },
  93. 'osd_usage': {
  94. 'options': [None, 'Ceph OSDs', 'KiB', 'osd', 'ceph.osd_usage', 'line'],
  95. 'lines': []
  96. },
  97. 'osd_apply_latency': {
  98. 'options': [None, 'Ceph OSDs apply latency', 'milliseconds', 'osd', 'ceph.apply_latency', 'line'],
  99. 'lines': []
  100. },
  101. 'osd_commit_latency': {
  102. 'options': [None, 'Ceph OSDs commit latency', 'milliseconds', 'osd', 'ceph.commit_latency', 'line'],
  103. 'lines': []
  104. }
  105. }
  106. class Service(SimpleService):
  107. def __init__(self, configuration=None, name=None):
  108. SimpleService.__init__(self, configuration=configuration, name=name)
  109. self.order = ORDER
  110. self.definitions = CHARTS
  111. self.config_file = self.configuration.get('config_file')
  112. self.keyring_file = self.configuration.get('keyring_file')
  113. def check(self):
  114. """
  115. Checks module
  116. :return:
  117. """
  118. if not CEPH:
  119. self.error('rados module is needed to use ceph.chart.py')
  120. return False
  121. if not (self.config_file and self.keyring_file):
  122. self.error('config_file and/or keyring_file is not defined')
  123. return False
  124. # Verify files and permissions
  125. if not (os.access(self.config_file, os.F_OK)):
  126. self.error('{0} does not exist'.format(self.config_file))
  127. return False
  128. if not (os.access(self.keyring_file, os.F_OK)):
  129. self.error('{0} does not exist'.format(self.keyring_file))
  130. return False
  131. if not (os.access(self.config_file, os.R_OK)):
  132. self.error('Ceph plugin does not read {0}, define read permission.'.format(self.config_file))
  133. return False
  134. if not (os.access(self.keyring_file, os.R_OK)):
  135. self.error('Ceph plugin does not read {0}, define read permission.'.format(self.keyring_file))
  136. return False
  137. try:
  138. self.cluster = rados.Rados(conffile=self.config_file,
  139. conf=dict(keyring=self.keyring_file))
  140. self.cluster.connect()
  141. except rados.Error as error:
  142. self.error(error)
  143. return False
  144. self.create_definitions()
  145. return True
  146. def create_definitions(self):
  147. """
  148. Create dynamically charts options
  149. :return: None
  150. """
  151. # Pool lines
  152. for pool in sorted(self._get_df()['pools'], key=lambda x:sorted(x.keys())):
  153. self.definitions['pool_usage']['lines'].append([pool['name'],
  154. pool['name'],
  155. 'absolute'])
  156. self.definitions['pool_objects']['lines'].append(["obj_{0}".format(pool['name']),
  157. pool['name'],
  158. 'absolute'])
  159. self.definitions['pool_read_bytes']['lines'].append(['read_{0}'.format(pool['name']),
  160. pool['name'],
  161. 'absolute', 1, 1024])
  162. self.definitions['pool_write_bytes']['lines'].append(['write_{0}'.format(pool['name']),
  163. pool['name'],
  164. 'absolute', 1, 1024])
  165. self.definitions['pool_read_operations']['lines'].append(['read_operations_{0}'.format(pool['name']),
  166. pool['name'],
  167. 'absolute'])
  168. self.definitions['pool_write_operations']['lines'].append(['write_operations_{0}'.format(pool['name']),
  169. pool['name'],
  170. 'absolute'])
  171. # OSD lines
  172. for osd in sorted(self._get_osd_df()['nodes'], key=lambda x:sorted(x.keys())):
  173. self.definitions['osd_usage']['lines'].append([osd['name'],
  174. osd['name'],
  175. 'absolute'])
  176. self.definitions['osd_apply_latency']['lines'].append(['apply_latency_{0}'.format(osd['name']),
  177. osd['name'],
  178. 'absolute'])
  179. self.definitions['osd_commit_latency']['lines'].append(['commit_latency_{0}'.format(osd['name']),
  180. osd['name'],
  181. 'absolute'])
  182. def get_data(self):
  183. """
  184. Catch all ceph data
  185. :return: dict
  186. """
  187. try:
  188. data = {}
  189. df = self._get_df()
  190. osd_df = self._get_osd_df()
  191. osd_perf = self._get_osd_perf()
  192. pool_stats = self._get_osd_pool_stats()
  193. data.update(self._get_general(osd_perf, pool_stats))
  194. for pool in df['pools']:
  195. data.update(self._get_pool_usage(pool))
  196. data.update(self._get_pool_objects(pool))
  197. for pool_io in pool_stats:
  198. data.update(self._get_pool_rw(pool_io))
  199. for osd in osd_df['nodes']:
  200. data.update(self._get_osd_usage(osd))
  201. for osd_apply_commit in osd_perf['osd_perf_infos']:
  202. data.update(self._get_osd_latency(osd_apply_commit))
  203. return data
  204. except (ValueError, AttributeError) as error:
  205. self.error(error)
  206. return None
  207. def _get_general(self, osd_perf, pool_stats):
  208. """
  209. Get ceph's general usage
  210. :return: dict
  211. """
  212. status = self.cluster.get_cluster_stats()
  213. read_bytes_sec = 0
  214. write_bytes_sec = 0
  215. read_op_per_sec = 0
  216. write_op_per_sec = 0
  217. apply_latency = 0
  218. commit_latency = 0
  219. for pool_rw_io_b in pool_stats:
  220. read_bytes_sec += pool_rw_io_b['client_io_rate'].get('read_bytes_sec', 0)
  221. write_bytes_sec += pool_rw_io_b['client_io_rate'].get('write_bytes_sec', 0)
  222. read_op_per_sec += pool_rw_io_b['client_io_rate'].get('read_op_per_sec', 0)
  223. write_op_per_sec += pool_rw_io_b['client_io_rate'].get('write_op_per_sec', 0)
  224. for perf in osd_perf['osd_perf_infos']:
  225. apply_latency += perf['perf_stats']['apply_latency_ms']
  226. commit_latency += perf['perf_stats']['commit_latency_ms']
  227. return {
  228. 'general_usage': int(status['kb_used']),
  229. 'general_available': int(status['kb_avail']),
  230. 'general_objects': int(status['num_objects']),
  231. 'general_read_bytes': read_bytes_sec,
  232. 'general_write_bytes': write_bytes_sec,
  233. 'general_read_operations': read_op_per_sec,
  234. 'general_write_operations': write_op_per_sec,
  235. 'general_apply_latency': apply_latency,
  236. 'general_commit_latency': commit_latency
  237. }
  238. @staticmethod
  239. def _get_pool_usage(pool):
  240. """
  241. Process raw data into pool usage dict information
  242. :return: A pool dict with pool name's key and usage bytes' value
  243. """
  244. return {pool['name']: pool['stats']['kb_used']}
  245. @staticmethod
  246. def _get_pool_objects(pool):
  247. """
  248. Process raw data into pool usage dict information
  249. :return: A pool dict with pool name's key and object numbers
  250. """
  251. return {'obj_{0}'.format(pool['name']): pool['stats']['objects']}
  252. @staticmethod
  253. def _get_pool_rw(pool):
  254. """
  255. Get read/write kb and operations in a pool
  256. :return: A pool dict with both read/write bytes and operations.
  257. """
  258. return {
  259. 'read_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_bytes_sec', 0)),
  260. 'write_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_bytes_sec', 0)),
  261. 'read_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('read_op_per_sec', 0)),
  262. 'write_operations_{0}'.format(pool['pool_name']): int(pool['client_io_rate'].get('write_op_per_sec', 0))
  263. }
  264. @staticmethod
  265. def _get_osd_usage(osd):
  266. """
  267. Process raw data into osd dict information to get osd usage
  268. :return: A osd dict with osd name's key and usage bytes' value
  269. """
  270. return {osd['name']: float(osd['kb_used'])}
  271. @staticmethod
  272. def _get_osd_latency(osd):
  273. """
  274. Get ceph osd apply and commit latency
  275. :return: A osd dict with osd name's key with both apply and commit latency values
  276. """
  277. return {
  278. 'apply_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['apply_latency_ms'],
  279. 'commit_latency_osd.{0}'.format(osd['id']): osd['perf_stats']['commit_latency_ms']
  280. }
  281. def _get_df(self):
  282. """
  283. Get ceph df output
  284. :return: ceph df --format json
  285. """
  286. return json.loads(self.cluster.mon_command(json.dumps({
  287. 'prefix': 'df',
  288. 'format': 'json'
  289. }), '')[1].decode('utf-8'))
  290. def _get_osd_df(self):
  291. """
  292. Get ceph osd df output
  293. :return: ceph osd df --format json
  294. """
  295. return json.loads(self.cluster.mon_command(json.dumps({
  296. 'prefix': 'osd df',
  297. 'format': 'json'
  298. }), '')[1].decode('utf-8').replace('-nan', '"-nan"'))
  299. def _get_osd_perf(self):
  300. """
  301. Get ceph osd performance
  302. :return: ceph osd perf --format json
  303. """
  304. return json.loads(self.cluster.mon_command(json.dumps({
  305. 'prefix': 'osd perf',
  306. 'format': 'json'
  307. }), '')[1].decode('utf-8'))
  308. def _get_osd_pool_stats(self):
  309. """
  310. Get ceph osd pool status.
  311. This command is used to get information about both
  312. read/write operation and bytes per second on each pool
  313. :return: ceph osd pool stats --format json
  314. """
  315. return json.loads(self.cluster.mon_command(json.dumps({
  316. 'prefix': 'osd pool stats',
  317. 'format': 'json'
  318. }), '')[1].decode('utf-8'))