nvidia_smi.chart.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. # -*- coding: utf-8 -*-
  2. # Description: nvidia-smi netdata python.d module
  3. # Original Author: Steven Noonan (tycho)
  4. # Author: Ilya Mashchenko (ilyam8)
  5. # User Memory Stat Author: Guido Scatena (scatenag)
  6. import os
  7. import pwd
  8. import subprocess
  9. import threading
  10. import xml.etree.ElementTree as et
  11. from bases.FrameworkServices.SimpleService import SimpleService
  12. from bases.collection import find_binary
  13. disabled_by_default = True
  14. NVIDIA_SMI = 'nvidia-smi'
  15. NOT_AVAILABLE = 'N/A'
  16. EMPTY_ROW = ''
  17. EMPTY_ROW_LIMIT = 500
  18. POLLER_BREAK_ROW = '</nvidia_smi_log>'
  19. PCI_BANDWIDTH = 'pci_bandwidth'
  20. PCI_BANDWIDTH_PERCENT = 'pci_bandwidth_percent'
  21. FAN_SPEED = 'fan_speed'
  22. GPU_UTIL = 'gpu_utilization'
  23. MEM_UTIL = 'mem_utilization'
  24. ENCODER_UTIL = 'encoder_utilization'
  25. MEM_USAGE = 'mem_usage'
  26. BAR_USAGE = 'bar1_mem_usage'
  27. TEMPERATURE = 'temperature'
  28. CLOCKS = 'clocks'
  29. POWER = 'power'
  30. POWER_STATE = 'power_state'
  31. PROCESSES_MEM = 'processes_mem'
  32. USER_MEM = 'user_mem'
  33. USER_NUM = 'user_num'
  34. ORDER = [
  35. PCI_BANDWIDTH,
  36. PCI_BANDWIDTH_PERCENT,
  37. FAN_SPEED,
  38. GPU_UTIL,
  39. MEM_UTIL,
  40. ENCODER_UTIL,
  41. MEM_USAGE,
  42. BAR_USAGE,
  43. TEMPERATURE,
  44. CLOCKS,
  45. POWER,
  46. POWER_STATE,
  47. PROCESSES_MEM,
  48. USER_MEM,
  49. USER_NUM,
  50. ]
  51. # https://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/group__gpupstate.html
  52. POWER_STATES = ['P' + str(i) for i in range(0, 16)]
  53. # PCI Transfer data rate in gigabits per second (Gb/s) per generation
  54. PCI_SPEED = {
  55. "1": 2.5,
  56. "2": 5,
  57. "3": 8,
  58. "4": 16,
  59. "5": 32
  60. }
  61. # PCI encoding per generation
  62. PCI_ENCODING = {
  63. "1": 2/10,
  64. "2": 2/10,
  65. "3": 2/130,
  66. "4": 2/130,
  67. "5": 2/130
  68. }
  69. def gpu_charts(gpu):
  70. fam = gpu.full_name()
  71. charts = {
  72. PCI_BANDWIDTH: {
  73. 'options': [None, 'PCI Express Bandwidth Utilization', 'KiB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
  74. 'lines': [
  75. ['rx_util', 'rx', 'absolute', 1, 1],
  76. ['tx_util', 'tx', 'absolute', 1, -1],
  77. ]
  78. },
  79. PCI_BANDWIDTH_PERCENT: {
  80. 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'],
  81. 'lines': [
  82. ['rx_util_percent', 'rx_percent'],
  83. ['tx_util_percent', 'tx_percent'],
  84. ]
  85. },
  86. FAN_SPEED: {
  87. 'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
  88. 'lines': [
  89. ['fan_speed', 'speed'],
  90. ]
  91. },
  92. GPU_UTIL: {
  93. 'options': [None, 'GPU Utilization', 'percentage', fam, 'nvidia_smi.gpu_utilization', 'line'],
  94. 'lines': [
  95. ['gpu_util', 'utilization'],
  96. ]
  97. },
  98. MEM_UTIL: {
  99. 'options': [None, 'Memory Bandwidth Utilization', 'percentage', fam, 'nvidia_smi.mem_utilization', 'line'],
  100. 'lines': [
  101. ['memory_util', 'utilization'],
  102. ]
  103. },
  104. ENCODER_UTIL: {
  105. 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization',
  106. 'line'],
  107. 'lines': [
  108. ['encoder_util', 'encoder'],
  109. ['decoder_util', 'decoder'],
  110. ]
  111. },
  112. MEM_USAGE: {
  113. 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
  114. 'lines': [
  115. ['fb_memory_free', 'free'],
  116. ['fb_memory_used', 'used'],
  117. ]
  118. },
  119. BAR_USAGE: {
  120. 'options': [None, 'Bar1 Memory Usage', 'MiB', fam, 'nvidia_smi.bar1_memory_usage', 'stacked'],
  121. 'lines': [
  122. ['bar1_memory_free', 'free'],
  123. ['bar1_memory_used', 'used'],
  124. ]
  125. },
  126. TEMPERATURE: {
  127. 'options': [None, 'Temperature', 'celsius', fam, 'nvidia_smi.temperature', 'line'],
  128. 'lines': [
  129. ['gpu_temp', 'temp'],
  130. ]
  131. },
  132. CLOCKS: {
  133. 'options': [None, 'Clock Frequencies', 'MHz', fam, 'nvidia_smi.clocks', 'line'],
  134. 'lines': [
  135. ['graphics_clock', 'graphics'],
  136. ['video_clock', 'video'],
  137. ['sm_clock', 'sm'],
  138. ['mem_clock', 'mem'],
  139. ]
  140. },
  141. POWER: {
  142. 'options': [None, 'Power Utilization', 'Watts', fam, 'nvidia_smi.power', 'line'],
  143. 'lines': [
  144. ['power_draw', 'power', 'absolute', 1, 100],
  145. ]
  146. },
  147. POWER_STATE: {
  148. 'options': [None, 'Power State', 'state', fam, 'nvidia_smi.power_state', 'line'],
  149. 'lines': [['power_state_' + v.lower(), v, 'absolute'] for v in POWER_STATES]
  150. },
  151. PROCESSES_MEM: {
  152. 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
  153. 'lines': []
  154. },
  155. USER_MEM: {
  156. 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
  157. 'lines': []
  158. },
  159. USER_NUM: {
  160. 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
  161. 'lines': [
  162. ['user_num', 'users'],
  163. ]
  164. },
  165. }
  166. idx = gpu.num
  167. order = ['gpu{0}_{1}'.format(idx, v) for v in ORDER]
  168. charts = dict(('gpu{0}_{1}'.format(idx, k), v) for k, v in charts.items())
  169. for chart in charts.values():
  170. for line in chart['lines']:
  171. line[0] = 'gpu{0}_{1}'.format(idx, line[0])
  172. return order, charts
  173. class NvidiaSMI:
  174. def __init__(self):
  175. self.command = find_binary(NVIDIA_SMI)
  176. self.active_proc = None
  177. def run_once(self):
  178. proc = subprocess.Popen([self.command, '-x', '-q'], stdout=subprocess.PIPE)
  179. stdout, _ = proc.communicate()
  180. return stdout
  181. def run_loop(self, interval):
  182. if self.active_proc:
  183. self.kill()
  184. proc = subprocess.Popen([self.command, '-x', '-q', '-l', str(interval)], stdout=subprocess.PIPE)
  185. self.active_proc = proc
  186. return proc.stdout
  187. def kill(self):
  188. if self.active_proc:
  189. self.active_proc.kill()
  190. self.active_proc = None
  191. class NvidiaSMIPoller(threading.Thread):
  192. def __init__(self, poll_interval):
  193. threading.Thread.__init__(self)
  194. self.daemon = True
  195. self.smi = NvidiaSMI()
  196. self.interval = poll_interval
  197. self.lock = threading.RLock()
  198. self.last_data = str()
  199. self.exit = False
  200. self.empty_rows = 0
  201. self.rows = list()
  202. def has_smi(self):
  203. return bool(self.smi.command)
  204. def run_once(self):
  205. return self.smi.run_once()
  206. def run(self):
  207. out = self.smi.run_loop(self.interval)
  208. for row in out:
  209. if self.exit or self.empty_rows > EMPTY_ROW_LIMIT:
  210. break
  211. self.process_row(row)
  212. self.smi.kill()
  213. def process_row(self, row):
  214. row = row.decode()
  215. self.empty_rows += (row == EMPTY_ROW)
  216. self.rows.append(row)
  217. if POLLER_BREAK_ROW in row:
  218. self.lock.acquire()
  219. self.last_data = '\n'.join(self.rows)
  220. self.lock.release()
  221. self.rows = list()
  222. self.empty_rows = 0
  223. def is_started(self):
  224. return self.ident is not None
  225. def shutdown(self):
  226. self.exit = True
  227. def data(self):
  228. self.lock.acquire()
  229. data = self.last_data
  230. self.lock.release()
  231. return data
  232. def handle_attr_error(method):
  233. def on_call(*args, **kwargs):
  234. try:
  235. return method(*args, **kwargs)
  236. except AttributeError:
  237. return None
  238. return on_call
  239. def handle_value_error(method):
  240. def on_call(*args, **kwargs):
  241. try:
  242. return method(*args, **kwargs)
  243. except ValueError:
  244. return None
  245. return on_call
  246. HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
  247. ETC_PASSWD_PATH = '/etc/passwd'
  248. PROC_PATH = '/proc'
  249. IS_INSIDE_DOCKER = False
  250. if HOST_PREFIX:
  251. ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
  252. PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
  253. IS_INSIDE_DOCKER = True
  254. def read_passwd_file():
  255. data = dict()
  256. with open(ETC_PASSWD_PATH, 'r') as f:
  257. for line in f:
  258. line = line.strip()
  259. if line.startswith("#"):
  260. continue
  261. fields = line.split(":")
  262. # name, passwd, uid, gid, comment, home_dir, shell
  263. if len(fields) != 7:
  264. continue
  265. # uid, guid
  266. fields[2], fields[3] = int(fields[2]), int(fields[3])
  267. data[fields[2]] = fields
  268. return data
  269. def read_passwd_file_safe():
  270. try:
  271. if IS_INSIDE_DOCKER:
  272. return read_passwd_file()
  273. return dict((k[2], k) for k in pwd.getpwall())
  274. except (OSError, IOError):
  275. return dict()
  276. def get_username_by_pid_safe(pid, passwd_file):
  277. path = os.path.join(PROC_PATH, pid)
  278. try:
  279. uid = os.stat(path).st_uid
  280. except (OSError, IOError):
  281. return ''
  282. try:
  283. if IS_INSIDE_DOCKER:
  284. return passwd_file[uid][0]
  285. return pwd.getpwuid(uid)[0]
  286. except KeyError:
  287. return str(uid)
  288. class GPU:
  289. def __init__(self, num, root, exclude_zero_memory_users=False):
  290. self.num = num
  291. self.root = root
  292. self.exclude_zero_memory_users = exclude_zero_memory_users
  293. def id(self):
  294. return self.root.get('id')
  295. def name(self):
  296. return self.root.find('product_name').text
  297. def full_name(self):
  298. return 'gpu{0} {1}'.format(self.num, self.name())
  299. @handle_attr_error
  300. def pci_link_gen(self):
  301. return self.root.find('pci').find('pci_gpu_link_info').find('pcie_gen').find('max_link_gen').text
  302. @handle_attr_error
  303. def pci_link_width(self):
  304. return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0]
  305. def pci_bw_max(self):
  306. link_gen = self.pci_link_gen()
  307. link_width = int(self.pci_link_width())
  308. if link_gen not in PCI_SPEED or link_gen not in PCI_ENCODING or not link_width:
  309. return None
  310. # Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s.
  311. # see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
  312. # return max bandwidth in kilobytes per second (kB/s)
  313. return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8
  314. @handle_attr_error
  315. def rx_util(self):
  316. return self.root.find('pci').find('rx_util').text.split()[0]
  317. @handle_attr_error
  318. def tx_util(self):
  319. return self.root.find('pci').find('tx_util').text.split()[0]
  320. @handle_attr_error
  321. def fan_speed(self):
  322. return self.root.find('fan_speed').text.split()[0]
  323. @handle_attr_error
  324. def gpu_util(self):
  325. return self.root.find('utilization').find('gpu_util').text.split()[0]
  326. @handle_attr_error
  327. def memory_util(self):
  328. return self.root.find('utilization').find('memory_util').text.split()[0]
  329. @handle_attr_error
  330. def encoder_util(self):
  331. return self.root.find('utilization').find('encoder_util').text.split()[0]
  332. @handle_attr_error
  333. def decoder_util(self):
  334. return self.root.find('utilization').find('decoder_util').text.split()[0]
  335. @handle_attr_error
  336. def fb_memory_used(self):
  337. return self.root.find('fb_memory_usage').find('used').text.split()[0]
  338. @handle_attr_error
  339. def fb_memory_free(self):
  340. return self.root.find('fb_memory_usage').find('free').text.split()[0]
  341. @handle_attr_error
  342. def bar1_memory_used(self):
  343. return self.root.find('bar1_memory_usage').find('used').text.split()[0]
  344. @handle_attr_error
  345. def bar1_memory_free(self):
  346. return self.root.find('bar1_memory_usage').find('free').text.split()[0]
  347. @handle_attr_error
  348. def temperature(self):
  349. return self.root.find('temperature').find('gpu_temp').text.split()[0]
  350. @handle_attr_error
  351. def graphics_clock(self):
  352. return self.root.find('clocks').find('graphics_clock').text.split()[0]
  353. @handle_attr_error
  354. def video_clock(self):
  355. return self.root.find('clocks').find('video_clock').text.split()[0]
  356. @handle_attr_error
  357. def sm_clock(self):
  358. return self.root.find('clocks').find('sm_clock').text.split()[0]
  359. @handle_attr_error
  360. def mem_clock(self):
  361. return self.root.find('clocks').find('mem_clock').text.split()[0]
  362. @handle_attr_error
  363. def power_state(self):
  364. return str(self.root.find('power_readings').find('power_state').text.split()[0])
  365. @handle_value_error
  366. @handle_attr_error
  367. def power_draw(self):
  368. return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
  369. @handle_attr_error
  370. def processes(self):
  371. processes_info = self.root.find('processes').findall('process_info')
  372. if not processes_info:
  373. return list()
  374. passwd_file = read_passwd_file_safe()
  375. processes = list()
  376. for info in processes_info:
  377. pid = info.find('pid').text
  378. processes.append({
  379. 'pid': int(pid),
  380. 'process_name': info.find('process_name').text,
  381. 'used_memory': int(info.find('used_memory').text.split()[0]),
  382. 'username': get_username_by_pid_safe(pid, passwd_file),
  383. })
  384. return processes
  385. def data(self):
  386. data = {
  387. 'rx_util': self.rx_util(),
  388. 'tx_util': self.tx_util(),
  389. 'fan_speed': self.fan_speed(),
  390. 'gpu_util': self.gpu_util(),
  391. 'memory_util': self.memory_util(),
  392. 'encoder_util': self.encoder_util(),
  393. 'decoder_util': self.decoder_util(),
  394. 'fb_memory_used': self.fb_memory_used(),
  395. 'fb_memory_free': self.fb_memory_free(),
  396. 'bar1_memory_used': self.bar1_memory_used(),
  397. 'bar1_memory_free': self.bar1_memory_free(),
  398. 'gpu_temp': self.temperature(),
  399. 'graphics_clock': self.graphics_clock(),
  400. 'video_clock': self.video_clock(),
  401. 'sm_clock': self.sm_clock(),
  402. 'mem_clock': self.mem_clock(),
  403. 'power_draw': self.power_draw(),
  404. }
  405. if self.rx_util() != NOT_AVAILABLE and self.tx_util() != NOT_AVAILABLE:
  406. pci_bw_max = self.pci_bw_max()
  407. if not pci_bw_max:
  408. data['rx_util_percent'] = 0
  409. data['tx_util_percent'] = 0
  410. else:
  411. data['rx_util_percent'] = str(int(int(self.rx_util()) * 100 / self.pci_bw_max()))
  412. data['tx_util_percent'] = str(int(int(self.tx_util()) * 100 / self.pci_bw_max()))
  413. for v in POWER_STATES:
  414. data['power_state_' + v.lower()] = 0
  415. p_state = self.power_state()
  416. if p_state:
  417. data['power_state_' + p_state.lower()] = 1
  418. processes = self.processes() or []
  419. users = set()
  420. for p in processes:
  421. data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
  422. if p['username']:
  423. if self.exclude_zero_memory_users and p['used_memory'] == 0:
  424. continue
  425. users.add(p['username'])
  426. key = 'user_mem_{0}'.format(p['username'])
  427. if key in data:
  428. data[key] += p['used_memory']
  429. else:
  430. data[key] = p['used_memory']
  431. data['user_num'] = len(users)
  432. return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items())
  433. class Service(SimpleService):
  434. def __init__(self, configuration=None, name=None):
  435. super(Service, self).__init__(configuration=configuration, name=name)
  436. self.order = list()
  437. self.definitions = dict()
  438. self.loop_mode = configuration.get('loop_mode', True)
  439. poll = int(configuration.get('poll_seconds', self.get_update_every()))
  440. self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False)
  441. self.poller = NvidiaSMIPoller(poll)
  442. def get_data_loop_mode(self):
  443. if not self.poller.is_started():
  444. self.poller.start()
  445. if not self.poller.is_alive():
  446. self.debug('poller is off')
  447. return None
  448. return self.poller.data()
  449. def get_data_normal_mode(self):
  450. return self.poller.run_once()
  451. def get_data(self):
  452. if self.loop_mode:
  453. last_data = self.get_data_loop_mode()
  454. else:
  455. last_data = self.get_data_normal_mode()
  456. if not last_data:
  457. return None
  458. parsed = self.parse_xml(last_data)
  459. if parsed is None:
  460. return None
  461. data = dict()
  462. for idx, root in enumerate(parsed.findall('gpu')):
  463. gpu = GPU(idx, root, self.exclude_zero_memory_users)
  464. gpu_data = gpu.data()
  465. # self.debug(gpu_data)
  466. gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v))
  467. data.update(gpu_data)
  468. self.update_processes_mem_chart(gpu)
  469. self.update_processes_user_mem_chart(gpu)
  470. return data or None
  471. def update_processes_mem_chart(self, gpu):
  472. ps = gpu.processes()
  473. if not ps:
  474. return
  475. chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
  476. active_dim_ids = []
  477. for p in ps:
  478. dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
  479. active_dim_ids.append(dim_id)
  480. if dim_id not in chart:
  481. chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
  482. for dim in chart:
  483. if dim.id not in active_dim_ids:
  484. chart.del_dimension(dim.id, hide=False)
  485. def update_processes_user_mem_chart(self, gpu):
  486. ps = gpu.processes()
  487. if not ps:
  488. return
  489. chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
  490. active_dim_ids = []
  491. for p in ps:
  492. if not p.get('username'):
  493. continue
  494. dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
  495. active_dim_ids.append(dim_id)
  496. if dim_id not in chart:
  497. chart.add_dimension([dim_id, '{0}'.format(p['username'])])
  498. for dim in chart:
  499. if dim.id not in active_dim_ids:
  500. chart.del_dimension(dim.id, hide=False)
  501. def check(self):
  502. if not self.poller.has_smi():
  503. self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
  504. return False
  505. raw_data = self.poller.run_once()
  506. if not raw_data:
  507. self.error("failed to invoke '{0}' binary".format(NVIDIA_SMI))
  508. return False
  509. parsed = self.parse_xml(raw_data)
  510. if parsed is None:
  511. return False
  512. gpus = parsed.findall('gpu')
  513. if not gpus:
  514. return False
  515. self.create_charts(gpus)
  516. return True
  517. def parse_xml(self, data):
  518. try:
  519. return et.fromstring(data)
  520. except et.ParseError as error:
  521. self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
  522. return None
  523. def create_charts(self, gpus):
  524. for idx, root in enumerate(gpus):
  525. order, charts = gpu_charts(GPU(idx, root))
  526. self.order.extend(order)
  527. self.definitions.update(charts)
  528. def is_gpu_data_value_valid(value):
  529. try:
  530. int(value)
  531. except (TypeError, ValueError):
  532. return False
  533. return True