nvidia_smi.chart.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. # -*- coding: utf-8 -*-
  2. # Description: nvidia-smi netdata python.d module
  3. # Original Author: Steven Noonan (tycho)
  4. # Author: Ilya Mashchenko (ilyam8)
  5. # User Memory Stat Author: Guido Scatena (scatenag)
  6. import os
  7. import pwd
  8. import subprocess
  9. import threading
  10. import xml.etree.ElementTree as et
  11. from bases.FrameworkServices.SimpleService import SimpleService
  12. from bases.collection import find_binary
  13. disabled_by_default = True
  14. NVIDIA_SMI = 'nvidia-smi'
  15. EMPTY_ROW = ''
  16. EMPTY_ROW_LIMIT = 500
  17. POLLER_BREAK_ROW = '</nvidia_smi_log>'
  18. PCI_BANDWIDTH = 'pci_bandwidth'
  19. PCI_BANDWIDTH_PERCENT = 'pci_bandwidth_percent'
  20. FAN_SPEED = 'fan_speed'
  21. GPU_UTIL = 'gpu_utilization'
  22. MEM_UTIL = 'mem_utilization'
  23. ENCODER_UTIL = 'encoder_utilization'
  24. MEM_USAGE = 'mem_usage'
  25. BAR_USAGE = 'bar1_mem_usage'
  26. TEMPERATURE = 'temperature'
  27. CLOCKS = 'clocks'
  28. POWER = 'power'
  29. POWER_STATE = 'power_state'
  30. PROCESSES_MEM = 'processes_mem'
  31. USER_MEM = 'user_mem'
  32. USER_NUM = 'user_num'
  33. ORDER = [
  34. PCI_BANDWIDTH,
  35. PCI_BANDWIDTH_PERCENT,
  36. FAN_SPEED,
  37. GPU_UTIL,
  38. MEM_UTIL,
  39. ENCODER_UTIL,
  40. MEM_USAGE,
  41. BAR_USAGE,
  42. TEMPERATURE,
  43. CLOCKS,
  44. POWER,
  45. POWER_STATE,
  46. PROCESSES_MEM,
  47. USER_MEM,
  48. USER_NUM,
  49. ]
  50. # https://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/group__gpupstate.html
  51. POWER_STATES = ['P' + str(i) for i in range(0, 16)]
  52. # PCI Transfer data rate in gigabits per second (Gb/s) per generation
  53. PCI_SPEED = {
  54. "1": 2.5,
  55. "2": 5,
  56. "3": 8,
  57. "4": 16,
  58. "5": 32
  59. }
  60. # PCI encoding per generation
  61. PCI_ENCODING = {
  62. "1": 2/10,
  63. "2": 2/10,
  64. "3": 2/130,
  65. "4": 2/130,
  66. "5": 2/130
  67. }
  68. def gpu_charts(gpu):
  69. fam = gpu.full_name()
  70. charts = {
  71. PCI_BANDWIDTH: {
  72. 'options': [None, 'PCI Express Bandwidth Utilization', 'KiB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
  73. 'lines': [
  74. ['rx_util', 'rx', 'absolute', 1, 1],
  75. ['tx_util', 'tx', 'absolute', 1, -1],
  76. ]
  77. },
  78. PCI_BANDWIDTH_PERCENT: {
  79. 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'],
  80. 'lines': [
  81. ['rx_util_percent', 'rx_percent'],
  82. ['tx_util_percent', 'tx_percent'],
  83. ]
  84. },
  85. FAN_SPEED: {
  86. 'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
  87. 'lines': [
  88. ['fan_speed', 'speed'],
  89. ]
  90. },
  91. GPU_UTIL: {
  92. 'options': [None, 'GPU Utilization', 'percentage', fam, 'nvidia_smi.gpu_utilization', 'line'],
  93. 'lines': [
  94. ['gpu_util', 'utilization'],
  95. ]
  96. },
  97. MEM_UTIL: {
  98. 'options': [None, 'Memory Bandwidth Utilization', 'percentage', fam, 'nvidia_smi.mem_utilization', 'line'],
  99. 'lines': [
  100. ['memory_util', 'utilization'],
  101. ]
  102. },
  103. ENCODER_UTIL: {
  104. 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization',
  105. 'line'],
  106. 'lines': [
  107. ['encoder_util', 'encoder'],
  108. ['decoder_util', 'decoder'],
  109. ]
  110. },
  111. MEM_USAGE: {
  112. 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
  113. 'lines': [
  114. ['fb_memory_free', 'free'],
  115. ['fb_memory_used', 'used'],
  116. ]
  117. },
  118. BAR_USAGE: {
  119. 'options': [None, 'Bar1 Memory Usage', 'MiB', fam, 'nvidia_smi.bar1_memory_usage', 'stacked'],
  120. 'lines': [
  121. ['bar1_memory_free', 'free'],
  122. ['bar1_memory_used', 'used'],
  123. ]
  124. },
  125. TEMPERATURE: {
  126. 'options': [None, 'Temperature', 'celsius', fam, 'nvidia_smi.temperature', 'line'],
  127. 'lines': [
  128. ['gpu_temp', 'temp'],
  129. ]
  130. },
  131. CLOCKS: {
  132. 'options': [None, 'Clock Frequencies', 'MHz', fam, 'nvidia_smi.clocks', 'line'],
  133. 'lines': [
  134. ['graphics_clock', 'graphics'],
  135. ['video_clock', 'video'],
  136. ['sm_clock', 'sm'],
  137. ['mem_clock', 'mem'],
  138. ]
  139. },
  140. POWER: {
  141. 'options': [None, 'Power Utilization', 'Watts', fam, 'nvidia_smi.power', 'line'],
  142. 'lines': [
  143. ['power_draw', 'power', 'absolute', 1, 100],
  144. ]
  145. },
  146. POWER_STATE: {
  147. 'options': [None, 'Power State', 'state', fam, 'nvidia_smi.power_state', 'line'],
  148. 'lines': [['power_state_' + v.lower(), v, 'absolute'] for v in POWER_STATES]
  149. },
  150. PROCESSES_MEM: {
  151. 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
  152. 'lines': []
  153. },
  154. USER_MEM: {
  155. 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
  156. 'lines': []
  157. },
  158. USER_NUM: {
  159. 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
  160. 'lines': [
  161. ['user_num', 'users'],
  162. ]
  163. },
  164. }
  165. idx = gpu.num
  166. order = ['gpu{0}_{1}'.format(idx, v) for v in ORDER]
  167. charts = dict(('gpu{0}_{1}'.format(idx, k), v) for k, v in charts.items())
  168. for chart in charts.values():
  169. for line in chart['lines']:
  170. line[0] = 'gpu{0}_{1}'.format(idx, line[0])
  171. return order, charts
  172. class NvidiaSMI:
  173. def __init__(self):
  174. self.command = find_binary(NVIDIA_SMI)
  175. self.active_proc = None
  176. def run_once(self):
  177. proc = subprocess.Popen([self.command, '-x', '-q'], stdout=subprocess.PIPE)
  178. stdout, _ = proc.communicate()
  179. return stdout
  180. def run_loop(self, interval):
  181. if self.active_proc:
  182. self.kill()
  183. proc = subprocess.Popen([self.command, '-x', '-q', '-l', str(interval)], stdout=subprocess.PIPE)
  184. self.active_proc = proc
  185. return proc.stdout
  186. def kill(self):
  187. if self.active_proc:
  188. self.active_proc.kill()
  189. self.active_proc = None
  190. class NvidiaSMIPoller(threading.Thread):
  191. def __init__(self, poll_interval):
  192. threading.Thread.__init__(self)
  193. self.daemon = True
  194. self.smi = NvidiaSMI()
  195. self.interval = poll_interval
  196. self.lock = threading.RLock()
  197. self.last_data = str()
  198. self.exit = False
  199. self.empty_rows = 0
  200. self.rows = list()
  201. def has_smi(self):
  202. return bool(self.smi.command)
  203. def run_once(self):
  204. return self.smi.run_once()
  205. def run(self):
  206. out = self.smi.run_loop(self.interval)
  207. for row in out:
  208. if self.exit or self.empty_rows > EMPTY_ROW_LIMIT:
  209. break
  210. self.process_row(row)
  211. self.smi.kill()
  212. def process_row(self, row):
  213. row = row.decode()
  214. self.empty_rows += (row == EMPTY_ROW)
  215. self.rows.append(row)
  216. if POLLER_BREAK_ROW in row:
  217. self.lock.acquire()
  218. self.last_data = '\n'.join(self.rows)
  219. self.lock.release()
  220. self.rows = list()
  221. self.empty_rows = 0
  222. def is_started(self):
  223. return self.ident is not None
  224. def shutdown(self):
  225. self.exit = True
  226. def data(self):
  227. self.lock.acquire()
  228. data = self.last_data
  229. self.lock.release()
  230. return data
  231. def handle_attr_error(method):
  232. def on_call(*args, **kwargs):
  233. try:
  234. return method(*args, **kwargs)
  235. except AttributeError:
  236. return None
  237. return on_call
  238. def handle_value_error(method):
  239. def on_call(*args, **kwargs):
  240. try:
  241. return method(*args, **kwargs)
  242. except ValueError:
  243. return None
  244. return on_call
  245. HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
  246. ETC_PASSWD_PATH = '/etc/passwd'
  247. PROC_PATH = '/proc'
  248. IS_INSIDE_DOCKER = False
  249. if HOST_PREFIX:
  250. ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
  251. PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
  252. IS_INSIDE_DOCKER = True
  253. def read_passwd_file():
  254. data = dict()
  255. with open(ETC_PASSWD_PATH, 'r') as f:
  256. for line in f:
  257. line = line.strip()
  258. if line.startswith("#"):
  259. continue
  260. fields = line.split(":")
  261. # name, passwd, uid, gid, comment, home_dir, shell
  262. if len(fields) != 7:
  263. continue
  264. # uid, guid
  265. fields[2], fields[3] = int(fields[2]), int(fields[3])
  266. data[fields[2]] = fields
  267. return data
  268. def read_passwd_file_safe():
  269. try:
  270. if IS_INSIDE_DOCKER:
  271. return read_passwd_file()
  272. return dict((k[2], k) for k in pwd.getpwall())
  273. except (OSError, IOError):
  274. return dict()
  275. def get_username_by_pid_safe(pid, passwd_file):
  276. path = os.path.join(PROC_PATH, pid)
  277. try:
  278. uid = os.stat(path).st_uid
  279. except (OSError, IOError):
  280. return ''
  281. try:
  282. if IS_INSIDE_DOCKER:
  283. return passwd_file[uid][0]
  284. return pwd.getpwuid(uid)[0]
  285. except KeyError:
  286. return str(uid)
  287. class GPU:
  288. def __init__(self, num, root, exclude_zero_memory_users=False):
  289. self.num = num
  290. self.root = root
  291. self.exclude_zero_memory_users = exclude_zero_memory_users
  292. def id(self):
  293. return self.root.get('id')
  294. def name(self):
  295. return self.root.find('product_name').text
  296. def full_name(self):
  297. return 'gpu{0} {1}'.format(self.num, self.name())
  298. @handle_attr_error
  299. def pci_link_gen(self):
  300. return self.root.find('pci').find('pci_gpu_link_info').find('pcie_gen').find('max_link_gen').text
  301. @handle_attr_error
  302. def pci_link_width(self):
  303. return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0]
  304. def pci_bw_max(self):
  305. link_gen = self.pci_link_gen()
  306. link_width = int(self.pci_link_width())
  307. if link_gen not in PCI_SPEED or link_gen not in PCI_ENCODING or not link_width:
  308. return None
  309. # Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s.
  310. # see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
  311. # return max bandwidth in kilobytes per second (kB/s)
  312. return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8
  313. @handle_attr_error
  314. def rx_util(self):
  315. return self.root.find('pci').find('rx_util').text.split()[0]
  316. @handle_attr_error
  317. def tx_util(self):
  318. return self.root.find('pci').find('tx_util').text.split()[0]
  319. @handle_attr_error
  320. def fan_speed(self):
  321. return self.root.find('fan_speed').text.split()[0]
  322. @handle_attr_error
  323. def gpu_util(self):
  324. return self.root.find('utilization').find('gpu_util').text.split()[0]
  325. @handle_attr_error
  326. def memory_util(self):
  327. return self.root.find('utilization').find('memory_util').text.split()[0]
  328. @handle_attr_error
  329. def encoder_util(self):
  330. return self.root.find('utilization').find('encoder_util').text.split()[0]
  331. @handle_attr_error
  332. def decoder_util(self):
  333. return self.root.find('utilization').find('decoder_util').text.split()[0]
  334. @handle_attr_error
  335. def fb_memory_used(self):
  336. return self.root.find('fb_memory_usage').find('used').text.split()[0]
  337. @handle_attr_error
  338. def fb_memory_free(self):
  339. return self.root.find('fb_memory_usage').find('free').text.split()[0]
  340. @handle_attr_error
  341. def bar1_memory_used(self):
  342. return self.root.find('bar1_memory_usage').find('used').text.split()[0]
  343. @handle_attr_error
  344. def bar1_memory_free(self):
  345. return self.root.find('bar1_memory_usage').find('free').text.split()[0]
  346. @handle_attr_error
  347. def temperature(self):
  348. return self.root.find('temperature').find('gpu_temp').text.split()[0]
  349. @handle_attr_error
  350. def graphics_clock(self):
  351. return self.root.find('clocks').find('graphics_clock').text.split()[0]
  352. @handle_attr_error
  353. def video_clock(self):
  354. return self.root.find('clocks').find('video_clock').text.split()[0]
  355. @handle_attr_error
  356. def sm_clock(self):
  357. return self.root.find('clocks').find('sm_clock').text.split()[0]
  358. @handle_attr_error
  359. def mem_clock(self):
  360. return self.root.find('clocks').find('mem_clock').text.split()[0]
  361. @handle_attr_error
  362. def power_state(self):
  363. return str(self.root.find('power_readings').find('power_state').text.split()[0])
  364. @handle_value_error
  365. @handle_attr_error
  366. def power_draw(self):
  367. return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
  368. @handle_attr_error
  369. def processes(self):
  370. processes_info = self.root.find('processes').findall('process_info')
  371. if not processes_info:
  372. return list()
  373. passwd_file = read_passwd_file_safe()
  374. processes = list()
  375. for info in processes_info:
  376. pid = info.find('pid').text
  377. processes.append({
  378. 'pid': int(pid),
  379. 'process_name': info.find('process_name').text,
  380. 'used_memory': int(info.find('used_memory').text.split()[0]),
  381. 'username': get_username_by_pid_safe(pid, passwd_file),
  382. })
  383. return processes
  384. def data(self):
  385. data = {
  386. 'rx_util': self.rx_util(),
  387. 'tx_util': self.tx_util(),
  388. 'fan_speed': self.fan_speed(),
  389. 'gpu_util': self.gpu_util(),
  390. 'memory_util': self.memory_util(),
  391. 'encoder_util': self.encoder_util(),
  392. 'decoder_util': self.decoder_util(),
  393. 'fb_memory_used': self.fb_memory_used(),
  394. 'fb_memory_free': self.fb_memory_free(),
  395. 'bar1_memory_used': self.bar1_memory_used(),
  396. 'bar1_memory_free': self.bar1_memory_free(),
  397. 'gpu_temp': self.temperature(),
  398. 'graphics_clock': self.graphics_clock(),
  399. 'video_clock': self.video_clock(),
  400. 'sm_clock': self.sm_clock(),
  401. 'mem_clock': self.mem_clock(),
  402. 'power_draw': self.power_draw(),
  403. }
  404. pci_bw_max = self.pci_bw_max()
  405. if not pci_bw_max:
  406. data['rx_util_percent'] = 0
  407. data['tx_util_percent'] = 0
  408. else :
  409. data['rx_util_percent'] = str(int(int(self.rx_util())*100/self.pci_bw_max()))
  410. data['tx_util_percent'] = str(int(int(self.tx_util())*100/self.pci_bw_max()))
  411. for v in POWER_STATES:
  412. data['power_state_' + v.lower()] = 0
  413. p_state = self.power_state()
  414. if p_state:
  415. data['power_state_' + p_state.lower()] = 1
  416. processes = self.processes() or []
  417. users = set()
  418. for p in processes:
  419. data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
  420. if p['username']:
  421. if self.exclude_zero_memory_users and p['used_memory'] == 0:
  422. continue
  423. users.add(p['username'])
  424. key = 'user_mem_{0}'.format(p['username'])
  425. if key in data:
  426. data[key] += p['used_memory']
  427. else:
  428. data[key] = p['used_memory']
  429. data['user_num'] = len(users)
  430. return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items())
  431. class Service(SimpleService):
  432. def __init__(self, configuration=None, name=None):
  433. super(Service, self).__init__(configuration=configuration, name=name)
  434. self.order = list()
  435. self.definitions = dict()
  436. self.loop_mode = configuration.get('loop_mode', True)
  437. poll = int(configuration.get('poll_seconds', self.get_update_every()))
  438. self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False)
  439. self.poller = NvidiaSMIPoller(poll)
  440. def get_data_loop_mode(self):
  441. if not self.poller.is_started():
  442. self.poller.start()
  443. if not self.poller.is_alive():
  444. self.debug('poller is off')
  445. return None
  446. return self.poller.data()
  447. def get_data_normal_mode(self):
  448. return self.poller.run_once()
  449. def get_data(self):
  450. if self.loop_mode:
  451. last_data = self.get_data_loop_mode()
  452. else:
  453. last_data = self.get_data_normal_mode()
  454. if not last_data:
  455. return None
  456. parsed = self.parse_xml(last_data)
  457. if parsed is None:
  458. return None
  459. data = dict()
  460. for idx, root in enumerate(parsed.findall('gpu')):
  461. gpu = GPU(idx, root, self.exclude_zero_memory_users)
  462. gpu_data = gpu.data()
  463. # self.debug(gpu_data)
  464. gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v))
  465. data.update(gpu_data)
  466. self.update_processes_mem_chart(gpu)
  467. self.update_processes_user_mem_chart(gpu)
  468. return data or None
  469. def update_processes_mem_chart(self, gpu):
  470. ps = gpu.processes()
  471. if not ps:
  472. return
  473. chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
  474. active_dim_ids = []
  475. for p in ps:
  476. dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
  477. active_dim_ids.append(dim_id)
  478. if dim_id not in chart:
  479. chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
  480. for dim in chart:
  481. if dim.id not in active_dim_ids:
  482. chart.del_dimension(dim.id, hide=False)
  483. def update_processes_user_mem_chart(self, gpu):
  484. ps = gpu.processes()
  485. if not ps:
  486. return
  487. chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
  488. active_dim_ids = []
  489. for p in ps:
  490. if not p.get('username'):
  491. continue
  492. dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
  493. active_dim_ids.append(dim_id)
  494. if dim_id not in chart:
  495. chart.add_dimension([dim_id, '{0}'.format(p['username'])])
  496. for dim in chart:
  497. if dim.id not in active_dim_ids:
  498. chart.del_dimension(dim.id, hide=False)
  499. def check(self):
  500. if not self.poller.has_smi():
  501. self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
  502. return False
  503. raw_data = self.poller.run_once()
  504. if not raw_data:
  505. self.error("failed to invoke '{0}' binary".format(NVIDIA_SMI))
  506. return False
  507. parsed = self.parse_xml(raw_data)
  508. if parsed is None:
  509. return False
  510. gpus = parsed.findall('gpu')
  511. if not gpus:
  512. return False
  513. self.create_charts(gpus)
  514. return True
  515. def parse_xml(self, data):
  516. try:
  517. return et.fromstring(data)
  518. except et.ParseError as error:
  519. self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
  520. return None
  521. def create_charts(self, gpus):
  522. for idx, root in enumerate(gpus):
  523. order, charts = gpu_charts(GPU(idx, root))
  524. self.order.extend(order)
  525. self.definitions.update(charts)
  526. def is_gpu_data_value_valid(value):
  527. try:
  528. int(value)
  529. except (TypeError, ValueError):
  530. return False
  531. return True