nvidia_smi.chart.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. # -*- coding: utf-8 -*-
  2. # Description: nvidia-smi netdata python.d module
  3. # Original Author: Steven Noonan (tycho)
  4. # Author: Ilya Mashchenko (ilyam8)
  5. # User Memory Stat Author: Guido Scatena (scatenag)
  6. import subprocess
  7. import threading
  8. import os
  9. import pwd
  10. import xml.etree.ElementTree as et
  11. from bases.FrameworkServices.SimpleService import SimpleService
  12. from bases.collection import find_binary
  13. disabled_by_default = True
  14. NVIDIA_SMI = 'nvidia-smi'
  15. EMPTY_ROW = ''
  16. EMPTY_ROW_LIMIT = 500
  17. POLLER_BREAK_ROW = '</nvidia_smi_log>'
  18. PCI_BANDWIDTH = 'pci_bandwidth'
  19. FAN_SPEED = 'fan_speed'
  20. GPU_UTIL = 'gpu_utilization'
  21. MEM_UTIL = 'mem_utilization'
  22. ENCODER_UTIL = 'encoder_utilization'
  23. MEM_USAGE = 'mem_usage'
  24. BAR_USAGE = 'bar1_mem_usage'
  25. TEMPERATURE = 'temperature'
  26. CLOCKS = 'clocks'
  27. POWER = 'power'
  28. PROCESSES_MEM = 'processes_mem'
  29. USER_MEM = 'user_mem'
  30. USER_NUM = 'user_num'
  31. ORDER = [
  32. PCI_BANDWIDTH,
  33. FAN_SPEED,
  34. GPU_UTIL,
  35. MEM_UTIL,
  36. ENCODER_UTIL,
  37. MEM_USAGE,
  38. BAR_USAGE,
  39. TEMPERATURE,
  40. CLOCKS,
  41. POWER,
  42. PROCESSES_MEM,
  43. USER_MEM,
  44. USER_NUM,
  45. ]
  46. def gpu_charts(gpu):
  47. fam = gpu.full_name()
  48. charts = {
  49. PCI_BANDWIDTH: {
  50. 'options': [None, 'PCI Express Bandwidth Utilization', 'KiB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
  51. 'lines': [
  52. ['rx_util', 'rx', 'absolute', 1, 1],
  53. ['tx_util', 'tx', 'absolute', 1, -1],
  54. ]
  55. },
  56. FAN_SPEED: {
  57. 'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
  58. 'lines': [
  59. ['fan_speed', 'speed'],
  60. ]
  61. },
  62. GPU_UTIL: {
  63. 'options': [None, 'GPU Utilization', 'percentage', fam, 'nvidia_smi.gpu_utilization', 'line'],
  64. 'lines': [
  65. ['gpu_util', 'utilization'],
  66. ]
  67. },
  68. MEM_UTIL: {
  69. 'options': [None, 'Memory Bandwidth Utilization', 'percentage', fam, 'nvidia_smi.mem_utilization', 'line'],
  70. 'lines': [
  71. ['memory_util', 'utilization'],
  72. ]
  73. },
  74. ENCODER_UTIL: {
  75. 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization',
  76. 'line'],
  77. 'lines': [
  78. ['encoder_util', 'encoder'],
  79. ['decoder_util', 'decoder'],
  80. ]
  81. },
  82. MEM_USAGE: {
  83. 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
  84. 'lines': [
  85. ['fb_memory_free', 'free'],
  86. ['fb_memory_used', 'used'],
  87. ]
  88. },
  89. BAR_USAGE: {
  90. 'options': [None, 'Bar1 Memory Usage', 'MiB', fam, 'nvidia_smi.bar1_memory_usage', 'stacked'],
  91. 'lines': [
  92. ['bar1_memory_free', 'free'],
  93. ['bar1_memory_used', 'used'],
  94. ]
  95. },
  96. TEMPERATURE: {
  97. 'options': [None, 'Temperature', 'celsius', fam, 'nvidia_smi.temperature', 'line'],
  98. 'lines': [
  99. ['gpu_temp', 'temp'],
  100. ]
  101. },
  102. CLOCKS: {
  103. 'options': [None, 'Clock Frequencies', 'MHz', fam, 'nvidia_smi.clocks', 'line'],
  104. 'lines': [
  105. ['graphics_clock', 'graphics'],
  106. ['video_clock', 'video'],
  107. ['sm_clock', 'sm'],
  108. ['mem_clock', 'mem'],
  109. ]
  110. },
  111. POWER: {
  112. 'options': [None, 'Power Utilization', 'Watts', fam, 'nvidia_smi.power', 'line'],
  113. 'lines': [
  114. ['power_draw', 'power', 'absolute', 1, 100],
  115. ]
  116. },
  117. PROCESSES_MEM: {
  118. 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
  119. 'lines': []
  120. },
  121. USER_MEM: {
  122. 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
  123. 'lines': []
  124. },
  125. USER_NUM: {
  126. 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
  127. 'lines': [
  128. ['user_num', 'users'],
  129. ]
  130. },
  131. }
  132. idx = gpu.num
  133. order = ['gpu{0}_{1}'.format(idx, v) for v in ORDER]
  134. charts = dict(('gpu{0}_{1}'.format(idx, k), v) for k, v in charts.items())
  135. for chart in charts.values():
  136. for line in chart['lines']:
  137. line[0] = 'gpu{0}_{1}'.format(idx, line[0])
  138. return order, charts
  139. class NvidiaSMI:
  140. def __init__(self):
  141. self.command = find_binary(NVIDIA_SMI)
  142. self.active_proc = None
  143. def run_once(self):
  144. proc = subprocess.Popen([self.command, '-x', '-q'], stdout=subprocess.PIPE)
  145. stdout, _ = proc.communicate()
  146. return stdout
  147. def run_loop(self, interval):
  148. if self.active_proc:
  149. self.kill()
  150. proc = subprocess.Popen([self.command, '-x', '-q', '-l', str(interval)], stdout=subprocess.PIPE)
  151. self.active_proc = proc
  152. return proc.stdout
  153. def kill(self):
  154. if self.active_proc:
  155. self.active_proc.kill()
  156. self.active_proc = None
  157. class NvidiaSMIPoller(threading.Thread):
  158. def __init__(self, poll_interval):
  159. threading.Thread.__init__(self)
  160. self.daemon = True
  161. self.smi = NvidiaSMI()
  162. self.interval = poll_interval
  163. self.lock = threading.RLock()
  164. self.last_data = str()
  165. self.exit = False
  166. self.empty_rows = 0
  167. self.rows = list()
  168. def has_smi(self):
  169. return bool(self.smi.command)
  170. def run_once(self):
  171. return self.smi.run_once()
  172. def run(self):
  173. out = self.smi.run_loop(self.interval)
  174. for row in out:
  175. if self.exit or self.empty_rows > EMPTY_ROW_LIMIT:
  176. break
  177. self.process_row(row)
  178. self.smi.kill()
  179. def process_row(self, row):
  180. row = row.decode()
  181. self.empty_rows += (row == EMPTY_ROW)
  182. self.rows.append(row)
  183. if POLLER_BREAK_ROW in row:
  184. self.lock.acquire()
  185. self.last_data = '\n'.join(self.rows)
  186. self.lock.release()
  187. self.rows = list()
  188. self.empty_rows = 0
  189. def is_started(self):
  190. return self.ident is not None
  191. def shutdown(self):
  192. self.exit = True
  193. def data(self):
  194. self.lock.acquire()
  195. data = self.last_data
  196. self.lock.release()
  197. return data
  198. def handle_attr_error(method):
  199. def on_call(*args, **kwargs):
  200. try:
  201. return method(*args, **kwargs)
  202. except AttributeError:
  203. return None
  204. return on_call
  205. def handle_value_error(method):
  206. def on_call(*args, **kwargs):
  207. try:
  208. return method(*args, **kwargs)
  209. except ValueError:
  210. return None
  211. return on_call
  212. HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
  213. ETC_PASSWD_PATH = '/etc/passwd'
  214. PROC_PATH = '/proc'
  215. IS_INSIDE_DOCKER = False
  216. if HOST_PREFIX:
  217. ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
  218. PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
  219. IS_INSIDE_DOCKER = True
  220. def read_passwd_file():
  221. data = dict()
  222. with open(ETC_PASSWD_PATH, 'r') as f:
  223. for line in f:
  224. line = line.strip()
  225. if line.startswith("#"):
  226. continue
  227. fields = line.split(":")
  228. # name, passwd, uid, gid, comment, home_dir, shell
  229. if len(fields) != 7:
  230. continue
  231. # uid, guid
  232. fields[2], fields[3] = int(fields[2]), int(fields[3])
  233. data[fields[2]] = fields
  234. return data
  235. def read_passwd_file_safe():
  236. try:
  237. if IS_INSIDE_DOCKER:
  238. return read_passwd_file()
  239. return dict((k[2], k) for k in pwd.getpwall())
  240. except (OSError, IOError):
  241. return dict()
  242. def get_username_by_pid_safe(pid, passwd_file):
  243. path = os.path.join(PROC_PATH, pid)
  244. try:
  245. uid = os.stat(path).st_uid
  246. except (OSError, IOError):
  247. return ''
  248. try:
  249. if IS_INSIDE_DOCKER:
  250. return passwd_file[uid][0]
  251. return pwd.getpwuid(uid)[0]
  252. except KeyError:
  253. return str(uid)
  254. class GPU:
  255. def __init__(self, num, root, exclude_zero_memory_users=False):
  256. self.num = num
  257. self.root = root
  258. self.exclude_zero_memory_users = exclude_zero_memory_users
  259. def id(self):
  260. return self.root.get('id')
  261. def name(self):
  262. return self.root.find('product_name').text
  263. def full_name(self):
  264. return 'gpu{0} {1}'.format(self.num, self.name())
  265. @handle_attr_error
  266. def rx_util(self):
  267. return self.root.find('pci').find('rx_util').text.split()[0]
  268. @handle_attr_error
  269. def tx_util(self):
  270. return self.root.find('pci').find('tx_util').text.split()[0]
  271. @handle_attr_error
  272. def fan_speed(self):
  273. return self.root.find('fan_speed').text.split()[0]
  274. @handle_attr_error
  275. def gpu_util(self):
  276. return self.root.find('utilization').find('gpu_util').text.split()[0]
  277. @handle_attr_error
  278. def memory_util(self):
  279. return self.root.find('utilization').find('memory_util').text.split()[0]
  280. @handle_attr_error
  281. def encoder_util(self):
  282. return self.root.find('utilization').find('encoder_util').text.split()[0]
  283. @handle_attr_error
  284. def decoder_util(self):
  285. return self.root.find('utilization').find('decoder_util').text.split()[0]
  286. @handle_attr_error
  287. def fb_memory_used(self):
  288. return self.root.find('fb_memory_usage').find('used').text.split()[0]
  289. @handle_attr_error
  290. def fb_memory_free(self):
  291. return self.root.find('fb_memory_usage').find('free').text.split()[0]
  292. @handle_attr_error
  293. def bar1_memory_used(self):
  294. return self.root.find('bar1_memory_usage').find('used').text.split()[0]
  295. @handle_attr_error
  296. def bar1_memory_free(self):
  297. return self.root.find('bar1_memory_usage').find('free').text.split()[0]
  298. @handle_attr_error
  299. def temperature(self):
  300. return self.root.find('temperature').find('gpu_temp').text.split()[0]
  301. @handle_attr_error
  302. def graphics_clock(self):
  303. return self.root.find('clocks').find('graphics_clock').text.split()[0]
  304. @handle_attr_error
  305. def video_clock(self):
  306. return self.root.find('clocks').find('video_clock').text.split()[0]
  307. @handle_attr_error
  308. def sm_clock(self):
  309. return self.root.find('clocks').find('sm_clock').text.split()[0]
  310. @handle_attr_error
  311. def mem_clock(self):
  312. return self.root.find('clocks').find('mem_clock').text.split()[0]
  313. @handle_value_error
  314. @handle_attr_error
  315. def power_draw(self):
  316. return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
  317. @handle_attr_error
  318. def processes(self):
  319. processes_info = self.root.find('processes').findall('process_info')
  320. if not processes_info:
  321. return list()
  322. passwd_file = read_passwd_file_safe()
  323. processes = list()
  324. for info in processes_info:
  325. pid = info.find('pid').text
  326. processes.append({
  327. 'pid': int(pid),
  328. 'process_name': info.find('process_name').text,
  329. 'used_memory': int(info.find('used_memory').text.split()[0]),
  330. 'username': get_username_by_pid_safe(pid, passwd_file),
  331. })
  332. return processes
  333. def data(self):
  334. data = {
  335. 'rx_util': self.rx_util(),
  336. 'tx_util': self.tx_util(),
  337. 'fan_speed': self.fan_speed(),
  338. 'gpu_util': self.gpu_util(),
  339. 'memory_util': self.memory_util(),
  340. 'encoder_util': self.encoder_util(),
  341. 'decoder_util': self.decoder_util(),
  342. 'fb_memory_used': self.fb_memory_used(),
  343. 'fb_memory_free': self.fb_memory_free(),
  344. 'bar1_memory_used': self.bar1_memory_used(),
  345. 'bar1_memory_free': self.bar1_memory_free(),
  346. 'gpu_temp': self.temperature(),
  347. 'graphics_clock': self.graphics_clock(),
  348. 'video_clock': self.video_clock(),
  349. 'sm_clock': self.sm_clock(),
  350. 'mem_clock': self.mem_clock(),
  351. 'power_draw': self.power_draw(),
  352. }
  353. processes = self.processes() or []
  354. users = set()
  355. for p in processes:
  356. data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
  357. if p['username']:
  358. if self.exclude_zero_memory_users and p['used_memory'] == 0:
  359. continue
  360. users.add(p['username'])
  361. key = 'user_mem_{0}'.format(p['username'])
  362. if key in data:
  363. data[key] += p['used_memory']
  364. else:
  365. data[key] = p['used_memory']
  366. data['user_num'] = len(users)
  367. return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items())
  368. class Service(SimpleService):
  369. def __init__(self, configuration=None, name=None):
  370. super(Service, self).__init__(configuration=configuration, name=name)
  371. self.order = list()
  372. self.definitions = dict()
  373. self.loop_mode = configuration.get('loop_mode', True)
  374. poll = int(configuration.get('poll_seconds', 1))
  375. self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False)
  376. self.poller = NvidiaSMIPoller(poll)
  377. def get_data_loop_mode(self):
  378. if not self.poller.is_started():
  379. self.poller.start()
  380. if not self.poller.is_alive():
  381. self.debug('poller is off')
  382. return None
  383. return self.poller.data()
  384. def get_data_normal_mode(self):
  385. return self.poller.run_once()
  386. def get_data(self):
  387. if self.loop_mode:
  388. last_data = self.get_data_loop_mode()
  389. else:
  390. last_data = self.get_data_normal_mode()
  391. if not last_data:
  392. return None
  393. parsed = self.parse_xml(last_data)
  394. if parsed is None:
  395. return None
  396. data = dict()
  397. for idx, root in enumerate(parsed.findall('gpu')):
  398. gpu = GPU(idx, root, self.exclude_zero_memory_users)
  399. gpu_data = gpu.data()
  400. # self.debug(gpu_data)
  401. gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v))
  402. data.update(gpu_data)
  403. self.update_processes_mem_chart(gpu)
  404. self.update_processes_user_mem_chart(gpu)
  405. return data or None
  406. def update_processes_mem_chart(self, gpu):
  407. ps = gpu.processes()
  408. if not ps:
  409. return
  410. chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
  411. active_dim_ids = []
  412. for p in ps:
  413. dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
  414. active_dim_ids.append(dim_id)
  415. if dim_id not in chart:
  416. chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
  417. for dim in chart:
  418. if dim.id not in active_dim_ids:
  419. chart.del_dimension(dim.id, hide=False)
  420. def update_processes_user_mem_chart(self, gpu):
  421. ps = gpu.processes()
  422. if not ps:
  423. return
  424. chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
  425. active_dim_ids = []
  426. for p in ps:
  427. if not p.get('username'):
  428. continue
  429. dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
  430. active_dim_ids.append(dim_id)
  431. if dim_id not in chart:
  432. chart.add_dimension([dim_id, '{0}'.format(p['username'])])
  433. for dim in chart:
  434. if dim.id not in active_dim_ids:
  435. chart.del_dimension(dim.id, hide=False)
  436. def check(self):
  437. if not self.poller.has_smi():
  438. self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
  439. return False
  440. raw_data = self.poller.run_once()
  441. if not raw_data:
  442. self.error("failed to invoke '{0}' binary".format(NVIDIA_SMI))
  443. return False
  444. parsed = self.parse_xml(raw_data)
  445. if parsed is None:
  446. return False
  447. gpus = parsed.findall('gpu')
  448. if not gpus:
  449. return False
  450. self.create_charts(gpus)
  451. return True
  452. def parse_xml(self, data):
  453. try:
  454. return et.fromstring(data)
  455. except et.ParseError as error:
  456. self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
  457. return None
  458. def create_charts(self, gpus):
  459. for idx, root in enumerate(gpus):
  460. order, charts = gpu_charts(GPU(idx, root))
  461. self.order.extend(order)
  462. self.definitions.update(charts)
  463. def is_gpu_data_value_valid(value):
  464. try:
  465. int(value)
  466. except (TypeError, ValueError):
  467. return False
  468. return True