link_exe.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. import itertools
  2. import os
  3. import os.path
  4. import sys
  5. import json
  6. import subprocess
  7. import optparse
  8. import textwrap
  9. # Explicitly enable local imports
  10. # Don't forget to add imported scripts to inputs of the calling command!
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  12. import process_command_files as pcf
  13. import thinlto_cache
  14. from process_whole_archive_option import ProcessWholeArchiveOption
  15. def get_leaks_suppressions(cmd):
  16. supp, newcmd = [], []
  17. for arg in cmd:
  18. if arg.endswith(".supp"):
  19. supp.append(arg)
  20. else:
  21. newcmd.append(arg)
  22. return supp, newcmd
  23. CUDA_LIBRARIES = {
  24. '-lcublas_static': '-lcublas',
  25. '-lcublasLt_static': '-lcublasLt',
  26. '-lcudart_static': '-lcudart',
  27. '-lcudnn_static': '-lcudnn',
  28. '-lcudnn_adv_infer_static': '-lcudnn',
  29. '-lcudnn_adv_train_static': '-lcudnn',
  30. '-lcudnn_cnn_infer_static': '-lcudnn',
  31. '-lcudnn_cnn_train_static': '-lcudnn',
  32. '-lcudnn_ops_infer_static': '-lcudnn',
  33. '-lcudnn_ops_train_static': '-lcudnn',
  34. '-lcufft_static_nocallback': '-lcufft',
  35. '-lcupti_static': '-lcupti',
  36. '-lcurand_static': '-lcurand',
  37. '-lcusolver_static': '-lcusolver',
  38. '-lcusparse_static': '-lcusparse',
  39. '-lmyelin_compiler_static': '-lmyelin',
  40. '-lmyelin_executor_static': '-lnvcaffe_parser',
  41. '-lmyelin_pattern_library_static': '',
  42. '-lmyelin_pattern_runtime_static': '',
  43. '-lnvinfer_static': '-lnvinfer',
  44. '-lnvinfer_plugin_static': '-lnvinfer_plugin',
  45. '-lnvonnxparser_static': '-lnvonnxparser',
  46. '-lnvparsers_static': '-lnvparsers',
  47. '-lnvrtc_static': '-lnvrtc',
  48. '-lnvrtc-builtins_static': '-lnvrtc-builtins',
  49. '-lnvptxcompiler_static': '',
  50. '-lnppc_static': '-lnppc',
  51. '-lnppial_static': '-lnppial',
  52. '-lnppicc_static': '-lnppicc',
  53. '-lnppicom_static': '-lnppicom',
  54. '-lnppidei_static': '-lnppidei',
  55. '-lnppif_static': '-lnppif',
  56. '-lnppig_static': '-lnppig',
  57. '-lnppim_static': '-lnppim',
  58. '-lnppist_static': '-lnppist',
  59. '-lnppisu_static': '-lnppisu',
  60. '-lnppitc_static': '-lnppitc',
  61. '-lnpps_static': '-lnpps',
  62. }
  63. class CUDAManager:
  64. def __init__(self, known_arches, nvprune_exe):
  65. self.fatbin_libs = self._known_fatbin_libs(set(CUDA_LIBRARIES))
  66. self.prune_args = []
  67. if known_arches:
  68. for arch in known_arches.split(':'):
  69. self.prune_args.append('-gencode')
  70. self.prune_args.append(self._arch_flag(arch))
  71. self.nvprune_exe = nvprune_exe
  72. def has_cuda_fatbins(self, cmd):
  73. return bool(set(cmd) & self.fatbin_libs)
  74. @property
  75. def can_prune_libs(self):
  76. return self.prune_args and self.nvprune_exe
  77. def _known_fatbin_libs(self, libs):
  78. libs_wo_device_code = {
  79. '-lcudart_static',
  80. '-lcupti_static',
  81. '-lnppc_static',
  82. }
  83. return set(libs) - libs_wo_device_code
  84. def _arch_flag(self, arch):
  85. _, ver = arch.split('_', 1)
  86. return 'arch=compute_{},code={}'.format(ver, arch)
  87. def prune_lib(self, inp_fname, out_fname):
  88. if self.prune_args:
  89. prune_command = [self.nvprune_exe] + self.prune_args + ['--output-file', out_fname, inp_fname]
  90. subprocess.check_call(prune_command)
  91. def write_linker_script(self, f):
  92. # This script simply says:
  93. # * Place all `.nv_fatbin` input sections from all input files into one `.nv_fatbin` output section of output file
  94. # * Place it after `.bss` section
  95. #
  96. # Motivation can be found here: https://maskray.me/blog/2021-07-04-sections-and-overwrite-sections#insert-before-and-insert-after
  97. # TL;DR - we put section with a lot of GPU code directly after the last meaningful section in the binary
  98. # (which turns out to be .bss)
  99. # In that case, we decrease chances of relocation overflows from .text to .bss,
  100. # because now these sections are close to each other
  101. script = textwrap.dedent("""
  102. SECTIONS {
  103. .nv_fatbin : { *(.nv_fatbin) }
  104. } INSERT AFTER .bss
  105. """).strip()
  106. f.write(script)
  107. def tmpdir_generator(base_path, prefix):
  108. for idx in itertools.count():
  109. path = os.path.abspath(os.path.join(base_path, prefix + '_' + str(idx)))
  110. os.makedirs(path)
  111. yield path
  112. def process_cuda_library_by_external_tool(cmd, build_root, tool_name, callable_tool_executor, allowed_cuda_libs):
  113. tmpdir_gen = tmpdir_generator(build_root, 'cuda_' + tool_name + '_libs')
  114. new_flags = []
  115. cuda_deps = set()
  116. # Because each directory flag only affects flags that follow it,
  117. # for correct pruning we need to process that in reversed order
  118. for flag in reversed(cmd):
  119. if flag in allowed_cuda_libs:
  120. cuda_deps.add('lib' + flag[2:] + '.a')
  121. flag += '_' + tool_name
  122. elif flag.startswith('-L') and os.path.exists(flag[2:]) and os.path.isdir(flag[2:]) and any(f in cuda_deps for f in os.listdir(flag[2:])):
  123. from_dirpath = flag[2:]
  124. from_deps = list(cuda_deps & set(os.listdir(from_dirpath)))
  125. if from_deps:
  126. to_dirpath = next(tmpdir_gen)
  127. for f in from_deps:
  128. from_path = os.path.join(from_dirpath, f)
  129. to_path = os.path.join(to_dirpath, f[:-2] + '_' + tool_name +'.a')
  130. callable_tool_executor(from_path, to_path)
  131. cuda_deps.remove(f)
  132. # do not remove current directory
  133. # because it can contain other libraries we want link to
  134. # instead we just add new directory with processed by tool libs
  135. new_flags.append('-L' + to_dirpath)
  136. new_flags.append(flag)
  137. assert not cuda_deps, ('Unresolved CUDA deps: ' + ','.join(cuda_deps))
  138. return reversed(new_flags)
  139. def process_cuda_libraries_by_objcopy(cmd, build_root, objcopy_exe):
  140. if not objcopy_exe:
  141. return cmd
  142. def run_objcopy(from_path, to_path):
  143. rename_section_command = [objcopy_exe, "--rename-section", ".ctors=.init_array", from_path, to_path]
  144. subprocess.check_call(rename_section_command)
  145. possible_libraries = set(CUDA_LIBRARIES.keys())
  146. possible_libraries.update([
  147. '-lcudadevrt',
  148. '-lcufilt',
  149. '-lculibos',
  150. ])
  151. possible_libraries.update([
  152. lib_name + "_pruner" for lib_name in possible_libraries
  153. ])
  154. return process_cuda_library_by_external_tool(list(cmd), build_root, 'objcopy', run_objcopy, possible_libraries)
  155. def process_cuda_libraries_by_nvprune(cmd, cuda_manager, build_root):
  156. if not cuda_manager.has_cuda_fatbins(cmd):
  157. return cmd
  158. # add custom linker script
  159. to_dirpath = next(tmpdir_generator(build_root, 'cuda_linker_script'))
  160. script_path = os.path.join(to_dirpath, 'script')
  161. with open(script_path, 'w') as f:
  162. cuda_manager.write_linker_script(f)
  163. flags_with_linker = list(cmd) + ['-Wl,--script={}'.format(script_path)]
  164. if not cuda_manager.can_prune_libs:
  165. return flags_with_linker
  166. return process_cuda_library_by_external_tool(flags_with_linker, build_root, 'pruner', cuda_manager.prune_lib, cuda_manager.fatbin_libs)
  167. def remove_excessive_flags(cmd):
  168. flags = []
  169. for flag in cmd:
  170. if not flag.endswith('.ios.interface') and not flag.endswith('.pkg.fake'):
  171. flags.append(flag)
  172. return flags
  173. def fix_sanitize_flag(cmd, opts):
  174. """
  175. Remove -fsanitize=address flag if sanitazers are linked explicitly for linux target.
  176. """
  177. for flag in cmd:
  178. if flag.startswith('--target') and 'linux' not in flag.lower():
  179. # use toolchained sanitize libraries
  180. return cmd
  181. assert opts.clang_ver
  182. CLANG_RT = 'contrib/libs/clang' + opts.clang_ver + '-rt/lib/'
  183. sanitize_flags = {
  184. '-fsanitize=address': CLANG_RT + 'asan',
  185. '-fsanitize=memory': CLANG_RT + 'msan',
  186. '-fsanitize=leak': CLANG_RT + 'lsan',
  187. '-fsanitize=undefined': CLANG_RT + 'ubsan',
  188. '-fsanitize=thread': CLANG_RT + 'tsan',
  189. }
  190. used_sanitize_libs = []
  191. aux = []
  192. for flag in cmd:
  193. if flag.startswith('-fsanitize-coverage='):
  194. # do not link sanitizer libraries from clang
  195. aux.append('-fno-sanitize-link-runtime')
  196. if flag in sanitize_flags and any(s.startswith(sanitize_flags[flag]) for s in cmd):
  197. # exclude '-fsanitize=' if appropriate library is linked explicitly
  198. continue
  199. if any(flag.startswith(lib) for lib in sanitize_flags.values()):
  200. used_sanitize_libs.append(flag)
  201. continue
  202. aux.append(flag)
  203. # move sanitize libraries out of the repeatedly searched group of archives
  204. flags = []
  205. for flag in aux:
  206. if flag == '-Wl,--start-group':
  207. flags += ['-Wl,--whole-archive'] + used_sanitize_libs + ['-Wl,--no-whole-archive']
  208. flags.append(flag)
  209. return flags
  210. def fix_cmd_for_dynamic_cuda(cmd):
  211. flags = []
  212. for flag in cmd:
  213. if flag in CUDA_LIBRARIES:
  214. flags.append(CUDA_LIBRARIES[flag])
  215. else:
  216. flags.append(flag)
  217. return flags
  218. def remove_libs(cmd, libs):
  219. excluded_flags = ['-l{}'.format(lib) for lib in libs]
  220. flags = []
  221. for flag in cmd:
  222. if flag in excluded_flags:
  223. continue
  224. flags.append(flag)
  225. return flags
  226. def gen_default_suppressions(inputs, output, source_root):
  227. import collections
  228. import os
  229. supp_map = collections.defaultdict(set)
  230. for filename in inputs:
  231. sanitizer = os.path.basename(filename).split('.', 1)[0]
  232. with open(os.path.join(source_root, filename)) as src:
  233. for line in src:
  234. line = line.strip()
  235. if not line or line.startswith('#'):
  236. continue
  237. supp_map[sanitizer].add(line)
  238. with open(output, "wb") as dst:
  239. for supp_type, supps in supp_map.items():
  240. dst.write('extern "C" const char *__%s_default_suppressions() {\n' % supp_type)
  241. dst.write(' return "{}";\n'.format('\\n'.join(sorted(supps))))
  242. dst.write('}\n')
  243. def parse_args(args):
  244. parser = optparse.OptionParser()
  245. parser.disable_interspersed_args()
  246. parser.add_option('--custom-step')
  247. parser.add_option('--python')
  248. parser.add_option('--source-root')
  249. parser.add_option('--build-root')
  250. parser.add_option('--clang-ver')
  251. parser.add_option('--dynamic-cuda', action='store_true')
  252. parser.add_option('--cuda-architectures',
  253. help='List of supported CUDA architectures, separated by ":" (e.g. "sm_52:compute_70:lto_90a"')
  254. parser.add_option('--nvprune-exe')
  255. parser.add_option('--objcopy-exe')
  256. parser.add_option('--arch')
  257. parser.add_option('--linker-output')
  258. parser.add_option('--whole-archive-peers', action='append')
  259. parser.add_option('--whole-archive-libs', action='append')
  260. parser.add_option('--exclude-libs', action='append')
  261. thinlto_cache.add_options(parser)
  262. return parser.parse_args(args)
  263. if __name__ == '__main__':
  264. args = sys.argv[1:]
  265. plugins = []
  266. if '--start-plugins' in args:
  267. ib = args.index('--start-plugins')
  268. ie = args.index('--end-plugins')
  269. plugins = args[ib + 1:ie]
  270. args = args[:ib] + args[ie + 1:]
  271. for p in plugins:
  272. res = subprocess.check_output([sys.executable, p] + args).decode().strip()
  273. if res:
  274. args = json.loads(res)
  275. opts, args = parse_args(args)
  276. args = pcf.skip_markers(args)
  277. cmd = args
  278. cmd = remove_excessive_flags(cmd)
  279. cmd = fix_sanitize_flag(cmd, opts)
  280. if opts.dynamic_cuda:
  281. cmd = fix_cmd_for_dynamic_cuda(cmd)
  282. else:
  283. cuda_manager = CUDAManager(opts.cuda_architectures, opts.nvprune_exe)
  284. cmd = process_cuda_libraries_by_nvprune(cmd, cuda_manager, opts.build_root)
  285. cmd = process_cuda_libraries_by_objcopy(cmd, opts.build_root, opts.objcopy_exe)
  286. if opts.exclude_libs:
  287. cmd = remove_libs(cmd, opts.exclude_libs)
  288. cmd = ProcessWholeArchiveOption(opts.arch, opts.whole_archive_peers, opts.whole_archive_libs).construct_cmd(cmd)
  289. if opts.custom_step:
  290. assert opts.python
  291. subprocess.check_call([opts.python] + [opts.custom_step] + args)
  292. supp, cmd = get_leaks_suppressions(cmd)
  293. if supp:
  294. src_file = "default_suppressions.cpp"
  295. gen_default_suppressions(supp, src_file, opts.source_root)
  296. cmd += [src_file]
  297. if opts.linker_output:
  298. stdout = open(opts.linker_output, 'w')
  299. else:
  300. stdout = sys.stdout
  301. thinlto_cache.preprocess(opts, cmd)
  302. rc = subprocess.call(cmd, shell=False, stderr=sys.stderr, stdout=stdout)
  303. thinlto_cache.postprocess(opts)
  304. sys.exit(rc)