link_exe.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import itertools
  2. import os
  3. import os.path
  4. import sys
  5. import json
  6. import subprocess
  7. import optparse
  8. import textwrap
  9. # Explicitly enable local imports
  10. # Don't forget to add imported scripts to inputs of the calling command!
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  12. import process_command_files as pcf
  13. import thinlto_cache
  14. from process_whole_archive_option import ProcessWholeArchiveOption
  15. CUDA_LIBRARIES = {
  16. '-lcublas_static': '-lcublas',
  17. '-lcublasLt_static': '-lcublasLt',
  18. '-lcudart_static': '-lcudart',
  19. '-lcudnn_static': '-lcudnn',
  20. '-lcudnn_adv_infer_static': '-lcudnn',
  21. '-lcudnn_adv_train_static': '-lcudnn',
  22. '-lcudnn_cnn_infer_static': '-lcudnn',
  23. '-lcudnn_cnn_train_static': '-lcudnn',
  24. '-lcudnn_ops_infer_static': '-lcudnn',
  25. '-lcudnn_ops_train_static': '-lcudnn',
  26. '-lcufft_static_nocallback': '-lcufft',
  27. '-lcupti_static': '-lcupti',
  28. '-lcurand_static': '-lcurand',
  29. '-lcusolver_static': '-lcusolver',
  30. '-lcusparse_static': '-lcusparse',
  31. '-lmyelin_compiler_static': '-lmyelin',
  32. '-lmyelin_executor_static': '-lnvcaffe_parser',
  33. '-lmyelin_pattern_library_static': '',
  34. '-lmyelin_pattern_runtime_static': '',
  35. '-lnvinfer_static': '-lnvinfer',
  36. '-lnvinfer_plugin_static': '-lnvinfer_plugin',
  37. '-lnvonnxparser_static': '-lnvonnxparser',
  38. '-lnvparsers_static': '-lnvparsers',
  39. '-lnvrtc_static': '-lnvrtc',
  40. '-lnvrtc-builtins_static': '-lnvrtc-builtins',
  41. '-lnvptxcompiler_static': '',
  42. '-lnppc_static': '-lnppc',
  43. '-lnppial_static': '-lnppial',
  44. '-lnppicc_static': '-lnppicc',
  45. '-lnppicom_static': '-lnppicom',
  46. '-lnppidei_static': '-lnppidei',
  47. '-lnppif_static': '-lnppif',
  48. '-lnppig_static': '-lnppig',
  49. '-lnppim_static': '-lnppim',
  50. '-lnppist_static': '-lnppist',
  51. '-lnppisu_static': '-lnppisu',
  52. '-lnppitc_static': '-lnppitc',
  53. '-lnpps_static': '-lnpps',
  54. }
  55. class CUDAManager:
  56. def __init__(self, known_arches, nvprune_exe):
  57. self.fatbin_libs = self._known_fatbin_libs(set(CUDA_LIBRARIES))
  58. self.prune_args = []
  59. if known_arches:
  60. for arch in known_arches.split(':'):
  61. self.prune_args.append('-gencode')
  62. self.prune_args.append(self._arch_flag(arch))
  63. self.nvprune_exe = nvprune_exe
  64. def has_cuda_fatbins(self, cmd):
  65. return bool(set(cmd) & self.fatbin_libs)
  66. @property
  67. def can_prune_libs(self):
  68. return self.prune_args and self.nvprune_exe
  69. def _known_fatbin_libs(self, libs):
  70. libs_wo_device_code = {
  71. '-lcudart_static',
  72. '-lcupti_static',
  73. '-lnppc_static',
  74. }
  75. return set(libs) - libs_wo_device_code
  76. def _arch_flag(self, arch):
  77. _, ver = arch.split('_', 1)
  78. return 'arch=compute_{},code={}'.format(ver, arch)
  79. def prune_lib(self, inp_fname, out_fname):
  80. if self.prune_args:
  81. prune_command = [self.nvprune_exe] + self.prune_args + ['--output-file', out_fname, inp_fname]
  82. subprocess.check_call(prune_command)
  83. def write_linker_script(self, f):
  84. # This script simply says:
  85. # * Place all `.nv_fatbin` input sections from all input files into one `.nv_fatbin` output section of output file
  86. # * Place it after `.bss` section
  87. #
  88. # Motivation can be found here: https://maskray.me/blog/2021-07-04-sections-and-overwrite-sections#insert-before-and-insert-after
  89. # TL;DR - we put section with a lot of GPU code directly after the last meaningful section in the binary
  90. # (which turns out to be .bss)
  91. # In that case, we decrease chances of relocation overflows from .text to .bss,
  92. # because now these sections are close to each other
  93. script = textwrap.dedent("""
  94. SECTIONS {
  95. .nv_fatbin : { *(.nv_fatbin) }
  96. } INSERT AFTER .bss
  97. """).strip()
  98. f.write(script)
  99. def tmpdir_generator(base_path, prefix):
  100. for idx in itertools.count():
  101. path = os.path.abspath(os.path.join(base_path, prefix + '_' + str(idx)))
  102. os.makedirs(path)
  103. yield path
  104. def process_cuda_library_by_external_tool(cmd, build_root, tool_name, callable_tool_executor, allowed_cuda_libs):
  105. tmpdir_gen = tmpdir_generator(build_root, 'cuda_' + tool_name + '_libs')
  106. new_flags = []
  107. cuda_deps = set()
  108. # Because each directory flag only affects flags that follow it,
  109. # for correct pruning we need to process that in reversed order
  110. for flag in reversed(cmd):
  111. if flag in allowed_cuda_libs:
  112. cuda_deps.add('lib' + flag[2:] + '.a')
  113. flag += '_' + tool_name
  114. elif flag.startswith('-L') and os.path.exists(flag[2:]) and os.path.isdir(flag[2:]) and any(f in cuda_deps for f in os.listdir(flag[2:])):
  115. from_dirpath = flag[2:]
  116. from_deps = list(cuda_deps & set(os.listdir(from_dirpath)))
  117. if from_deps:
  118. to_dirpath = next(tmpdir_gen)
  119. for f in from_deps:
  120. from_path = os.path.join(from_dirpath, f)
  121. to_path = os.path.join(to_dirpath, f[:-2] + '_' + tool_name +'.a')
  122. callable_tool_executor(from_path, to_path)
  123. cuda_deps.remove(f)
  124. # do not remove current directory
  125. # because it can contain other libraries we want link to
  126. # instead we just add new directory with processed by tool libs
  127. new_flags.append('-L' + to_dirpath)
  128. new_flags.append(flag)
  129. assert not cuda_deps, ('Unresolved CUDA deps: ' + ','.join(cuda_deps))
  130. return reversed(new_flags)
  131. def process_cuda_libraries_by_objcopy(cmd, build_root, objcopy_exe):
  132. if not objcopy_exe:
  133. return cmd
  134. def run_objcopy(from_path, to_path):
  135. rename_section_command = [objcopy_exe, "--rename-section", ".ctors=.init_array", from_path, to_path]
  136. subprocess.check_call(rename_section_command)
  137. possible_libraries = set(CUDA_LIBRARIES.keys())
  138. possible_libraries.update([
  139. '-lcudadevrt',
  140. '-lcufilt',
  141. '-lculibos',
  142. ])
  143. possible_libraries.update([
  144. lib_name + "_pruner" for lib_name in possible_libraries
  145. ])
  146. return process_cuda_library_by_external_tool(list(cmd), build_root, 'objcopy', run_objcopy, possible_libraries)
  147. def process_cuda_libraries_by_nvprune(cmd, cuda_manager, build_root):
  148. if not cuda_manager.has_cuda_fatbins(cmd):
  149. return cmd
  150. # add custom linker script
  151. to_dirpath = next(tmpdir_generator(build_root, 'cuda_linker_script'))
  152. script_path = os.path.join(to_dirpath, 'script')
  153. with open(script_path, 'w') as f:
  154. cuda_manager.write_linker_script(f)
  155. flags_with_linker = list(cmd) + ['-Wl,--script={}'.format(script_path)]
  156. if not cuda_manager.can_prune_libs:
  157. return flags_with_linker
  158. return process_cuda_library_by_external_tool(flags_with_linker, build_root, 'pruner', cuda_manager.prune_lib, cuda_manager.fatbin_libs)
  159. def remove_excessive_flags(cmd):
  160. flags = []
  161. for flag in cmd:
  162. if not flag.endswith('.ios.interface') and not flag.endswith('.pkg.fake'):
  163. flags.append(flag)
  164. return flags
  165. def fix_cmd_for_dynamic_cuda(cmd):
  166. flags = []
  167. for flag in cmd:
  168. if flag in CUDA_LIBRARIES:
  169. flags.append(CUDA_LIBRARIES[flag])
  170. else:
  171. flags.append(flag)
  172. return flags
  173. def remove_libs(cmd, libs):
  174. excluded_flags = ['-l{}'.format(lib) for lib in libs]
  175. flags = []
  176. for flag in cmd:
  177. if flag in excluded_flags:
  178. continue
  179. flags.append(flag)
  180. return flags
  181. def parse_args(args):
  182. parser = optparse.OptionParser()
  183. parser.disable_interspersed_args()
  184. parser.add_option('--custom-step')
  185. parser.add_option('--python')
  186. parser.add_option('--source-root')
  187. parser.add_option('--build-root')
  188. parser.add_option('--clang-ver')
  189. parser.add_option('--dynamic-cuda', action='store_true')
  190. parser.add_option('--cuda-architectures',
  191. help='List of supported CUDA architectures, separated by ":" (e.g. "sm_52:compute_70:lto_90a"')
  192. parser.add_option('--nvprune-exe')
  193. parser.add_option('--objcopy-exe')
  194. parser.add_option('--arch')
  195. parser.add_option('--linker-output')
  196. parser.add_option('--whole-archive-peers', action='append')
  197. parser.add_option('--whole-archive-libs', action='append')
  198. parser.add_option('--exclude-libs', action='append')
  199. thinlto_cache.add_options(parser)
  200. return parser.parse_args(args)
  201. if __name__ == '__main__':
  202. args = sys.argv[1:]
  203. plugins = []
  204. if '--start-plugins' in args:
  205. ib = args.index('--start-plugins')
  206. ie = args.index('--end-plugins')
  207. plugins = args[ib + 1:ie]
  208. args = args[:ib] + args[ie + 1:]
  209. for p in plugins:
  210. res = subprocess.check_output([sys.executable, p, sys.argv[0]] + args).decode().strip()
  211. if res:
  212. args = json.loads(res)[1:]
  213. opts, args = parse_args(args)
  214. args = pcf.skip_markers(args)
  215. cmd = args
  216. cmd = remove_excessive_flags(cmd)
  217. if opts.dynamic_cuda:
  218. cmd = fix_cmd_for_dynamic_cuda(cmd)
  219. else:
  220. cuda_manager = CUDAManager(opts.cuda_architectures, opts.nvprune_exe)
  221. cmd = process_cuda_libraries_by_nvprune(cmd, cuda_manager, opts.build_root)
  222. cmd = process_cuda_libraries_by_objcopy(cmd, opts.build_root, opts.objcopy_exe)
  223. if opts.exclude_libs:
  224. cmd = remove_libs(cmd, opts.exclude_libs)
  225. cmd = ProcessWholeArchiveOption(opts.arch, opts.whole_archive_peers, opts.whole_archive_libs).construct_cmd(cmd)
  226. if opts.custom_step:
  227. assert opts.python
  228. subprocess.check_call([opts.python] + [opts.custom_step] + args)
  229. if opts.linker_output:
  230. stdout = open(opts.linker_output, 'w')
  231. else:
  232. stdout = sys.stdout
  233. thinlto_cache.preprocess(opts, cmd)
  234. rc = subprocess.call(cmd, shell=False, stderr=sys.stderr, stdout=stdout)
  235. thinlto_cache.postprocess(opts)
  236. sys.exit(rc)