SMusatov
/
ydb
mirror of https://github.com/ydb-platform/ydb.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
							import itertools
import os
import os.path
import sys
import json
import subprocess
import optparse
import textwrap

# Explicitly enable local imports
# Don't forget to add imported scripts to inputs of the calling command!
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import process_command_files as pcf
import thinlto_cache

from process_whole_archive_option import ProcessWholeArchiveOption


CUDA_LIBRARIES = {
    '-lcublas_static': '-lcublas',
    '-lcublasLt_static': '-lcublasLt',
    '-lcudart_static': '-lcudart',
    '-lcudnn_static': '-lcudnn',
    '-lcudnn_adv_infer_static': '-lcudnn',
    '-lcudnn_adv_train_static': '-lcudnn',
    '-lcudnn_cnn_infer_static': '-lcudnn',
    '-lcudnn_cnn_train_static': '-lcudnn',
    '-lcudnn_ops_infer_static': '-lcudnn',
    '-lcudnn_ops_train_static': '-lcudnn',
    '-lcufft_static_nocallback': '-lcufft',
    '-lcupti_static': '-lcupti',
    '-lcurand_static': '-lcurand',
    '-lcusolver_static': '-lcusolver',
    '-lcusparse_static': '-lcusparse',
    '-lmyelin_compiler_static': '-lmyelin',
    '-lmyelin_executor_static': '-lnvcaffe_parser',
    '-lmyelin_pattern_library_static': '',
    '-lmyelin_pattern_runtime_static': '',
    '-lnvinfer_static': '-lnvinfer',
    '-lnvinfer_plugin_static': '-lnvinfer_plugin',
    '-lnvonnxparser_static': '-lnvonnxparser',
    '-lnvparsers_static': '-lnvparsers',
    '-lnvrtc_static': '-lnvrtc',
    '-lnvrtc-builtins_static': '-lnvrtc-builtins',
    '-lnvptxcompiler_static': '',
    '-lnppc_static': '-lnppc',
    '-lnppial_static': '-lnppial',
    '-lnppicc_static': '-lnppicc',
    '-lnppicom_static': '-lnppicom',
    '-lnppidei_static': '-lnppidei',
    '-lnppif_static': '-lnppif',
    '-lnppig_static': '-lnppig',
    '-lnppim_static': '-lnppim',
    '-lnppist_static': '-lnppist',
    '-lnppisu_static': '-lnppisu',
    '-lnppitc_static': '-lnppitc',
    '-lnpps_static': '-lnpps',
}


class CUDAManager:
    def __init__(self, known_arches, nvprune_exe):
        self.fatbin_libs = self._known_fatbin_libs(set(CUDA_LIBRARIES))

        self.prune_args = []
        if known_arches:
            for arch in known_arches.split(':'):
                self.prune_args.append('-gencode')
                self.prune_args.append(self._arch_flag(arch))

        self.nvprune_exe = nvprune_exe

    def has_cuda_fatbins(self, cmd):
        return bool(set(cmd) & self.fatbin_libs)

    @property
    def can_prune_libs(self):
        return self.prune_args and self.nvprune_exe

    def _known_fatbin_libs(self, libs):
        libs_wo_device_code = {
            '-lcudart_static',
            '-lcupti_static',
            '-lnppc_static',
        }
        return set(libs) - libs_wo_device_code

    def _arch_flag(self, arch):
        _, ver = arch.split('_', 1)
        return 'arch=compute_{},code={}'.format(ver, arch)

    def prune_lib(self, inp_fname, out_fname):
        if self.prune_args:
            prune_command = [self.nvprune_exe] + self.prune_args + ['--output-file', out_fname, inp_fname]
            subprocess.check_call(prune_command)

    def write_linker_script(self, f):
        # This script simply says:
        # * Place all `.nv_fatbin` input sections from all input files into one `.nv_fatbin` output section of output file
        # * Place it after `.bss` section
        #
        # Motivation can be found here: https://maskray.me/blog/2021-07-04-sections-and-overwrite-sections#insert-before-and-insert-after
        # TL;DR - we put section with a lot of GPU code directly after the last meaningful section in the binary
        # (which turns out to be .bss)
        # In that case, we decrease chances of relocation overflows from .text to .bss,
        # because now these sections are close to each other
        script = textwrap.dedent("""
            SECTIONS {
                .nv_fatbin : { *(.nv_fatbin) }
            } INSERT AFTER .bss
        """).strip()

        f.write(script)


def tmpdir_generator(base_path, prefix):
    for idx in itertools.count():
        path = os.path.abspath(os.path.join(base_path, prefix + '_' + str(idx)))
        os.makedirs(path)
        yield path


def process_cuda_library_by_external_tool(cmd, build_root, tool_name, callable_tool_executor, allowed_cuda_libs):
    tmpdir_gen = tmpdir_generator(build_root, 'cuda_' + tool_name + '_libs')

    new_flags = []
    cuda_deps = set()

    # Because each directory flag only affects flags that follow it,
    # for correct pruning we need to process that in reversed order
    for flag in reversed(cmd):
        if flag in allowed_cuda_libs:
            cuda_deps.add('lib' + flag[2:] + '.a')
            flag += '_' + tool_name
        elif flag.startswith('-L') and os.path.exists(flag[2:]) and os.path.isdir(flag[2:]) and any(f in cuda_deps for f in os.listdir(flag[2:])):
            from_dirpath = flag[2:]
            from_deps = list(cuda_deps & set(os.listdir(from_dirpath)))

            if from_deps:
                to_dirpath = next(tmpdir_gen)

                for f in from_deps:
                    from_path = os.path.join(from_dirpath, f)
                    to_path = os.path.join(to_dirpath, f[:-2] + '_' + tool_name +'.a')
                    callable_tool_executor(from_path, to_path)
                    cuda_deps.remove(f)

                # do not remove current directory
                # because it can contain other libraries we want link to
                # instead we just add new directory with processed by tool libs
                new_flags.append('-L' + to_dirpath)

        new_flags.append(flag)

    assert not cuda_deps, ('Unresolved CUDA deps: ' + ','.join(cuda_deps))
    return reversed(new_flags)


def process_cuda_libraries_by_objcopy(cmd, build_root, objcopy_exe):
    if not objcopy_exe:
        return cmd

    def run_objcopy(from_path, to_path):
        rename_section_command = [objcopy_exe, "--rename-section", ".ctors=.init_array", from_path, to_path]
        subprocess.check_call(rename_section_command)

    possible_libraries = set(CUDA_LIBRARIES.keys())
    possible_libraries.update([
        '-lcudadevrt',
        '-lcufilt',
        '-lculibos',
    ])
    possible_libraries.update([
        lib_name + "_pruner" for lib_name in possible_libraries
    ])

    return process_cuda_library_by_external_tool(list(cmd), build_root, 'objcopy', run_objcopy, possible_libraries)


def process_cuda_libraries_by_nvprune(cmd, cuda_manager, build_root):
    if not cuda_manager.has_cuda_fatbins(cmd):
        return cmd

    # add custom linker script
    to_dirpath = next(tmpdir_generator(build_root, 'cuda_linker_script'))
    script_path = os.path.join(to_dirpath, 'script')
    with open(script_path, 'w') as f:
        cuda_manager.write_linker_script(f)
    flags_with_linker = list(cmd) + ['-Wl,--script={}'.format(script_path)]

    if not cuda_manager.can_prune_libs:
        return flags_with_linker

    return process_cuda_library_by_external_tool(flags_with_linker, build_root, 'pruner', cuda_manager.prune_lib, cuda_manager.fatbin_libs)


def remove_excessive_flags(cmd):
    flags = []
    for flag in cmd:
        if not flag.endswith('.ios.interface') and not flag.endswith('.pkg.fake'):
            flags.append(flag)
    return flags


def fix_cmd_for_dynamic_cuda(cmd):
    flags = []
    for flag in cmd:
        if flag in CUDA_LIBRARIES:
            flags.append(CUDA_LIBRARIES[flag])
        else:
            flags.append(flag)
    return flags


def remove_libs(cmd, libs):
    excluded_flags = ['-l{}'.format(lib) for lib in libs]

    flags = []

    for flag in cmd:
        if flag in excluded_flags:
            continue

        flags.append(flag)

    return flags


def parse_args(args):
    parser = optparse.OptionParser()
    parser.disable_interspersed_args()
    parser.add_option('--custom-step')
    parser.add_option('--python')
    parser.add_option('--source-root')
    parser.add_option('--build-root')
    parser.add_option('--clang-ver')
    parser.add_option('--dynamic-cuda', action='store_true')
    parser.add_option('--cuda-architectures',
                      help='List of supported CUDA architectures, separated by ":" (e.g. "sm_52:compute_70:lto_90a"')
    parser.add_option('--nvprune-exe')
    parser.add_option('--objcopy-exe')
    parser.add_option('--arch')
    parser.add_option('--linker-output')
    parser.add_option('--whole-archive-peers', action='append')
    parser.add_option('--whole-archive-libs', action='append')
    parser.add_option('--exclude-libs', action='append')
    thinlto_cache.add_options(parser)
    return parser.parse_args(args)


if __name__ == '__main__':
    args = sys.argv[1:]
    plugins = []

    if '--start-plugins' in args:
        ib = args.index('--start-plugins')
        ie = args.index('--end-plugins')
        plugins = args[ib + 1:ie]
        args = args[:ib] + args[ie + 1:]

    for p in plugins:
        res = subprocess.check_output([sys.executable, p, sys.argv[0]] + args).decode().strip()

        if res:
            args = json.loads(res)[1:]

    opts, args = parse_args(args)
    args = pcf.skip_markers(args)

    cmd = args
    cmd = remove_excessive_flags(cmd)

    if opts.dynamic_cuda:
        cmd = fix_cmd_for_dynamic_cuda(cmd)
    else:
        cuda_manager = CUDAManager(opts.cuda_architectures, opts.nvprune_exe)
        cmd = process_cuda_libraries_by_nvprune(cmd, cuda_manager, opts.build_root)
        cmd = process_cuda_libraries_by_objcopy(cmd, opts.build_root, opts.objcopy_exe)

    if opts.exclude_libs:
        cmd = remove_libs(cmd, opts.exclude_libs)

    cmd = ProcessWholeArchiveOption(opts.arch, opts.whole_archive_peers, opts.whole_archive_libs).construct_cmd(cmd)

    if opts.custom_step:
        assert opts.python
        subprocess.check_call([opts.python] + [opts.custom_step] + args)

    if opts.linker_output:
        stdout = open(opts.linker_output, 'w')
    else:
        stdout = sys.stdout

    thinlto_cache.preprocess(opts, cmd)
    rc = subprocess.call(cmd, shell=False, stderr=sys.stderr, stdout=stdout)
    thinlto_cache.postprocess(opts)

    sys.exit(rc)