// SPDX-License-Identifier: GPL-3.0-or-later

#include "libnetdata/libnetdata.h"
#include "libnetdata/required_dummies.h"

#ifdef HAVE_SETNS
#ifndef _GNU_SOURCE
#define _GNU_SOURCE             /* See feature_test_macros(7) */
#endif
#include <sched.h>
#endif

char environment_variable2[FILENAME_MAX + 50] = "";
char *environment[] = {
        "PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin",
        environment_variable2,
        NULL
};

struct iface {
    const char *device;
    uint32_t hash;

    unsigned int ifindex;
    unsigned int iflink;

    struct iface *next;
};

unsigned int calc_num_ifaces(struct iface *root) {
    unsigned int num = 0;
    for (struct iface *h = root; h; h = h->next) {
        num++;
    }
    return num;
}

unsigned int read_iface_iflink(const char *prefix, const char *iface) {
    if(!prefix) prefix = "";

    char filename[FILENAME_MAX + 1];
    snprintfz(filename, FILENAME_MAX, "%s/sys/class/net/%s/iflink", prefix, iface);

    unsigned long long iflink = 0;
    int ret = read_single_number_file(filename, &iflink);
    if(ret) collector_error("Cannot read '%s'.", filename);

    return (unsigned int)iflink;
}

unsigned int read_iface_ifindex(const char *prefix, const char *iface) {
    if(!prefix) prefix = "";

    char filename[FILENAME_MAX + 1];
    snprintfz(filename, FILENAME_MAX, "%s/sys/class/net/%s/ifindex", prefix, iface);

    unsigned long long ifindex = 0;
    int ret = read_single_number_file(filename, &ifindex);
    if(ret) collector_error("Cannot read '%s'.", filename);

    return (unsigned int)ifindex;
}

struct iface *read_proc_net_dev(const char *scope __maybe_unused, const char *prefix) {
    if(!prefix) prefix = "";

    procfile *ff = NULL;
    char filename[FILENAME_MAX + 1];

    snprintfz(filename, FILENAME_MAX, "%s%s", prefix, (*prefix)?"/proc/1/net/dev":"/proc/net/dev");

#ifdef NETDATA_INTERNAL_CHECKS
    collector_info("parsing '%s'", filename);
#endif

    ff = procfile_open(filename, " \t,:|", PROCFILE_FLAG_DEFAULT);
    if(unlikely(!ff)) {
        collector_error("Cannot open file '%s'", filename);
        return NULL;
    }

    ff = procfile_readall(ff);
    if(unlikely(!ff)) {
        collector_error("Cannot read file '%s'", filename);
        return NULL;
    }

    size_t lines = procfile_lines(ff), l;
    struct iface *root = NULL;
    for(l = 2; l < lines ;l++) {
        if (unlikely(procfile_linewords(ff, l) < 1)) continue;

        struct iface *t = callocz(1, sizeof(struct iface));
        t->device = strdupz(procfile_lineword(ff, l, 0));
        t->hash = simple_hash(t->device);
        t->ifindex = read_iface_ifindex(prefix, t->device);
        t->iflink  = read_iface_iflink(prefix, t->device);
        t->next = root;
        root = t;

#ifdef NETDATA_INTERNAL_CHECKS
        collector_info("added %s interface '%s', ifindex %u, iflink %u", scope, t->device, t->ifindex, t->iflink);
#endif
    }

    procfile_close(ff);

    return root;
}

void free_iface(struct iface *iface) {
    freez((void *)iface->device);
    freez(iface);
}

void free_host_ifaces(struct iface *iface) {
    while(iface) {
        struct iface *t = iface->next;
        free_iface(iface);
        iface = t;
    }
}

int iface_is_eligible(struct iface *iface) {
    if(iface->iflink != iface->ifindex)
        return 1;

    return 0;
}

int eligible_ifaces(struct iface *root) {
    int eligible = 0;

    struct iface *t;
    for(t = root; t ; t = t->next)
        if(iface_is_eligible(t))
            eligible++;

    return eligible;
}

static void continue_as_child(void) {
    pid_t child = fork();
    int status;
    pid_t ret;

    if (child < 0)
        collector_error("fork() failed");

    /* Only the child returns */
    if (child == 0)
        return;

    for (;;) {
        ret = waitpid(child, &status, WUNTRACED);
        if ((ret == child) && (WIFSTOPPED(status))) {
            /* The child suspended so suspend us as well */
            kill(getpid(), SIGSTOP);
            kill(child, SIGCONT);
        } else {
            break;
        }
    }

    /* Return the child's exit code if possible */
    if (WIFEXITED(status)) {
        exit(WEXITSTATUS(status));
    } else if (WIFSIGNALED(status)) {
        kill(getpid(), WTERMSIG(status));
    }

    exit(EXIT_FAILURE);
}

int proc_pid_fd(const char *prefix, const char *ns, pid_t pid) {
    if(!prefix) prefix = "";

    char filename[FILENAME_MAX + 1];
    snprintfz(filename, FILENAME_MAX, "%s/proc/%d/%s", prefix, (int)pid, ns);
    int fd = open(filename, O_RDONLY);

    if(fd == -1)
        collector_error("Cannot open proc_pid_fd() file '%s'", filename);

    return fd;
}

static struct ns {
    int nstype;
    int fd;
    int status;
    const char *name;
    const char *path;
} all_ns[] = {
        // { .nstype = CLONE_NEWUSER,   .fd = -1, .status = -1, .name = "user",    .path = "ns/user"   },
        // { .nstype = CLONE_NEWCGROUP, .fd = -1, .status = -1, .name = "cgroup",  .path = "ns/cgroup" },
        // { .nstype = CLONE_NEWIPC,    .fd = -1, .status = -1, .name = "ipc",     .path = "ns/ipc"    },
        // { .nstype = CLONE_NEWUTS,    .fd = -1, .status = -1, .name = "uts",     .path = "ns/uts"    },
        { .nstype = CLONE_NEWNET,    .fd = -1, .status = -1, .name = "network", .path = "ns/net"    },
        { .nstype = CLONE_NEWPID,    .fd = -1, .status = -1, .name = "pid",     .path = "ns/pid"    },
        { .nstype = CLONE_NEWNS,     .fd = -1, .status = -1, .name = "mount",   .path = "ns/mnt"    },

        // terminator
        { .nstype = 0,               .fd = -1, .status = -1, .name = NULL,      .path = NULL        }
};

int switch_namespace(const char *prefix, pid_t pid) {

#ifdef HAVE_SETNS

    int i;
    for(i = 0; all_ns[i].name ; i++)
        all_ns[i].fd = proc_pid_fd(prefix, all_ns[i].path, pid);

    int root_fd = proc_pid_fd(prefix, "root", pid);
    int cwd_fd  = proc_pid_fd(prefix, "cwd", pid);

    setgroups(0, NULL);

    // 2 passes - found it at nsenter source code
    // this is related CLONE_NEWUSER functionality

    // This code cannot switch user namespace (it can all the other namespaces)
    // Fortunately, we don't need to switch user namespaces.

    int pass;
    for(pass = 0; pass < 2 ;pass++) {
        for(i = 0; all_ns[i].name ; i++) {
            if (all_ns[i].fd != -1 && all_ns[i].status == -1) {
                if(setns(all_ns[i].fd, all_ns[i].nstype) == -1) {
                    if(pass == 1) {
                        all_ns[i].status = 0;
                        collector_error("Cannot switch to %s namespace of pid %d", all_ns[i].name, (int) pid);
                    }
                }
                else
                    all_ns[i].status = 1;
            }
        }
    }

    setgroups(0, NULL);

    if(root_fd != -1) {
        if(fchdir(root_fd) < 0)
            collector_error("Cannot fchdir() to pid %d root directory", (int)pid);

        if(chroot(".") < 0)
            collector_error("Cannot chroot() to pid %d root directory", (int)pid);

        close(root_fd);
    }

    if(cwd_fd != -1) {
        if(fchdir(cwd_fd) < 0)
            collector_error("Cannot fchdir() to pid %d current working directory", (int)pid);

        close(cwd_fd);
    }

    int do_fork = 0;
    for(i = 0; all_ns[i].name ; i++)
        if(all_ns[i].fd != -1) {

            // CLONE_NEWPID requires a fork() to become effective
            if(all_ns[i].nstype == CLONE_NEWPID && all_ns[i].status)
                do_fork = 1;

            close(all_ns[i].fd);
        }

    if(do_fork)
        continue_as_child();

    return 0;

#else

    errno = ENOSYS;
    collector_error("setns() is missing on this system.");
    return 1;

#endif
}

pid_t read_pid_from_cgroup_file(const char *filename) {
    int fd = open(filename, procfile_open_flags);
    if(fd == -1) {
        collector_error("Cannot open pid_from_cgroup() file '%s'.", filename);
        return 0;
    }

    FILE *fp = fdopen(fd, "r");
    if(!fp) {
        collector_error("Cannot upgrade fd to fp for file '%s'.", filename);
        return 0;
    }

    char buffer[100 + 1];
    pid_t pid = 0;
    char *s;
    while((s = fgets(buffer, 100, fp))) {
        buffer[100] = '\0';
        pid = atoi(s);
        if(pid > 0) break;
    }

    fclose(fp);

#ifdef NETDATA_INTERNAL_CHECKS
    if(pid > 0) collector_info("found pid %d on file '%s'", pid, filename);
#endif

    return pid;
}

pid_t read_pid_from_cgroup_files(const char *path) {
    char filename[FILENAME_MAX + 1];

    snprintfz(filename, FILENAME_MAX, "%s/cgroup.procs", path);
    pid_t pid = read_pid_from_cgroup_file(filename);
    if(pid > 0) return pid;

    snprintfz(filename, FILENAME_MAX, "%s/tasks", path);
    return read_pid_from_cgroup_file(filename);
}

pid_t read_pid_from_cgroup(const char *path) {
    pid_t pid = read_pid_from_cgroup_files(path);
    if (pid > 0) return pid;

    DIR *dir = opendir(path);
    if (!dir) {
        collector_error("cannot read directory '%s'", path);
        return 0;
    }

    struct dirent *de = NULL;
    while ((de = readdir(dir))) {
        if (de->d_type == DT_DIR
            && (
                    (de->d_name[0] == '.' && de->d_name[1] == '\0')
                    || (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')
            ))
            continue;

        if (de->d_type == DT_DIR) {
            char filename[FILENAME_MAX + 1];
            snprintfz(filename, FILENAME_MAX, "%s/%s", path, de->d_name);
            pid = read_pid_from_cgroup(filename);
            if(pid > 0) break;
        }
    }
    closedir(dir);
    return pid;
}

// ----------------------------------------------------------------------------
// send the result to netdata

struct found_device {
    const char *host_device;
    const char *guest_device;

    uint32_t host_device_hash;

    struct found_device *next;
} *detected_devices = NULL;

void add_device(const char *host, const char *guest) {
#ifdef NETDATA_INTERNAL_CHECKS
    collector_info("adding device with host '%s', guest '%s'", host, guest);
#endif

    uint32_t hash = simple_hash(host);

    if(guest && (!*guest || strcmp(host, guest) == 0))
        guest = NULL;

    struct found_device *f;
    for(f = detected_devices; f ; f = f->next) {
        if(f->host_device_hash == hash && !strcmp(host, f->host_device)) {

            if(guest && (!f->guest_device || !strcmp(f->host_device, f->guest_device))) {
                if(f->guest_device) freez((void *)f->guest_device);
                f->guest_device = strdupz(guest);
            }

            return;
        }
    }

    f = mallocz(sizeof(struct found_device));
    f->host_device = strdupz(host);
    f->host_device_hash = hash;
    f->guest_device = (guest)?strdupz(guest):NULL;
    f->next = detected_devices;
    detected_devices = f;
}

int send_devices(void) {
    int found = 0;

    struct found_device *f;
    for(f = detected_devices; f ; f = f->next) {
        found++;
        printf("%s %s\n", f->host_device, (f->guest_device)?f->guest_device:f->host_device);
    }

    return found;
}

// ----------------------------------------------------------------------------
// this function should be called only **ONCE**
// also it has to be the **LAST** to be called
// since it switches namespaces, so after this call, everything is different!

void detect_veth_interfaces(pid_t pid) {
    struct iface *cgroup = NULL;
    struct iface *host, *h, *c;

    host = read_proc_net_dev("host", netdata_configured_host_prefix);
    if(!host) {
        errno = 0;
        collector_error("cannot read host interface list.");
        goto cleanup;
    }

    if(!eligible_ifaces(host)) {
        errno = 0;
        collector_info("there are no double-linked host interfaces available.");
        goto cleanup;
    }

    if(switch_namespace(netdata_configured_host_prefix, pid)) {
        errno = 0;
        collector_error("cannot switch to the namespace of pid %u", (unsigned int) pid);
        goto cleanup;
    }

#ifdef NETDATA_INTERNAL_CHECKS
    collector_info("switched to namespaces of pid %d", pid);
#endif

    cgroup = read_proc_net_dev("cgroup", NULL);
    if(!cgroup) {
        errno = 0;
        collector_error("cannot read cgroup interface list.");
        goto cleanup;
    }

    if(!eligible_ifaces(cgroup)) {
        errno = 0;
        collector_error("there are not double-linked cgroup interfaces available.");
        goto cleanup;
    }

     unsigned int host_dev_num = calc_num_ifaces(host);
     unsigned int cgroup_dev_num = calc_num_ifaces(cgroup);
    // host ifaces == guest ifaces => we are still in the host namespace
    // and we can't really identify which ifaces belong to the cgroup (e.g. Proxmox VM).
    if (host_dev_num == cgroup_dev_num) {
        unsigned int m = 0;
        for (h = host; h; h = h->next) {
            for (c = cgroup; c; c = c->next) {
                if (h->ifindex == c->ifindex && h->iflink == c->iflink) {
                    m++;
                    break;
                }
            }
        }
        if (host_dev_num == m) {
            goto cleanup;
        }
    }

    for(h = host; h ; h = h->next) {
        if(iface_is_eligible(h)) {
            for (c = cgroup; c; c = c->next) {
                if(iface_is_eligible(c) && h->ifindex == c->iflink && h->iflink == c->ifindex) {
                    add_device(h->device, c->device);
                }
            }
        }
    }

cleanup:
    free_host_ifaces(cgroup);
    free_host_ifaces(host);
}

// ----------------------------------------------------------------------------
// call the external helper

#define CGROUP_NETWORK_INTERFACE_MAX_LINE 2048
void call_the_helper(pid_t pid, const char *cgroup) {
    if(setresuid(0, 0, 0) == -1)
        collector_error("setresuid(0, 0, 0) failed.");

    char command[CGROUP_NETWORK_INTERFACE_MAX_LINE + 1];
    if(cgroup)
        snprintfz(command, CGROUP_NETWORK_INTERFACE_MAX_LINE, "exec " PLUGINS_DIR "/cgroup-network-helper.sh --cgroup '%s'", cgroup);
    else
        snprintfz(command, CGROUP_NETWORK_INTERFACE_MAX_LINE, "exec " PLUGINS_DIR "/cgroup-network-helper.sh --pid %d", pid);

    collector_info("running: %s", command);

    pid_t cgroup_pid;
    FILE *fp_child_input, *fp_child_output;

    if(cgroup) {
        (void)netdata_popen_raw_default_flags(&cgroup_pid, environment, &fp_child_input, &fp_child_output, PLUGINS_DIR "/cgroup-network-helper.sh", "--cgroup", cgroup);
    }
    else {
        char buffer[100];
        snprintfz(buffer, sizeof(buffer) - 1, "%d", pid);
        (void)netdata_popen_raw_default_flags(&cgroup_pid, environment, &fp_child_input, &fp_child_output, PLUGINS_DIR "/cgroup-network-helper.sh", "--pid", buffer);
    }

    if(fp_child_output) {
        char buffer[CGROUP_NETWORK_INTERFACE_MAX_LINE + 1];
        char *s;
        while((s = fgets(buffer, CGROUP_NETWORK_INTERFACE_MAX_LINE, fp_child_output))) {
            trim(s);

            if(*s && *s != '\n') {
                char *t = s;
                while(*t && *t != ' ') t++;
                if(*t == ' ') {
                    *t = '\0';
                    t++;
                }

                if(!*s || !*t) continue;
                add_device(s, t);
            }
        }

        netdata_pclose(fp_child_input, fp_child_output, cgroup_pid);
    }
    else
        collector_error("cannot execute cgroup-network helper script: %s", command);
}

int is_valid_path_symbol(char c) {
    switch(c) {
        case '/':   // path separators
        case '\\':  // needed for virsh domains \x2d1\x2dname
        case ' ':   // space
        case '-':   // hyphen
        case '_':   // underscore
        case '.':   // dot
        case ',':   // comma
            return 1;

        default:
            return 0;
    }
}

// we will pass this path a shell script running as root
// so, we need to make sure the path will be valid
// and will not include anything that could allow
// the caller use shell expansion for gaining escalated
// privileges.
int verify_path(const char *path) {
    struct stat sb;

    char c;
    const char *s = path;
    while((c = *s++)) {
        if(!( isalnum(c) || is_valid_path_symbol(c) )) {
            collector_error("invalid character in path '%s'", path);
            return -1;
        }
    }

    if(strstr(path, "\\") && !strstr(path, "\\x")) {
        collector_error("invalid escape sequence in path '%s'", path);
        return 1;
    }

    if(strstr(path, "/../")) {
        collector_error("invalid parent path sequence detected in '%s'", path);
        return 1;
    }

    if(path[0] != '/') {
        collector_error("only absolute path names are supported - invalid path '%s'", path);
        return -1;
    }

    if (stat(path, &sb) == -1) {
        collector_error("cannot stat() path '%s'", path);
        return -1;
    }

    if((sb.st_mode & S_IFMT) != S_IFDIR) {
        collector_error("path '%s' is not a directory", path);
        return -1;
    }

    return 0;
}

/*
char *fix_path_variable(void) {
    const char *path = getenv("PATH");
    if(!path || !*path) return 0;

    char *p = strdupz(path);
    char *safe_path = callocz(1, strlen(p) + strlen("PATH=") + 1);
    strcpy(safe_path, "PATH=");

    int added = 0;
    char *ptr = p;
    while(ptr && *ptr) {
        char *s = strsep(&ptr, ":");
        if(s && *s) {
            if(verify_path(s) == -1) {
                collector_error("the PATH variable includes an invalid path '%s' - removed it.", s);
            }
            else {
                collector_info("the PATH variable includes a valid path '%s'.", s);
                if(added) strcat(safe_path, ":");
                strcat(safe_path, s);
                added++;
            }
        }
    }

    collector_info("unsafe PATH:      '%s'.", path);
    collector_info("  safe PATH: '%s'.", safe_path);

    freez(p);
    return safe_path;
}
*/

// ----------------------------------------------------------------------------
// main

void usage(void) {
    fprintf(stderr, "%s [ -p PID | --pid PID | --cgroup /path/to/cgroup ]\n", program_name);
    exit(1);
}

int main(int argc, char **argv) {
    stderror = stderr;
    pid_t pid = 0;

    program_name = argv[0];
    program_version = VERSION;
    error_log_syslog = 0;

    // since cgroup-network runs as root, prevent it from opening symbolic links
    procfile_open_flags = O_RDONLY|O_NOFOLLOW;

    // ------------------------------------------------------------------------
    // make sure NETDATA_HOST_PREFIX is safe

    netdata_configured_host_prefix = getenv("NETDATA_HOST_PREFIX");
    if(verify_netdata_host_prefix() == -1) exit(1);

    if(netdata_configured_host_prefix[0] != '\0' && verify_path(netdata_configured_host_prefix) == -1)
        fatal("invalid NETDATA_HOST_PREFIX '%s'", netdata_configured_host_prefix);

    // ------------------------------------------------------------------------
    // build a safe environment for our script

    // the first environment variable is a fixed PATH=
    snprintfz(environment_variable2, sizeof(environment_variable2) - 1, "NETDATA_HOST_PREFIX=%s", netdata_configured_host_prefix);

    // ------------------------------------------------------------------------

    if(argc == 2 && (!strcmp(argv[1], "version") || !strcmp(argv[1], "-version") || !strcmp(argv[1], "--version") || !strcmp(argv[1], "-v") || !strcmp(argv[1], "-V"))) {
        fprintf(stderr, "cgroup-network %s\n", VERSION);
        exit(0);
    }

    if(argc != 3)
        usage();

    int arg = 1;
    int helper = 1;
    if (getenv("KUBERNETES_SERVICE_HOST") != NULL && getenv("KUBERNETES_SERVICE_PORT") != NULL)
        helper = 0;

    if(!strcmp(argv[arg], "-p") || !strcmp(argv[arg], "--pid")) {
        pid = atoi(argv[arg+1]);

        if(pid <= 0) {
            errno = 0;
            collector_error("Invalid pid %d given", (int) pid);
            return 2;
        }

        if(helper) call_the_helper(pid, NULL);
    }
    else if(!strcmp(argv[arg], "--cgroup")) {
        char *cgroup = argv[arg+1];
        if(verify_path(cgroup) == -1) {
            collector_error("cgroup '%s' does not exist or is not valid.", cgroup);
            return 1;
        }

        pid = read_pid_from_cgroup(cgroup);
        if(helper) call_the_helper(pid, cgroup);

        if(pid <= 0 && !detected_devices) {
            errno = 0;
            collector_error("Cannot find a cgroup PID from cgroup '%s'", cgroup);
        }
    }
    else
        usage();

    if(pid > 0)
        detect_veth_interfaces(pid);

    int found = send_devices();
    if(found <= 0) return 1;
    return 0;
}