Browse Source

spawn-server-nofork: invalid magic (#18831)

* debug code to trace the issue

* detect the libsystemd socket and keep it open when forking

* fix the path

* evaluate 2k fds

* increase to 1k pad buffer

* detect it in a different way

* cleanup

* use the same journal direct socket detection

* removed debugging paddding
Costa Tsaousis 4 months ago
parent
commit
fc1fb32851

+ 11 - 17
src/libnetdata/log/nd_log-init.c

@@ -44,7 +44,7 @@ void nd_log_initialize_for_external_plugins(const char *name) {
     program_name = name;
 
     for(size_t i = 0; i < _NDLS_MAX ;i++) {
-        nd_log.sources[i].method = STDERR_FILENO;
+        nd_log.sources[i].method = NDLM_DEFAULT;
         nd_log.sources[i].fd = -1;
         nd_log.sources[i].fp = NULL;
     }
@@ -123,12 +123,10 @@ void nd_log_initialize_for_external_plugins(const char *name) {
             break;
     }
 
-    for(size_t i = 0; i < _NDLS_MAX ;i++) {
-        nd_log.sources[i].method = method;
-        nd_log.sources[i].format = format;
-        nd_log.sources[i].fd = -1;
-        nd_log.sources[i].fp = NULL;
-    }
+    nd_log.sources[NDLS_COLLECTORS].method = method;
+    nd_log.sources[NDLS_COLLECTORS].format = format;
+    nd_log.sources[NDLS_COLLECTORS].fd = -1;
+    nd_log.sources[NDLS_COLLECTORS].fp = NULL;
 
     //    nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "FINAL_LOG_METHOD: %s", nd_log_id2method(method));
 }
@@ -271,6 +269,10 @@ void nd_log_reopen_log_files(bool log) {
         netdata_log_info("Log files re-opened.");
 }
 
+int nd_log_systemd_journal_fd(void) {
+    return nd_log.journal.fd;
+}
+
 void nd_log_reopen_log_files_for_spawn_server(const char *name) {
     gettid_uncached();
 
@@ -284,13 +286,11 @@ void nd_log_reopen_log_files_for_spawn_server(const char *name) {
         close(nd_log.journal_direct.fd);
         nd_log.journal_direct.fd = -1;
         nd_log.journal_direct.initialized = false;
-        nd_log_journal_direct_init(NULL);
     }
 
     for(size_t i = 0; i < _NDLS_MAX ;i++) {
-        if(i != NDLS_COLLECTORS && i != NDLS_DAEMON) continue;
-
         spinlock_init(&nd_log.sources[i].spinlock);
+        nd_log.sources[i].method = NDLM_DEFAULT;
         nd_log.sources[i].fd = -1;
         nd_log.sources[i].fp = NULL;
         nd_log.sources[i].pending_msg = NULL;
@@ -299,16 +299,10 @@ void nd_log_reopen_log_files_for_spawn_server(const char *name) {
 #endif
     }
 
-    for(size_t i = 0; i < _NDLS_MAX ;i++) {
-        if(i == NDLS_COLLECTORS || i == NDLS_DAEMON) continue;
-        nd_log.sources[i].method = NDLM_DISABLED;
-    }
-
+    // initialize spinlocks
     spinlock_init(&nd_log.std_output.spinlock);
     spinlock_init(&nd_log.std_error.spinlock);
 
-    nd_log.journal.initialized = false;
-    nd_log.journal_direct.initialized = false;
     nd_log.syslog.initialized = false;
     nd_log.eventlog.initialized = false;
     nd_log.std_output.initialized = false;

+ 2 - 0
src/libnetdata/log/nd_log-internals.c

@@ -284,6 +284,8 @@ struct nd_log nd_log = {
     .overwrite_process_source = 0,
     .journal = {
         .initialized = false,
+        .first_msg = false,
+        .fd = -1,
     },
     .journal_direct = {
         .initialized = false,

+ 3 - 1
src/libnetdata/log/nd_log-internals.h

@@ -133,12 +133,14 @@ struct nd_log {
 
     struct {
         bool initialized;
+        bool first_msg;
+        int fd;             // we don't control this, we just detect it to keep it open
     } journal;
 
     struct {
         bool initialized;
         int fd;
-        char filename[FILENAME_MAX + 1];
+        char filename[FILENAME_MAX];
     } journal_direct;
 
     struct {

+ 51 - 27
src/libnetdata/log/nd_log-to-systemd-journal.c

@@ -12,18 +12,36 @@ bool nd_log_journal_systemd_init(void) {
     return nd_log.journal.initialized;
 }
 
-bool nd_log_journal_socket_available(void) {
-    if(netdata_configured_host_prefix && *netdata_configured_host_prefix) {
-        char filename[FILENAME_MAX + 1];
+static int nd_log_journal_direct_fd_find_and_open(char *filename, size_t size) {
+    int fd;
 
-        snprintfz(filename, sizeof(filename), "%s%s",
-                  netdata_configured_host_prefix, "/run/systemd/journal/socket");
+    if(netdata_configured_host_prefix && *netdata_configured_host_prefix) {
+        journal_construct_path(filename, size, netdata_configured_host_prefix, "netdata");
+        if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1)
+            return fd;
 
-        if(is_path_unix_socket(filename))
-            return true;
+        journal_construct_path(filename, size, netdata_configured_host_prefix, NULL);
+        if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1)
+            return fd;
     }
 
-    return is_path_unix_socket("/run/systemd/journal/socket");
+    journal_construct_path(filename, size, NULL, "netdata");
+    if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1)
+        return fd;
+
+    journal_construct_path(filename, size, NULL, NULL);
+    if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1)
+        return fd;
+
+    return -1;
+}
+
+bool nd_log_journal_socket_available(void) {
+    char filename[FILENAME_MAX];
+    int fd = nd_log_journal_direct_fd_find_and_open(filename, sizeof(filename));
+    if(fd == -1) return false;
+    close(fd);
+    return true;
 }
 
 static void nd_log_journal_direct_set_env(void) {
@@ -38,25 +56,9 @@ bool nd_log_journal_direct_init(const char *path) {
     }
 
     int fd;
-    char filename[FILENAME_MAX + 1];
-    if(!is_path_unix_socket(path)) {
-
-        journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, "netdata");
-        if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) {
-
-            journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, NULL);
-            if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) {
-
-                journal_construct_path(filename, sizeof(filename), NULL, "netdata");
-                if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) {
-
-                    journal_construct_path(filename, sizeof(filename), NULL, NULL);
-                    if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1)
-                        return false;
-                }
-            }
-        }
-    }
+    char filename[FILENAME_MAX];
+    if(!is_path_unix_socket(path))
+        fd = nd_log_journal_direct_fd_find_and_open(filename, sizeof(filename));
     else {
         snprintfz(filename, sizeof(filename), "%s", path);
         fd = journal_direct_fd(filename);
@@ -74,6 +76,8 @@ bool nd_log_journal_direct_init(const char *path) {
     return true;
 }
 
+static bool sockets_before[1024];
+
 bool nd_logger_journal_libsystemd(struct log_field *fields __maybe_unused, size_t fields_max __maybe_unused) {
 #ifdef HAVE_SYSTEMD
 
@@ -154,8 +158,28 @@ bool nd_logger_journal_libsystemd(struct log_field *fields __maybe_unused, size_
         }
     }
 
+    bool detect_systemd_socket = __atomic_load_n(&nd_log.journal.first_msg, __ATOMIC_RELAXED) == false;
+    if(detect_systemd_socket) {
+        for(int i = 3 ; (size_t)i < _countof(sockets_before); i++)
+            sockets_before[i] = fd_is_socket(i);
+    }
+
     int r = sd_journal_sendv(iov, iov_count);
 
+    if(r == 0 && detect_systemd_socket) {
+        __atomic_store_n(&nd_log.journal.first_msg, true, __ATOMIC_RELAXED);
+
+        // this is the first successful libsystemd log
+        // let's detect its fd number (we need it for the spawn server)
+
+        for(int i = 3 ; (size_t)i < _countof(sockets_before); i++) {
+            if (!sockets_before[i] && fd_is_socket(i)) {
+                nd_log.journal.fd = i;
+                break;
+            }
+        }
+    }
+
     // Clean up allocated memory
     for (int i = 0; i < iov_count; i++) {
         if (iov[i].iov_base != NULL) {

+ 1 - 0
src/libnetdata/log/nd_log.h

@@ -14,6 +14,7 @@ extern "C" {
 #define ND_LOG_DEFAULT_THROTTLE_PERIOD 60
 
 void errno_clear(void);
+int nd_log_systemd_journal_fd(void);
 void nd_log_set_user_settings(ND_LOG_SOURCES source, const char *setting);
 void nd_log_set_facility(const char *facility);
 void nd_log_set_priority_level(const char *setting);

+ 54 - 21
src/libnetdata/spawn_server/spawn_server_nofork.c

@@ -274,7 +274,13 @@ static bool spawn_external_command(SPAWN_SERVER *server __maybe_unused, SPAWN_RE
         return false;
     }
 
-    os_close_all_non_std_open_fds_except(rq->fds, 3, CLOSE_RANGE_CLOEXEC);
+    int fds_to_keep[] = {
+        rq->fds[0],
+        rq->fds[1],
+        rq->fds[2],
+        nd_log_systemd_journal_fd(),
+    };
+    os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), CLOSE_RANGE_CLOEXEC);
 
     errno_clear();
     if (posix_spawn(&rq->pid, rq->argv[0], &file_actions, &attr, (char * const *)rq->argv, (char * const *)rq->envp) != 0) {
@@ -325,7 +331,14 @@ static bool spawn_server_run_callback(SPAWN_SERVER *server __maybe_unused, SPAWN
         os_setproctitle("spawn-callback", server->argc, server->argv);
 
         // close all open file descriptors of the parent, but keep ours
-        os_close_all_non_std_open_fds_except(rq->fds, 4, 0);
+        int fds_to_keep[] = {
+            rq->fds[0],
+            rq->fds[1],
+            rq->fds[2],
+            rq->fds[3],
+            nd_log_systemd_journal_fd(),
+        };
+        os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), 0);
         nd_log_reopen_log_files_for_spawn_server("spawn-callback");
 
         // get the fds from the request
@@ -1076,7 +1089,12 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name
         os_setproctitle(buf, server->argc, server->argv);
 
         replace_stdio_with_dev_null();
-        os_close_all_non_std_open_fds_except((int[]){ server->sock, server->pipe[1] }, 2, 0);
+        int fds_to_keep[] = {
+            server->sock,
+            server->pipe[1],
+            nd_log_systemd_journal_fd(),
+        };
+        os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), 0);
         nd_log_reopen_log_files_for_spawn_server(buf);
         exit(spawn_server_event_loop(server));
     }
@@ -1125,6 +1143,21 @@ void spawn_server_exec_destroy(SPAWN_INSTANCE *instance) {
     freez(instance);
 }
 
+static void log_invalid_magic(SPAWN_INSTANCE *instance, struct status_report *sr) {
+    unsigned char buf[sizeof(*sr) + 1];
+    memcpy(buf, sr, sizeof(*sr));
+    buf[sizeof(buf) - 1] = '\0';
+
+    for(size_t i = 0; i < sizeof(buf) - 1; i++) {
+        if (iscntrl(buf[i]) || !isprint(buf[i]))
+            buf[i] = '_';
+    }
+
+    nd_log(NDLS_COLLECTORS, NDLP_ERR,
+           "SPAWN PARENT: invalid final status report for child %d, request %zu (invalid magic %#x in response, reads like '%s')",
+           instance->child_pid, instance->request_id, sr->magic, buf);
+}
+
 int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *instance) {
     int rc = -1;
 
@@ -1139,24 +1172,24 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *
             "SPAWN PARENT: failed to read final status report for child %d, request %zu",
             instance->child_pid, instance->request_id);
 
-    else if(sr.magic != STATUS_REPORT_MAGIC) {
-        nd_log(NDLS_COLLECTORS, NDLP_ERR,
-            "SPAWN PARENT: invalid final status report for child %d, request %zu (invalid magic %#x in response)",
-            instance->child_pid, instance->request_id, sr.magic);
-    }
-    else switch(sr.status) {
-        case STATUS_REPORT_EXITED:
-            rc = sr.exited.waitpid_status;
-            break;
-
-        case STATUS_REPORT_STARTED:
-        case STATUS_REPORT_FAILED:
-        default:
-            errno = 0;
-            nd_log(NDLS_COLLECTORS, NDLP_ERR,
-                "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)",
-                instance->request_id, instance->child_pid, sr.status);
-            break;
+    else if(sr.magic != STATUS_REPORT_MAGIC)
+        log_invalid_magic(instance, &sr);
+    else {
+        switch (sr.status) {
+            case STATUS_REPORT_EXITED:
+                rc = sr.exited.waitpid_status;
+                break;
+
+            case STATUS_REPORT_STARTED:
+            case STATUS_REPORT_FAILED:
+            default:
+                errno = 0;
+                nd_log(
+                    NDLS_COLLECTORS, NDLP_ERR,
+                    "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)",
+                    instance->request_id, instance->child_pid, sr.status);
+                break;
+        }
     }
 
     instance->child_pid = 0;