Browse Source

Adds metrics for ACLK performance and status (#9269)

Adds ACLK charts
Timotej S 4 years ago
parent
commit
ba350b7554
8 changed files with 350 additions and 2 deletions
  1. 2 0
      CMakeLists.txt
  2. 2 0
      Makefile.am
  3. 25 1
      aclk/aclk_lws_wss_client.c
  4. 195 0
      aclk/aclk_stats.c
  5. 51 0
      aclk/aclk_stats.h
  6. 47 0
      aclk/agent_cloud_link.c
  7. 13 1
      aclk/mqtt.c
  8. 15 0
      web/gui/dashboard_info.js

+ 2 - 0
CMakeLists.txt

@@ -651,6 +651,8 @@ set(ACLK_PLUGIN_FILES
         aclk/aclk_lws_https_client.h
         aclk/mqtt.c
         aclk/mqtt.h
+        aclk/aclk_stats.c
+        aclk/aclk_stats.h
         )
 
 set(SPAWN_PLUGIN_FILES

+ 2 - 0
Makefile.am

@@ -494,6 +494,8 @@ PARSER_FILES = \
 ACLK_FILES = \
     aclk/aclk_common.c \
     aclk/aclk_common.h \
+    aclk/aclk_stats.c \
+    aclk/aclk_stats.h
     $(NULL)
 
 if ENABLE_ACLK

+ 25 - 1
aclk/aclk_lws_wss_client.c

@@ -5,6 +5,7 @@
 #include "libnetdata/libnetdata.h"
 #include "../daemon/common.h"
 #include "aclk_common.h"
+#include "aclk_stats.h"
 
 extern int aclk_shutting_down;
 
@@ -436,8 +437,14 @@ static int aclk_lws_wss_callback(struct lws *wsi, enum lws_callback_reasons reas
                 if ( bytes_left > FRAGMENT_SIZE)
                     bytes_left = FRAGMENT_SIZE;
                 int n = lws_write(wsi, data->data + LWS_PRE + data->written, bytes_left, LWS_WRITE_BINARY);
-                if (n>=0)
+                if (n>=0) {
                     data->written += n;
+                    if (aclk_stats_enabled) {
+                        ACLK_STATS_LOCK;
+                        aclk_metrics_per_sample.write_q_consumed += n;
+                        ACLK_STATS_UNLOCK;
+                    }
+                }
                 //error("lws_write(req=%u,written=%u) %zu of %zu",bytes_left, rc, data->written,data->data_size,rc);
                 if (data->written == data->data_size)
                 {
@@ -455,6 +462,11 @@ static int aclk_lws_wss_callback(struct lws *wsi, enum lws_callback_reasons reas
             if (!received_data_to_ringbuff(engine_instance->read_ringbuffer, in, len))
                 retval = 1;
             aclk_lws_mutex_unlock(&engine_instance->read_buf_mutex);
+            if (aclk_stats_enabled) {
+                ACLK_STATS_LOCK;
+                aclk_metrics_per_sample.read_q_added += len;
+                ACLK_STATS_UNLOCK;
+            }
 
             // to future myself -> do not call this while read lock is active as it will eventually
             // want to acquire same lock later in aclk_lws_wss_client_read() function
@@ -524,6 +536,12 @@ int aclk_lws_wss_client_write(void *buf, size_t count)
         lws_wss_packet_buffer_append(&engine_instance->write_buffer_head, lws_wss_packet_buffer_new(buf, count));
         aclk_lws_mutex_unlock(&engine_instance->write_buf_mutex);
 
+        if (aclk_stats_enabled) {
+            ACLK_STATS_LOCK;
+            aclk_metrics_per_sample.write_q_added += count;
+            ACLK_STATS_UNLOCK;
+        }
+
         lws_callback_on_writable(engine_instance->lws_wsi);
         return count;
     }
@@ -549,6 +567,12 @@ int aclk_lws_wss_client_read(void *buf, size_t count)
     if (data_to_be_read == readable_byte_count)
         engine_instance->data_to_read = 0;
 
+    if (aclk_stats_enabled) {
+        ACLK_STATS_LOCK;
+        aclk_metrics_per_sample.read_q_consumed += data_to_be_read;
+        ACLK_STATS_UNLOCK;
+    }
+
 abort:
     aclk_lws_mutex_unlock(&engine_instance->read_buf_mutex);
     return data_to_be_read;

+ 195 - 0
aclk/aclk_stats.c

@@ -0,0 +1,195 @@
+#include "aclk_stats.h"
+
+netdata_mutex_t aclk_stats_mutex = NETDATA_MUTEX_INITIALIZER;
+
+int aclk_stats_enabled;
+
+struct aclk_metrics aclk_metrics = {
+    .online = 0,
+};
+
+struct aclk_metrics_per_sample aclk_metrics_per_sample;
+
+static void aclk_stats_collect(struct aclk_metrics_per_sample *per_sample, struct aclk_metrics *permanent)
+{
+    static RRDSET *st_aclkstats = NULL;
+    static RRDDIM *rd_online_status = NULL;
+
+    if (unlikely(!st_aclkstats)) {
+        st_aclkstats = rrdset_create_localhost(
+            "netdata", "aclk_status", NULL, "aclk_stats", NULL, "ACLK/Cloud connection status",
+            "connected", "netdata", "stats", 200000, localhost->rrd_update_every, RRDSET_TYPE_LINE);
+
+        rd_online_status = rrddim_add(st_aclkstats, "online", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st_aclkstats);
+
+    rrddim_set_by_pointer(st_aclkstats, rd_online_status, per_sample->offline_during_sample ? 0 : permanent->online);
+
+    rrdset_done(st_aclkstats);
+}
+
+static void aclk_stats_query_thread(struct aclk_metrics_per_sample *per_sample)
+{
+    static RRDSET *st_query_thread = NULL;
+    static RRDDIM *rd_queued = NULL;
+    static RRDDIM *rd_dispatched = NULL;
+
+    if (unlikely(!st_query_thread)) {
+        st_query_thread = rrdset_create_localhost(
+            "netdata", "aclk_query_per_second", NULL, "aclk_stats", NULL, "ACLK Queries per second", "queries/s",
+            "netdata", "stats", 200001, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+        rd_queued = rrddim_add(st_query_thread, "added", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+        rd_dispatched = rrddim_add(st_query_thread, "dispatched", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st_query_thread);
+
+    rrddim_set_by_pointer(st_query_thread, rd_queued, per_sample->queries_queued);
+    rrddim_set_by_pointer(st_query_thread, rd_dispatched, -per_sample->queries_dispatched);
+
+    rrdset_done(st_query_thread);
+}
+
+#ifdef NETDATA_INTERNAL_CHECKS
+static void aclk_stats_latency(struct aclk_metrics_per_sample *per_sample)
+{
+    static RRDSET *st = NULL;
+    static RRDDIM *rd_avg = NULL;
+    static RRDDIM *rd_max = NULL;
+
+    if (unlikely(!st)) {
+        st = rrdset_create_localhost(
+            "netdata", "aclk_latency_mqtt", NULL, "aclk_stats", NULL, "ACLK Message Publish Latency", "ms",
+            "netdata", "stats", 200002, localhost->rrd_update_every, RRDSET_TYPE_LINE);
+
+        rd_avg = rrddim_add(st, "avg", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        rd_max = rrddim_add(st, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st);
+    if(per_sample->latency_count)
+        rrddim_set_by_pointer(st, rd_avg, roundf((float)per_sample->latency_total / per_sample->latency_count));
+    else
+        rrddim_set_by_pointer(st, rd_avg, 0);
+
+    rrddim_set_by_pointer(st, rd_max, per_sample->latency_max);
+
+    rrdset_done(st);
+}
+#endif
+
+static void aclk_stats_write_q(struct aclk_metrics_per_sample *per_sample)
+{
+    static RRDSET *st = NULL;
+    static RRDDIM *rd_wq_add = NULL;
+    static RRDDIM *rd_wq_consumed = NULL;
+
+    if (unlikely(!st)) {
+        st = rrdset_create_localhost(
+            "netdata", "aclk_write_q", NULL, "aclk_stats", NULL, "Write Queue Mosq->Libwebsockets", "kB/s",
+            "netdata", "stats", 200003, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+        rd_wq_add = rrddim_add(st, "added", NULL, 1, 1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+        rd_wq_consumed = rrddim_add(st, "consumed", NULL, 1, -1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st);
+
+    rrddim_set_by_pointer(st, rd_wq_add, per_sample->write_q_added);
+    rrddim_set_by_pointer(st, rd_wq_consumed, per_sample->write_q_consumed);
+
+    rrdset_done(st);
+}
+
+static void aclk_stats_read_q(struct aclk_metrics_per_sample *per_sample)
+{
+    static RRDSET *st = NULL;
+    static RRDDIM *rd_rq_add = NULL;
+    static RRDDIM *rd_rq_consumed = NULL;
+
+    if (unlikely(!st)) {
+        st = rrdset_create_localhost(
+            "netdata", "aclk_read_q", NULL, "aclk_stats", NULL, "Read Queue Libwebsockets->Mosq", "kB/s",
+            "netdata", "stats", 200004, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+        rd_rq_add = rrddim_add(st, "added", NULL, 1, 1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+        rd_rq_consumed = rrddim_add(st, "consumed", NULL, 1, -1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st);
+
+    rrddim_set_by_pointer(st, rd_rq_add, per_sample->read_q_added);
+    rrddim_set_by_pointer(st, rd_rq_consumed, per_sample->read_q_consumed);
+
+    rrdset_done(st);
+}
+
+static void aclk_stats_cloud_req(struct aclk_metrics_per_sample *per_sample)
+{
+    static RRDSET *st = NULL;
+    static RRDDIM *rd_rq_rcvd = NULL;
+    static RRDDIM *rd_rq_err = NULL;
+
+    if (unlikely(!st)) {
+        st = rrdset_create_localhost(
+            "netdata", "aclk_cloud_req", NULL, "aclk_stats", NULL, "Requests received from cloud", "req/s",
+            "netdata", "stats", 200005, localhost->rrd_update_every, RRDSET_TYPE_STACKED);
+
+        rd_rq_rcvd = rrddim_add(st, "received", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+        rd_rq_err = rrddim_add(st, "malformed", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+    } else
+        rrdset_next(st);
+
+    rrddim_set_by_pointer(st, rd_rq_rcvd, per_sample->cloud_req_recvd - per_sample->cloud_req_err);
+    rrddim_set_by_pointer(st, rd_rq_err, per_sample->cloud_req_err);
+
+    rrdset_done(st);
+}
+
+void *aclk_stats_main_thread(void *ptr)
+{
+    UNUSED(ptr);
+    heartbeat_t hb;
+    heartbeat_init(&hb);
+    usec_t step_ut = localhost->rrd_update_every * USEC_PER_SEC;
+    memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample));
+    struct aclk_metrics_per_sample per_sample;
+    struct aclk_metrics permanent;
+
+    while (!netdata_exit) {
+        netdata_thread_testcancel();
+        // ------------------------------------------------------------------------
+        // Wait for the next iteration point.
+
+        heartbeat_next(&hb, step_ut);
+
+        ACLK_STATS_LOCK;
+        // to not hold lock longer than necessary, especially not to hold it
+        // during database rrd* operations
+        memcpy(&per_sample, &aclk_metrics_per_sample, sizeof(struct aclk_metrics_per_sample));
+        memcpy(&permanent, &aclk_metrics, sizeof(struct aclk_metrics));
+        memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample));
+        ACLK_STATS_UNLOCK;
+
+        aclk_stats_collect(&per_sample, &permanent);
+        aclk_stats_query_thread(&per_sample);
+#ifdef NETDATA_INTERNAL_CHECKS
+        aclk_stats_latency(&per_sample);
+#endif
+        aclk_stats_write_q(&per_sample);
+        aclk_stats_read_q(&per_sample);
+
+        aclk_stats_cloud_req(&per_sample);
+    }
+    return 0;
+}
+
+void aclk_stats_upd_online(int online) {
+    if(!aclk_stats_enabled)
+        return;
+
+    ACLK_STATS_LOCK;
+    aclk_metrics.online = online;
+
+    if(!online)
+        aclk_metrics_per_sample.offline_during_sample = 1;
+    ACLK_STATS_UNLOCK;
+}

+ 51 - 0
aclk/aclk_stats.h

@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_ACLK_STATS_H
+#define NETDATA_ACLK_STATS_H
+
+#include "../daemon/common.h"
+#include "libnetdata/libnetdata.h"
+
+#define ACLK_STATS_THREAD_NAME "ACLK_Stats"
+
+extern netdata_mutex_t aclk_stats_mutex;
+
+#define ACLK_STATS_LOCK netdata_mutex_lock(&aclk_stats_mutex)
+#define ACLK_STATS_UNLOCK netdata_mutex_unlock(&aclk_stats_mutex)
+
+extern int aclk_stats_enabled;
+
+// preserve between samples
+struct aclk_metrics {
+    volatile uint8_t online;
+};
+
+// reset to 0 on every sample
+extern struct aclk_metrics_per_sample {
+    /* in the unlikely event of ACLK disconnecting
+       and reconnecting under 1 sampling rate
+       we want to make sure we record the disconnection
+       despite it being then seemingly longer in graph */
+    volatile uint8_t offline_during_sample;
+
+    volatile uint8_t queries_queued;
+    volatile uint8_t queries_dispatched;
+#ifdef NETDATA_INTERNAL_CHECKS
+    volatile uint32_t latency_max;
+    volatile uint32_t latency_total;
+    volatile uint32_t latency_count;
+#endif
+    volatile uint32_t write_q_added;
+    volatile uint32_t write_q_consumed;
+
+    volatile uint32_t read_q_added;
+    volatile uint32_t read_q_consumed;
+
+    volatile uint32_t cloud_req_recvd;
+    volatile uint32_t cloud_req_err;
+} aclk_metrics_per_sample;
+
+void *aclk_stats_main_thread(void *ptr);
+void aclk_stats_upd_online(int online);
+
+#endif /* NETDATA_ACLK_STATS_H */

+ 47 - 0
aclk/agent_cloud_link.c

@@ -4,6 +4,7 @@
 #include "agent_cloud_link.h"
 #include "aclk_lws_https_client.h"
 #include "aclk_common.h"
+#include "aclk_stats.h"
 
 int aclk_shutting_down = 0;
 // State-machine for the on-connect metadata transmission.
@@ -324,6 +325,12 @@ int aclk_queue_query(char *topic, char *data, char *msg_id, char *query, int run
         aclk_queue.count--;
     }
 
+    if (aclk_stats_enabled) {
+        ACLK_STATS_LOCK;
+        aclk_metrics_per_sample.queries_queued++;
+        ACLK_STATS_UNLOCK;
+    }
+
     new_query = callocz(1, sizeof(struct aclk_query));
     new_query->cmd = aclk_cmd;
     if (internal) {
@@ -894,6 +901,12 @@ int aclk_process_query()
 
     aclk_query_free(this_query);
 
+    if (aclk_stats_enabled) {
+        ACLK_STATS_LOCK;
+        aclk_metrics_per_sample.queries_dispatched++;
+        ACLK_STATS_UNLOCK;
+    }
+
     return 1;
 }
 
@@ -1358,6 +1371,7 @@ void *aclk_main(void *ptr)
 {
     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
     struct netdata_static_thread *query_thread;
+    struct netdata_static_thread *stats_thread = NULL;
 
     // This thread is unusual in that it cannot be cancelled by cancel_main_threads()
     // as it must notify the far end that it shutdown gracefully and avoid the LWT.
@@ -1383,6 +1397,15 @@ void *aclk_main(void *ptr)
         }
     }
 
+    aclk_stats_enabled = appconfig_get_boolean(&cloud_config, CONFIG_SECTION_GLOBAL, "statistics", CONFIG_BOOLEAN_YES);
+    if (aclk_stats_enabled) {
+        stats_thread = callocz(1, sizeof(struct netdata_static_thread));
+        stats_thread->thread = mallocz(sizeof(netdata_thread_t));
+        netdata_thread_create(
+            stats_thread->thread, ACLK_STATS_THREAD_NAME, NETDATA_THREAD_OPTION_JOINABLE, aclk_stats_main_thread,
+            stats_thread);
+    }
+
     last_init_sequence = now_realtime_sec();
     query_thread = NULL;
 
@@ -1502,6 +1525,13 @@ exited:
         RSA_free(aclk_private_key);
 
     aclk_main_cleanup(ptr);
+
+    if(aclk_stats_enabled) {
+        netdata_thread_join(*stats_thread->thread, NULL);
+        freez(stats_thread->thread);
+        freez(stats_thread);
+    }
+
     return NULL;
 }
 
@@ -1587,6 +1617,9 @@ int aclk_subscribe(char *sub_topic, int qos)
 void aclk_connect()
 {
     info("Connection detected (%"PRIu64" queued queries)", aclk_queue.count);
+
+    aclk_stats_upd_online(1);
+
     aclk_connected = 1;
     waiting_init = 0;
     aclk_reconnect_delay(0);
@@ -1599,6 +1632,9 @@ void aclk_disconnect()
 {
     if (likely(aclk_connected))
         info("Disconnect detected (%"PRIu64" queued queries)", aclk_queue.count);
+
+    aclk_stats_upd_online(0);
+
     aclk_subscribed = 0;
     aclk_metadata_submitted = ACLK_METADATA_REQUIRED;
     waiting_init = 1;
@@ -1901,6 +1937,11 @@ int aclk_handle_cloud_request(char *payload)
         .type_id = NULL, .msg_id = NULL, .callback_topic = NULL, .payload = NULL, .version = 0
     };
 
+    if (aclk_stats_enabled) {
+        ACLK_STATS_LOCK;
+        aclk_metrics_per_sample.cloud_req_recvd++;
+        ACLK_STATS_UNLOCK;
+    }
 
     if (unlikely(agent_state == AGENT_INITIALIZING)) {
         debug(D_ACLK, "Ignoring cloud request; agent not in stable state");
@@ -1938,6 +1979,12 @@ int aclk_handle_cloud_request(char *payload)
         if (cloud_to_agent.callback_topic)
             freez(cloud_to_agent.callback_topic);
 
+        if (aclk_stats_enabled) {
+            ACLK_STATS_LOCK;
+            aclk_metrics_per_sample.cloud_req_err++;
+            ACLK_STATS_UNLOCK;
+        }
+
         return 1;
     }
 

+ 13 - 1
aclk/mqtt.c

@@ -4,6 +4,7 @@
 #include "../daemon/common.h"
 #include "mqtt.h"
 #include "aclk_lws_wss_client.h"
+#include "aclk_stats.h"
 
 extern usec_t aclk_session_us;
 extern time_t aclk_session_sec;
@@ -38,8 +39,19 @@ void publish_callback(struct mosquitto *mosq, void *obj, int rc)
     now_realtime_timeval(&now);
     orig = &sendTimes[ rc & 0x3ff ];
     int64_t diff = (now.tv_sec - orig->tv_sec) * USEC_PER_SEC + (now.tv_usec - orig->tv_usec);
+    diff /= 1000;
 
-    info("Publish_callback: mid=%d latency=%" PRId64 "ms", rc, diff / 1000);
+    info("Publish_callback: mid=%d latency=%" PRId64 "ms", rc, diff);
+
+    if (aclk_stats_enabled) {
+        ACLK_STATS_LOCK;
+        if (aclk_metrics_per_sample.latency_max < diff)
+            aclk_metrics_per_sample.latency_max = diff;
+
+        aclk_metrics_per_sample.latency_total += diff;
+        aclk_metrics_per_sample.latency_count++;
+        ACLK_STATS_UNLOCK;
+    }
 #endif
     return;
 }

+ 15 - 0
web/gui/dashboard_info.js

@@ -3132,6 +3132,21 @@ netdataDashboard.context = {
         info: 'Difference between the number of process created and the number of threads created per period(<code>process</code> dimension), it also shows the number of possible zombie process running on system.'
     },
 
+    // ------------------------------------------------------------------------
+    // ACLK Internal Stats
+    'netdata.aclk_status': {
+        valueRange: "[0, 1]",
+        info: 'This chart shows if ACLK was online during entirety of the sample duration.'
+    },
+
+    'netdata.aclk_query_per_second': {
+        info: 'This chart shows how many queries were added for ACLK_query thread to process and how many it was actually able to process.'
+    },
+
+    'netdata.aclk_latency_mqtt': {
+        info: 'Measures latency between MQTT publish of the message and it\'s PUB_ACK being received'
+    },
+
     // ------------------------------------------------------------------------
     // VerneMQ