Browse Source

eBPF mdflush (#11681)

* mdflush: initial agent-side collector.

* mdflush: fix x-axis name.

* mdflush: fix issue where we iteration doesnt occur.

* mdflush: update flush prio to match health.

health seems to be global just as flush is, so
keep them together.

* mdflush: vsn bump.

* mdflush: add reference to bcc tool.

* mdflush: add more docs.

* mdflush: change family name to indicate ebpf.

* mdflush: remove "count" word in chart.

* mdflush: change mdstat prios so flush comes after health.
Uman Shahzad 3 years ago
parent
commit
2cbced292a

+ 2 - 0
CMakeLists.txt

@@ -496,6 +496,8 @@ set(EBPF_PROCESS_PLUGIN_FILES
         collectors/ebpf.plugin/ebpf_fd.h
         collectors/ebpf.plugin/ebpf_hardirq.c
         collectors/ebpf.plugin/ebpf_hardirq.h
+        collectors/ebpf.plugin/ebpf_mdflush.c
+        collectors/ebpf.plugin/ebpf_mdflush.h
         collectors/ebpf.plugin/ebpf_mount.c
         collectors/ebpf.plugin/ebpf_mount.h
         collectors/ebpf.plugin/ebpf_filesystem.c

+ 2 - 0
Makefile.am

@@ -302,6 +302,8 @@ EBPF_PLUGIN_FILES = \
     collectors/ebpf.plugin/ebpf_filesystem.h \
     collectors/ebpf.plugin/ebpf_hardirq.c \
     collectors/ebpf.plugin/ebpf_hardirq.h \
+    collectors/ebpf.plugin/ebpf_mdflush.c \
+    collectors/ebpf.plugin/ebpf_mdflush.h \
     collectors/ebpf.plugin/ebpf_mount.c \
     collectors/ebpf.plugin/ebpf_mount.h \
     collectors/ebpf.plugin/ebpf_oomkill.c \

+ 7 - 6
collectors/all.h

@@ -133,12 +133,13 @@
 // MDSTAT
 
 #define NETDATA_CHART_PRIO_MDSTAT_HEALTH              2100
-#define NETDATA_CHART_PRIO_MDSTAT_NONREDUNDANT        2101
-#define NETDATA_CHART_PRIO_MDSTAT_DISKS               2102 // 5 charts per raid
-#define NETDATA_CHART_PRIO_MDSTAT_MISMATCH            2103
-#define NETDATA_CHART_PRIO_MDSTAT_OPERATION           2104
-#define NETDATA_CHART_PRIO_MDSTAT_FINISH              2105
-#define NETDATA_CHART_PRIO_MDSTAT_SPEED               2106
+#define NETDATA_CHART_PRIO_MDSTAT_FLUSH               2101
+#define NETDATA_CHART_PRIO_MDSTAT_NONREDUNDANT        2105
+#define NETDATA_CHART_PRIO_MDSTAT_DISKS               2106 // 5 charts per raid
+#define NETDATA_CHART_PRIO_MDSTAT_MISMATCH            2107
+#define NETDATA_CHART_PRIO_MDSTAT_OPERATION           2108
+#define NETDATA_CHART_PRIO_MDSTAT_FINISH              2109
+#define NETDATA_CHART_PRIO_MDSTAT_SPEED               2110
 
 // Filesystem
 #define NETDATA_CHART_PRIO_FILESYSTEM_VFS_CLEAN       2150

+ 1 - 0
collectors/ebpf.plugin/Makefile.am

@@ -29,6 +29,7 @@ dist_ebpfconfig_DATA = \
     ebpf.d/fd.conf \
     ebpf.d/filesystem.conf \
     ebpf.d/hardirq.conf \
+    ebpf.d/mdflush.conf \
     ebpf.d/mount.conf \
     ebpf.d/network.conf \
     ebpf.d/oomkill.conf \

+ 16 - 0
collectors/ebpf.plugin/README.md

@@ -116,6 +116,20 @@ This chart monitors calls demonstrating commits from filesystem caches to disk.
 This chart shows calls to `sync_file_range(2)` which synchronizes file segments with disk. This is the most dangerous
 syscall to synchronize data according to its manual.
 
+### MD flush
+
+The eBPF plugin shows multi-device flushes happening in real time. This can be
+used to explain some spikes happening in
+[disk latency](docs/agent/collectors/ebpf.plugin#disk) charts.
+
+By default, MD flush is disabled. To enable it, configure your
+`/etc/netdata/ebpf.d.conf` file as:
+
+```conf
+[global]
+    mdflush = yes
+```
+
 ### Disk
 
 The eBPF plugin also shows a chart in the Disk section when the `disk` thread is enabled. This will create the
@@ -399,6 +413,8 @@ You can also enable the following eBPF programs:
 - `disk` : This eBPF program creates charts that show information about disk latency independent of filesystem.
 - `filesystem` : This eBPF program creates charts that show information about some filesystem latency.
 - `swap` : This eBPF program creates charts that show information about swap access.
+- `mdflush`: This eBPF program creates charts that show information about
+  multi-device software flushes.
 
 ## Thread configuration
 

+ 24 - 0
collectors/ebpf.plugin/ebpf.c

@@ -158,6 +158,11 @@ ebpf_module_t ebpf_modules[] = {
       .apps_routine = ebpf_shm_create_apps_charts, .maps = NULL,
       .pid_map_size = ND_EBPF_DEFAULT_PID_SIZE, .names = NULL, .cfg = &shm_config,
       .config_file = NETDATA_DIRECTORY_SHM_CONFIG_FILE},
+    { .thread_name = "mdflush", .config_name = "mdflush", .enabled = 0, .start_routine = ebpf_mdflush_thread,
+      .update_every = EBPF_DEFAULT_UPDATE_EVERY, .global_charts = 1, .apps_charts = CONFIG_BOOLEAN_NO,
+      .cgroup_charts = CONFIG_BOOLEAN_NO, .mode = MODE_ENTRY, .optional = 0, .apps_routine = NULL, .maps = NULL,
+      .pid_map_size = ND_EBPF_DEFAULT_PID_SIZE, .names = NULL, .cfg = &mdflush_config,
+      .config_file = NETDATA_DIRECTORY_MDFLUSH_CONFIG_FILE},
       { .thread_name = NULL, .enabled = 0, .start_routine = NULL, .update_every = EBPF_DEFAULT_UPDATE_EVERY,
       .global_charts = 0, .apps_charts = CONFIG_BOOLEAN_NO, .cgroup_charts = CONFIG_BOOLEAN_NO,
       .mode = MODE_ENTRY, .optional = 0, .apps_routine = NULL, .maps = NULL, .pid_map_size = 0, .names = NULL,
@@ -803,6 +808,8 @@ void ebpf_print_help()
             "\n"
             " [-]-hardirq           Enable chart related to hard IRQ latency.\n"
             "\n"
+            " [-]-mdflush           Enable charts related to multi-device flush.\n"
+            "\n"
             " [-]-mount             Enable charts related to mount monitoring.\n"
             "\n"
             " [-]-net               Enable network viewer charts.\n"
@@ -1259,6 +1266,13 @@ static void read_collector_values(int *disable_apps, int *disable_cgroups, int u
         started++;
     }
 
+    enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, "mdflush",
+                                    CONFIG_BOOLEAN_NO);
+    if (enabled) {
+        ebpf_enable_chart(EBPF_MODULE_MDFLUSH_IDX, *disable_apps, *disable_cgroups);
+        started++;
+    }
+
     if (!started){
         ebpf_enable_all_charts(*disable_apps, *disable_cgroups);
         // Read network viewer section
@@ -1370,6 +1384,7 @@ static void ebpf_parse_args(int argc, char **argv)
         {"softirq",        no_argument,    0,  0 },
         {"oomkill",        no_argument,    0,  0 },
         {"shm",            no_argument,    0,  0 },
+        {"mdflush",        no_argument,    0,  0 },
         /* INSERT NEW THREADS BEFORE THIS COMMENT TO KEEP COMPATIBILITY WITH enum ebpf_module_indexes */
         {"all",            no_argument,    0,  0 },
         {"version",        no_argument,    0,  0 },
@@ -1511,6 +1526,13 @@ static void ebpf_parse_args(int argc, char **argv)
                 select_threads |= 1<<EBPF_MODULE_SHM_IDX;
 #ifdef NETDATA_INTERNAL_CHECKS
                 info("EBPF enabling \"SHM\" chart, because it was started with the option \"[-]-shm\".");
+#endif
+                break;
+            }
+            case EBPF_MODULE_MDFLUSH_IDX: {
+                select_threads |= 1<<EBPF_MODULE_MDFLUSH_IDX;
+#ifdef NETDATA_INTERNAL_CHECKS
+                info("EBPF enabling \"MDFLUSH\" chart, because it was started with the option \"[-]-mdflush\".");
 #endif
                 break;
             }
@@ -1813,6 +1835,8 @@ int main(int argc, char **argv)
             NULL, NULL, ebpf_modules[EBPF_MODULE_OOMKILL_IDX].start_routine},
         {"EBPF SHM" , NULL, NULL, 1,
             NULL, NULL, ebpf_modules[EBPF_MODULE_SHM_IDX].start_routine},
+        {"EBPF MDFLUSH" , NULL, NULL, 1,
+            NULL, NULL, ebpf_modules[EBPF_MODULE_MDFLUSH_IDX].start_routine},
         {NULL          , NULL, NULL, 0,
           NULL, NULL, NULL}
     };

+ 2 - 0
collectors/ebpf.plugin/ebpf.d.conf

@@ -31,6 +31,7 @@
 #  `dcstat`    : Make charts for kernel functions related to directory cache.
 #  `disk`      : Monitor I/O latencies for disks
 #  `fd`        : This eBPF program creates charts that show information about file manipulation.
+#  `mdflush`   : Monitors flush counts for multi-devices.
 #  `mount`     : Monitor calls for syscalls mount and umount
 #  `filesystem`: Monitor calls for functions used to manipulate specific filesystems
 #  `hardirq`   : Monitor latency of serving hardware interrupt requests (hard IRQs).
@@ -51,6 +52,7 @@
     fd = yes
     filesystem = no
     hardirq = yes
+    mdflush = no
     mount = yes
     oomkill = yes
     process = yes

+ 7 - 0
collectors/ebpf.plugin/ebpf.d/mdflush.conf

@@ -0,0 +1,7 @@
+# The `ebpf load mode` option accepts the following values :
+#  `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors.
+#  `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates
+#            new charts for the return of these functions, such as errors.
+#[global]
+#    ebpf load mode = entry
+#    update every = 1

+ 1 - 0
collectors/ebpf.plugin/ebpf.h

@@ -90,6 +90,7 @@ enum ebpf_main_index {
     EBPF_MODULE_SOFTIRQ_IDX,
     EBPF_MODULE_OOMKILL_IDX,
     EBPF_MODULE_SHM_IDX,
+    EBPF_MODULE_MDFLUSH_IDX,
     /* THREADS MUST BE INCLUDED BEFORE THIS COMMENT */
     EBPF_OPTION_ALL_CHARTS,
     EBPF_OPTION_VERSION,

+ 1 - 0
collectors/ebpf.plugin/ebpf_apps.h

@@ -24,6 +24,7 @@
 #include "ebpf_filesystem.h"
 #include "ebpf_hardirq.h"
 #include "ebpf_cachestat.h"
+#include "ebpf_mdflush.h"
 #include "ebpf_mount.h"
 #include "ebpf_oomkill.h"
 #include "ebpf_shm.h"

Some files were not shown because too many files changed in this diff