Browse Source

Move cpufreq python module to proc plugin (#4562)

* Add time_in_state file to processing

* Add time_in_state calculation logic and TODOs

* Add time_in_state calculation and switching

* Final cleanup

* Add info messages

* Remove excessive array

* Rename chart and disable python module
Vladimir Kobal 6 years ago
parent
commit
9b56c2474d
2 changed files with 179 additions and 27 deletions
  1. 178 26
      collectors/proc.plugin/proc_stat.c
  2. 1 1
      collectors/python.d.plugin/python.d.plugin.in

+ 178 - 26
collectors/proc.plugin/proc_stat.c

@@ -12,9 +12,23 @@ struct per_core_single_number_file {
     RRDDIM *rd;
 };
 
+struct last_ticks {
+    collected_number frequency;
+    collected_number ticks;
+};
+
+// This is an extension of struct per_core_single_number_file at CPU_FREQ_INDEX.
+// Either scaling_cur_freq or time_in_state file is used at one time.
+struct per_core_time_in_state_file {
+    const char *filename;
+    procfile *ff;
+    size_t last_ticks_len;
+    struct last_ticks *last_ticks;
+};
+
 #define CORE_THROTTLE_COUNT_INDEX    0
 #define PACKAGE_THROTTLE_COUNT_INDEX 1
-#define SCALING_CUR_FREQ_INDEX       2
+#define CPU_FREQ_INDEX               2
 #define PER_CORE_FILES               3
 
 struct cpu_chart {
@@ -33,6 +47,8 @@ struct cpu_chart {
     RRDDIM *rd_guest_nice;
 
     struct per_core_single_number_file files[PER_CORE_FILES];
+
+    struct per_core_time_in_state_file time_in_state_files;
 };
 
 static int keep_per_core_fds_open = CONFIG_BOOLEAN_YES;
@@ -87,7 +103,6 @@ static int read_per_core_files(struct cpu_chart *all_cpu_charts, size_t len, siz
         f->found = 1;
 
         f->value = str2ll(buf, NULL);
-        // info("read '%s', parsed as " COLLECTED_NUMBER_FORMAT, buf, f->value);
         if(likely(f->value != 0))
             files_nonzero++;
     }
@@ -101,6 +116,112 @@ static int read_per_core_files(struct cpu_chart *all_cpu_charts, size_t len, siz
     return (int)files_nonzero;
 }
 
+static int read_per_core_time_in_state_files(struct cpu_chart *all_cpu_charts, size_t len, size_t index) {
+    size_t x, files_read = 0, files_nonzero = 0;
+
+    for(x = 0; x < len ; x++) {
+        struct per_core_single_number_file *f = &all_cpu_charts[x].files[index];
+        struct per_core_time_in_state_file *tsf = &all_cpu_charts[x].time_in_state_files;
+
+        f->found = 0;
+
+        if(unlikely(!tsf->filename))
+            continue;
+
+        if(unlikely(!tsf->ff)) {
+            tsf->ff = procfile_open(tsf->filename, " \t:", PROCFILE_FLAG_DEFAULT);
+            if(unlikely(!tsf->ff))
+            {
+                error("Cannot open file '%s'", tsf->filename);
+                continue;
+            }
+        }
+
+        tsf->ff = procfile_readall(tsf->ff);
+        if(unlikely(!tsf->ff)) {
+            error("Cannot read file '%s'", tsf->filename);
+            procfile_close(tsf->ff);
+            tsf->ff = NULL;
+            continue;
+        }
+        else {
+            // successful read
+
+            size_t lines = procfile_lines(tsf->ff), l;
+            size_t words;
+            unsigned long long total_ticks_since_last = 0, avg_freq = 0;
+
+            // Check if there is at least one frequency in time_in_state
+            if (procfile_word(tsf->ff, 0)[0] == '\0') {
+                if(unlikely(keep_per_core_fds_open != CONFIG_BOOLEAN_YES)) {
+                    procfile_close(tsf->ff);
+                    tsf->ff = NULL;
+                }
+                // TODO: Is there a better way to avoid spikes than calculating the average over
+                // the whole period under schedutil governor?
+                // freez(tsf->last_ticks);
+                // tsf->last_ticks = NULL;
+                // tsf->last_ticks_len = 0;                    
+                continue;
+            }
+
+            if (unlikely(tsf->last_ticks_len < lines || tsf->last_ticks == NULL)) {
+                tsf->last_ticks = reallocz(tsf->last_ticks, sizeof(struct last_ticks) * lines);
+                memset(tsf->last_ticks, 0, sizeof(struct last_ticks) * lines);
+                tsf->last_ticks_len = lines;
+            }
+
+            f->value = 0;
+
+            for(l = 0; l < lines - 1 ;l++) {
+                unsigned long long frequency = 0, ticks = 0, ticks_since_last = 0;
+
+                words = procfile_linewords(tsf->ff, l);
+                if(unlikely(words < 2)) {
+                    error("Cannot read time_in_state line. Expected 2 params, read %zu.", words);
+                    continue;
+                }
+                frequency = str2ull(procfile_lineword(tsf->ff, l, 0));
+                ticks     = str2ull(procfile_lineword(tsf->ff, l, 1));
+
+                // It is assumed that frequencies are static and sorted
+                ticks_since_last = ticks - tsf->last_ticks[l].ticks;
+                tsf->last_ticks[l].frequency = frequency;
+                tsf->last_ticks[l].ticks = ticks;
+
+                total_ticks_since_last += ticks_since_last;
+                avg_freq += frequency * ticks_since_last;
+
+            }
+
+            if (likely(total_ticks_since_last)) {
+                avg_freq /= total_ticks_since_last;
+                f->value = avg_freq;
+            }
+
+            if(unlikely(keep_per_core_fds_open != CONFIG_BOOLEAN_YES)) {
+                procfile_close(tsf->ff);
+                tsf->ff = NULL;
+            }
+        }
+
+        files_read++;
+
+        f->found = 1;
+
+        if(likely(f->value != 0))
+            files_nonzero++;
+    }
+
+    if(unlikely(files_read == 0))
+        return -1;
+
+    if(unlikely(files_nonzero == 0))
+        return 0;
+
+    return (int)files_nonzero;
+}
+
 static void chart_per_core_files(struct cpu_chart *all_cpu_charts, size_t len, size_t index, RRDSET *st, collected_number multiplier, collected_number divisor, RRD_ALGORITHM algorithm) {
     size_t x;
     for(x = 0; x < len ; x++) {
@@ -122,10 +243,11 @@ int do_proc_stat(int update_every, usec_t dt) {
     static struct cpu_chart *all_cpu_charts = NULL;
     static size_t all_cpu_charts_size = 0;
     static procfile *ff = NULL;
-    static int do_cpu = -1, do_cpu_cores = -1, do_interrupts = -1, do_context = -1, do_forks = -1, do_processes = -1, do_core_throttle_count = -1, do_package_throttle_count = -1, do_scaling_cur_freq = -1;
+    static int do_cpu = -1, do_cpu_cores = -1, do_interrupts = -1, do_context = -1, do_forks = -1, do_processes = -1, do_core_throttle_count = -1, do_package_throttle_count = -1, do_cpu_freq = -1;
     static uint32_t hash_intr, hash_ctxt, hash_processes, hash_procs_running, hash_procs_blocked;
-    static char *core_throttle_count_filename = NULL, *package_throttle_count_filename = NULL, *scaling_cur_freq_filename = NULL;
+    static char *core_throttle_count_filename = NULL, *package_throttle_count_filename = NULL, *scaling_cur_freq_filename = NULL, *time_in_state_filename = NULL;
     static RRDVAR *cpus_var = NULL;
+    static int accurate_freq_avail = 0, accurate_freq_is_used = 0;
     size_t cores_found = (size_t)processors;
 
     if(unlikely(do_cpu == -1)) {
@@ -137,25 +259,25 @@ int do_proc_stat(int update_every, usec_t dt) {
         do_processes              = config_get_boolean("plugin:proc:/proc/stat", "processes running", CONFIG_BOOLEAN_YES);
 
         // give sane defaults based on the number of processors
-        if(processors > 50) {
+        if(unlikely(processors > 50)) {
             // the system has too many processors
             keep_per_core_fds_open = CONFIG_BOOLEAN_NO;
             do_core_throttle_count = CONFIG_BOOLEAN_NO;
             do_package_throttle_count = CONFIG_BOOLEAN_NO;
-            do_scaling_cur_freq = CONFIG_BOOLEAN_NO;
+            do_cpu_freq = CONFIG_BOOLEAN_NO;
         }
         else {
             // the system has a reasonable number of processors
             keep_per_core_fds_open = CONFIG_BOOLEAN_YES;
             do_core_throttle_count = CONFIG_BOOLEAN_AUTO;
             do_package_throttle_count = CONFIG_BOOLEAN_NO;
-            do_scaling_cur_freq = CONFIG_BOOLEAN_NO;
+            do_cpu_freq = CONFIG_BOOLEAN_YES;
         }
 
         keep_per_core_fds_open    = config_get_boolean("plugin:proc:/proc/stat", "keep per core files open", keep_per_core_fds_open);
         do_core_throttle_count    = config_get_boolean_ondemand("plugin:proc:/proc/stat", "core_throttle_count", do_core_throttle_count);
         do_package_throttle_count = config_get_boolean_ondemand("plugin:proc:/proc/stat", "package_throttle_count", do_package_throttle_count);
-        do_scaling_cur_freq       = config_get_boolean_ondemand("plugin:proc:/proc/stat", "scaling_cur_freq", do_scaling_cur_freq);
+        do_cpu_freq               = config_get_boolean_ondemand("plugin:proc:/proc/stat", "cpu frequency", do_cpu_freq);
 
         hash_intr = simple_hash("intr");
         hash_ctxt = simple_hash("ctxt");
@@ -172,6 +294,9 @@ int do_proc_stat(int update_every, usec_t dt) {
 
         snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/cpu/%s/cpufreq/scaling_cur_freq");
         scaling_cur_freq_filename = config_get("plugin:proc:/proc/stat", "scaling_cur_freq filename to monitor", filename);
+
+        snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/cpu/%s/cpufreq/stats/time_in_state");
+        time_in_state_filename = config_get("plugin:proc:/proc/stat", "time_in_state filename to monitor", filename);
     }
 
     if(unlikely(!ff)) {
@@ -202,7 +327,7 @@ int do_proc_stat(int update_every, usec_t dt) {
             }
 
             size_t core    = (row_key[3] == '\0') ? 0 : str2ul(&row_key[3]) + 1;
-            if(core > 0) cores_found = core;
+            if(likely(core > 0)) cores_found = core;
 
             if(likely((core == 0 && do_cpu) || (core > 0 && do_cpu_cores))) {
                 char *id;
@@ -227,7 +352,7 @@ int do_proc_stat(int update_every, usec_t dt) {
                 char *title, *type, *context, *family;
                 long priority;
 
-                if(core >= all_cpu_charts_size) {
+                if(unlikely(core >= all_cpu_charts_size)) {
                     size_t old_cpu_charts_size = all_cpu_charts_size;
                     all_cpu_charts_size = core + 1;
                     all_cpu_charts = reallocz(all_cpu_charts, sizeof(struct cpu_chart) * all_cpu_charts_size);
@@ -238,7 +363,7 @@ int do_proc_stat(int update_every, usec_t dt) {
                 if(unlikely(!cpu_chart->st)) {
                     cpu_chart->id = strdupz(id);
 
-                    if(core == 0) {
+                    if(unlikely(core == 0)) {
                         title = "Total CPU utilization";
                         type = "system";
                         context = "system.cpu";
@@ -252,9 +377,6 @@ int do_proc_stat(int update_every, usec_t dt) {
                         family = "utilization";
                         priority = NETDATA_CHART_PRIO_CPU_PER_CORE;
 
-                        // TODO: check for /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq
-                        // TODO: check for /sys/devices/system/cpu/cpu*/cpufreq/stats/time_in_state
-
                         char filename[FILENAME_MAX + 1];
                         struct stat stbuf;
 
@@ -276,12 +398,23 @@ int do_proc_stat(int update_every, usec_t dt) {
                             }
                         }
 
-                        if(do_scaling_cur_freq != CONFIG_BOOLEAN_NO) {
+                        if(do_cpu_freq != CONFIG_BOOLEAN_NO) {
+
                             snprintfz(filename, FILENAME_MAX, scaling_cur_freq_filename, id);
+
+                            if (stat(filename, &stbuf) == 0) {
+                                cpu_chart->files[CPU_FREQ_INDEX].filename = strdupz(filename);
+                                cpu_chart->files[CPU_FREQ_INDEX].fd = -1;
+                                do_cpu_freq = CONFIG_BOOLEAN_YES;
+                            }
+                            
+                            snprintfz(filename, FILENAME_MAX, time_in_state_filename, id);
+
                             if (stat(filename, &stbuf) == 0) {
-                                cpu_chart->files[SCALING_CUR_FREQ_INDEX].filename = strdupz(filename);
-                                cpu_chart->files[SCALING_CUR_FREQ_INDEX].fd = -1;
-                                do_scaling_cur_freq = CONFIG_BOOLEAN_YES;
+                                cpu_chart->time_in_state_files.filename = strdupz(filename);
+                                cpu_chart->time_in_state_files.ff = NULL;
+                                do_cpu_freq = CONFIG_BOOLEAN_YES;
+                                accurate_freq_avail = 1;
                             }
                         }
                     }
@@ -532,21 +665,40 @@ int do_proc_stat(int update_every, usec_t dt) {
             }
         }
 
-        if(likely(do_scaling_cur_freq != CONFIG_BOOLEAN_NO)) {
-            int r = read_per_core_files(&all_cpu_charts[1], all_cpu_charts_size - 1, SCALING_CUR_FREQ_INDEX);
-            if(likely(r != -1 && (do_scaling_cur_freq == CONFIG_BOOLEAN_YES || r > 0))) {
-                do_scaling_cur_freq = CONFIG_BOOLEAN_YES;
+        if(likely(do_cpu_freq != CONFIG_BOOLEAN_NO)) {
+            char filename[FILENAME_MAX + 1];
+            int r = 0;
+
+            if (accurate_freq_avail) {
+                r = read_per_core_time_in_state_files(&all_cpu_charts[1], all_cpu_charts_size - 1, CPU_FREQ_INDEX);
+                if(r > 0 && !accurate_freq_is_used) {
+                    accurate_freq_is_used = 1;
+                    snprintfz(filename, FILENAME_MAX, time_in_state_filename, "cpu*");
+                    info("cpufreq is using %s", filename);
+                }
+            }
+            if (r < 1) {
+                r = read_per_core_files(&all_cpu_charts[1], all_cpu_charts_size - 1, CPU_FREQ_INDEX);
+                if(accurate_freq_is_used) {
+                    accurate_freq_is_used = 0;
+                    snprintfz(filename, FILENAME_MAX, scaling_cur_freq_filename, "cpu*");
+                    info("cpufreq fell back to %s", filename);
+                }
+            }
+
+            if(likely(r != -1 && (do_cpu_freq == CONFIG_BOOLEAN_YES || r > 0))) {
+                do_cpu_freq = CONFIG_BOOLEAN_YES;
 
                 static RRDSET *st_scaling_cur_freq = NULL;
 
                 if(unlikely(!st_scaling_cur_freq))
                     st_scaling_cur_freq = rrdset_create_localhost(
                             "cpu"
-                            , "scaling_cur_freq"
+                            , "cpufreq"
                             , NULL
                             , "cpufreq"
-                            , "cpu.scaling_cur_freq"
-                            , "Per CPU Core, Current CPU Scaling Frequency"
+                            , "cpufreq.cpufreq"
+                            , "Current CPU Frequency"
                             , "MHz"
                             , PLUGIN_PROC_NAME
                             , PLUGIN_PROC_MODULE_STAT_NAME
@@ -557,7 +709,7 @@ int do_proc_stat(int update_every, usec_t dt) {
                 else
                     rrdset_next(st_scaling_cur_freq);
 
-                chart_per_core_files(&all_cpu_charts[1], all_cpu_charts_size - 1, SCALING_CUR_FREQ_INDEX, st_scaling_cur_freq, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+                chart_per_core_files(&all_cpu_charts[1], all_cpu_charts_size - 1, CPU_FREQ_INDEX, st_scaling_cur_freq, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
                 rrdset_done(st_scaling_cur_freq);
             }
         }

+ 1 - 1
collectors/python.d.plugin/python.d.plugin.in

@@ -56,7 +56,7 @@ BASE_CONFIG = {'update_every': os.getenv('NETDATA_UPDATE_EVERY', 1),
 
 
 MODULE_EXTENSION = '.chart.py'
-OBSOLETE_MODULES = ['apache_cache', 'gunicorn_log', 'nginx_log']
+OBSOLETE_MODULES = ['apache_cache', 'gunicorn_log', 'nginx_log', 'cpufreq']
 
 
 def module_ok(m):