1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018 |
- #ifndef KMP_STATS_H
- #define KMP_STATS_H
- /** @file kmp_stats.h
- * Functions for collecting statistics.
- */
- //===----------------------------------------------------------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- #include "kmp_config.h"
- #include "kmp_debug.h"
- #if KMP_STATS_ENABLED
- /* Statistics accumulator.
- Accumulates number of samples and computes min, max, mean, standard deviation
- on the fly.
- Online variance calculation algorithm from
- http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
- */
- #error #include "kmp_stats_timing.h"
- #include <limits>
- #include <math.h>
- #include <new> // placement new
- #include <stdint.h>
- #include <string>
- #include <vector>
- /* Enable developer statistics here if you want them. They are more detailed
- than is useful for application characterisation and are intended for the
- runtime library developer. */
- #define KMP_DEVELOPER_STATS 0
- /* Enable/Disable histogram output */
- #define KMP_STATS_HIST 0
- /*!
- * @ingroup STATS_GATHERING
- * \brief flags to describe the statistic (timer or counter)
- *
- */
- enum stats_flags_e {
- noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
- onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
- noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
- notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
- logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
- //! KMP_STATS_EVENTS is on (valid only for timers)
- };
- /*!
- * @ingroup STATS_GATHERING
- * \brief the states which a thread can be in
- *
- */
- enum stats_state_e {
- IDLE,
- SERIAL_REGION,
- FORK_JOIN_BARRIER,
- PLAIN_BARRIER,
- TASKWAIT,
- TASKYIELD,
- TASKGROUP,
- IMPLICIT_TASK,
- EXPLICIT_TASK,
- TEAMS_REGION
- };
- /*!
- * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
- *
- * @param macro a user defined macro that takes three arguments -
- * macro(COUNTER_NAME, flags, arg)
- * @param arg a user defined argument to send to the user defined macro
- *
- * \details A counter counts the occurrence of some event. Each thread
- * accumulates its own count, at the end of execution the counts are aggregated
- * treating each thread as a separate measurement. (Unless onlyInMaster is set,
- * in which case there's only a single measurement). The min,mean,max are
- * therefore the values for the threads. Adding the counter here and then
- * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
- * need to do. All of the tables and printing is generated from this macro.
- * Format is "macro(name, flags, arg)"
- *
- * @ingroup STATS_GATHERING
- */
- // clang-format off
- #define KMP_FOREACH_COUNTER(macro, arg) \
- macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
- macro(OMP_NESTED_PARALLEL, 0, arg) \
- macro(OMP_LOOP_STATIC, 0, arg) \
- macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
- macro(OMP_LOOP_DYNAMIC, 0, arg) \
- macro(OMP_DISTRIBUTE, 0, arg) \
- macro(OMP_BARRIER, 0, arg) \
- macro(OMP_CRITICAL, 0, arg) \
- macro(OMP_SINGLE, 0, arg) \
- macro(OMP_MASTER, 0, arg) \
- macro(OMP_MASKED, 0, arg) \
- macro(OMP_TEAMS, 0, arg) \
- macro(OMP_set_lock, 0, arg) \
- macro(OMP_test_lock, 0, arg) \
- macro(REDUCE_wait, 0, arg) \
- macro(REDUCE_nowait, 0, arg) \
- macro(OMP_TASKYIELD, 0, arg) \
- macro(OMP_TASKLOOP, 0, arg) \
- macro(TASK_executed, 0, arg) \
- macro(TASK_cancelled, 0, arg) \
- macro(TASK_stolen, 0, arg)
- // clang-format on
- /*!
- * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
- *
- * @param macro a user defined macro that takes three arguments -
- * macro(TIMER_NAME, flags, arg)
- * @param arg a user defined argument to send to the user defined macro
- *
- * \details A timer collects multiple samples of some count in each thread and
- * then finally aggregates all of the samples from all of the threads. For most
- * timers the printing code also provides an aggregation over the thread totals.
- * These are printed as TOTAL_foo. The count is normally a time (in ticks),
- * hence the name "timer". (But can be any value, so we use this for "number of
- * arguments passed to fork" as well). For timers the threads are not
- * significant, it's the individual observations that count, so the statistics
- * are at that level. Format is "macro(name, flags, arg)"
- *
- * @ingroup STATS_GATHERING2
- */
- // clang-format off
- #define KMP_FOREACH_TIMER(macro, arg) \
- macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
- macro (OMP_parallel, stats_flags_e::logEvent, arg) \
- macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
- macro (OMP_teams, stats_flags_e::logEvent, arg) \
- macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
- macro (OMP_loop_static, 0, arg) \
- macro (OMP_loop_static_scheduling, 0, arg) \
- macro (OMP_loop_dynamic, 0, arg) \
- macro (OMP_loop_dynamic_scheduling, 0, arg) \
- macro (OMP_distribute, 0, arg) \
- macro (OMP_distribute_scheduling, 0, arg) \
- macro (OMP_critical, 0, arg) \
- macro (OMP_critical_wait, 0, arg) \
- macro (OMP_single, 0, arg) \
- macro (OMP_master, 0, arg) \
- macro (OMP_masked, 0, arg) \
- macro (OMP_task_immediate, 0, arg) \
- macro (OMP_task_taskwait, 0, arg) \
- macro (OMP_task_taskyield, 0, arg) \
- macro (OMP_task_taskgroup, 0, arg) \
- macro (OMP_task_join_bar, 0, arg) \
- macro (OMP_task_plain_bar, 0, arg) \
- macro (OMP_taskloop_scheduling, 0, arg) \
- macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_idle, stats_flags_e::logEvent, arg) \
- macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
- macro (OMP_serial, stats_flags_e::logEvent, arg) \
- macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
- arg) \
- macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
- arg) \
- macro (OMP_loop_static_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (OMP_loop_static_total_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (OMP_loop_dynamic_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (OMP_loop_dynamic_total_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (OMP_distribute_iterations, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
- // clang-format on
- // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
- // initializing OpenMP or being created by a primary
- // thread) until the thread is destroyed
- // OMP_parallel -- Time thread spends executing work directly
- // within a #pragma omp parallel
- // OMP_parallel_overhead -- Time thread spends setting up a parallel region
- // OMP_loop_static -- Time thread spends executing loop iterations from
- // a statically scheduled loop
- // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
- // from a statically scheduled loop
- // OMP_loop_dynamic -- Time thread spends executing loop iterations from
- // a dynamically scheduled loop
- // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
- // from a dynamically scheduled loop
- // OMP_critical -- Time thread spends executing critical section
- // OMP_critical_wait -- Time thread spends waiting to enter
- // a critical section
- // OMP_single -- Time spent executing a "single" region
- // OMP_master -- Time spent executing a "master" region
- // OMP_masked -- Time spent executing a "masked" region
- // OMP_task_immediate -- Time spent executing non-deferred tasks
- // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
- // construct
- // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
- // construct
- // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
- // construct
- // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
- // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
- // construct
- // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
- // construct
- // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
- // inside implicit barrier at end of worksharing
- // construct
- // OMP_idle -- Time worker threads spend waiting for next
- // parallel region
- // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
- // parallel region
- // OMP_join_barrier -- Time spent in a the join barrier surrounding a
- // parallel region
- // OMP_serial -- Time thread zero spends executing serial code
- // OMP_set_numthreads -- Values passed to omp_set_num_threads
- // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
- // OMP_loop_static_iterations -- Number of iterations thread is assigned for
- // statically scheduled loops
- // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
- // dynamically scheduled loops
- #if (KMP_DEVELOPER_STATS)
- // Timers which are of interest to runtime library developers, not end users.
- // These have to be explicitly enabled in addition to the other stats.
- // KMP_fork_barrier -- time in __kmp_fork_barrier
- // KMP_join_barrier -- time in __kmp_join_barrier
- // KMP_barrier -- time in __kmp_barrier
- // KMP_end_split_barrier -- time in __kmp_end_split_barrier
- // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
- // KMP_icv_copy -- start/stop timer for any ICV copying
- // KMP_linear_gather -- time in __kmp_linear_barrier_gather
- // KMP_linear_release -- time in __kmp_linear_barrier_release
- // KMP_tree_gather -- time in __kmp_tree_barrier_gather
- // KMP_tree_release -- time in __kmp_tree_barrier_release
- // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
- // KMP_hyper_release -- time in __kmp_hyper_barrier_release
- // KMP_dist_gather -- time in __kmp_dist_barrier_gather
- // KMP_dist_release -- time in __kmp_dist_barrier_release
- // clang-format off
- #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
- macro(KMP_fork_call, 0, arg) \
- macro(KMP_join_call, 0, arg) \
- macro(KMP_end_split_barrier, 0, arg) \
- macro(KMP_hier_gather, 0, arg) \
- macro(KMP_hier_release, 0, arg) \
- macro(KMP_hyper_gather, 0, arg) \
- macro(KMP_hyper_release, 0, arg) \
- macro(KMP_dist_gather, 0, arg) \
- macro(KMP_dist_release, 0, arg) \
- macro(KMP_linear_gather, 0, arg) \
- macro(KMP_linear_release, 0, arg) \
- macro(KMP_tree_gather, 0, arg) \
- macro(KMP_tree_release, 0, arg) \
- macro(USER_resume, 0, arg) \
- macro(USER_suspend, 0, arg) \
- macro(USER_mwait, 0, arg) \
- macro(KMP_allocate_team, 0, arg) \
- macro(KMP_setup_icv_copy, 0, arg) \
- macro(USER_icv_copy, 0, arg) \
- macro (FOR_static_steal_stolen, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
- macro (FOR_static_steal_chunks, \
- stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
- #else
- #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
- #endif
- // clang-format on
- /*!
- * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
- *
- * @param macro a user defined macro that takes three arguments -
- * macro(TIMER_NAME, flags, arg)
- * @param arg a user defined argument to send to the user defined macro
- *
- * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
- * BAD THINGS WILL HAPPEN!
- *
- * \details Explicit timers are ones where we need to allocate a timer itself
- * (as well as the accumulated timing statistics). We allocate these on a
- * per-thread basis, and explicitly start and stop them. Block timers just
- * allocate the timer itself on the stack, and use the destructor to notice
- * block exit; they don't need to be defined here. The name here should be the
- * same as that of a timer above.
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
- #define ENUMERATE(name, ignore, prefix) prefix##name,
- enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
- enum explicit_timer_e {
- KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
- };
- enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
- #undef ENUMERATE
- /*
- * A logarithmic histogram. It accumulates the number of values in each power of
- * ten bin. So 1<=x<10, 10<=x<100, ...
- * Mostly useful where we have some big outliers and want to see information
- * about them.
- */
- class logHistogram {
- enum {
- numBins = 31, /* Number of powers of 10. If this changes you need to change
- * the initializer for binMax */
- /*
- * If you want to use this to analyse values that may be less than 1, (for
- * instance times in s), then the logOffset gives you negative powers.
- * In our case here, we're just looking at times in ticks, or counts, so we
- * can never see values with magnitude < 1 (other than zero), so we can set
- * it to 0. As above change the initializer if you change this.
- */
- logOffset = 0
- };
- uint32_t KMP_ALIGN_CACHE zeroCount;
- struct {
- uint32_t count;
- double total;
- } bins[numBins];
- static double binMax[numBins];
- #ifdef KMP_DEBUG
- uint64_t _total;
- void check() const {
- uint64_t t = zeroCount;
- for (int i = 0; i < numBins; i++)
- t += bins[i].count;
- KMP_DEBUG_ASSERT(t == _total);
- }
- #else
- void check() const {}
- #endif
- public:
- logHistogram() { reset(); }
- logHistogram(logHistogram const &o) {
- for (int i = 0; i < numBins; i++)
- bins[i] = o.bins[i];
- #ifdef KMP_DEBUG
- _total = o._total;
- #endif
- }
- void reset() {
- zeroCount = 0;
- for (int i = 0; i < numBins; i++) {
- bins[i].count = 0;
- bins[i].total = 0;
- }
- #ifdef KMP_DEBUG
- _total = 0;
- #endif
- }
- uint32_t count(int b) const { return bins[b + logOffset].count; }
- double total(int b) const { return bins[b + logOffset].total; }
- static uint32_t findBin(double sample);
- logHistogram &operator+=(logHistogram const &o) {
- zeroCount += o.zeroCount;
- for (int i = 0; i < numBins; i++) {
- bins[i].count += o.bins[i].count;
- bins[i].total += o.bins[i].total;
- }
- #ifdef KMP_DEBUG
- _total += o._total;
- check();
- #endif
- return *this;
- }
- void addSample(double sample);
- int minBin() const;
- int maxBin() const;
- std::string format(char) const;
- };
- class statistic {
- double KMP_ALIGN_CACHE minVal;
- double maxVal;
- double meanVal;
- double m2;
- uint64_t sampleCount;
- double offset;
- bool collectingHist;
- logHistogram hist;
- public:
- statistic(bool doHist = bool(KMP_STATS_HIST)) {
- reset();
- collectingHist = doHist;
- }
- statistic(statistic const &o)
- : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
- sampleCount(o.sampleCount), offset(o.offset),
- collectingHist(o.collectingHist), hist(o.hist) {}
- statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
- : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
- sampleCount(sc), offset(0.0), collectingHist(false) {}
- bool haveHist() const { return collectingHist; }
- double getMin() const { return minVal; }
- double getMean() const { return meanVal; }
- double getMax() const { return maxVal; }
- uint64_t getCount() const { return sampleCount; }
- double getSD() const { return sqrt(m2 / sampleCount); }
- double getTotal() const { return sampleCount * meanVal; }
- logHistogram const *getHist() const { return &hist; }
- void setOffset(double d) { offset = d; }
- void reset() {
- minVal = (std::numeric_limits<double>::max)();
- maxVal = -minVal;
- meanVal = 0.0;
- m2 = 0.0;
- sampleCount = 0;
- offset = 0.0;
- hist.reset();
- }
- void addSample(double sample);
- void scale(double factor);
- void scaleDown(double f) { scale(1. / f); }
- void forceCount(uint64_t count) { sampleCount = count; }
- statistic &operator+=(statistic const &other);
- std::string format(char unit, bool total = false) const;
- std::string formatHist(char unit) const { return hist.format(unit); }
- };
- struct statInfo {
- const char *name;
- uint32_t flags;
- };
- class timeStat : public statistic {
- static statInfo timerInfo[];
- public:
- timeStat() : statistic() {}
- static const char *name(timer_e e) { return timerInfo[e].name; }
- static bool noTotal(timer_e e) {
- return timerInfo[e].flags & stats_flags_e::noTotal;
- }
- static bool masterOnly(timer_e e) {
- return timerInfo[e].flags & stats_flags_e::onlyInMaster;
- }
- static bool workerOnly(timer_e e) {
- return timerInfo[e].flags & stats_flags_e::notInMaster;
- }
- static bool noUnits(timer_e e) {
- return timerInfo[e].flags & stats_flags_e::noUnits;
- }
- static bool logEvent(timer_e e) {
- return timerInfo[e].flags & stats_flags_e::logEvent;
- }
- static void clearEventFlags() {
- for (int i = 0; i < TIMER_LAST; i++) {
- timerInfo[i].flags &= (~(stats_flags_e::logEvent));
- }
- }
- };
- // Where we need explicitly to start and end the timer, this version can be used
- // Since these timers normally aren't nicely scoped, so don't have a good place
- // to live on the stack of the thread, they're more work to use.
- class explicitTimer {
- timeStat *stat;
- timer_e timerEnumValue;
- tsc_tick_count startTime;
- tsc_tick_count pauseStartTime;
- tsc_tick_count::tsc_interval_t totalPauseTime;
- public:
- explicitTimer(timeStat *s, timer_e te)
- : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
- totalPauseTime() {}
- // void setStat(timeStat *s) { stat = s; }
- void start(tsc_tick_count tick);
- void pause(tsc_tick_count tick) { pauseStartTime = tick; }
- void resume(tsc_tick_count tick) {
- totalPauseTime += (tick - pauseStartTime);
- }
- void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
- void reset() {
- startTime = 0;
- pauseStartTime = 0;
- totalPauseTime = 0;
- }
- timer_e get_type() const { return timerEnumValue; }
- };
- // Where you need to partition a threads clock ticks into separate states
- // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
- // DOING_NOTHING would render these conditions:
- // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
- // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
- // versa
- class partitionedTimers {
- private:
- std::vector<explicitTimer> timer_stack;
- public:
- partitionedTimers();
- void init(explicitTimer timer);
- void exchange(explicitTimer timer);
- void push(explicitTimer timer);
- void pop();
- void windup();
- };
- // Special wrapper around the partitioned timers to aid timing code blocks
- // It avoids the need to have an explicit end, leaving the scope suffices.
- class blockPartitionedTimer {
- partitionedTimers *part_timers;
- public:
- blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
- : part_timers(pt) {
- part_timers->push(timer);
- }
- ~blockPartitionedTimer() { part_timers->pop(); }
- };
- // Special wrapper around the thread state to aid in keeping state in code
- // blocks It avoids the need to have an explicit end, leaving the scope
- // suffices.
- class blockThreadState {
- stats_state_e *state_pointer;
- stats_state_e old_state;
- public:
- blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
- : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
- *state_pointer = new_state;
- }
- ~blockThreadState() { *state_pointer = old_state; }
- };
- // If all you want is a count, then you can use this...
- // The individual per-thread counts will be aggregated into a statistic at
- // program exit.
- class counter {
- uint64_t value;
- static const statInfo counterInfo[];
- public:
- counter() : value(0) {}
- void increment() { value++; }
- uint64_t getValue() const { return value; }
- void reset() { value = 0; }
- static const char *name(counter_e e) { return counterInfo[e].name; }
- static bool masterOnly(counter_e e) {
- return counterInfo[e].flags & stats_flags_e::onlyInMaster;
- }
- };
- /* ****************************************************************
- Class to implement an event
- There are four components to an event: start time, stop time
- nest_level, and timer_name.
- The start and stop time should be obvious (recorded in clock ticks).
- The nest_level relates to the bar width in the timeline graph.
- The timer_name is used to determine which timer event triggered this event.
- the interface to this class is through four read-only operations:
- 1) getStart() -- returns the start time as 64 bit integer
- 2) getStop() -- returns the stop time as 64 bit integer
- 3) getNestLevel() -- returns the nest level of the event
- 4) getTimerName() -- returns the timer name that triggered event
- *MORE ON NEST_LEVEL*
- The nest level is used in the bar graph that represents the timeline.
- Its main purpose is for showing how events are nested inside eachother.
- For example, say events, A, B, and C are recorded. If the timeline
- looks like this:
- Begin -------------------------------------------------------------> Time
- | | | | | |
- A B C C B A
- start start start end end end
- Then A, B, C will have a nest level of 1, 2, 3 respectively.
- These values are then used to calculate the barwidth so you can
- see that inside A, B has occurred, and inside B, C has occurred.
- Currently, this is shown with A's bar width being larger than B's
- bar width, and B's bar width being larger than C's bar width.
- **************************************************************** */
- class kmp_stats_event {
- uint64_t start;
- uint64_t stop;
- int nest_level;
- timer_e timer_name;
- public:
- kmp_stats_event()
- : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
- kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
- : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
- inline uint64_t getStart() const { return start; }
- inline uint64_t getStop() const { return stop; }
- inline int getNestLevel() const { return nest_level; }
- inline timer_e getTimerName() const { return timer_name; }
- };
- /* ****************************************************************
- Class to implement a dynamically expandable array of events
- ---------------------------------------------------------
- | event 1 | event 2 | event 3 | event 4 | ... | event N |
- ---------------------------------------------------------
- An event is pushed onto the back of this array at every
- explicitTimer->stop() call. The event records the thread #,
- start time, stop time, and nest level related to the bar width.
- The event vector starts at size INIT_SIZE and grows (doubles in size)
- if needed. An implication of this behavior is that log(N)
- reallocations are needed (where N is number of events). If you want
- to avoid reallocations, then set INIT_SIZE to a large value.
- the interface to this class is through six operations:
- 1) reset() -- sets the internal_size back to 0 but does not deallocate any
- memory
- 2) size() -- returns the number of valid elements in the vector
- 3) push_back(start, stop, nest, timer_name) -- pushes an event onto
- the back of the array
- 4) deallocate() -- frees all memory associated with the vector
- 5) sort() -- sorts the vector by start time
- 6) operator[index] or at(index) -- returns event reference at that index
- **************************************************************** */
- class kmp_stats_event_vector {
- kmp_stats_event *events;
- int internal_size;
- int allocated_size;
- static const int INIT_SIZE = 1024;
- public:
- kmp_stats_event_vector() {
- events =
- (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
- internal_size = 0;
- allocated_size = INIT_SIZE;
- }
- ~kmp_stats_event_vector() {}
- inline void reset() { internal_size = 0; }
- inline int size() const { return internal_size; }
- void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
- timer_e name) {
- int i;
- if (internal_size == allocated_size) {
- kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
- sizeof(kmp_stats_event) * allocated_size * 2);
- for (i = 0; i < internal_size; i++)
- tmp[i] = events[i];
- __kmp_free(events);
- events = tmp;
- allocated_size *= 2;
- }
- events[internal_size] =
- kmp_stats_event(start_time, stop_time, nest_level, name);
- internal_size++;
- return;
- }
- void deallocate();
- void sort();
- const kmp_stats_event &operator[](int index) const { return events[index]; }
- kmp_stats_event &operator[](int index) { return events[index]; }
- const kmp_stats_event &at(int index) const { return events[index]; }
- kmp_stats_event &at(int index) { return events[index]; }
- };
- /* ****************************************************************
- Class to implement a doubly-linked, circular, statistics list
- |---| ---> |---| ---> |---| ---> |---| ---> ... next
- | | | | | | | |
- |---| <--- |---| <--- |---| <--- |---| <--- ... prev
- Sentinel first second third
- Node node node node
- The Sentinel Node is the user handle on the list.
- The first node corresponds to thread 0's statistics.
- The second node corresponds to thread 1's statistics and so on...
- Each node has a _timers, _counters, and _explicitTimers array to hold that
- thread's statistics. The _explicitTimers point to the correct _timer and
- update its statistics at every stop() call. The explicitTimers' pointers are
- set up in the constructor. Each node also has an event vector to hold that
- thread's timing events. The event vector expands as necessary and records
- the start-stop times for each timer.
- The nestLevel variable is for plotting events and is related
- to the bar width in the timeline graph.
- Every thread will have a thread local pointer to its node in
- the list. The sentinel node is used by the primary thread to
- store "dummy" statistics before __kmp_create_worker() is called.
- **************************************************************** */
- class kmp_stats_list {
- int gtid;
- timeStat _timers[TIMER_LAST + 1];
- counter _counters[COUNTER_LAST + 1];
- explicitTimer thread_life_timer;
- partitionedTimers _partitionedTimers;
- int _nestLevel; // one per thread
- kmp_stats_event_vector _event_vector;
- kmp_stats_list *next;
- kmp_stats_list *prev;
- stats_state_e state;
- int thread_is_idle_flag;
- public:
- kmp_stats_list()
- : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
- TIMER_OMP_worker_thread_life),
- _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
- thread_is_idle_flag(0) {}
- ~kmp_stats_list() {}
- inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
- inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
- inline partitionedTimers *getPartitionedTimers() {
- return &_partitionedTimers;
- }
- inline timeStat *getTimers() { return _timers; }
- inline counter *getCounters() { return _counters; }
- inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
- inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
- inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
- inline void resetEventVector() { _event_vector.reset(); }
- inline void incrementNestValue() { _nestLevel++; }
- inline int getNestValue() { return _nestLevel; }
- inline void decrementNestValue() { _nestLevel--; }
- inline int getGtid() const { return gtid; }
- inline void setGtid(int newgtid) { gtid = newgtid; }
- inline void setState(stats_state_e newstate) { state = newstate; }
- inline stats_state_e getState() const { return state; }
- inline stats_state_e *getStatePointer() { return &state; }
- inline bool isIdle() { return thread_is_idle_flag == 1; }
- inline void setIdleFlag() { thread_is_idle_flag = 1; }
- inline void resetIdleFlag() { thread_is_idle_flag = 0; }
- kmp_stats_list *push_back(int gtid); // returns newly created list node
- inline void push_event(uint64_t start_time, uint64_t stop_time,
- int nest_level, timer_e name) {
- _event_vector.push_back(start_time, stop_time, nest_level, name);
- }
- void deallocate();
- class iterator;
- kmp_stats_list::iterator begin();
- kmp_stats_list::iterator end();
- int size();
- class iterator {
- kmp_stats_list *ptr;
- friend kmp_stats_list::iterator kmp_stats_list::begin();
- friend kmp_stats_list::iterator kmp_stats_list::end();
- public:
- iterator();
- ~iterator();
- iterator operator++();
- iterator operator++(int dummy);
- iterator operator--();
- iterator operator--(int dummy);
- bool operator!=(const iterator &rhs);
- bool operator==(const iterator &rhs);
- kmp_stats_list *operator*() const; // dereference operator
- };
- };
- /* ****************************************************************
- Class to encapsulate all output functions and the environment variables
- This module holds filenames for various outputs (normal stats, events, plot
- file), as well as coloring information for the plot file.
- The filenames and flags variables are read from environment variables.
- These are read once by the constructor of the global variable
- __kmp_stats_output which calls init().
- During this init() call, event flags for the timeStat::timerInfo[] global
- array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
- The only interface function that is public is outputStats(heading). This
- function should print out everything it needs to, either to files or stderr,
- depending on the environment variables described below
- ENVIRONMENT VARIABLES:
- KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
- file, otherwise, print to stderr
- KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
- either KMP_STATS_FILE or stderr
- KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
- otherwise, the plot file is sent to "events.plt"
- KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
- events
- KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
- otherwise, output is sent to "events.dat"
- **************************************************************** */
- class kmp_stats_output_module {
- public:
- struct rgb_color {
- float r;
- float g;
- float b;
- };
- private:
- std::string outputFileName;
- static const char *eventsFileName;
- static const char *plotFileName;
- static int printPerThreadFlag;
- static int printPerThreadEventsFlag;
- static const rgb_color globalColorArray[];
- static rgb_color timerColorInfo[];
- void init();
- static void setupEventColors();
- static void printPloticusFile();
- static void printHeaderInfo(FILE *statsOut);
- static void printTimerStats(FILE *statsOut, statistic const *theStats,
- statistic const *totalStats);
- static void printCounterStats(FILE *statsOut, statistic const *theStats);
- static void printCounters(FILE *statsOut, counter const *theCounters);
- static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
- int gtid);
- static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
- static void windupExplicitTimers();
- bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
- public:
- kmp_stats_output_module() { init(); }
- void outputStats(const char *heading);
- };
- #ifdef __cplusplus
- extern "C" {
- #endif
- void __kmp_stats_init();
- void __kmp_stats_fini();
- void __kmp_reset_stats();
- void __kmp_output_stats(const char *);
- void __kmp_accumulate_stats_at_exit(void);
- // thread local pointer to stats node within list
- extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
- // head to stats list.
- extern kmp_stats_list *__kmp_stats_list;
- // lock for __kmp_stats_list
- extern kmp_tas_lock_t __kmp_stats_lock;
- // reference start time
- extern tsc_tick_count __kmp_stats_start_time;
- // interface to output
- extern kmp_stats_output_module __kmp_stats_output;
- #ifdef __cplusplus
- }
- #endif
- // Simple, standard interfaces that drop out completely if stats aren't enabled
- /*!
- * \brief Adds value to specified timer (name).
- *
- * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
- * @param value double precision sample value to add to statistics for the timer
- *
- * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
- * a timer statistics.
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_COUNT_VALUE(name, value) \
- __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
- /*!
- * \brief Increments specified counter (name).
- *
- * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
- *
- * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
- * counter for the executing thread.
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_COUNT_BLOCK(name) \
- __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
- /*!
- * \brief Outputs the current thread statistics and reset them.
- *
- * @param heading_string heading put above the final stats output
- *
- * \details Explicitly stops all timers and outputs all stats. Environment
- * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
- * filename instead of stderr. Environment variable,
- * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
- * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
- * defined with any value, which will print out thread specific stats, or it can
- * be undefined (not specified in the environment) and thread specific stats
- * won't be printed. It should be noted that all statistics are reset when this
- * macro is called.
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
- /*!
- * \brief Initializes the partitioned timers to begin with name.
- *
- * @param name timer which you want this thread to begin with
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_INIT_PARTITIONED_TIMERS(name) \
- __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
- __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
- #define KMP_TIME_PARTITIONED_BLOCK(name) \
- blockPartitionedTimer __PBLOCKTIME__( \
- __kmp_stats_thread_ptr->getPartitionedTimers(), \
- explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
- TIMER_##name))
- #define KMP_PUSH_PARTITIONED_TIMER(name) \
- __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
- __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
- #define KMP_POP_PARTITIONED_TIMER() \
- __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
- #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
- __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
- __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
- #define KMP_SET_THREAD_STATE(state_name) \
- __kmp_stats_thread_ptr->setState(state_name)
- #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
- #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
- blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
- state_name)
- /*!
- * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
- *
- * \details Reset all stats for all threads.
- *
- * @ingroup STATS_GATHERING
- */
- #define KMP_RESET_STATS() __kmp_reset_stats()
- #if (KMP_DEVELOPER_STATS)
- #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
- #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
- #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
- #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
- #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
- #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
- KMP_EXCHANGE_PARTITIONED_TIMER(n)
- #else
- // Null definitions
- #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
- #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
- #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
- #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #endif
- #else // KMP_STATS_ENABLED
- // Null definitions
- #define KMP_COUNT_VALUE(n, v) ((void)0)
- #define KMP_COUNT_BLOCK(n) ((void)0)
- #define KMP_OUTPUT_STATS(heading_string) ((void)0)
- #define KMP_RESET_STATS() ((void)0)
- #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
- #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
- #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
- #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
- #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
- #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
- #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
- #define KMP_POP_PARTITIONED_TIMER() ((void)0)
- #define KMP_SET_THREAD_STATE(state_name) ((void)0)
- #define KMP_GET_THREAD_STATE() ((void)0)
- #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
- #endif // KMP_STATS_ENABLED
- #endif // KMP_STATS_H
|