nanobenchmark.h 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // Copyright 2017 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_
  15. #define ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_
  16. // Benchmarks functions of a single integer argument with realistic branch
  17. // prediction hit rates. Uses a robust estimator to summarize the measurements.
  18. // The precision is about 0.2%.
  19. //
  20. // Examples: see nanobenchmark_test.cc.
  21. //
  22. // Background: Microbenchmarks such as http://github.com/google/benchmark
  23. // can measure elapsed times on the order of a microsecond. Shorter functions
  24. // are typically measured by repeating them thousands of times and dividing
  25. // the total elapsed time by this count. Unfortunately, repetition (especially
  26. // with the same input parameter!) influences the runtime. In time-critical
  27. // code, it is reasonable to expect warm instruction/data caches and TLBs,
  28. // but a perfect record of which branches will be taken is unrealistic.
  29. // Unless the application also repeatedly invokes the measured function with
  30. // the same parameter, the benchmark is measuring something very different -
  31. // a best-case result, almost as if the parameter were made a compile-time
  32. // constant. This may lead to erroneous conclusions about branch-heavy
  33. // algorithms outperforming branch-free alternatives.
  34. //
  35. // Our approach differs in three ways. Adding fences to the timer functions
  36. // reduces variability due to instruction reordering, improving the timer
  37. // resolution to about 40 CPU cycles. However, shorter functions must still
  38. // be invoked repeatedly. For more realistic branch prediction performance,
  39. // we vary the input parameter according to a user-specified distribution.
  40. // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
  41. // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
  42. // central tendency of the measurement samples with the "half sample mode",
  43. // which is more robust to outliers and skewed data than the mean or median.
  44. // NOTE: for compatibility with multiple translation units compiled with
  45. // distinct flags, avoid #including headers that define functions.
  46. #include <stddef.h>
  47. #include <stdint.h>
  48. #include "absl/base/config.h"
  49. namespace absl {
  50. ABSL_NAMESPACE_BEGIN
  51. namespace random_internal_nanobenchmark {
  52. // Input influencing the function being measured (e.g. number of bytes to copy).
  53. using FuncInput = size_t;
  54. // "Proof of work" returned by Func to ensure the compiler does not elide it.
  55. using FuncOutput = uint64_t;
  56. // Function to measure: either 1) a captureless lambda or function with two
  57. // arguments or 2) a lambda with capture, in which case the first argument
  58. // is reserved for use by MeasureClosure.
  59. using Func = FuncOutput (*)(const void*, FuncInput);
  60. // Internal parameters that determine precision/resolution/measuring time.
  61. struct Params {
  62. // For measuring timer overhead/resolution. Used in a nested loop =>
  63. // quadratic time, acceptable because we know timer overhead is "low".
  64. // constexpr because this is used to define array bounds.
  65. static constexpr size_t kTimerSamples = 256;
  66. // Best-case precision, expressed as a divisor of the timer resolution.
  67. // Larger => more calls to Func and higher precision.
  68. size_t precision_divisor = 1024;
  69. // Ratio between full and subset input distribution sizes. Cannot be less
  70. // than 2; larger values increase measurement time but more faithfully
  71. // model the given input distribution.
  72. size_t subset_ratio = 2;
  73. // Together with the estimated Func duration, determines how many times to
  74. // call Func before checking the sample variability. Larger values increase
  75. // measurement time, memory/cache use and precision.
  76. double seconds_per_eval = 4E-3;
  77. // The minimum number of samples before estimating the central tendency.
  78. size_t min_samples_per_eval = 7;
  79. // The mode is better than median for estimating the central tendency of
  80. // skewed/fat-tailed distributions, but it requires sufficient samples
  81. // relative to the width of half-ranges.
  82. size_t min_mode_samples = 64;
  83. // Maximum permissible variability (= median absolute deviation / center).
  84. double target_rel_mad = 0.002;
  85. // Abort after this many evals without reaching target_rel_mad. This
  86. // prevents infinite loops.
  87. size_t max_evals = 9;
  88. // Retry the measure loop up to this many times.
  89. size_t max_measure_retries = 2;
  90. // Whether to print additional statistics to stdout.
  91. bool verbose = true;
  92. };
  93. // Measurement result for each unique input.
  94. struct Result {
  95. FuncInput input;
  96. // Robust estimate (mode or median) of duration.
  97. float ticks;
  98. // Measure of variability (median absolute deviation relative to "ticks").
  99. float variability;
  100. };
  101. // Ensures the thread is running on the specified cpu, and no others.
  102. // Reduces noise due to desynchronized socket RDTSC and context switches.
  103. // If "cpu" is negative, pin to the currently running core.
  104. void PinThreadToCPU(const int cpu = -1);
  105. // Returns tick rate, useful for converting measurements to seconds. Invariant
  106. // means the tick counter frequency is independent of CPU throttling or sleep.
  107. // This call may be expensive, callers should cache the result.
  108. double InvariantTicksPerSecond();
  109. // Precisely measures the number of ticks elapsed when calling "func" with the
  110. // given inputs, shuffled to ensure realistic branch prediction hit rates.
  111. //
  112. // "func" returns a 'proof of work' to ensure its computations are not elided.
  113. // "arg" is passed to Func, or reserved for internal use by MeasureClosure.
  114. // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
  115. // "func". The values should be chosen to maximize coverage of "func". This
  116. // represents a distribution, so a value's frequency should reflect its
  117. // probability in the real application. Order does not matter; for example, a
  118. // uniform distribution over [0, 4) could be represented as {3,0,2,1}.
  119. // Returns how many Result were written to "results": one per unique input, or
  120. // zero if the measurement failed (an error message goes to stderr).
  121. size_t Measure(const Func func, const void* arg, const FuncInput* inputs,
  122. const size_t num_inputs, Result* results,
  123. const Params& p = Params());
  124. // Calls operator() of the given closure (lambda function).
  125. template <class Closure>
  126. static FuncOutput CallClosure(const void* f, const FuncInput input) {
  127. return (*reinterpret_cast<const Closure*>(f))(input);
  128. }
  129. // Same as Measure, except "closure" is typically a lambda function of
  130. // FuncInput -> FuncOutput with a capture list.
  131. template <class Closure>
  132. static inline size_t MeasureClosure(const Closure& closure,
  133. const FuncInput* inputs,
  134. const size_t num_inputs, Result* results,
  135. const Params& p = Params()) {
  136. return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
  137. reinterpret_cast<const void*>(&closure), inputs, num_inputs,
  138. results, p);
  139. }
  140. } // namespace random_internal_nanobenchmark
  141. ABSL_NAMESPACE_END
  142. } // namespace absl
  143. #endif // ABSL_RANDOM_INTERNAL_NANOBENCHMARK_H_