123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- #include <util/system/defaults.h>
- #include "profile.h"
- #if defined(_unix_) && !defined(_bionic_) && !defined(_cygwin_)
- #include <signal.h>
- #include <sys/time.h>
- #include <sys/resource.h>
- #if defined(_darwin_)
- #include <sys/ucontext.h>
- #else
- #include <ucontext.h>
- #endif
- #include <dlfcn.h>
- #include <library/cpp/deprecated/atomic/atomic.h>
- #include <util/system/platform.h>
- #include <util/generic/hash.h>
- #include <util/generic/map.h>
- #include <util/generic/noncopyable.h>
- #include <util/generic/algorithm.h>
- #include <util/generic/vector.h>
- #include <util/stream/file.h>
- #include <util/string/util.h>
- #include <util/system/datetime.h>
- // This class sets SIGPROF handler and captures instruction pointer in it.
- class TExecutionSampler : TNonCopyable {
- public:
- typedef TVector<std::pair<void*, size_t>> TSampleVector;
- struct TStats {
- ui64 SavedSamples;
- ui64 DroppedSamples;
- ui64 SearchSkipCount;
- };
- // NOTE: There is no synchronization here as the instance is supposed to be
- // created on the main thread.
- static TExecutionSampler* Instance() {
- if (SInstance == nullptr) {
- SInstance = new TExecutionSampler();
- }
- return SInstance;
- }
- void Start() {
- // Set signal handler
- struct sigaction sa;
- sa.sa_sigaction = ProfilerSignalHandler;
- sigemptyset(&sa.sa_mask);
- sa.sa_flags = SA_SIGINFO;
- if (sigaction(SIGPROF, &sa, &OldSignalHandler) != 0)
- return;
- // Set interval timer
- itimerval tv;
- tv.it_interval.tv_sec = tv.it_value.tv_sec = 0;
- tv.it_interval.tv_usec = tv.it_value.tv_usec = SAMPLE_INTERVAL;
- setitimer(ITIMER_PROF, &tv, &OldTimerValue);
- Started = true;
- }
- void Stop(TSampleVector& sampleVector, TStats& stats) {
- // Reset signal handler and timer
- if (Started) {
- setitimer(ITIMER_PROF, &OldTimerValue, nullptr);
- sleep(1);
- }
- WaitForWriteFlag();
- if (Started) {
- sigaction(SIGPROF, &OldSignalHandler, nullptr);
- Started = false;
- }
- TExecutionSampler::TSampleVector hits;
- hits.reserve(Samples);
- for (size_t i = 0; i < SZ; ++i) {
- if (Ips[i].first != nullptr) {
- hits.push_back(Ips[i]);
- }
- }
- stats.SavedSamples = Samples;
- stats.DroppedSamples = AtomicGet(DroppedSamples);
- stats.SearchSkipCount = SearchSkipCount;
- AtomicUnlock(&WriteFlag);
- Sort(hits.begin(), hits.end(), TCompareFirst());
- sampleVector.swap(hits);
- }
- void ResetStats() {
- WaitForWriteFlag();
- Clear();
- AtomicUnlock(&WriteFlag);
- }
- private:
- static const size_t SZ = 2 * 1024 * 1024; // size of the hash table
- // inserts work faster if it's a power of 2
- static const int SAMPLE_INTERVAL = 1000; // in microseconds
- struct TCompareFirst {
- bool operator()(const std::pair<void*, size_t>& a, const std::pair<void*, size_t>& b) const {
- return a.first < b.first;
- }
- };
- TExecutionSampler()
- : Started(false)
- , Ips(SZ)
- , WriteFlag(0)
- , DroppedSamples(0)
- , Samples(0)
- , UniqueSamples(0)
- , SearchSkipCount(0)
- {
- }
- ~TExecutionSampler() = default;
- // Signal handler is not allowed to do anything that can deadlock with activity
- // on the thread to which the signal is delivered or corrupt data structures that
- // were in process of update.
- // One such thing is memory allocation. That's why a fixed size vector is
- // preallocated at start.
- static void ProfilerSignalHandler(int signal, siginfo_t* info, void* context) {
- (void)info;
- if (signal != SIGPROF) {
- return;
- }
- ucontext_t* ucontext = reinterpret_cast<ucontext_t*>(context);
- Y_ASSERT(SInstance != nullptr);
- SInstance->CaptureIP(GetIp(&ucontext->uc_mcontext));
- }
- void WaitForWriteFlag() {
- // Wait for write flag to be reset
- ui32 delay = 100;
- while (!AtomicTryLock(&WriteFlag)) {
- usleep(delay);
- delay += delay;
- delay = Min(delay, (ui32)5000);
- }
- }
- void CaptureIP(void* rip) {
- // Check if the handler on another thread is in the process of adding a sample
- // If this is the case, we just drop the current sample as this should happen
- // rarely.
- if (AtomicTryLock(&WriteFlag)) {
- AddSample(rip);
- AtomicUnlock(&WriteFlag);
- } else {
- AtomicIncrement(DroppedSamples);
- }
- }
- // Hash function applied to the addresses
- static inline ui32 Hash(void* key) {
- return ((size_t)key + (size_t)key / SZ) % SZ;
- }
- // Get instruction pointer from the context
- static inline void* GetIp(const mcontext_t* mctx) {
- #if defined _freebsd_
- #if defined _64_
- return (void*)mctx->mc_rip;
- #else
- return (void*)mctx->mc_eip;
- #endif
- #elif defined _linux_
- #if defined _64_
- #if defined(_arm_)
- return (void*)mctx->pc;
- #else
- return (void*)mctx->gregs[REG_RIP];
- #endif
- #else
- return (void*)mctx->gregs[REG_EIP];
- #endif
- #elif defined _darwin_
- #if defined _64_
- #if defined(_arm_)
- return (void*)(*mctx)->__ss.__pc;
- #else
- return (void*)(*mctx)->__ss.__rip;
- #endif
- #else
- #if defined(__IOS__)
- return (void*)(*mctx)->__ss.__pc;
- #else
- return (void*)(*mctx)->__ss.__eip;
- #endif
- #endif
- #endif
- }
- inline bool AddSample(void* key) {
- ui32 slot = Hash(key);
- ui32 prevSlot = (slot - 1) % SZ;
- while (key != Ips[slot].first && !IsSlotEmpty(slot) && slot != prevSlot) {
- slot = (slot + 1) % SZ;
- SearchSkipCount++;
- }
- if (key == Ips[slot].first) {
- // increment the count
- Ips[slot].second++;
- ++Samples;
- } else if (InsertsAllowed()) {
- // add new sample and set the count to 1
- Ips[slot].first = key;
- Ips[slot].second = 1;
- ++UniqueSamples;
- ++Samples;
- } else {
- // don't insert new sample if the search is becoming too slow
- AtomicIncrement(DroppedSamples);
- return false;
- }
- return true;
- }
- inline bool IsSlotEmpty(ui32 slot) const {
- return Ips[slot].first == nullptr;
- }
- inline bool InsertsAllowed() const {
- return UniqueSamples < SZ / 2;
- }
- void
- Clear() {
- Y_ASSERT(WriteFlag == 1);
- for (size_t i = 0; i < SZ; ++i) {
- Ips[i] = std::make_pair((void*)nullptr, (size_t)0);
- }
- Samples = 0;
- AtomicSet(DroppedSamples, 0);
- UniqueSamples = 0;
- SearchSkipCount = 0;
- }
- bool Started;
- struct sigaction OldSignalHandler;
- itimerval OldTimerValue;
- TVector<std::pair<void*, size_t>>
- Ips; // The hash table storing addresses and their hitcounts
- // TODO: on a big multiproc cache line false sharing by the flag and count might become an issue
- TAtomic WriteFlag; // Is used to syncronize access to the hash table
- TAtomic DroppedSamples; // "dropped sample" count will show how many times
- // a sample was dropped either because of write conflict
- // or because of the hash table had become too filled up
- ui64 Samples; // Saved samples count
- ui64 UniqueSamples; // Number of unique addresses
- ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions
- static TExecutionSampler* SInstance;
- };
- // Performs analysis of samples captured by TExecutionSampler
- class TSampleAnalyser : TNonCopyable {
- public:
- TSampleAnalyser(TExecutionSampler::TSampleVector& samples, const TExecutionSampler::TStats& stats, bool putTimeStamps = false)
- : Samples()
- , Stats(stats)
- , PutTimestamps(putTimeStamps)
- {
- Samples.swap(samples);
- }
- ~TSampleAnalyser() = default;
- void Analyze(FILE* out) const;
- private:
- TExecutionSampler::TSampleVector Samples;
- TExecutionSampler::TStats Stats;
- bool PutTimestamps;
- };
- void TSampleAnalyser::Analyze(FILE* out) const {
- fprintf(out, "samples: %" PRIu64 " unique: %" PRIu64 " dropped: %" PRIu64 " searchskips: %" PRIu64 "\n",
- (ui64)Stats.SavedSamples, (ui64)Samples.size(),
- (ui64)Stats.DroppedSamples, (ui64)Stats.SearchSkipCount);
- fprintf(out, "\nSamples:\n");
- size_t funcCnt = 0;
- void* prevModBase = (void*)-1;
- void* prevFunc = (void*)-1;
- for (size_t i = 0; i < Samples.size(); ++i) {
- // print cycle count once in a while to estimate time consumed by
- // dumping the samples
- if (PutTimestamps && (i % 1000 == 0)) {
- ui64 tm = GetCycleCount();
- fprintf(out, "TM: %" PRIu64 "\n", tm);
- }
- Dl_info addrInfo;
- if (dladdr(Samples[i].first, &addrInfo)) {
- if (addrInfo.dli_fbase != prevModBase || addrInfo.dli_saddr != prevFunc) {
- fprintf(out, "Func\t%" PRISZT "\t%p\t%p\t%s\t%s\n",
- funcCnt,
- addrInfo.dli_fbase,
- addrInfo.dli_saddr,
- addrInfo.dli_fname,
- addrInfo.dli_sname);
- prevModBase = addrInfo.dli_fbase;
- prevFunc = addrInfo.dli_saddr;
- ++funcCnt;
- }
- } else {
- fprintf(out, "[dladdr failed]\n");
- }
- fprintf(out, "%" PRISZT "\t%p\t%lu\n", i, Samples[i].first, Samples[i].second);
- }
- }
- TExecutionSampler* TExecutionSampler::SInstance = nullptr;
- // Starts capturing execution samples
- void BeginProfiling() {
- TExecutionSampler::Instance()->Start();
- }
- // Resets captured execution samples
- void ResetProfile() {
- TExecutionSampler::Instance()->ResetStats();
- }
- void DumpRUsage(FILE* out) {
- rusage ru;
- int e = getrusage(RUSAGE_SELF, &ru);
- if (e != 0)
- return;
- fprintf(out,
- "user time: %lf\n"
- "system time: %lf\n"
- "max RSS: %ld\n"
- "shared text: %ld\n"
- "unshared data: %ld\n"
- "unshared stack: %ld\n"
- "page reclaims: %ld\n"
- "page faults: %ld\n"
- "swaps: %ld\n"
- "block input ops: %ld\n"
- "block output ops: %ld\n"
- "msg sent: %ld\n"
- "msg received: %ld\n"
- "signals: %ld\n"
- "voluntary ctx switches: %ld\n"
- "involuntary ctx switches: %ld\n\n",
- ru.ru_utime.tv_sec + (ru.ru_utime.tv_usec * 0.000001),
- ru.ru_stime.tv_sec + (ru.ru_stime.tv_usec * 0.000001),
- ru.ru_maxrss, ru.ru_ixrss, ru.ru_idrss, ru.ru_isrss,
- ru.ru_minflt, ru.ru_majflt, ru.ru_nswap,
- ru.ru_inblock, ru.ru_oublock,
- ru.ru_msgsnd, ru.ru_msgrcv,
- ru.ru_nsignals,
- ru.ru_nvcsw, ru.ru_nivcsw);
- }
- // Pauses capturing execution samples and dumps them to the file
- // Samples are not cleared so that profiling can be continued by calling BeginProfiling()
- // or it can be started from scratch by calling ResetProfile() and then BeginProfiling()
- void EndProfiling(FILE* out) {
- DumpRUsage(out);
- TExecutionSampler::TSampleVector samples;
- TExecutionSampler::TStats stats;
- TExecutionSampler::Instance()->Stop(samples, stats);
- TSampleAnalyser analyzer(samples, stats);
- analyzer.Analyze(out);
- }
- void EndProfiling() {
- static unsigned cnt = 0;
- char nameBuf[256];
- snprintf(nameBuf, sizeof(nameBuf), "./%s.%d.%u.profile", getprogname(), (int)getpid(), cnt);
- FILE* out = fopen(nameBuf, "a");
- EndProfiling(out);
- fclose(out);
- ++cnt;
- }
- #else
- // NOTE: not supported on Windows
- void BeginProfiling() {
- }
- void ResetProfile() {
- }
- void EndProfiling(FILE*) {
- }
- void EndProfiling() {
- }
- #endif
|