#include #include "profile.h" #if defined(_unix_) && !defined(_bionic_) && !defined(_cygwin_) #include #include #include #if defined(_darwin_) #include #else #include #endif #include #include #include #include #include #include #include #include #include #include #include // This class sets SIGPROF handler and captures instruction pointer in it. class TExecutionSampler : TNonCopyable { public: typedef TVector> TSampleVector; struct TStats { ui64 SavedSamples; ui64 DroppedSamples; ui64 SearchSkipCount; }; // NOTE: There is no synchronization here as the instance is supposed to be // created on the main thread. static TExecutionSampler* Instance() { if (SInstance == nullptr) { SInstance = new TExecutionSampler(); } return SInstance; } void Start() { // Set signal handler struct sigaction sa; sa.sa_sigaction = ProfilerSignalHandler; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; if (sigaction(SIGPROF, &sa, &OldSignalHandler) != 0) return; // Set interval timer itimerval tv; tv.it_interval.tv_sec = tv.it_value.tv_sec = 0; tv.it_interval.tv_usec = tv.it_value.tv_usec = SAMPLE_INTERVAL; setitimer(ITIMER_PROF, &tv, &OldTimerValue); Started = true; } void Stop(TSampleVector& sampleVector, TStats& stats) { // Reset signal handler and timer if (Started) { setitimer(ITIMER_PROF, &OldTimerValue, nullptr); sleep(1); } WaitForWriteFlag(); if (Started) { sigaction(SIGPROF, &OldSignalHandler, nullptr); Started = false; } TExecutionSampler::TSampleVector hits; hits.reserve(Samples); for (size_t i = 0; i < SZ; ++i) { if (Ips[i].first != nullptr) { hits.push_back(Ips[i]); } } stats.SavedSamples = Samples; stats.DroppedSamples = AtomicGet(DroppedSamples); stats.SearchSkipCount = SearchSkipCount; AtomicUnlock(&WriteFlag); Sort(hits.begin(), hits.end(), TCompareFirst()); sampleVector.swap(hits); } void ResetStats() { WaitForWriteFlag(); Clear(); AtomicUnlock(&WriteFlag); } private: static const size_t SZ = 2 * 1024 * 1024; // size of the hash table // inserts work faster if it's a power of 2 static const int SAMPLE_INTERVAL = 1000; // in microseconds struct TCompareFirst { bool operator()(const std::pair& a, const std::pair& b) const { return a.first < b.first; } }; TExecutionSampler() : Started(false) , Ips(SZ) , WriteFlag(0) , DroppedSamples(0) , Samples(0) , UniqueSamples(0) , SearchSkipCount(0) { } ~TExecutionSampler() = default; // Signal handler is not allowed to do anything that can deadlock with activity // on the thread to which the signal is delivered or corrupt data structures that // were in process of update. // One such thing is memory allocation. That's why a fixed size vector is // preallocated at start. static void ProfilerSignalHandler(int signal, siginfo_t* info, void* context) { (void)info; if (signal != SIGPROF) { return; } ucontext_t* ucontext = reinterpret_cast(context); Y_ASSERT(SInstance != nullptr); SInstance->CaptureIP(GetIp(&ucontext->uc_mcontext)); } void WaitForWriteFlag() { // Wait for write flag to be reset ui32 delay = 100; while (!AtomicTryLock(&WriteFlag)) { usleep(delay); delay += delay; delay = Min(delay, (ui32)5000); } } void CaptureIP(void* rip) { // Check if the handler on another thread is in the process of adding a sample // If this is the case, we just drop the current sample as this should happen // rarely. if (AtomicTryLock(&WriteFlag)) { AddSample(rip); AtomicUnlock(&WriteFlag); } else { AtomicIncrement(DroppedSamples); } } // Hash function applied to the addresses static inline ui32 Hash(void* key) { return ((size_t)key + (size_t)key / SZ) % SZ; } // Get instruction pointer from the context static inline void* GetIp(const mcontext_t* mctx) { #if defined _freebsd_ #if defined _64_ return (void*)mctx->mc_rip; #else return (void*)mctx->mc_eip; #endif #elif defined _linux_ #if defined _64_ #if defined(_arm_) return (void*)mctx->pc; #else return (void*)mctx->gregs[REG_RIP]; #endif #else return (void*)mctx->gregs[REG_EIP]; #endif #elif defined _darwin_ #if defined _64_ #if defined(_arm_) return (void*)(*mctx)->__ss.__pc; #else return (void*)(*mctx)->__ss.__rip; #endif #else #if defined(__IOS__) return (void*)(*mctx)->__ss.__pc; #else return (void*)(*mctx)->__ss.__eip; #endif #endif #endif } inline bool AddSample(void* key) { ui32 slot = Hash(key); ui32 prevSlot = (slot - 1) % SZ; while (key != Ips[slot].first && !IsSlotEmpty(slot) && slot != prevSlot) { slot = (slot + 1) % SZ; SearchSkipCount++; } if (key == Ips[slot].first) { // increment the count Ips[slot].second++; ++Samples; } else if (InsertsAllowed()) { // add new sample and set the count to 1 Ips[slot].first = key; Ips[slot].second = 1; ++UniqueSamples; ++Samples; } else { // don't insert new sample if the search is becoming too slow AtomicIncrement(DroppedSamples); return false; } return true; } inline bool IsSlotEmpty(ui32 slot) const { return Ips[slot].first == nullptr; } inline bool InsertsAllowed() const { return UniqueSamples < SZ / 2; } void Clear() { Y_ASSERT(WriteFlag == 1); for (size_t i = 0; i < SZ; ++i) { Ips[i] = std::make_pair((void*)nullptr, (size_t)0); } Samples = 0; AtomicSet(DroppedSamples, 0); UniqueSamples = 0; SearchSkipCount = 0; } bool Started; struct sigaction OldSignalHandler; itimerval OldTimerValue; TVector> Ips; // The hash table storing addresses and their hitcounts // TODO: on a big multiproc cache line false sharing by the flag and count might become an issue TAtomic WriteFlag; // Is used to syncronize access to the hash table TAtomic DroppedSamples; // "dropped sample" count will show how many times // a sample was dropped either because of write conflict // or because of the hash table had become too filled up ui64 Samples; // Saved samples count ui64 UniqueSamples; // Number of unique addresses ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions static TExecutionSampler* SInstance; }; // Performs analysis of samples captured by TExecutionSampler class TSampleAnalyser : TNonCopyable { public: TSampleAnalyser(TExecutionSampler::TSampleVector& samples, const TExecutionSampler::TStats& stats, bool putTimeStamps = false) : Samples() , Stats(stats) , PutTimestamps(putTimeStamps) { Samples.swap(samples); } ~TSampleAnalyser() = default; void Analyze(FILE* out) const; private: TExecutionSampler::TSampleVector Samples; TExecutionSampler::TStats Stats; bool PutTimestamps; }; void TSampleAnalyser::Analyze(FILE* out) const { fprintf(out, "samples: %" PRIu64 " unique: %" PRIu64 " dropped: %" PRIu64 " searchskips: %" PRIu64 "\n", (ui64)Stats.SavedSamples, (ui64)Samples.size(), (ui64)Stats.DroppedSamples, (ui64)Stats.SearchSkipCount); fprintf(out, "\nSamples:\n"); size_t funcCnt = 0; void* prevModBase = (void*)-1; void* prevFunc = (void*)-1; for (size_t i = 0; i < Samples.size(); ++i) { // print cycle count once in a while to estimate time consumed by // dumping the samples if (PutTimestamps && (i % 1000 == 0)) { ui64 tm = GetCycleCount(); fprintf(out, "TM: %" PRIu64 "\n", tm); } Dl_info addrInfo; if (dladdr(Samples[i].first, &addrInfo)) { if (addrInfo.dli_fbase != prevModBase || addrInfo.dli_saddr != prevFunc) { fprintf(out, "Func\t%" PRISZT "\t%p\t%p\t%s\t%s\n", funcCnt, addrInfo.dli_fbase, addrInfo.dli_saddr, addrInfo.dli_fname, addrInfo.dli_sname); prevModBase = addrInfo.dli_fbase; prevFunc = addrInfo.dli_saddr; ++funcCnt; } } else { fprintf(out, "[dladdr failed]\n"); } fprintf(out, "%" PRISZT "\t%p\t%lu\n", i, Samples[i].first, Samples[i].second); } } TExecutionSampler* TExecutionSampler::SInstance = nullptr; // Starts capturing execution samples void BeginProfiling() { TExecutionSampler::Instance()->Start(); } // Resets captured execution samples void ResetProfile() { TExecutionSampler::Instance()->ResetStats(); } void DumpRUsage(FILE* out) { rusage ru; int e = getrusage(RUSAGE_SELF, &ru); if (e != 0) return; fprintf(out, "user time: %lf\n" "system time: %lf\n" "max RSS: %ld\n" "shared text: %ld\n" "unshared data: %ld\n" "unshared stack: %ld\n" "page reclaims: %ld\n" "page faults: %ld\n" "swaps: %ld\n" "block input ops: %ld\n" "block output ops: %ld\n" "msg sent: %ld\n" "msg received: %ld\n" "signals: %ld\n" "voluntary ctx switches: %ld\n" "involuntary ctx switches: %ld\n\n", ru.ru_utime.tv_sec + (ru.ru_utime.tv_usec * 0.000001), ru.ru_stime.tv_sec + (ru.ru_stime.tv_usec * 0.000001), ru.ru_maxrss, ru.ru_ixrss, ru.ru_idrss, ru.ru_isrss, ru.ru_minflt, ru.ru_majflt, ru.ru_nswap, ru.ru_inblock, ru.ru_oublock, ru.ru_msgsnd, ru.ru_msgrcv, ru.ru_nsignals, ru.ru_nvcsw, ru.ru_nivcsw); } // Pauses capturing execution samples and dumps them to the file // Samples are not cleared so that profiling can be continued by calling BeginProfiling() // or it can be started from scratch by calling ResetProfile() and then BeginProfiling() void EndProfiling(FILE* out) { DumpRUsage(out); TExecutionSampler::TSampleVector samples; TExecutionSampler::TStats stats; TExecutionSampler::Instance()->Stop(samples, stats); TSampleAnalyser analyzer(samples, stats); analyzer.Analyze(out); } void EndProfiling() { static unsigned cnt = 0; char nameBuf[256]; snprintf(nameBuf, sizeof(nameBuf), "./%s.%d.%u.profile", getprogname(), (int)getpid(), cnt); FILE* out = fopen(nameBuf, "a"); EndProfiling(out); fclose(out); ++cnt; } #else // NOTE: not supported on Windows void BeginProfiling() { } void ResetProfile() { } void EndProfiling(FILE*) { } void EndProfiling() { } #endif