profile.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #include <util/system/defaults.h>
  2. #include "profile.h"
  3. #if defined(_unix_) && !defined(_bionic_) && !defined(_cygwin_)
  4. #include <signal.h>
  5. #include <sys/time.h>
  6. #include <sys/resource.h>
  7. #if defined(_darwin_)
  8. #include <sys/ucontext.h>
  9. #else
  10. #include <ucontext.h>
  11. #endif
  12. #include <dlfcn.h>
  13. #include <library/cpp/deprecated/atomic/atomic.h>
  14. #include <util/system/platform.h>
  15. #include <util/generic/hash.h>
  16. #include <util/generic/map.h>
  17. #include <util/generic/noncopyable.h>
  18. #include <util/generic/algorithm.h>
  19. #include <util/generic/vector.h>
  20. #include <util/stream/file.h>
  21. #include <util/string/util.h>
  22. #include <util/system/datetime.h>
  23. // This class sets SIGPROF handler and captures instruction pointer in it.
  24. class TExecutionSampler : TNonCopyable {
  25. public:
  26. typedef TVector<std::pair<void*, size_t>> TSampleVector;
  27. struct TStats {
  28. ui64 SavedSamples;
  29. ui64 DroppedSamples;
  30. ui64 SearchSkipCount;
  31. };
  32. // NOTE: There is no synchronization here as the instance is supposed to be
  33. // created on the main thread.
  34. static TExecutionSampler* Instance() {
  35. if (SInstance == nullptr) {
  36. SInstance = new TExecutionSampler();
  37. }
  38. return SInstance;
  39. }
  40. void Start() {
  41. // Set signal handler
  42. struct sigaction sa;
  43. sa.sa_sigaction = ProfilerSignalHandler;
  44. sigemptyset(&sa.sa_mask);
  45. sa.sa_flags = SA_SIGINFO;
  46. if (sigaction(SIGPROF, &sa, &OldSignalHandler) != 0)
  47. return;
  48. // Set interval timer
  49. itimerval tv;
  50. tv.it_interval.tv_sec = tv.it_value.tv_sec = 0;
  51. tv.it_interval.tv_usec = tv.it_value.tv_usec = SAMPLE_INTERVAL;
  52. setitimer(ITIMER_PROF, &tv, &OldTimerValue);
  53. Started = true;
  54. }
  55. void Stop(TSampleVector& sampleVector, TStats& stats) {
  56. // Reset signal handler and timer
  57. if (Started) {
  58. setitimer(ITIMER_PROF, &OldTimerValue, nullptr);
  59. sleep(1);
  60. }
  61. WaitForWriteFlag();
  62. if (Started) {
  63. sigaction(SIGPROF, &OldSignalHandler, nullptr);
  64. Started = false;
  65. }
  66. TExecutionSampler::TSampleVector hits;
  67. hits.reserve(Samples);
  68. for (size_t i = 0; i < SZ; ++i) {
  69. if (Ips[i].first != nullptr) {
  70. hits.push_back(Ips[i]);
  71. }
  72. }
  73. stats.SavedSamples = Samples;
  74. stats.DroppedSamples = AtomicGet(DroppedSamples);
  75. stats.SearchSkipCount = SearchSkipCount;
  76. AtomicUnlock(&WriteFlag);
  77. Sort(hits.begin(), hits.end(), TCompareFirst());
  78. sampleVector.swap(hits);
  79. }
  80. void ResetStats() {
  81. WaitForWriteFlag();
  82. Clear();
  83. AtomicUnlock(&WriteFlag);
  84. }
  85. private:
  86. static const size_t SZ = 2 * 1024 * 1024; // size of the hash table
  87. // inserts work faster if it's a power of 2
  88. static const int SAMPLE_INTERVAL = 1000; // in microseconds
  89. struct TCompareFirst {
  90. bool operator()(const std::pair<void*, size_t>& a, const std::pair<void*, size_t>& b) const {
  91. return a.first < b.first;
  92. }
  93. };
  94. TExecutionSampler()
  95. : Started(false)
  96. , Ips(SZ)
  97. , WriteFlag(0)
  98. , DroppedSamples(0)
  99. , Samples(0)
  100. , UniqueSamples(0)
  101. , SearchSkipCount(0)
  102. {
  103. }
  104. ~TExecutionSampler() = default;
  105. // Signal handler is not allowed to do anything that can deadlock with activity
  106. // on the thread to which the signal is delivered or corrupt data structures that
  107. // were in process of update.
  108. // One such thing is memory allocation. That's why a fixed size vector is
  109. // preallocated at start.
  110. static void ProfilerSignalHandler(int signal, siginfo_t* info, void* context) {
  111. (void)info;
  112. if (signal != SIGPROF) {
  113. return;
  114. }
  115. ucontext_t* ucontext = reinterpret_cast<ucontext_t*>(context);
  116. Y_ASSERT(SInstance != nullptr);
  117. SInstance->CaptureIP(GetIp(&ucontext->uc_mcontext));
  118. }
  119. void WaitForWriteFlag() {
  120. // Wait for write flag to be reset
  121. ui32 delay = 100;
  122. while (!AtomicTryLock(&WriteFlag)) {
  123. usleep(delay);
  124. delay += delay;
  125. delay = Min(delay, (ui32)5000);
  126. }
  127. }
  128. void CaptureIP(void* rip) {
  129. // Check if the handler on another thread is in the process of adding a sample
  130. // If this is the case, we just drop the current sample as this should happen
  131. // rarely.
  132. if (AtomicTryLock(&WriteFlag)) {
  133. AddSample(rip);
  134. AtomicUnlock(&WriteFlag);
  135. } else {
  136. AtomicIncrement(DroppedSamples);
  137. }
  138. }
  139. // Hash function applied to the addresses
  140. static inline ui32 Hash(void* key) {
  141. return ((size_t)key + (size_t)key / SZ) % SZ;
  142. }
  143. // Get instruction pointer from the context
  144. static inline void* GetIp(const mcontext_t* mctx) {
  145. #if defined _freebsd_
  146. #if defined _64_
  147. return (void*)mctx->mc_rip;
  148. #else
  149. return (void*)mctx->mc_eip;
  150. #endif
  151. #elif defined _linux_
  152. #if defined _64_
  153. #if defined(_arm_)
  154. return (void*)mctx->pc;
  155. #else
  156. return (void*)mctx->gregs[REG_RIP];
  157. #endif
  158. #else
  159. return (void*)mctx->gregs[REG_EIP];
  160. #endif
  161. #elif defined _darwin_
  162. #if defined _64_
  163. #if defined(_arm_)
  164. return (void*)(*mctx)->__ss.__pc;
  165. #else
  166. return (void*)(*mctx)->__ss.__rip;
  167. #endif
  168. #else
  169. #if defined(__IOS__)
  170. return (void*)(*mctx)->__ss.__pc;
  171. #else
  172. return (void*)(*mctx)->__ss.__eip;
  173. #endif
  174. #endif
  175. #endif
  176. }
  177. inline bool AddSample(void* key) {
  178. ui32 slot = Hash(key);
  179. ui32 prevSlot = (slot - 1) % SZ;
  180. while (key != Ips[slot].first && !IsSlotEmpty(slot) && slot != prevSlot) {
  181. slot = (slot + 1) % SZ;
  182. SearchSkipCount++;
  183. }
  184. if (key == Ips[slot].first) {
  185. // increment the count
  186. Ips[slot].second++;
  187. ++Samples;
  188. } else if (InsertsAllowed()) {
  189. // add new sample and set the count to 1
  190. Ips[slot].first = key;
  191. Ips[slot].second = 1;
  192. ++UniqueSamples;
  193. ++Samples;
  194. } else {
  195. // don't insert new sample if the search is becoming too slow
  196. AtomicIncrement(DroppedSamples);
  197. return false;
  198. }
  199. return true;
  200. }
  201. inline bool IsSlotEmpty(ui32 slot) const {
  202. return Ips[slot].first == nullptr;
  203. }
  204. inline bool InsertsAllowed() const {
  205. return UniqueSamples < SZ / 2;
  206. }
  207. void
  208. Clear() {
  209. Y_ASSERT(WriteFlag == 1);
  210. for (size_t i = 0; i < SZ; ++i) {
  211. Ips[i] = std::make_pair((void*)nullptr, (size_t)0);
  212. }
  213. Samples = 0;
  214. AtomicSet(DroppedSamples, 0);
  215. UniqueSamples = 0;
  216. SearchSkipCount = 0;
  217. }
  218. bool Started;
  219. struct sigaction OldSignalHandler;
  220. itimerval OldTimerValue;
  221. TVector<std::pair<void*, size_t>>
  222. Ips; // The hash table storing addresses and their hitcounts
  223. // TODO: on a big multiproc cache line false sharing by the flag and count might become an issue
  224. TAtomic WriteFlag; // Is used to syncronize access to the hash table
  225. TAtomic DroppedSamples; // "dropped sample" count will show how many times
  226. // a sample was dropped either because of write conflict
  227. // or because of the hash table had become too filled up
  228. ui64 Samples; // Saved samples count
  229. ui64 UniqueSamples; // Number of unique addresses
  230. ui64 SearchSkipCount; // Total number of linear hash table probes due to collisions
  231. static TExecutionSampler* SInstance;
  232. };
  233. // Performs analysis of samples captured by TExecutionSampler
  234. class TSampleAnalyser : TNonCopyable {
  235. public:
  236. TSampleAnalyser(TExecutionSampler::TSampleVector& samples, const TExecutionSampler::TStats& stats, bool putTimeStamps = false)
  237. : Samples()
  238. , Stats(stats)
  239. , PutTimestamps(putTimeStamps)
  240. {
  241. Samples.swap(samples);
  242. }
  243. ~TSampleAnalyser() = default;
  244. void Analyze(FILE* out) const;
  245. private:
  246. TExecutionSampler::TSampleVector Samples;
  247. TExecutionSampler::TStats Stats;
  248. bool PutTimestamps;
  249. };
  250. void TSampleAnalyser::Analyze(FILE* out) const {
  251. fprintf(out, "samples: %" PRIu64 " unique: %" PRIu64 " dropped: %" PRIu64 " searchskips: %" PRIu64 "\n",
  252. (ui64)Stats.SavedSamples, (ui64)Samples.size(),
  253. (ui64)Stats.DroppedSamples, (ui64)Stats.SearchSkipCount);
  254. fprintf(out, "\nSamples:\n");
  255. size_t funcCnt = 0;
  256. void* prevModBase = (void*)-1;
  257. void* prevFunc = (void*)-1;
  258. for (size_t i = 0; i < Samples.size(); ++i) {
  259. // print cycle count once in a while to estimate time consumed by
  260. // dumping the samples
  261. if (PutTimestamps && (i % 1000 == 0)) {
  262. ui64 tm = GetCycleCount();
  263. fprintf(out, "TM: %" PRIu64 "\n", tm);
  264. }
  265. Dl_info addrInfo;
  266. if (dladdr(Samples[i].first, &addrInfo)) {
  267. if (addrInfo.dli_fbase != prevModBase || addrInfo.dli_saddr != prevFunc) {
  268. fprintf(out, "Func\t%" PRISZT "\t%p\t%p\t%s\t%s\n",
  269. funcCnt,
  270. addrInfo.dli_fbase,
  271. addrInfo.dli_saddr,
  272. addrInfo.dli_fname,
  273. addrInfo.dli_sname);
  274. prevModBase = addrInfo.dli_fbase;
  275. prevFunc = addrInfo.dli_saddr;
  276. ++funcCnt;
  277. }
  278. } else {
  279. fprintf(out, "[dladdr failed]\n");
  280. }
  281. fprintf(out, "%" PRISZT "\t%p\t%lu\n", i, Samples[i].first, Samples[i].second);
  282. }
  283. }
  284. TExecutionSampler* TExecutionSampler::SInstance = nullptr;
  285. // Starts capturing execution samples
  286. void BeginProfiling() {
  287. TExecutionSampler::Instance()->Start();
  288. }
  289. // Resets captured execution samples
  290. void ResetProfile() {
  291. TExecutionSampler::Instance()->ResetStats();
  292. }
  293. void DumpRUsage(FILE* out) {
  294. rusage ru;
  295. int e = getrusage(RUSAGE_SELF, &ru);
  296. if (e != 0)
  297. return;
  298. fprintf(out,
  299. "user time: %lf\n"
  300. "system time: %lf\n"
  301. "max RSS: %ld\n"
  302. "shared text: %ld\n"
  303. "unshared data: %ld\n"
  304. "unshared stack: %ld\n"
  305. "page reclaims: %ld\n"
  306. "page faults: %ld\n"
  307. "swaps: %ld\n"
  308. "block input ops: %ld\n"
  309. "block output ops: %ld\n"
  310. "msg sent: %ld\n"
  311. "msg received: %ld\n"
  312. "signals: %ld\n"
  313. "voluntary ctx switches: %ld\n"
  314. "involuntary ctx switches: %ld\n\n",
  315. ru.ru_utime.tv_sec + (ru.ru_utime.tv_usec * 0.000001),
  316. ru.ru_stime.tv_sec + (ru.ru_stime.tv_usec * 0.000001),
  317. ru.ru_maxrss, ru.ru_ixrss, ru.ru_idrss, ru.ru_isrss,
  318. ru.ru_minflt, ru.ru_majflt, ru.ru_nswap,
  319. ru.ru_inblock, ru.ru_oublock,
  320. ru.ru_msgsnd, ru.ru_msgrcv,
  321. ru.ru_nsignals,
  322. ru.ru_nvcsw, ru.ru_nivcsw);
  323. }
  324. // Pauses capturing execution samples and dumps them to the file
  325. // Samples are not cleared so that profiling can be continued by calling BeginProfiling()
  326. // or it can be started from scratch by calling ResetProfile() and then BeginProfiling()
  327. void EndProfiling(FILE* out) {
  328. DumpRUsage(out);
  329. TExecutionSampler::TSampleVector samples;
  330. TExecutionSampler::TStats stats;
  331. TExecutionSampler::Instance()->Stop(samples, stats);
  332. TSampleAnalyser analyzer(samples, stats);
  333. analyzer.Analyze(out);
  334. }
  335. void EndProfiling() {
  336. static unsigned cnt = 0;
  337. char nameBuf[256];
  338. snprintf(nameBuf, sizeof(nameBuf), "./%s.%d.%u.profile", getprogname(), (int)getpid(), cnt);
  339. FILE* out = fopen(nameBuf, "a");
  340. EndProfiling(out);
  341. fclose(out);
  342. ++cnt;
  343. }
  344. #else
  345. // NOTE: not supported on Windows
  346. void BeginProfiling() {
  347. }
  348. void ResetProfile() {
  349. }
  350. void EndProfiling(FILE*) {
  351. }
  352. void EndProfiling() {
  353. }
  354. #endif