perf_trampoline.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /*
  2. Perf trampoline instrumentation
  3. ===============================
  4. This file contains instrumentation to allow to associate
  5. calls to the CPython eval loop back to the names of the Python
  6. functions and filename being executed.
  7. Many native performance profilers like the Linux perf tools are
  8. only available to 'see' the C stack when sampling from the profiled
  9. process. This means that if we have the following python code:
  10. import time
  11. def foo(n):
  12. # Some CPU intensive code
  13. def bar(n):
  14. foo(n)
  15. def baz(n):
  16. bar(n)
  17. baz(10000000)
  18. A performance profiler that is only able to see native frames will
  19. produce the following backtrace when sampling from foo():
  20. _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
  21. _PyEval_Vector
  22. _PyFunction_Vectorcall
  23. PyObject_Vectorcall
  24. call_function
  25. _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
  26. _PyEval_EvalFrame
  27. _PyEval_Vector
  28. _PyFunction_Vectorcall
  29. PyObject_Vectorcall
  30. call_function
  31. _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
  32. _PyEval_EvalFrame
  33. _PyEval_Vector
  34. _PyFunction_Vectorcall
  35. PyObject_Vectorcall
  36. call_function
  37. ...
  38. Py_RunMain
  39. Because the profiler is only able to see the native frames and the native
  40. function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
  41. then the profiler and any reporter generated by it will not be able to
  42. associate the names of the Python functions and the filenames associated with
  43. those calls, rendering the results useless in the Python world.
  44. To fix this problem, we introduce the concept of a trampoline frame. A
  45. trampoline frame is a piece of code that is unique per Python code object that
  46. is executed before entering the CPython eval loop. This piece of code just
  47. calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
  48. forwards all the arguments received. In this way, when a profiler samples
  49. frames from the previous example it will see;
  50. _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
  51. [Jit compiled code 3]
  52. _PyEval_Vector
  53. _PyFunction_Vectorcall
  54. PyObject_Vectorcall
  55. call_function
  56. _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
  57. [Jit compiled code 2]
  58. _PyEval_EvalFrame
  59. _PyEval_Vector
  60. _PyFunction_Vectorcall
  61. PyObject_Vectorcall
  62. call_function
  63. _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
  64. [Jit compiled code 1]
  65. _PyEval_EvalFrame
  66. _PyEval_Vector
  67. _PyFunction_Vectorcall
  68. PyObject_Vectorcall
  69. call_function
  70. ...
  71. Py_RunMain
  72. When we generate every unique copy of the trampoline (what here we called "[Jit
  73. compiled code N]") we write the relationship between the compiled code and the
  74. Python function that is associated with it. Every profiler requires this
  75. information in a different format. For example, the Linux "perf" profiler
  76. requires a file in "/tmp/perf-PID.map" (name and location not configurable)
  77. with the following format:
  78. <compiled code address> <compiled code size> <name of the compiled code>
  79. If this file is available when "perf" generates reports, it will automatically
  80. associate every trampoline with the Python function that it is associated with
  81. allowing it to generate reports that include Python information. These reports
  82. then can also be filtered in a way that *only* Python information appears.
  83. Notice that for this to work, there must be a unique copied of the trampoline
  84. per Python code object even if the code in the trampoline is the same. To
  85. achieve this we have a assembly template in Objects/asm_trampiline.S that is
  86. compiled into the Python executable/shared library. This template generates a
  87. symbol that maps the start of the assembly code and another that marks the end
  88. of the assembly code for the trampoline. Then, every time we need a unique
  89. trampoline for a Python code object, we copy the assembly code into a mmaped
  90. area that has executable permissions and we return the start of that area as
  91. our trampoline function.
  92. Asking for a mmap-ed memory area for trampoline is very wasteful so we
  93. allocate big arenas of memory in a single mmap call, we populate the entire
  94. arena with copies of the trampoline (this allows us to now have to invalidate
  95. the icache for the instructions in the page) and then we return the next
  96. available chunk every time someone asks for a new trampoline. We keep a linked
  97. list of arenas in case the current memory arena is exhausted and another one is
  98. needed.
  99. For the best results, Python should be compiled with
  100. CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
  101. profilers to unwind using only the frame pointer and not on DWARF debug
  102. information (note that as trampilines are dynamically generated there won't be
  103. any DWARF information available for them).
  104. */
  105. #include "Python.h"
  106. #include "pycore_ceval.h"
  107. #include "pycore_frame.h"
  108. #include "pycore_interp.h"
  109. #ifdef PY_HAVE_PERF_TRAMPOLINE
  110. #include <fcntl.h>
  111. #include <stdio.h>
  112. #include <stdlib.h>
  113. #include <sys/mman.h>
  114. #include <sys/types.h>
  115. #include <unistd.h>
  116. #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
  117. #define PY_HAVE_INVALIDATE_ICACHE
  118. #if defined(__clang__) || defined(__GNUC__)
  119. extern void __clear_cache(void *, void*);
  120. #endif
  121. static void invalidate_icache(char* begin, char*end) {
  122. #if defined(__clang__) || defined(__GNUC__)
  123. return __clear_cache(begin, end);
  124. #else
  125. return;
  126. #endif
  127. }
  128. #endif
  129. /* The function pointer is passed as last argument. The other three arguments
  130. * are passed in the same order as the function requires. This results in
  131. * shorter, more efficient ASM code for trampoline.
  132. */
  133. typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
  134. int throwflag);
  135. typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
  136. py_evaluator);
  137. extern void *_Py_trampoline_func_start; // Start of the template of the
  138. // assembly trampoline
  139. extern void *
  140. _Py_trampoline_func_end; // End of the template of the assembly trampoline
  141. struct code_arena_st {
  142. char *start_addr; // Start of the memory arena
  143. char *current_addr; // Address of the current trampoline within the arena
  144. size_t size; // Size of the memory arena
  145. size_t size_left; // Remaining size of the memory arena
  146. size_t code_size; // Size of the code of every trampoline in the arena
  147. struct code_arena_st
  148. *prev; // Pointer to the arena or NULL if this is the first arena.
  149. };
  150. typedef struct code_arena_st code_arena_t;
  151. typedef struct trampoline_api_st trampoline_api_t;
  152. #define perf_status _PyRuntime.ceval.perf.status
  153. #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
  154. #define perf_code_arena _PyRuntime.ceval.perf.code_arena
  155. #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
  156. #define perf_map_file _PyRuntime.ceval.perf.map_file
  157. static void
  158. perf_map_write_entry(void *state, const void *code_addr,
  159. unsigned int code_size, PyCodeObject *co)
  160. {
  161. const char *entry = "";
  162. if (co->co_qualname != NULL) {
  163. entry = PyUnicode_AsUTF8(co->co_qualname);
  164. }
  165. const char *filename = "";
  166. if (co->co_filename != NULL) {
  167. filename = PyUnicode_AsUTF8(co->co_filename);
  168. }
  169. size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
  170. char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
  171. if (perf_map_entry == NULL) {
  172. return;
  173. }
  174. snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
  175. PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
  176. PyMem_RawFree(perf_map_entry);
  177. }
  178. static void*
  179. perf_map_init_state(void)
  180. {
  181. PyUnstable_PerfMapState_Init();
  182. return NULL;
  183. }
  184. static int
  185. perf_map_free_state(void *state)
  186. {
  187. PyUnstable_PerfMapState_Fini();
  188. return 0;
  189. }
  190. _PyPerf_Callbacks _Py_perfmap_callbacks = {
  191. &perf_map_init_state,
  192. &perf_map_write_entry,
  193. &perf_map_free_state,
  194. };
  195. static int
  196. new_code_arena(void)
  197. {
  198. // non-trivial programs typically need 64 to 256 kiB.
  199. size_t mem_size = 4096 * 16;
  200. assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
  201. char *memory =
  202. mmap(NULL, // address
  203. mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
  204. -1, // fd (not used here)
  205. 0); // offset (not used here)
  206. if (memory == MAP_FAILED) {
  207. PyErr_SetFromErrno(PyExc_OSError);
  208. _PyErr_WriteUnraisableMsg(
  209. "Failed to create new mmap for perf trampoline", NULL);
  210. perf_status = PERF_STATUS_FAILED;
  211. return -1;
  212. }
  213. void *start = &_Py_trampoline_func_start;
  214. void *end = &_Py_trampoline_func_end;
  215. size_t code_size = end - start;
  216. // TODO: Check the effect of alignment of the code chunks. Initial investigation
  217. // showed that this has no effect on performance in x86-64 or aarch64 and the current
  218. // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
  219. //
  220. // We should check the values in the future and see if there is a
  221. // measurable performance improvement by rounding trampolines up to 32-bit
  222. // or 64-bit alignment.
  223. size_t n_copies = mem_size / code_size;
  224. for (size_t i = 0; i < n_copies; i++) {
  225. memcpy(memory + i * code_size, start, code_size * sizeof(char));
  226. }
  227. // Some systems may prevent us from creating executable code on the fly.
  228. int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
  229. if (res == -1) {
  230. PyErr_SetFromErrno(PyExc_OSError);
  231. munmap(memory, mem_size);
  232. _PyErr_WriteUnraisableMsg(
  233. "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
  234. NULL);
  235. return -1;
  236. }
  237. #ifdef PY_HAVE_INVALIDATE_ICACHE
  238. // Before the JIT can run a block of code that has been emitted it must invalidate
  239. // the instruction cache on some platforms like arm and aarch64.
  240. invalidate_icache(memory, memory + mem_size);
  241. #endif
  242. code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
  243. if (new_arena == NULL) {
  244. PyErr_NoMemory();
  245. munmap(memory, mem_size);
  246. _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
  247. NULL);
  248. return -1;
  249. }
  250. new_arena->start_addr = memory;
  251. new_arena->current_addr = memory;
  252. new_arena->size = mem_size;
  253. new_arena->size_left = mem_size;
  254. new_arena->code_size = code_size;
  255. new_arena->prev = perf_code_arena;
  256. perf_code_arena = new_arena;
  257. return 0;
  258. }
  259. static void
  260. free_code_arenas(void)
  261. {
  262. code_arena_t *cur = perf_code_arena;
  263. code_arena_t *prev;
  264. perf_code_arena = NULL; // invalid static pointer
  265. while (cur) {
  266. munmap(cur->start_addr, cur->size);
  267. prev = cur->prev;
  268. PyMem_RawFree(cur);
  269. cur = prev;
  270. }
  271. }
  272. static inline py_trampoline
  273. code_arena_new_code(code_arena_t *code_arena)
  274. {
  275. py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
  276. code_arena->size_left -= code_arena->code_size;
  277. code_arena->current_addr += code_arena->code_size;
  278. return trampoline;
  279. }
  280. static inline py_trampoline
  281. compile_trampoline(void)
  282. {
  283. if ((perf_code_arena == NULL) ||
  284. (perf_code_arena->size_left <= perf_code_arena->code_size)) {
  285. if (new_code_arena() < 0) {
  286. return NULL;
  287. }
  288. }
  289. assert(perf_code_arena->size_left <= perf_code_arena->size);
  290. return code_arena_new_code(perf_code_arena);
  291. }
  292. static PyObject *
  293. py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
  294. int throw)
  295. {
  296. if (perf_status == PERF_STATUS_FAILED ||
  297. perf_status == PERF_STATUS_NO_INIT) {
  298. goto default_eval;
  299. }
  300. PyCodeObject *co = frame->f_code;
  301. py_trampoline f = NULL;
  302. assert(extra_code_index != -1);
  303. int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
  304. if (ret != 0 || f == NULL) {
  305. // This is the first time we see this code object so we need
  306. // to compile a trampoline for it.
  307. py_trampoline new_trampoline = compile_trampoline();
  308. if (new_trampoline == NULL) {
  309. goto default_eval;
  310. }
  311. trampoline_api.write_state(trampoline_api.state, new_trampoline,
  312. perf_code_arena->code_size, co);
  313. _PyCode_SetExtra((PyObject *)co, extra_code_index,
  314. (void *)new_trampoline);
  315. f = new_trampoline;
  316. }
  317. assert(f != NULL);
  318. return f(ts, frame, throw, _PyEval_EvalFrameDefault);
  319. default_eval:
  320. // Something failed, fall back to the default evaluator.
  321. return _PyEval_EvalFrameDefault(ts, frame, throw);
  322. }
  323. #endif // PY_HAVE_PERF_TRAMPOLINE
  324. int
  325. _PyIsPerfTrampolineActive(void)
  326. {
  327. #ifdef PY_HAVE_PERF_TRAMPOLINE
  328. PyThreadState *tstate = _PyThreadState_GET();
  329. return tstate->interp->eval_frame == py_trampoline_evaluator;
  330. #endif
  331. return 0;
  332. }
  333. void
  334. _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
  335. {
  336. if (callbacks == NULL) {
  337. return;
  338. }
  339. #ifdef PY_HAVE_PERF_TRAMPOLINE
  340. callbacks->init_state = trampoline_api.init_state;
  341. callbacks->write_state = trampoline_api.write_state;
  342. callbacks->free_state = trampoline_api.free_state;
  343. #endif
  344. return;
  345. }
  346. int
  347. _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
  348. {
  349. if (callbacks == NULL) {
  350. return -1;
  351. }
  352. #ifdef PY_HAVE_PERF_TRAMPOLINE
  353. if (trampoline_api.state) {
  354. _PyPerfTrampoline_Fini();
  355. }
  356. trampoline_api.init_state = callbacks->init_state;
  357. trampoline_api.write_state = callbacks->write_state;
  358. trampoline_api.free_state = callbacks->free_state;
  359. trampoline_api.state = NULL;
  360. #endif
  361. return 0;
  362. }
  363. void _PyPerfTrampoline_FreeArenas(void) {
  364. #ifdef PY_HAVE_PERF_TRAMPOLINE
  365. free_code_arenas();
  366. #endif
  367. return;
  368. }
  369. int
  370. _PyPerfTrampoline_Init(int activate)
  371. {
  372. #ifdef PY_HAVE_PERF_TRAMPOLINE
  373. PyThreadState *tstate = _PyThreadState_GET();
  374. if (tstate->interp->eval_frame &&
  375. tstate->interp->eval_frame != py_trampoline_evaluator) {
  376. PyErr_SetString(PyExc_RuntimeError,
  377. "Trampoline cannot be initialized as a custom eval "
  378. "frame is already present");
  379. return -1;
  380. }
  381. if (!activate) {
  382. tstate->interp->eval_frame = NULL;
  383. perf_status = PERF_STATUS_NO_INIT;
  384. }
  385. else {
  386. tstate->interp->eval_frame = py_trampoline_evaluator;
  387. if (new_code_arena() < 0) {
  388. return -1;
  389. }
  390. extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
  391. if (extra_code_index == -1) {
  392. return -1;
  393. }
  394. if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
  395. trampoline_api.state = trampoline_api.init_state();
  396. }
  397. perf_status = PERF_STATUS_OK;
  398. }
  399. #endif
  400. return 0;
  401. }
  402. int
  403. _PyPerfTrampoline_Fini(void)
  404. {
  405. #ifdef PY_HAVE_PERF_TRAMPOLINE
  406. if (perf_status != PERF_STATUS_OK) {
  407. return 0;
  408. }
  409. PyThreadState *tstate = _PyThreadState_GET();
  410. if (tstate->interp->eval_frame == py_trampoline_evaluator) {
  411. tstate->interp->eval_frame = NULL;
  412. }
  413. if (perf_status == PERF_STATUS_OK) {
  414. trampoline_api.free_state(trampoline_api.state);
  415. }
  416. extra_code_index = -1;
  417. perf_status = PERF_STATUS_NO_INIT;
  418. #endif
  419. return 0;
  420. }
  421. PyStatus
  422. _PyPerfTrampoline_AfterFork_Child(void)
  423. {
  424. #ifdef PY_HAVE_PERF_TRAMPOLINE
  425. // Restart trampoline in file in child.
  426. int was_active = _PyIsPerfTrampolineActive();
  427. _PyPerfTrampoline_Fini();
  428. PyUnstable_PerfMapState_Fini();
  429. if (was_active) {
  430. _PyPerfTrampoline_Init(1);
  431. }
  432. #endif
  433. return PyStatus_Ok();
  434. }