ceval_macros.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. // Macros needed by ceval.c and bytecodes.c
  2. /* Computed GOTOs, or
  3. the-optimization-commonly-but-improperly-known-as-"threaded code"
  4. using gcc's labels-as-values extension
  5. (http://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html).
  6. The traditional bytecode evaluation loop uses a "switch" statement, which
  7. decent compilers will optimize as a single indirect branch instruction
  8. combined with a lookup table of jump addresses. However, since the
  9. indirect jump instruction is shared by all opcodes, the CPU will have a
  10. hard time making the right prediction for where to jump next (actually,
  11. it will be always wrong except in the uncommon case of a sequence of
  12. several identical opcodes).
  13. "Threaded code" in contrast, uses an explicit jump table and an explicit
  14. indirect jump instruction at the end of each opcode. Since the jump
  15. instruction is at a different address for each opcode, the CPU will make a
  16. separate prediction for each of these instructions, which is equivalent to
  17. predicting the second opcode of each opcode pair. These predictions have
  18. a much better chance to turn out valid, especially in small bytecode loops.
  19. A mispredicted branch on a modern CPU flushes the whole pipeline and
  20. can cost several CPU cycles (depending on the pipeline depth),
  21. and potentially many more instructions (depending on the pipeline width).
  22. A correctly predicted branch, however, is nearly free.
  23. At the time of this writing, the "threaded code" version is up to 15-20%
  24. faster than the normal "switch" version, depending on the compiler and the
  25. CPU architecture.
  26. NOTE: care must be taken that the compiler doesn't try to "optimize" the
  27. indirect jumps by sharing them between all opcodes. Such optimizations
  28. can be disabled on gcc by using the -fno-gcse flag (or possibly
  29. -fno-crossjumping).
  30. */
  31. /* Use macros rather than inline functions, to make it as clear as possible
  32. * to the C compiler that the tracing check is a simple test then branch.
  33. * We want to be sure that the compiler knows this before it generates
  34. * the CFG.
  35. */
  36. #ifdef WITH_DTRACE
  37. #define OR_DTRACE_LINE | (PyDTrace_LINE_ENABLED() ? 255 : 0)
  38. #else
  39. #define OR_DTRACE_LINE
  40. #endif
  41. #ifdef HAVE_COMPUTED_GOTOS
  42. #ifndef USE_COMPUTED_GOTOS
  43. #define USE_COMPUTED_GOTOS 1
  44. #endif
  45. #else
  46. #if defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
  47. #error "Computed gotos are not supported on this compiler."
  48. #endif
  49. #undef USE_COMPUTED_GOTOS
  50. #define USE_COMPUTED_GOTOS 0
  51. #endif
  52. #ifdef Py_STATS
  53. #define INSTRUCTION_START(op) \
  54. do { \
  55. frame->prev_instr = next_instr++; \
  56. OPCODE_EXE_INC(op); \
  57. if (_py_stats) _py_stats->opcode_stats[lastopcode].pair_count[op]++; \
  58. lastopcode = op; \
  59. } while (0)
  60. #else
  61. #define INSTRUCTION_START(op) (frame->prev_instr = next_instr++)
  62. #endif
  63. #if USE_COMPUTED_GOTOS
  64. # define TARGET(op) TARGET_##op: INSTRUCTION_START(op);
  65. # define DISPATCH_GOTO() goto *opcode_targets[opcode]
  66. #else
  67. # define TARGET(op) case op: TARGET_##op: INSTRUCTION_START(op);
  68. # define DISPATCH_GOTO() goto dispatch_opcode
  69. #endif
  70. /* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */
  71. #ifdef LLTRACE
  72. #define PRE_DISPATCH_GOTO() if (lltrace) { \
  73. lltrace_instruction(frame, stack_pointer, next_instr); }
  74. #else
  75. #define PRE_DISPATCH_GOTO() ((void)0)
  76. #endif
  77. /* Do interpreter dispatch accounting for tracing and instrumentation */
  78. #define DISPATCH() \
  79. { \
  80. NEXTOPARG(); \
  81. PRE_DISPATCH_GOTO(); \
  82. DISPATCH_GOTO(); \
  83. }
  84. #define DISPATCH_SAME_OPARG() \
  85. { \
  86. opcode = next_instr->op.code; \
  87. PRE_DISPATCH_GOTO(); \
  88. DISPATCH_GOTO(); \
  89. }
  90. #define DISPATCH_INLINED(NEW_FRAME) \
  91. do { \
  92. assert(tstate->interp->eval_frame == NULL); \
  93. _PyFrame_SetStackPointer(frame, stack_pointer); \
  94. frame->prev_instr = next_instr - 1; \
  95. (NEW_FRAME)->previous = frame; \
  96. frame = cframe.current_frame = (NEW_FRAME); \
  97. CALL_STAT_INC(inlined_py_calls); \
  98. goto start_frame; \
  99. } while (0)
  100. #define CHECK_EVAL_BREAKER() \
  101. _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
  102. if (_Py_atomic_load_relaxed_int32(&tstate->interp->ceval.eval_breaker)) { \
  103. goto handle_eval_breaker; \
  104. }
  105. /* Tuple access macros */
  106. #ifndef Py_DEBUG
  107. #define GETITEM(v, i) PyTuple_GET_ITEM((v), (i))
  108. #else
  109. static inline PyObject *
  110. GETITEM(PyObject *v, Py_ssize_t i) {
  111. assert(PyTuple_Check(v));
  112. assert(i >= 0);
  113. assert(i < PyTuple_GET_SIZE(v));
  114. return PyTuple_GET_ITEM(v, i);
  115. }
  116. #endif
  117. /* Code access macros */
  118. /* The integer overflow is checked by an assertion below. */
  119. #define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(frame->f_code)))
  120. #define NEXTOPARG() do { \
  121. _Py_CODEUNIT word = *next_instr; \
  122. opcode = word.op.code; \
  123. oparg = word.op.arg; \
  124. } while (0)
  125. #define JUMPTO(x) (next_instr = _PyCode_CODE(frame->f_code) + (x))
  126. #define JUMPBY(x) (next_instr += (x))
  127. /* OpCode prediction macros
  128. Some opcodes tend to come in pairs thus making it possible to
  129. predict the second code when the first is run. For example,
  130. COMPARE_OP is often followed by POP_JUMP_IF_FALSE or POP_JUMP_IF_TRUE.
  131. Verifying the prediction costs a single high-speed test of a register
  132. variable against a constant. If the pairing was good, then the
  133. processor's own internal branch predication has a high likelihood of
  134. success, resulting in a nearly zero-overhead transition to the
  135. next opcode. A successful prediction saves a trip through the eval-loop
  136. including its unpredictable switch-case branch. Combined with the
  137. processor's internal branch prediction, a successful PREDICT has the
  138. effect of making the two opcodes run as if they were a single new opcode
  139. with the bodies combined.
  140. If collecting opcode statistics, your choices are to either keep the
  141. predictions turned-on and interpret the results as if some opcodes
  142. had been combined or turn-off predictions so that the opcode frequency
  143. counter updates for both opcodes.
  144. Opcode prediction is disabled with threaded code, since the latter allows
  145. the CPU to record separate branch prediction information for each
  146. opcode.
  147. */
  148. #define PREDICT_ID(op) PRED_##op
  149. #if USE_COMPUTED_GOTOS
  150. #define PREDICT(op) if (0) goto PREDICT_ID(op)
  151. #else
  152. #define PREDICT(next_op) \
  153. do { \
  154. _Py_CODEUNIT word = *next_instr; \
  155. opcode = word.op.code; \
  156. if (opcode == next_op) { \
  157. oparg = word.op.arg; \
  158. INSTRUCTION_START(next_op); \
  159. goto PREDICT_ID(next_op); \
  160. } \
  161. } while(0)
  162. #endif
  163. #define PREDICTED(op) PREDICT_ID(op):
  164. /* Stack manipulation macros */
  165. /* The stack can grow at most MAXINT deep, as co_nlocals and
  166. co_stacksize are ints. */
  167. #define STACK_LEVEL() ((int)(stack_pointer - _PyFrame_Stackbase(frame)))
  168. #define STACK_SIZE() (frame->f_code->co_stacksize)
  169. #define EMPTY() (STACK_LEVEL() == 0)
  170. #define TOP() (stack_pointer[-1])
  171. #define SECOND() (stack_pointer[-2])
  172. #define THIRD() (stack_pointer[-3])
  173. #define FOURTH() (stack_pointer[-4])
  174. #define PEEK(n) (stack_pointer[-(n)])
  175. #define POKE(n, v) (stack_pointer[-(n)] = (v))
  176. #define SET_TOP(v) (stack_pointer[-1] = (v))
  177. #define SET_SECOND(v) (stack_pointer[-2] = (v))
  178. #define BASIC_STACKADJ(n) (stack_pointer += n)
  179. #define BASIC_PUSH(v) (*stack_pointer++ = (v))
  180. #define BASIC_POP() (*--stack_pointer)
  181. #ifdef Py_DEBUG
  182. #define PUSH(v) do { \
  183. BASIC_PUSH(v); \
  184. assert(STACK_LEVEL() <= STACK_SIZE()); \
  185. } while (0)
  186. #define POP() (assert(STACK_LEVEL() > 0), BASIC_POP())
  187. #define STACK_GROW(n) do { \
  188. assert(n >= 0); \
  189. BASIC_STACKADJ(n); \
  190. assert(STACK_LEVEL() <= STACK_SIZE()); \
  191. } while (0)
  192. #define STACK_SHRINK(n) do { \
  193. assert(n >= 0); \
  194. assert(STACK_LEVEL() >= n); \
  195. BASIC_STACKADJ(-(n)); \
  196. } while (0)
  197. #else
  198. #define PUSH(v) BASIC_PUSH(v)
  199. #define POP() BASIC_POP()
  200. #define STACK_GROW(n) BASIC_STACKADJ(n)
  201. #define STACK_SHRINK(n) BASIC_STACKADJ(-(n))
  202. #endif
  203. /* Local variable macros */
  204. #define GETLOCAL(i) (frame->localsplus[i])
  205. /* The SETLOCAL() macro must not DECREF the local variable in-place and
  206. then store the new value; it must copy the old value to a temporary
  207. value, then store the new value, and then DECREF the temporary value.
  208. This is because it is possible that during the DECREF the frame is
  209. accessed by other code (e.g. a __del__ method or gc.collect()) and the
  210. variable would be pointing to already-freed memory. */
  211. #define SETLOCAL(i, value) do { PyObject *tmp = GETLOCAL(i); \
  212. GETLOCAL(i) = value; \
  213. Py_XDECREF(tmp); } while (0)
  214. #define GO_TO_INSTRUCTION(op) goto PREDICT_ID(op)
  215. #ifdef Py_STATS
  216. #define UPDATE_MISS_STATS(INSTNAME) \
  217. do { \
  218. STAT_INC(opcode, miss); \
  219. STAT_INC((INSTNAME), miss); \
  220. /* The counter is always the first cache entry: */ \
  221. if (ADAPTIVE_COUNTER_IS_ZERO(next_instr->cache)) { \
  222. STAT_INC((INSTNAME), deopt); \
  223. } \
  224. else { \
  225. /* This is about to be (incorrectly) incremented: */ \
  226. STAT_DEC((INSTNAME), deferred); \
  227. } \
  228. } while (0)
  229. #else
  230. #define UPDATE_MISS_STATS(INSTNAME) ((void)0)
  231. #endif
  232. #define DEOPT_IF(COND, INSTNAME) \
  233. if ((COND)) { \
  234. /* This is only a single jump on release builds! */ \
  235. UPDATE_MISS_STATS((INSTNAME)); \
  236. assert(_PyOpcode_Deopt[opcode] == (INSTNAME)); \
  237. GO_TO_INSTRUCTION(INSTNAME); \
  238. }
  239. #define GLOBALS() frame->f_globals
  240. #define BUILTINS() frame->f_builtins
  241. #define LOCALS() frame->f_locals
  242. #define DTRACE_FUNCTION_ENTRY() \
  243. if (PyDTrace_FUNCTION_ENTRY_ENABLED()) { \
  244. dtrace_function_entry(frame); \
  245. }
  246. #define ADAPTIVE_COUNTER_IS_ZERO(COUNTER) \
  247. (((COUNTER) >> ADAPTIVE_BACKOFF_BITS) == 0)
  248. #define ADAPTIVE_COUNTER_IS_MAX(COUNTER) \
  249. (((COUNTER) >> ADAPTIVE_BACKOFF_BITS) == ((1 << MAX_BACKOFF_VALUE) - 1))
  250. #define DECREMENT_ADAPTIVE_COUNTER(COUNTER) \
  251. do { \
  252. assert(!ADAPTIVE_COUNTER_IS_ZERO((COUNTER))); \
  253. (COUNTER) -= (1 << ADAPTIVE_BACKOFF_BITS); \
  254. } while (0);
  255. #define INCREMENT_ADAPTIVE_COUNTER(COUNTER) \
  256. do { \
  257. assert(!ADAPTIVE_COUNTER_IS_MAX((COUNTER))); \
  258. (COUNTER) += (1 << ADAPTIVE_BACKOFF_BITS); \
  259. } while (0);
  260. #define NAME_ERROR_MSG "name '%.200s' is not defined"
  261. #define KWNAMES_LEN() \
  262. (kwnames == NULL ? 0 : ((int)PyTuple_GET_SIZE(kwnames)))
  263. #define DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dval, result) \
  264. do { \
  265. if (Py_REFCNT(left) == 1) { \
  266. ((PyFloatObject *)left)->ob_fval = (dval); \
  267. _Py_DECREF_SPECIALIZED(right, _PyFloat_ExactDealloc);\
  268. result = (left); \
  269. } \
  270. else if (Py_REFCNT(right) == 1) {\
  271. ((PyFloatObject *)right)->ob_fval = (dval); \
  272. _Py_DECREF_NO_DEALLOC(left); \
  273. result = (right); \
  274. }\
  275. else { \
  276. result = PyFloat_FromDouble(dval); \
  277. if ((result) == NULL) goto error; \
  278. _Py_DECREF_NO_DEALLOC(left); \
  279. _Py_DECREF_NO_DEALLOC(right); \
  280. } \
  281. } while (0)
  282. // If a trace function sets a new f_lineno and
  283. // *then* raises, we use the destination when searching
  284. // for an exception handler, displaying the traceback, and so on
  285. #define INSTRUMENTED_JUMP(src, dest, event) \
  286. do { \
  287. _PyFrame_SetStackPointer(frame, stack_pointer); \
  288. next_instr = _Py_call_instrumentation_jump(tstate, event, frame, src, dest); \
  289. stack_pointer = _PyFrame_GetStackPointer(frame); \
  290. if (next_instr == NULL) { \
  291. next_instr = (dest)+1; \
  292. goto error; \
  293. } \
  294. } while (0);