tokenizer.h 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #ifndef Py_TOKENIZER_H
  2. #define Py_TOKENIZER_H
  3. #ifdef __cplusplus
  4. extern "C" {
  5. #endif
  6. #include "object.h"
  7. /* Tokenizer interface */
  8. #include "pycore_token.h" /* For token types */
  9. #define MAXINDENT 100 /* Max indentation level */
  10. #define MAXLEVEL 200 /* Max parentheses level */
  11. #define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
  12. enum decoding_state {
  13. STATE_INIT,
  14. STATE_SEEK_CODING,
  15. STATE_NORMAL
  16. };
  17. enum interactive_underflow_t {
  18. /* Normal mode of operation: return a new token when asked in interactive mode */
  19. IUNDERFLOW_NORMAL,
  20. /* Forcefully return ENDMARKER when asked for a new token in interactive mode. This
  21. * can be used to prevent the tokenizer to prompt the user for new tokens */
  22. IUNDERFLOW_STOP,
  23. };
  24. struct token {
  25. int level;
  26. int lineno, col_offset, end_lineno, end_col_offset;
  27. const char *start, *end;
  28. PyObject *metadata;
  29. };
  30. enum tokenizer_mode_kind_t {
  31. TOK_REGULAR_MODE,
  32. TOK_FSTRING_MODE,
  33. };
  34. #define MAX_EXPR_NESTING 3
  35. typedef struct _tokenizer_mode {
  36. enum tokenizer_mode_kind_t kind;
  37. int curly_bracket_depth;
  38. int curly_bracket_expr_start_depth;
  39. char f_string_quote;
  40. int f_string_quote_size;
  41. int f_string_raw;
  42. const char* f_string_start;
  43. const char* f_string_multi_line_start;
  44. int f_string_line_start;
  45. Py_ssize_t f_string_start_offset;
  46. Py_ssize_t f_string_multi_line_start_offset;
  47. Py_ssize_t last_expr_size;
  48. Py_ssize_t last_expr_end;
  49. char* last_expr_buffer;
  50. int f_string_debug;
  51. } tokenizer_mode;
  52. /* Tokenizer state */
  53. struct tok_state {
  54. /* Input state; buf <= cur <= inp <= end */
  55. /* NB an entire line is held in the buffer */
  56. char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL or readline != NULL */
  57. char *cur; /* Next character in buffer */
  58. char *inp; /* End of data in buffer */
  59. int fp_interactive; /* If the file descriptor is interactive */
  60. char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
  61. char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
  62. const char *end; /* End of input buffer if buf != NULL */
  63. const char *start; /* Start of current token if not NULL */
  64. int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
  65. /* NB If done != E_OK, cur must be == inp!!! */
  66. FILE *fp; /* Rest of input; NULL if tokenizing a string */
  67. int tabsize; /* Tab spacing */
  68. int indent; /* Current indentation index */
  69. int indstack[MAXINDENT]; /* Stack of indents */
  70. int atbol; /* Nonzero if at begin of new line */
  71. int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
  72. const char *prompt, *nextprompt; /* For interactive prompting */
  73. int lineno; /* Current line number */
  74. int first_lineno; /* First line of a single line or multi line string
  75. expression (cf. issue 16806) */
  76. int starting_col_offset; /* The column offset at the beginning of a token */
  77. int col_offset; /* Current col offset */
  78. int level; /* () [] {} Parentheses nesting level */
  79. /* Used to allow free continuations inside them */
  80. char parenstack[MAXLEVEL];
  81. int parenlinenostack[MAXLEVEL];
  82. int parencolstack[MAXLEVEL];
  83. PyObject *filename;
  84. /* Stuff for checking on different tab sizes */
  85. int altindstack[MAXINDENT]; /* Stack of alternate indents */
  86. /* Stuff for PEP 0263 */
  87. enum decoding_state decoding_state;
  88. int decoding_erred; /* whether erred in decoding */
  89. char *encoding; /* Source encoding. */
  90. int cont_line; /* whether we are in a continuation line. */
  91. const char* line_start; /* pointer to start of current line */
  92. const char* multi_line_start; /* pointer to start of first line of
  93. a single line or multi line string
  94. expression (cf. issue 16806) */
  95. PyObject *decoding_readline; /* open(...).readline */
  96. PyObject *decoding_buffer;
  97. PyObject *readline; /* readline() function */
  98. const char* enc; /* Encoding for the current str. */
  99. char* str; /* Source string being tokenized (if tokenizing from a string)*/
  100. char* input; /* Tokenizer's newline translated copy of the string. */
  101. int type_comments; /* Whether to look for type comments */
  102. /* async/await related fields (still needed depending on feature_version) */
  103. int async_hacks; /* =1 if async/await aren't always keywords */
  104. int async_def; /* =1 if tokens are inside an 'async def' body. */
  105. int async_def_indent; /* Indentation level of the outermost 'async def'. */
  106. int async_def_nl; /* =1 if the outermost 'async def' had at least one
  107. NEWLINE token after it. */
  108. /* How to proceed when asked for a new token in interactive mode */
  109. enum interactive_underflow_t interactive_underflow;
  110. int report_warnings;
  111. // TODO: Factor this into its own thing
  112. tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
  113. int tok_mode_stack_index;
  114. int tok_extra_tokens;
  115. int comment_newline;
  116. int implicit_newline;
  117. #ifdef Py_DEBUG
  118. int debug;
  119. #endif
  120. };
  121. extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
  122. extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
  123. extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
  124. extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
  125. const char *, const char *);
  126. extern void _PyTokenizer_Free(struct tok_state *);
  127. extern void _PyToken_Free(struct token *);
  128. extern void _PyToken_Init(struct token *);
  129. extern int _PyTokenizer_Get(struct tok_state *, struct token *);
  130. #define tok_dump _Py_tok_dump
  131. #ifdef __cplusplus
  132. }
  133. #endif
  134. #endif /* !Py_TOKENIZER_H */