tokenizer.h 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #ifndef Py_TOKENIZER_H
  2. #define Py_TOKENIZER_H
  3. #ifdef __cplusplus
  4. extern "C" {
  5. #endif
  6. #include "object.h"
  7. /* Tokenizer interface */
  8. #include "pycore_token.h" /* For token types */
  9. #define MAXINDENT 100 /* Max indentation level */
  10. #define MAXLEVEL 200 /* Max parentheses level */
  11. #define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
  12. enum decoding_state { STATE_INIT, STATE_SEEK_CODING, STATE_NORMAL };
  13. enum interactive_underflow_t {
  14. /* Normal mode of operation: return a new token when asked in interactive mode
  15. */
  16. IUNDERFLOW_NORMAL,
  17. /* Forcefully return ENDMARKER when asked for a new token in interactive mode.
  18. * This can be used to prevent the tokenizer to prompt the user for new tokens
  19. */
  20. IUNDERFLOW_STOP,
  21. };
  22. struct token {
  23. int level;
  24. int lineno, col_offset, end_lineno, end_col_offset;
  25. const char *start, *end;
  26. PyObject *metadata;
  27. };
  28. enum tokenizer_mode_kind_t {
  29. TOK_REGULAR_MODE,
  30. TOK_FSTRING_MODE,
  31. };
  32. #define MAX_EXPR_NESTING 3
  33. typedef struct _tokenizer_mode {
  34. enum tokenizer_mode_kind_t kind;
  35. int curly_bracket_depth;
  36. int curly_bracket_expr_start_depth;
  37. char f_string_quote;
  38. int f_string_quote_size;
  39. int f_string_raw;
  40. const char *f_string_start;
  41. const char *f_string_multi_line_start;
  42. int f_string_line_start;
  43. Py_ssize_t f_string_start_offset;
  44. Py_ssize_t f_string_multi_line_start_offset;
  45. Py_ssize_t last_expr_size;
  46. Py_ssize_t last_expr_end;
  47. char *last_expr_buffer;
  48. int f_string_debug;
  49. int in_format_spec;
  50. } tokenizer_mode;
  51. /* Tokenizer state */
  52. struct tok_state {
  53. /* Input state; buf <= cur <= inp <= end */
  54. /* NB an entire line is held in the buffer */
  55. char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL or readline !=
  56. NULL */
  57. char *cur; /* Next character in buffer */
  58. char *inp; /* End of data in buffer */
  59. int fp_interactive; /* If the file descriptor is interactive */
  60. char *interactive_src_start; /* The start of the source parsed so far in
  61. interactive mode */
  62. char *interactive_src_end; /* The end of the source parsed so far in
  63. interactive mode */
  64. const char *end; /* End of input buffer if buf != NULL */
  65. const char *start; /* Start of current token if not NULL */
  66. int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
  67. /* NB If done != E_OK, cur must be == inp!!! */
  68. FILE *fp; /* Rest of input; NULL if tokenizing a string */
  69. int tabsize; /* Tab spacing */
  70. int indent; /* Current indentation index */
  71. int indstack[MAXINDENT]; /* Stack of indents */
  72. int atbol; /* Nonzero if at begin of new line */
  73. int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
  74. const char *prompt, *nextprompt; /* For interactive prompting */
  75. int lineno; /* Current line number */
  76. int first_lineno; /* First line of a single line or multi line string
  77. expression (cf. issue 16806) */
  78. int starting_col_offset; /* The column offset at the beginning of a token */
  79. int col_offset; /* Current col offset */
  80. int level; /* () [] {} Parentheses nesting level */
  81. /* Used to allow free continuations inside them */
  82. char parenstack[MAXLEVEL];
  83. int parenlinenostack[MAXLEVEL];
  84. int parencolstack[MAXLEVEL];
  85. PyObject *filename;
  86. /* Stuff for checking on different tab sizes */
  87. int altindstack[MAXINDENT]; /* Stack of alternate indents */
  88. /* Stuff for PEP 0263 */
  89. enum decoding_state decoding_state;
  90. int decoding_erred; /* whether erred in decoding */
  91. char *encoding; /* Source encoding. */
  92. int cont_line; /* whether we are in a continuation line. */
  93. const char *line_start; /* pointer to start of current line */
  94. const char *multi_line_start; /* pointer to start of first line of
  95. a single line or multi line string
  96. expression (cf. issue 16806) */
  97. PyObject *decoding_readline; /* open(...).readline */
  98. PyObject *decoding_buffer;
  99. PyObject *readline; /* readline() function */
  100. const char *enc; /* Encoding for the current str. */
  101. char *str; /* Source string being tokenized (if tokenizing from a string)*/
  102. char *input; /* Tokenizer's newline translated copy of the string. */
  103. int type_comments; /* Whether to look for type comments */
  104. /* async/await related fields (still needed depending on feature_version) */
  105. int async_hacks; /* =1 if async/await aren't always keywords */
  106. int async_def; /* =1 if tokens are inside an 'async def' body. */
  107. int async_def_indent; /* Indentation level of the outermost 'async def'. */
  108. int async_def_nl; /* =1 if the outermost 'async def' had at least one
  109. NEWLINE token after it. */
  110. /* How to proceed when asked for a new token in interactive mode */
  111. enum interactive_underflow_t interactive_underflow;
  112. int report_warnings;
  113. // TODO: Factor this into its own thing
  114. tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
  115. int tok_mode_stack_index;
  116. int tok_extra_tokens;
  117. int comment_newline;
  118. int implicit_newline;
  119. #ifdef Py_DEBUG
  120. int debug;
  121. #endif
  122. };
  123. extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
  124. extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
  125. extern struct tok_state *_PyTokenizer_FromReadline(PyObject *, const char *,
  126. int, int);
  127. extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char *,
  128. const char *, const char *);
  129. extern void _PyTokenizer_Free(struct tok_state *);
  130. extern void _PyToken_Free(struct token *);
  131. extern void _PyToken_Init(struct token *);
  132. extern int _PyTokenizer_Get(struct tok_state *, struct token *);
  133. #define tok_dump _Py_tok_dump
  134. #ifdef __cplusplus
  135. }
  136. #endif
  137. #endif /* !Py_TOKENIZER_H */