gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
eb50cd37ea
commit
01481f2dc1
76
Parser/lexer/buffer.c
Normal file
76
Parser/lexer/buffer.c
Normal file
@@ -0,0 +1,76 @@
|
||||
#include "Python.h"
|
||||
#include "errcode.h"
|
||||
|
||||
#include "state.h"
|
||||
|
||||
/* Traverse and remember all f-string buffers, in order to be able to restore
|
||||
them after reallocating tok->buf */
|
||||
void
|
||||
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
|
||||
{
|
||||
int index;
|
||||
tokenizer_mode *mode;
|
||||
|
||||
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
|
||||
mode = &(tok->tok_mode_stack[index]);
|
||||
mode->f_string_start_offset = mode->f_string_start - tok->buf;
|
||||
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
|
||||
}
|
||||
}
|
||||
|
||||
/* Traverse and restore all f-string buffers after reallocating tok->buf */
|
||||
void
|
||||
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
|
||||
{
|
||||
int index;
|
||||
tokenizer_mode *mode;
|
||||
|
||||
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
|
||||
mode = &(tok->tok_mode_stack[index]);
|
||||
mode->f_string_start = tok->buf + mode->f_string_start_offset;
|
||||
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read a line of text from TOK into S, using the stream in TOK.
|
||||
Return NULL on failure, else S.
|
||||
|
||||
On entry, tok->decoding_buffer will be one of:
|
||||
1) NULL: need to call tok->decoding_readline to get a new line
|
||||
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
|
||||
stored the result in tok->decoding_buffer
|
||||
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
|
||||
(in the s buffer) to copy entire contents of the line read
|
||||
by tok->decoding_readline. tok->decoding_buffer has the overflow.
|
||||
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
|
||||
until the buffer ends with a '\n' (or until the end of the file is
|
||||
reached): see tok_nextc and its calls to tok_reserve_buf.
|
||||
*/
|
||||
int
|
||||
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
|
||||
{
|
||||
Py_ssize_t cur = tok->cur - tok->buf;
|
||||
Py_ssize_t oldsize = tok->inp - tok->buf;
|
||||
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
|
||||
if (newsize > tok->end - tok->buf) {
|
||||
char *newbuf = tok->buf;
|
||||
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
|
||||
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
|
||||
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
|
||||
_PyLexer_remember_fstring_buffers(tok);
|
||||
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
|
||||
if (newbuf == NULL) {
|
||||
tok->done = E_NOMEM;
|
||||
return 0;
|
||||
}
|
||||
tok->buf = newbuf;
|
||||
tok->cur = tok->buf + cur;
|
||||
tok->inp = tok->buf + oldsize;
|
||||
tok->end = tok->buf + newsize;
|
||||
tok->start = start < 0 ? NULL : tok->buf + start;
|
||||
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
|
||||
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
|
||||
_PyLexer_restore_fstring_buffers(tok);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
10
Parser/lexer/buffer.h
Normal file
10
Parser/lexer/buffer.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef _LEXER_BUFFER_H_
|
||||
#define _LEXER_BUFFER_H_
|
||||
|
||||
#include "pyport.h"
|
||||
|
||||
void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
|
||||
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
|
||||
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
|
||||
|
||||
#endif
|
||||
1419
Parser/lexer/lexer.c
Normal file
1419
Parser/lexer/lexer.c
Normal file
File diff suppressed because it is too large
Load Diff
10
Parser/lexer/lexer.h
Normal file
10
Parser/lexer/lexer.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef _PY_LEXER_LEXER_H_
|
||||
#define _PY_LEXER_LEXER_H_
|
||||
|
||||
#include "state.h"
|
||||
|
||||
int _PyLexer_update_fstring_expr(struct tok_state *tok, char cur);
|
||||
|
||||
int _PyTokenizer_Get(struct tok_state *, struct token *);
|
||||
|
||||
#endif
|
||||
149
Parser/lexer/state.c
Normal file
149
Parser/lexer/state.c
Normal file
@@ -0,0 +1,149 @@
|
||||
#include "Python.h"
|
||||
#include "pycore_pystate.h"
|
||||
#include "pycore_token.h"
|
||||
#include "errcode.h"
|
||||
|
||||
#include "state.h"
|
||||
|
||||
/* Never change this */
|
||||
#define TABSIZE 8
|
||||
|
||||
/* Create and initialize a new tok_state structure */
|
||||
struct tok_state *
|
||||
_PyTokenizer_tok_new(void)
|
||||
{
|
||||
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
|
||||
sizeof(struct tok_state));
|
||||
if (tok == NULL)
|
||||
return NULL;
|
||||
tok->buf = tok->cur = tok->inp = NULL;
|
||||
tok->fp_interactive = 0;
|
||||
tok->interactive_src_start = NULL;
|
||||
tok->interactive_src_end = NULL;
|
||||
tok->start = NULL;
|
||||
tok->end = NULL;
|
||||
tok->done = E_OK;
|
||||
tok->fp = NULL;
|
||||
tok->input = NULL;
|
||||
tok->tabsize = TABSIZE;
|
||||
tok->indent = 0;
|
||||
tok->indstack[0] = 0;
|
||||
tok->atbol = 1;
|
||||
tok->pendin = 0;
|
||||
tok->prompt = tok->nextprompt = NULL;
|
||||
tok->lineno = 0;
|
||||
tok->starting_col_offset = -1;
|
||||
tok->col_offset = -1;
|
||||
tok->level = 0;
|
||||
tok->altindstack[0] = 0;
|
||||
tok->decoding_state = STATE_INIT;
|
||||
tok->decoding_erred = 0;
|
||||
tok->enc = NULL;
|
||||
tok->encoding = NULL;
|
||||
tok->cont_line = 0;
|
||||
tok->filename = NULL;
|
||||
tok->decoding_readline = NULL;
|
||||
tok->decoding_buffer = NULL;
|
||||
tok->readline = NULL;
|
||||
tok->type_comments = 0;
|
||||
tok->interactive_underflow = IUNDERFLOW_NORMAL;
|
||||
tok->underflow = NULL;
|
||||
tok->str = NULL;
|
||||
tok->report_warnings = 1;
|
||||
tok->tok_extra_tokens = 0;
|
||||
tok->comment_newline = 0;
|
||||
tok->implicit_newline = 0;
|
||||
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
|
||||
tok->tok_mode_stack_index = 0;
|
||||
#ifdef Py_DEBUG
|
||||
tok->debug = _Py_GetConfig()->parser_debug;
|
||||
#endif
|
||||
return tok;
|
||||
}
|
||||
|
||||
static void
|
||||
free_fstring_expressions(struct tok_state *tok)
|
||||
{
|
||||
int index;
|
||||
tokenizer_mode *mode;
|
||||
|
||||
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
|
||||
mode = &(tok->tok_mode_stack[index]);
|
||||
if (mode->last_expr_buffer != NULL) {
|
||||
PyMem_Free(mode->last_expr_buffer);
|
||||
mode->last_expr_buffer = NULL;
|
||||
mode->last_expr_size = 0;
|
||||
mode->last_expr_end = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Free a tok_state structure */
|
||||
void
|
||||
_PyTokenizer_Free(struct tok_state *tok)
|
||||
{
|
||||
if (tok->encoding != NULL) {
|
||||
PyMem_Free(tok->encoding);
|
||||
}
|
||||
Py_XDECREF(tok->decoding_readline);
|
||||
Py_XDECREF(tok->decoding_buffer);
|
||||
Py_XDECREF(tok->readline);
|
||||
Py_XDECREF(tok->filename);
|
||||
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
|
||||
PyMem_Free(tok->buf);
|
||||
}
|
||||
if (tok->input) {
|
||||
PyMem_Free(tok->input);
|
||||
}
|
||||
if (tok->interactive_src_start != NULL) {
|
||||
PyMem_Free(tok->interactive_src_start);
|
||||
}
|
||||
free_fstring_expressions(tok);
|
||||
PyMem_Free(tok);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Free(struct token *token) {
|
||||
Py_XDECREF(token->metadata);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Init(struct token *token) {
|
||||
token->metadata = NULL;
|
||||
}
|
||||
|
||||
int
|
||||
_PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
|
||||
int end_col_offset, const char *start, const char *end)
|
||||
{
|
||||
token->level = tok->level;
|
||||
token->lineno = token->end_lineno = tok->lineno;
|
||||
token->col_offset = col_offset;
|
||||
token->end_col_offset = end_col_offset;
|
||||
token->start = start;
|
||||
token->end = end;
|
||||
return type;
|
||||
}
|
||||
|
||||
int
|
||||
_PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
|
||||
{
|
||||
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
|
||||
token->level = tok->level;
|
||||
if (ISSTRINGLIT(type)) {
|
||||
token->lineno = tok->first_lineno;
|
||||
}
|
||||
else {
|
||||
token->lineno = tok->lineno;
|
||||
}
|
||||
token->end_lineno = tok->lineno;
|
||||
token->col_offset = token->end_col_offset = -1;
|
||||
token->start = start;
|
||||
token->end = end;
|
||||
|
||||
if (start != NULL && end != NULL) {
|
||||
token->col_offset = tok->starting_col_offset;
|
||||
token->end_col_offset = tok->col_offset;
|
||||
}
|
||||
return type;
|
||||
}
|
||||
141
Parser/lexer/state.h
Normal file
141
Parser/lexer/state.h
Normal file
@@ -0,0 +1,141 @@
|
||||
#ifndef _PY_LEXER_H_
|
||||
#define _PY_LEXER_H_
|
||||
|
||||
#include "object.h"
|
||||
|
||||
#define MAXINDENT 100 /* Max indentation level */
|
||||
#define MAXLEVEL 200 /* Max parentheses level */
|
||||
#define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
|
||||
|
||||
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
|
||||
#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
|
||||
|
||||
enum decoding_state {
|
||||
STATE_INIT,
|
||||
STATE_SEEK_CODING,
|
||||
STATE_NORMAL
|
||||
};
|
||||
|
||||
enum interactive_underflow_t {
|
||||
/* Normal mode of operation: return a new token when asked in interactive mode */
|
||||
IUNDERFLOW_NORMAL,
|
||||
/* Forcefully return ENDMARKER when asked for a new token in interactive mode. This
|
||||
* can be used to prevent the tokenizer to prompt the user for new tokens */
|
||||
IUNDERFLOW_STOP,
|
||||
};
|
||||
|
||||
struct token {
|
||||
int level;
|
||||
int lineno, col_offset, end_lineno, end_col_offset;
|
||||
const char *start, *end;
|
||||
PyObject *metadata;
|
||||
};
|
||||
|
||||
enum tokenizer_mode_kind_t {
|
||||
TOK_REGULAR_MODE,
|
||||
TOK_FSTRING_MODE,
|
||||
};
|
||||
|
||||
#define MAX_EXPR_NESTING 3
|
||||
|
||||
typedef struct _tokenizer_mode {
|
||||
enum tokenizer_mode_kind_t kind;
|
||||
|
||||
int curly_bracket_depth;
|
||||
int curly_bracket_expr_start_depth;
|
||||
|
||||
char f_string_quote;
|
||||
int f_string_quote_size;
|
||||
int f_string_raw;
|
||||
const char* f_string_start;
|
||||
const char* f_string_multi_line_start;
|
||||
int f_string_line_start;
|
||||
|
||||
Py_ssize_t f_string_start_offset;
|
||||
Py_ssize_t f_string_multi_line_start_offset;
|
||||
|
||||
Py_ssize_t last_expr_size;
|
||||
Py_ssize_t last_expr_end;
|
||||
char* last_expr_buffer;
|
||||
int f_string_debug;
|
||||
} tokenizer_mode;
|
||||
|
||||
/* Tokenizer state */
|
||||
struct tok_state {
|
||||
/* Input state; buf <= cur <= inp <= end */
|
||||
/* NB an entire line is held in the buffer */
|
||||
char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL or readline != NULL */
|
||||
char *cur; /* Next character in buffer */
|
||||
char *inp; /* End of data in buffer */
|
||||
int fp_interactive; /* If the file descriptor is interactive */
|
||||
char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
|
||||
char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
|
||||
const char *end; /* End of input buffer if buf != NULL */
|
||||
const char *start; /* Start of current token if not NULL */
|
||||
int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
|
||||
/* NB If done != E_OK, cur must be == inp!!! */
|
||||
FILE *fp; /* Rest of input; NULL if tokenizing a string */
|
||||
int tabsize; /* Tab spacing */
|
||||
int indent; /* Current indentation index */
|
||||
int indstack[MAXINDENT]; /* Stack of indents */
|
||||
int atbol; /* Nonzero if at begin of new line */
|
||||
int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
|
||||
const char *prompt, *nextprompt; /* For interactive prompting */
|
||||
int lineno; /* Current line number */
|
||||
int first_lineno; /* First line of a single line or multi line string
|
||||
expression (cf. issue 16806) */
|
||||
int starting_col_offset; /* The column offset at the beginning of a token */
|
||||
int col_offset; /* Current col offset */
|
||||
int level; /* () [] {} Parentheses nesting level */
|
||||
/* Used to allow free continuations inside them */
|
||||
char parenstack[MAXLEVEL];
|
||||
int parenlinenostack[MAXLEVEL];
|
||||
int parencolstack[MAXLEVEL];
|
||||
PyObject *filename;
|
||||
/* Stuff for checking on different tab sizes */
|
||||
int altindstack[MAXINDENT]; /* Stack of alternate indents */
|
||||
/* Stuff for PEP 0263 */
|
||||
enum decoding_state decoding_state;
|
||||
int decoding_erred; /* whether erred in decoding */
|
||||
char *encoding; /* Source encoding. */
|
||||
int cont_line; /* whether we are in a continuation line. */
|
||||
const char* line_start; /* pointer to start of current line */
|
||||
const char* multi_line_start; /* pointer to start of first line of
|
||||
a single line or multi line string
|
||||
expression (cf. issue 16806) */
|
||||
PyObject *decoding_readline; /* open(...).readline */
|
||||
PyObject *decoding_buffer;
|
||||
PyObject *readline; /* readline() function */
|
||||
const char* enc; /* Encoding for the current str. */
|
||||
char* str; /* Source string being tokenized (if tokenizing from a string)*/
|
||||
char* input; /* Tokenizer's newline translated copy of the string. */
|
||||
|
||||
int type_comments; /* Whether to look for type comments */
|
||||
|
||||
/* How to proceed when asked for a new token in interactive mode */
|
||||
enum interactive_underflow_t interactive_underflow;
|
||||
int (*underflow)(struct tok_state *); /* Function to call when buffer is empty and we need to refill it*/
|
||||
|
||||
int report_warnings;
|
||||
// TODO: Factor this into its own thing
|
||||
tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
|
||||
int tok_mode_stack_index;
|
||||
int tok_extra_tokens;
|
||||
int comment_newline;
|
||||
int implicit_newline;
|
||||
#ifdef Py_DEBUG
|
||||
int debug;
|
||||
#endif
|
||||
};
|
||||
|
||||
int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
|
||||
int end_col_offset, const char *start, const char *end);
|
||||
int _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end);
|
||||
|
||||
struct tok_state *_PyTokenizer_tok_new(void);
|
||||
void _PyTokenizer_Free(struct tok_state *);
|
||||
void _PyToken_Free(struct token *);
|
||||
void _PyToken_Init(struct token *);
|
||||
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user