gh-104169: Refactor tokenizer into lexer and wrappers (#110684)

* The lexer, which include the actual lexeme producing logic, goes into
  the `lexer` directory.
* The wrappers, one wrapper per input mode (file, string, utf-8, and
  readline), go into the `tokenizer` directory and include logic for
  creating a lexer instance and managing the buffer for different modes.
---------

Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
Lysandros Nikolaou
2023-10-11 17:14:44 +02:00
committed by GitHub
parent eb50cd37ea
commit 01481f2dc1
29 changed files with 3185 additions and 2988 deletions

76
Parser/lexer/buffer.c Normal file
View File

@@ -0,0 +1,76 @@
#include "Python.h"
#include "errcode.h"
#include "state.h"
/* Traverse and remember all f-string buffers, in order to be able to restore
them after reallocating tok->buf */
void
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start_offset = mode->f_string_start - tok->buf;
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
}
}
/* Traverse and restore all f-string buffers after reallocating tok->buf */
void
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start = tok->buf + mode->f_string_start_offset;
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
}
}
/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.
On entry, tok->decoding_buffer will be one of:
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
until the buffer ends with a '\n' (or until the end of the file is
reached): see tok_nextc and its calls to tok_reserve_buf.
*/
int
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
{
Py_ssize_t cur = tok->cur - tok->buf;
Py_ssize_t oldsize = tok->inp - tok->buf;
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
if (newsize > tok->end - tok->buf) {
char *newbuf = tok->buf;
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
_PyLexer_remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
return 0;
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
tok->inp = tok->buf + oldsize;
tok->end = tok->buf + newsize;
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
_PyLexer_restore_fstring_buffers(tok);
}
return 1;
}

10
Parser/lexer/buffer.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef _LEXER_BUFFER_H_
#define _LEXER_BUFFER_H_
#include "pyport.h"
void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
#endif

1419
Parser/lexer/lexer.c Normal file

File diff suppressed because it is too large Load Diff

10
Parser/lexer/lexer.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef _PY_LEXER_LEXER_H_
#define _PY_LEXER_LEXER_H_
#include "state.h"
int _PyLexer_update_fstring_expr(struct tok_state *tok, char cur);
int _PyTokenizer_Get(struct tok_state *, struct token *);
#endif

149
Parser/lexer/state.c Normal file
View File

@@ -0,0 +1,149 @@
#include "Python.h"
#include "pycore_pystate.h"
#include "pycore_token.h"
#include "errcode.h"
#include "state.h"
/* Never change this */
#define TABSIZE 8
/* Create and initialize a new tok_state structure */
struct tok_state *
_PyTokenizer_tok_new(void)
{
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
sizeof(struct tok_state));
if (tok == NULL)
return NULL;
tok->buf = tok->cur = tok->inp = NULL;
tok->fp_interactive = 0;
tok->interactive_src_start = NULL;
tok->interactive_src_end = NULL;
tok->start = NULL;
tok->end = NULL;
tok->done = E_OK;
tok->fp = NULL;
tok->input = NULL;
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->starting_col_offset = -1;
tok->col_offset = -1;
tok->level = 0;
tok->altindstack[0] = 0;
tok->decoding_state = STATE_INIT;
tok->decoding_erred = 0;
tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
tok->readline = NULL;
tok->type_comments = 0;
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->underflow = NULL;
tok->str = NULL;
tok->report_warnings = 1;
tok->tok_extra_tokens = 0;
tok->comment_newline = 0;
tok->implicit_newline = 0;
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
tok->tok_mode_stack_index = 0;
#ifdef Py_DEBUG
tok->debug = _Py_GetConfig()->parser_debug;
#endif
return tok;
}
static void
free_fstring_expressions(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
if (mode->last_expr_buffer != NULL) {
PyMem_Free(mode->last_expr_buffer);
mode->last_expr_buffer = NULL;
mode->last_expr_size = 0;
mode->last_expr_end = -1;
}
}
}
/* Free a tok_state structure */
void
_PyTokenizer_Free(struct tok_state *tok)
{
if (tok->encoding != NULL) {
PyMem_Free(tok->encoding);
}
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
Py_XDECREF(tok->readline);
Py_XDECREF(tok->filename);
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
PyMem_Free(tok->buf);
}
if (tok->input) {
PyMem_Free(tok->input);
}
if (tok->interactive_src_start != NULL) {
PyMem_Free(tok->interactive_src_start);
}
free_fstring_expressions(tok);
PyMem_Free(tok);
}
void
_PyToken_Free(struct token *token) {
Py_XDECREF(token->metadata);
}
void
_PyToken_Init(struct token *token) {
token->metadata = NULL;
}
int
_PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int end_col_offset, const char *start, const char *end)
{
token->level = tok->level;
token->lineno = token->end_lineno = tok->lineno;
token->col_offset = col_offset;
token->end_col_offset = end_col_offset;
token->start = start;
token->end = end;
return type;
}
int
_PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
{
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
token->level = tok->level;
if (ISSTRINGLIT(type)) {
token->lineno = tok->first_lineno;
}
else {
token->lineno = tok->lineno;
}
token->end_lineno = tok->lineno;
token->col_offset = token->end_col_offset = -1;
token->start = start;
token->end = end;
if (start != NULL && end != NULL) {
token->col_offset = tok->starting_col_offset;
token->end_col_offset = tok->col_offset;
}
return type;
}

141
Parser/lexer/state.h Normal file
View File

@@ -0,0 +1,141 @@
#ifndef _PY_LEXER_H_
#define _PY_LEXER_H_
#include "object.h"
#define MAXINDENT 100 /* Max indentation level */
#define MAXLEVEL 200 /* Max parentheses level */
#define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
enum decoding_state {
STATE_INIT,
STATE_SEEK_CODING,
STATE_NORMAL
};
enum interactive_underflow_t {
/* Normal mode of operation: return a new token when asked in interactive mode */
IUNDERFLOW_NORMAL,
/* Forcefully return ENDMARKER when asked for a new token in interactive mode. This
* can be used to prevent the tokenizer to prompt the user for new tokens */
IUNDERFLOW_STOP,
};
struct token {
int level;
int lineno, col_offset, end_lineno, end_col_offset;
const char *start, *end;
PyObject *metadata;
};
enum tokenizer_mode_kind_t {
TOK_REGULAR_MODE,
TOK_FSTRING_MODE,
};
#define MAX_EXPR_NESTING 3
typedef struct _tokenizer_mode {
enum tokenizer_mode_kind_t kind;
int curly_bracket_depth;
int curly_bracket_expr_start_depth;
char f_string_quote;
int f_string_quote_size;
int f_string_raw;
const char* f_string_start;
const char* f_string_multi_line_start;
int f_string_line_start;
Py_ssize_t f_string_start_offset;
Py_ssize_t f_string_multi_line_start_offset;
Py_ssize_t last_expr_size;
Py_ssize_t last_expr_end;
char* last_expr_buffer;
int f_string_debug;
} tokenizer_mode;
/* Tokenizer state */
struct tok_state {
/* Input state; buf <= cur <= inp <= end */
/* NB an entire line is held in the buffer */
char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL or readline != NULL */
char *cur; /* Next character in buffer */
char *inp; /* End of data in buffer */
int fp_interactive; /* If the file descriptor is interactive */
char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
const char *end; /* End of input buffer if buf != NULL */
const char *start; /* Start of current token if not NULL */
int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
/* NB If done != E_OK, cur must be == inp!!! */
FILE *fp; /* Rest of input; NULL if tokenizing a string */
int tabsize; /* Tab spacing */
int indent; /* Current indentation index */
int indstack[MAXINDENT]; /* Stack of indents */
int atbol; /* Nonzero if at begin of new line */
int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
const char *prompt, *nextprompt; /* For interactive prompting */
int lineno; /* Current line number */
int first_lineno; /* First line of a single line or multi line string
expression (cf. issue 16806) */
int starting_col_offset; /* The column offset at the beginning of a token */
int col_offset; /* Current col offset */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
char parenstack[MAXLEVEL];
int parenlinenostack[MAXLEVEL];
int parencolstack[MAXLEVEL];
PyObject *filename;
/* Stuff for checking on different tab sizes */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
enum decoding_state decoding_state;
int decoding_erred; /* whether erred in decoding */
char *encoding; /* Source encoding. */
int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */
const char* multi_line_start; /* pointer to start of first line of
a single line or multi line string
expression (cf. issue 16806) */
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
PyObject *readline; /* readline() function */
const char* enc; /* Encoding for the current str. */
char* str; /* Source string being tokenized (if tokenizing from a string)*/
char* input; /* Tokenizer's newline translated copy of the string. */
int type_comments; /* Whether to look for type comments */
/* How to proceed when asked for a new token in interactive mode */
enum interactive_underflow_t interactive_underflow;
int (*underflow)(struct tok_state *); /* Function to call when buffer is empty and we need to refill it*/
int report_warnings;
// TODO: Factor this into its own thing
tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
int tok_mode_stack_index;
int tok_extra_tokens;
int comment_newline;
int implicit_newline;
#ifdef Py_DEBUG
int debug;
#endif
};
int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int end_col_offset, const char *start, const char *end);
int _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end);
struct tok_state *_PyTokenizer_tok_new(void);
void _PyTokenizer_Free(struct tok_state *);
void _PyToken_Free(struct token *);
void _PyToken_Init(struct token *);
#endif