gh-63161: Fix PEP 263 support (GH-139481)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. * Include the decoding error position for default encoding in SyntaxError.
This commit is contained in:
@@ -224,6 +224,8 @@ class ExceptionTests(unittest.TestCase):
|
|||||||
if not isinstance(src, str):
|
if not isinstance(src, str):
|
||||||
src = src.decode(encoding, 'replace')
|
src = src.decode(encoding, 'replace')
|
||||||
line = src.split('\n')[lineno-1]
|
line = src.split('\n')[lineno-1]
|
||||||
|
if lineno == 1:
|
||||||
|
line = line.removeprefix('\ufeff')
|
||||||
self.assertIn(line, cm.exception.text)
|
self.assertIn(line, cm.exception.text)
|
||||||
|
|
||||||
def test_error_offset_continuation_characters(self):
|
def test_error_offset_continuation_characters(self):
|
||||||
@@ -239,7 +241,9 @@ class ExceptionTests(unittest.TestCase):
|
|||||||
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
|
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
|
||||||
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
|
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
|
||||||
2, 19, encoding='cp1251')
|
2, 19, encoding='cp1251')
|
||||||
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
|
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
|
||||||
|
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
|
||||||
|
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
|
||||||
check('x = "a', 1, 5)
|
check('x = "a', 1, 5)
|
||||||
check('lambda x: x = 2', 1, 1)
|
check('lambda x: x = 2', 1, 1)
|
||||||
check('f{a + b + c}', 1, 2)
|
check('f{a + b + c}', 1, 2)
|
||||||
@@ -287,7 +291,7 @@ class ExceptionTests(unittest.TestCase):
|
|||||||
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
|
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
|
||||||
check("(1+)", 1, 4)
|
check("(1+)", 1, 4)
|
||||||
check("[interesting\nfoo()\n", 1, 1)
|
check("[interesting\nfoo()\n", 1, 1)
|
||||||
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
|
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
|
||||||
check("""f'''
|
check("""f'''
|
||||||
{
|
{
|
||||||
(123_a)
|
(123_a)
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
|
from test import support
|
||||||
|
from test.support import script_helper
|
||||||
from test.support.os_helper import TESTFN, unlink, rmtree
|
from test.support.os_helper import TESTFN, unlink, rmtree
|
||||||
from test.support.import_helper import unload
|
from test.support.import_helper import unload
|
||||||
import importlib
|
import importlib
|
||||||
@@ -64,7 +65,7 @@ class MiscSourceEncodingTest(unittest.TestCase):
|
|||||||
# two bytes in common with the UTF-8 BOM
|
# two bytes in common with the UTF-8 BOM
|
||||||
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
|
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
|
||||||
|
|
||||||
@requires_subprocess()
|
@support.requires_subprocess()
|
||||||
def test_20731(self):
|
def test_20731(self):
|
||||||
sub = subprocess.Popen([sys.executable,
|
sub = subprocess.Popen([sys.executable,
|
||||||
os.path.join(os.path.dirname(__file__),
|
os.path.join(os.path.dirname(__file__),
|
||||||
@@ -267,6 +268,17 @@ class AbstractSourceEncodingTest:
|
|||||||
b'print(ascii("\xc3\xa4"))\n')
|
b'print(ascii("\xc3\xa4"))\n')
|
||||||
self.check_script_output(src, br"'\xc3\u20ac'")
|
self.check_script_output(src, br"'\xc3\u20ac'")
|
||||||
|
|
||||||
|
def test_first_utf8_coding_line_error(self):
|
||||||
|
src = (b'#coding:ascii \xc3\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
|
||||||
|
|
||||||
|
def test_second_utf8_coding_line_error(self):
|
||||||
|
src = (b'#!/usr/bin/python\n'
|
||||||
|
b'#coding:ascii \xc3\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
|
||||||
|
|
||||||
def test_utf8_bom(self):
|
def test_utf8_bom(self):
|
||||||
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
|
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
|
||||||
self.check_script_output(src, br"'\xe4'")
|
self.check_script_output(src, br"'\xe4'")
|
||||||
@@ -282,10 +294,80 @@ class AbstractSourceEncodingTest:
|
|||||||
b'print(ascii("\xc3\xa4"))\n')
|
b'print(ascii("\xc3\xa4"))\n')
|
||||||
self.check_script_output(src, br"'\xe4'")
|
self.check_script_output(src, br"'\xe4'")
|
||||||
|
|
||||||
def test_utf8_non_utf8_comment_line_error(self):
|
def test_utf8_bom_and_non_utf8_first_coding_line(self):
|
||||||
|
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"encoding problem: iso-8859-15 with BOM",
|
||||||
|
lineno=1)
|
||||||
|
|
||||||
|
def test_utf8_bom_and_non_utf8_second_coding_line(self):
|
||||||
|
src = (b'\xef\xbb\xbf#first\n'
|
||||||
|
b'#coding:iso-8859-15\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"encoding problem: iso-8859-15 with BOM",
|
||||||
|
lineno=2)
|
||||||
|
|
||||||
|
def test_non_utf8_shebang(self):
|
||||||
|
src = (b'#!/home/\xa4/bin/python\n'
|
||||||
|
b'#coding:iso-8859-15\n'
|
||||||
|
b'print(ascii("\xc3\xa4"))\n')
|
||||||
|
self.check_script_output(src, br"'\xc3\u20ac'")
|
||||||
|
|
||||||
|
def test_utf8_shebang_error(self):
|
||||||
|
src = (b'#!/home/\xc3\xa4/bin/python\n'
|
||||||
|
b'#coding:ascii\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
|
||||||
|
|
||||||
|
def test_non_utf8_shebang_error(self):
|
||||||
|
src = (b'#!/home/\xa4/bin/python\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
|
||||||
|
lineno=1)
|
||||||
|
|
||||||
|
def test_non_utf8_second_line_error(self):
|
||||||
|
src = (b'#first\n'
|
||||||
|
b'#second\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"Non-UTF-8 code starting with .* on line 2",
|
||||||
|
lineno=2)
|
||||||
|
|
||||||
|
def test_non_utf8_third_line_error(self):
|
||||||
|
src = (b'#first\n'
|
||||||
|
b'#second\n'
|
||||||
|
b'#third\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"Non-UTF-8 code starting with .* on line 3",
|
||||||
|
lineno=3)
|
||||||
|
|
||||||
|
def test_utf8_bom_non_utf8_third_line_error(self):
|
||||||
|
src = (b'\xef\xbb\xbf#first\n'
|
||||||
|
b'#second\n'
|
||||||
|
b'#third\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"Non-UTF-8 code starting with .* on line 3|"
|
||||||
|
br"'utf-8' codec can't decode byte",
|
||||||
|
lineno=3)
|
||||||
|
|
||||||
|
def test_utf_8_non_utf8_third_line_error(self):
|
||||||
|
src = (b'#coding: utf-8\n'
|
||||||
|
b'#second\n'
|
||||||
|
b'#third\xa4\n'
|
||||||
|
b'raise RuntimeError\n')
|
||||||
|
self.check_script_error(src,
|
||||||
|
br"Non-UTF-8 code starting with .* on line 3|"
|
||||||
|
br"'utf-8' codec can't decode byte",
|
||||||
|
lineno=3)
|
||||||
|
|
||||||
|
def test_utf8_non_utf8_third_line_error(self):
|
||||||
src = (b'#coding: utf8\n'
|
src = (b'#coding: utf8\n'
|
||||||
b'#\n'
|
b'#second\n'
|
||||||
b'#\xa4\n'
|
b'#third\xa4\n'
|
||||||
b'raise RuntimeError\n')
|
b'raise RuntimeError\n')
|
||||||
self.check_script_error(src,
|
self.check_script_error(src,
|
||||||
br"'utf-8' codec can't decode byte|"
|
br"'utf-8' codec can't decode byte|"
|
||||||
@@ -326,7 +408,7 @@ class AbstractSourceEncodingTest:
|
|||||||
class UTF8ValidatorTest(unittest.TestCase):
|
class UTF8ValidatorTest(unittest.TestCase):
|
||||||
@unittest.skipIf(not sys.platform.startswith("linux"),
|
@unittest.skipIf(not sys.platform.startswith("linux"),
|
||||||
"Too slow to run on non-Linux platforms")
|
"Too slow to run on non-Linux platforms")
|
||||||
@requires_resource('cpu')
|
@support.requires_resource('cpu')
|
||||||
def test_invalid_utf8(self):
|
def test_invalid_utf8(self):
|
||||||
# This is a port of test_utf8_decode_invalid_sequences in
|
# This is a port of test_utf8_decode_invalid_sequences in
|
||||||
# test_unicode.py to exercise the separate utf8 validator in
|
# test_unicode.py to exercise the separate utf8 validator in
|
||||||
@@ -392,19 +474,29 @@ class UTF8ValidatorTest(unittest.TestCase):
|
|||||||
check(b'\xF4'+cb+b'\xBF\xBF')
|
check(b'\xF4'+cb+b'\xBF\xBF')
|
||||||
|
|
||||||
|
|
||||||
|
@support.force_not_colorized_test_class
|
||||||
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
|
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
|
||||||
|
|
||||||
def check_script_output(self, src, expected):
|
def check_script_output(self, src, expected):
|
||||||
with captured_stdout() as stdout:
|
with support.captured_stdout() as stdout:
|
||||||
exec(src)
|
exec(src)
|
||||||
out = stdout.getvalue().encode('latin1')
|
out = stdout.getvalue().encode('latin1')
|
||||||
self.assertEqual(out.rstrip(), expected)
|
self.assertEqual(out.rstrip(), expected)
|
||||||
|
|
||||||
def check_script_error(self, src, expected):
|
def check_script_error(self, src, expected, lineno=...):
|
||||||
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
|
with self.assertRaises(SyntaxError) as cm:
|
||||||
exec(src)
|
exec(src)
|
||||||
|
exc = cm.exception
|
||||||
|
self.assertRegex(str(exc), expected.decode())
|
||||||
|
if lineno is not ...:
|
||||||
|
self.assertEqual(exc.lineno, lineno)
|
||||||
|
line = src.splitlines()[lineno-1].decode(errors='replace')
|
||||||
|
if lineno == 1:
|
||||||
|
line = line.removeprefix('\ufeff')
|
||||||
|
self.assertEqual(line, exc.text)
|
||||||
|
|
||||||
|
|
||||||
|
@support.force_not_colorized_test_class
|
||||||
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
|
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
|
||||||
|
|
||||||
def check_script_output(self, src, expected):
|
def check_script_output(self, src, expected):
|
||||||
@@ -415,13 +507,22 @@ class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
|
|||||||
res = script_helper.assert_python_ok(fn)
|
res = script_helper.assert_python_ok(fn)
|
||||||
self.assertEqual(res.out.rstrip(), expected)
|
self.assertEqual(res.out.rstrip(), expected)
|
||||||
|
|
||||||
def check_script_error(self, src, expected):
|
def check_script_error(self, src, expected, lineno=...):
|
||||||
with tempfile.TemporaryDirectory() as tmpd:
|
with tempfile.TemporaryDirectory() as tmpd:
|
||||||
fn = os.path.join(tmpd, 'test.py')
|
fn = os.path.join(tmpd, 'test.py')
|
||||||
with open(fn, 'wb') as fp:
|
with open(fn, 'wb') as fp:
|
||||||
fp.write(src)
|
fp.write(src)
|
||||||
res = script_helper.assert_python_failure(fn)
|
res = script_helper.assert_python_failure(fn)
|
||||||
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
|
err = res.err.rstrip()
|
||||||
|
self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected)
|
||||||
|
if lineno is not ...:
|
||||||
|
self.assertIn(f', line {lineno}\n'.encode(),
|
||||||
|
err.replace(os.linesep.encode(), b'\n'))
|
||||||
|
line = src.splitlines()[lineno-1].decode(errors='replace')
|
||||||
|
if lineno == 1:
|
||||||
|
line = line.removeprefix('\ufeff')
|
||||||
|
self.assertIn(line.encode(), err)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
|
||||||
|
encoding is specified. Detect decoding error in comments for default (UTF-8)
|
||||||
|
encoding. Show the line and position of decoding error for default encoding
|
||||||
|
in a traceback. Show the line containing the coding cookie when it conflicts
|
||||||
|
with the BOM in a traceback.
|
||||||
@@ -2,6 +2,7 @@
|
|||||||
#include <errcode.h>
|
#include <errcode.h>
|
||||||
|
|
||||||
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
|
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
|
||||||
|
#include "pycore_runtime.h" // _Py_ID()
|
||||||
#include "lexer/state.h"
|
#include "lexer/state.h"
|
||||||
#include "lexer/lexer.h"
|
#include "lexer/lexer.h"
|
||||||
#include "pegen.h"
|
#include "pegen.h"
|
||||||
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
|
|||||||
PyObject *value;
|
PyObject *value;
|
||||||
PyObject *tback;
|
PyObject *tback;
|
||||||
PyErr_Fetch(&type, &value, &tback);
|
PyErr_Fetch(&type, &value, &tback);
|
||||||
|
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
|
||||||
|
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
PyErr_Restore(type, value, tback);
|
||||||
|
return;
|
||||||
|
}
|
||||||
errstr = PyObject_Str(value);
|
errstr = PyObject_Str(value);
|
||||||
if (!errstr) {
|
if (!errstr) {
|
||||||
goto error;
|
goto error;
|
||||||
|
|||||||
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
tok_underflow_file(struct tok_state *tok) {
|
tok_underflow_file(struct tok_state *tok)
|
||||||
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
|
{
|
||||||
tok->cur = tok->inp = tok->buf;
|
|
||||||
}
|
|
||||||
if (tok->decoding_state == STATE_INIT) {
|
if (tok->decoding_state == STATE_INIT) {
|
||||||
/* We have not yet determined the encoding.
|
/* We have not yet determined the encoding.
|
||||||
If an encoding is found, use the file-pointer
|
If an encoding is found, use the file-pointer
|
||||||
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
|
|||||||
}
|
}
|
||||||
assert(tok->decoding_state != STATE_INIT);
|
assert(tok->decoding_state != STATE_INIT);
|
||||||
}
|
}
|
||||||
|
int raw = tok->decoding_readline == NULL;
|
||||||
|
if (raw && tok->decoding_state != STATE_NORMAL) {
|
||||||
|
/* Keep the first line in the buffer to validate it later if
|
||||||
|
* the encoding has not yet been determined. */
|
||||||
|
}
|
||||||
|
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
|
||||||
|
tok->cur = tok->inp = tok->buf;
|
||||||
|
}
|
||||||
/* Read until '\n' or EOF */
|
/* Read until '\n' or EOF */
|
||||||
if (tok->decoding_readline != NULL) {
|
if (!raw) {
|
||||||
/* We already have a codec associated with this input. */
|
/* We already have a codec associated with this input. */
|
||||||
if (!tok_readline_recode(tok)) {
|
if (!tok_readline_recode(tok)) {
|
||||||
return 0;
|
return 0;
|
||||||
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
|
|||||||
|
|
||||||
ADVANCE_LINENO();
|
ADVANCE_LINENO();
|
||||||
if (tok->decoding_state != STATE_NORMAL) {
|
if (tok->decoding_state != STATE_NORMAL) {
|
||||||
if (tok->lineno > 2) {
|
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
|
||||||
tok->decoding_state = STATE_NORMAL;
|
|
||||||
}
|
|
||||||
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
|
|
||||||
tok, fp_setreadl))
|
tok, fp_setreadl))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (tok->lineno >= 2) {
|
||||||
|
tok->decoding_state = STATE_NORMAL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* The default encoding is UTF-8, so make sure we don't have any
|
if (raw && tok->decoding_state == STATE_NORMAL) {
|
||||||
non-UTF-8 sequences in it. */
|
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
|
||||||
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
|
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
|
||||||
_PyTokenizer_error_ret(tok);
|
if (!tok->encoding) {
|
||||||
return 0;
|
/* The default encoding is UTF-8, so make sure we don't have any
|
||||||
|
non-UTF-8 sequences in it. */
|
||||||
|
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
|
||||||
|
_PyTokenizer_error_ret(tok);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
|
||||||
|
tok->encoding, NULL);
|
||||||
|
if (tmp == NULL) {
|
||||||
|
_PyTokenizer_error_ret(tok);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
Py_DECREF(tmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
assert(tok->done == E_OK);
|
assert(tok->done == E_OK);
|
||||||
return tok->done == E_OK;
|
return tok->done == E_OK;
|
||||||
|
|||||||
@@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
|
args = Py_BuildValue("(O(OiiNii))", errmsg,
|
||||||
col_offset, errtext, tok->lineno, end_col_offset);
|
tok->filename ? tok->filename : Py_None,
|
||||||
|
tok->lineno, col_offset, errtext,
|
||||||
|
tok->lineno, end_col_offset);
|
||||||
if (args) {
|
if (args) {
|
||||||
PyErr_SetObject(PyExc_SyntaxError, args);
|
PyErr_SetObject(PyExc_SyntaxError, args);
|
||||||
Py_DECREF(args);
|
Py_DECREF(args);
|
||||||
@@ -422,10 +424,13 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
|
|||||||
tok->encoding = cs;
|
tok->encoding = cs;
|
||||||
} else { /* then, compare cs with BOM */
|
} else { /* then, compare cs with BOM */
|
||||||
if (strcmp(tok->encoding, cs) != 0) {
|
if (strcmp(tok->encoding, cs) != 0) {
|
||||||
_PyTokenizer_error_ret(tok);
|
tok->line_start = line;
|
||||||
PyErr_Format(PyExc_SyntaxError,
|
tok->cur = (char *)line;
|
||||||
"encoding problem: %s with BOM", cs);
|
assert(size <= INT_MAX);
|
||||||
|
_PyTokenizer_syntaxerror_known_range(tok, 0, (int)size,
|
||||||
|
"encoding problem: %s with BOM", cs);
|
||||||
PyMem_Free(cs);
|
PyMem_Free(cs);
|
||||||
|
_PyTokenizer_error_ret(tok);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
PyMem_Free(cs);
|
PyMem_Free(cs);
|
||||||
@@ -496,24 +501,38 @@ valid_utf8(const unsigned char* s)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
|
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
|
||||||
{
|
{
|
||||||
int badchar = 0;
|
const char *badchar = NULL;
|
||||||
unsigned char *c;
|
const char *c;
|
||||||
int length;
|
int length;
|
||||||
for (c = (unsigned char *)line; *c; c += length) {
|
int col_offset = 0;
|
||||||
if (!(length = valid_utf8(c))) {
|
const char *line_start = line;
|
||||||
badchar = *c;
|
for (c = line; *c; c += length) {
|
||||||
|
if (!(length = valid_utf8((const unsigned char *)c))) {
|
||||||
|
badchar = c;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
col_offset++;
|
||||||
|
if (*c == '\n') {
|
||||||
|
lineno++;
|
||||||
|
col_offset = 0;
|
||||||
|
line_start = c + 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (badchar) {
|
if (badchar) {
|
||||||
PyErr_Format(PyExc_SyntaxError,
|
tok->lineno = lineno;
|
||||||
"Non-UTF-8 code starting with '\\x%.2x' "
|
tok->line_start = line_start;
|
||||||
"in file %U on line %i, "
|
tok->cur = (char *)badchar;
|
||||||
"but no encoding declared; "
|
_PyTokenizer_syntaxerror_known_range(tok,
|
||||||
"see https://peps.python.org/pep-0263/ for details",
|
col_offset + 1, col_offset + 1,
|
||||||
badchar, tok->filename, tok->lineno);
|
"Non-UTF-8 code starting with '\\x%.2x'"
|
||||||
|
"%s%V on line %i, "
|
||||||
|
"but no encoding declared; "
|
||||||
|
"see https://peps.python.org/pep-0263/ for details",
|
||||||
|
(unsigned char)*badchar,
|
||||||
|
tok->filename ? " in file " : "", tok->filename, "",
|
||||||
|
lineno);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *),
|
|||||||
struct tok_state *tok);
|
struct tok_state *tok);
|
||||||
int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
|
int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
|
||||||
int set_readline(struct tok_state *, const char *));
|
int set_readline(struct tok_state *, const char *));
|
||||||
int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok);
|
int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno);
|
||||||
|
|
||||||
#ifdef Py_DEBUG
|
#ifdef Py_DEBUG
|
||||||
void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);
|
void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {
|
|||||||
ADVANCE_LINENO();
|
ADVANCE_LINENO();
|
||||||
/* The default encoding is UTF-8, so make sure we don't have any
|
/* The default encoding is UTF-8, so make sure we don't have any
|
||||||
non-UTF-8 sequences in it. */
|
non-UTF-8 sequences in it. */
|
||||||
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
|
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {
|
||||||
_PyTokenizer_error_ret(tok);
|
_PyTokenizer_error_ret(tok);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
|
|||||||
/* need to check line 1 and 2 separately since check_coding_spec
|
/* need to check line 1 and 2 separately since check_coding_spec
|
||||||
assumes a single line as input */
|
assumes a single line as input */
|
||||||
if (newl[0]) {
|
if (newl[0]) {
|
||||||
|
tok->lineno = 1;
|
||||||
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
|
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
|
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
|
||||||
|
tok->lineno = 2;
|
||||||
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
||||||
tok, buf_setreadl))
|
tok, buf_setreadl))
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
tok->lineno = 0;
|
||||||
if (tok->enc != NULL) {
|
if (tok->enc != NULL) {
|
||||||
assert(utf8 == NULL);
|
assert(utf8 == NULL);
|
||||||
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
|
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
|
||||||
@@ -102,6 +105,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
|
|||||||
return _PyTokenizer_error_ret(tok);
|
return _PyTokenizer_error_ret(tok);
|
||||||
str = PyBytes_AS_STRING(utf8);
|
str = PyBytes_AS_STRING(utf8);
|
||||||
}
|
}
|
||||||
|
else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
|
||||||
|
return _PyTokenizer_error_ret(tok);
|
||||||
|
}
|
||||||
assert(tok->decoding_buffer == NULL);
|
assert(tok->decoding_buffer == NULL);
|
||||||
tok->decoding_buffer = utf8; /* CAUTION */
|
tok->decoding_buffer = utf8; /* CAUTION */
|
||||||
return str;
|
return str;
|
||||||
|
|||||||
Reference in New Issue
Block a user