From 5ba0a1aa1fe386fbc863d3fe8f32dfbfe2b1bded Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Sun, 9 Nov 2025 12:37:34 +0000 Subject: [PATCH] gh-136702: Deprecate passing non-ascii *encoding* (str) to `encodings.normalize_encoding` (#140030) Closes #136702 --- Doc/deprecations/pending-removal-in-3.17.rst | 6 ++++++ Lib/email/_header_value_parser.py | 4 ++++ Lib/email/utils.py | 4 ++++ Lib/encodings/__init__.py | 8 +++++++- Lib/test/test_codecs.py | 10 +++++++--- Lib/test/test_email/test_email.py | 3 ++- Lib/test/test_email/test_headerregistry.py | 10 +++++++++- .../2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst | 3 +++ 8 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst diff --git a/Doc/deprecations/pending-removal-in-3.17.rst b/Doc/deprecations/pending-removal-in-3.17.rst index 0a1c2f08ca..e769c9d371 100644 --- a/Doc/deprecations/pending-removal-in-3.17.rst +++ b/Doc/deprecations/pending-removal-in-3.17.rst @@ -23,6 +23,12 @@ Pending removal in Python 3.17 (Contributed by Shantanu Jain in :gh:`91896`.) +* :mod:`encodings`: + + - Passing non-ascii *encoding* names to :func:`encodings.normalize_encoding` + is deprecated and scheduled for removal in Python 3.17. + (Contributed by Stan Ulbrych in :gh:`136702`) + * :mod:`typing`: - Before Python 3.14, old-style unions were implemented using the private class diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 91243378dc..c7f665b399 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -796,6 +796,10 @@ class MimeParameters(TokenList): value = urllib.parse.unquote(value, encoding='latin-1') else: try: + # Explicitly look up the codec for warning generation, see gh-140030 + # Can be removed in 3.17 + import codecs + codecs.lookup(charset) value = value.decode(charset, 'surrogateescape') except (LookupError, UnicodeEncodeError): # XXX: there should really be a custom defect for diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 3de1f0d24a..d4824dc360 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -460,6 +460,10 @@ def collapse_rfc2231_value(value, errors='replace', charset = fallback_charset rawbytes = bytes(text, 'raw-unicode-escape') try: + # Explicitly look up the codec for warning generation, see gh-140030 + # Can be removed in 3.17 + import codecs + codecs.lookup(charset) return str(rawbytes, charset, errors) except LookupError: # charset is not a known codec. diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index e7e4ca3358..e205ec3263 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,7 +26,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs import sys @@ -56,6 +56,12 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") + if not encoding.isascii(): + import warnings + warnings.warn( + "Support for non-ascii encoding names will be removed in 3.17", + DeprecationWarning, stacklevel=2) + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c35a450894..f1f0ac5ad3 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3886,15 +3886,14 @@ class CodecNameNormalizationTest(unittest.TestCase): self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4)) - self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4)) self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4)) + with self.assertWarns(DeprecationWarning): + self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. @@ -3902,6 +3901,11 @@ class CodecNameNormalizationTest(unittest.TestCase): self.assertEqual(normalize('utf.8'), 'utf.8') self.assertEqual(normalize('utf...8'), 'utf...8') + # Non-ASCII *encoding* is deprecated. + with self.assertWarnsRegex(DeprecationWarning, + "Support for non-ascii encoding names will be removed in 3.17"): + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 4cd587bcd7..1900adf463 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -5738,7 +5738,8 @@ Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt """ msg = email.message_from_string(m) - self.assertEqual(msg.get_filename(), 'myfile.txt') + with self.assertWarns(DeprecationWarning): + self.assertEqual(msg.get_filename(), 'myfile.txt') def test_rfc2231_single_tick_in_filename_extended(self): eq = self.assertEqual diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index ff7a6da644..1d0d0a49a8 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -247,7 +247,15 @@ class TestContentTypeHeader(TestHeaderBase): decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' folded = args[3] if l>3 else header + decoded + '\n' - h = self.make_header('Content-Type', source) + # Both rfc2231 test cases with utf-8%E2%80%9D raise warnings, + # clear encoding cache to ensure test isolation. + if 'utf-8%E2%80%9D' in source and 'ascii' not in source: + import encodings + encodings._cache.clear() + with self.assertWarns(DeprecationWarning): + h = self.make_header('Content-Type', source) + else: + h = self.make_header('Content-Type', source) self.assertEqual(h.content_type, content_type) self.assertEqual(h.maintype, maintype) self.assertEqual(h.subtype, subtype) diff --git a/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst new file mode 100644 index 0000000000..88303f017f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst @@ -0,0 +1,3 @@ +:mod:`encodings`: Deprecate passing a non-ascii *encoding* name to +:func:`encodings.normalize_encoding` and schedule removal of support for +Python 3.17.