gh-136702: Deprecate passing non-ascii *encoding* (str) to encodings.normalize_encoding (#140030)

Closes #136702
2025-11-09 12:37:34 +00:00
parent 7ae440f262
commit 5ba0a1aa1f
8 changed files with 42 additions and 6 deletions
--- a/Doc/deprecations/pending-removal-in-3.17.rst
+++ b/Doc/deprecations/pending-removal-in-3.17.rst
@@ -23,6 +23,12 @@ Pending removal in Python 3.17
    (Contributed by Shantanu Jain in :gh:`91896`.)
 * :mod:`encodings`:
  - Passing non-ascii *encoding* names to :func:`encodings.normalize_encoding`
    is deprecated and scheduled for removal in Python 3.17.
    (Contributed by Stan Ulbrych in :gh:`136702`)
 * :mod:`typing`:
  - Before Python 3.14, old-style unions were implemented using the private class
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -796,6 +796,10 @@ class MimeParameters(TokenList):
                        value = urllib.parse.unquote(value, encoding='latin-1')
                    else:
                        try:
                            # Explicitly look up the codec for warning generation, see gh-140030
                            # Can be removed in 3.17
                            import codecs
                            codecs.lookup(charset)
                            value = value.decode(charset, 'surrogateescape')
                        except (LookupError, UnicodeEncodeError):
                            # XXX: there should really be a custom defect for
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -460,6 +460,10 @@ def collapse_rfc2231_value(value, errors='replace',
        charset = fallback_charset
    rawbytes = bytes(text, 'raw-unicode-escape')
    try:
        # Explicitly look up the codec for warning generation, see gh-140030
        # Can be removed in 3.17
        import codecs
        codecs.lookup(charset)
        return str(rawbytes, charset, errors)
    except LookupError:
        # charset is not a known codec.
--- a/Lib/encodings/init.py
+++ b/Lib/encodings/init.py
@@ -26,7 +26,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
-"""#"
+"""
 import codecs
 import sys
@@ -56,6 +56,12 @@ def normalize_encoding(encoding):
    if isinstance(encoding, bytes):
        encoding = str(encoding, "ascii")
    if not encoding.isascii():
        import warnings
        warnings.warn(
            "Support for non-ascii encoding names will be removed in 3.17",
            DeprecationWarning, stacklevel=2)
    return _normalize_encoding(encoding)
 def search_function(encoding):
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3886,15 +3886,14 @@ class CodecNameNormalizationTest(unittest.TestCase):
        self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4))
        self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4))
        self.assertEqual(codecs.lookup('TEST.AAA   8'), ('test.aaa---8', 2, 3, 4))
        self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4))
        self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4))
        self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4))
        with self.assertWarns(DeprecationWarning):
            self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4))
    def test_encodings_normalize_encoding(self):
        # encodings.normalize_encoding() ignores non-ASCII characters.
        normalize = encodings.normalize_encoding
        self.assertEqual(normalize('utf_8'), 'utf_8')
        self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
        self.assertEqual(normalize('utf   8'), 'utf_8')
        # encodings.normalize_encoding() doesn't convert
        # characters to lower case.
@@ -3902,6 +3901,11 @@ class CodecNameNormalizationTest(unittest.TestCase):
        self.assertEqual(normalize('utf.8'), 'utf.8')
        self.assertEqual(normalize('utf...8'), 'utf...8')
        # Non-ASCII *encoding* is deprecated.
        with self.assertWarnsRegex(DeprecationWarning,
                "Support for non-ascii encoding names will be removed in 3.17"):
            self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
 if __name__ == "__main__":
    unittest.main()
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -5738,7 +5738,8 @@ Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
 """
        msg = email.message_from_string(m)
-        self.assertEqual(msg.get_filename(), 'myfile.txt')
+        with self.assertWarns(DeprecationWarning):
            self.assertEqual(msg.get_filename(), 'myfile.txt')
    def test_rfc2231_single_tick_in_filename_extended(self):
        eq = self.assertEqual
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -247,7 +247,15 @@ class TestContentTypeHeader(TestHeaderBase):
        decoded =  args[2] if l>2 and args[2] is not DITTO else source
        header = 'Content-Type:' + ' ' if source else ''
        folded = args[3] if l>3 else header + decoded + '\n'
-        h = self.make_header('Content-Type', source)
+        # Both rfc2231 test cases with utf-8%E2%80%9D raise warnings,
        # clear encoding cache to ensure test isolation.
        if 'utf-8%E2%80%9D' in source and 'ascii' not in source:
            import encodings
            encodings._cache.clear()
            with self.assertWarns(DeprecationWarning):
                h = self.make_header('Content-Type', source)
        else:
            h = self.make_header('Content-Type', source)
        self.assertEqual(h.content_type, content_type)
        self.assertEqual(h.maintype, maintype)
        self.assertEqual(h.subtype, subtype)
--- a/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst
+++ b/Misc/NEWS.d/next/Library/2025-10-13-11-25-41.gh-issue-136702.uvLGK1.rst
@@ -0,0 +1,3 @@
 :mod:`encodings`: Deprecate passing a non-ascii *encoding* name to
 :func:`encodings.normalize_encoding` and schedule removal of support for
 Python 3.17.