Skip to content

Commit 18b07d7

Browse files
bpo-36819: Fix crashes in built-in encoders with weird error handlers (GH-28593)
If the error handler returns position less or equal than the starting position of non-encodable characters, most of built-in encoders didn't properly re-size the output buffer. This led to out-of-bounds writes, and segfaults.
1 parent 614420d commit 18b07d7

File tree

4 files changed

+222
-32
lines changed

4 files changed

+222
-32
lines changed

Lib/test/test_codeccallbacks.py

Lines changed: 168 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import codecs
22
import html.entities
3+
import itertools
34
import sys
45
import unicodedata
56
import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
2223
self.pos = len(exc.object)
2324
return ("<?>", oldpos)
2425

26+
class RepeatedPosReturn:
27+
def __init__(self, repl="<?>"):
28+
self.repl = repl
29+
self.pos = 0
30+
self.count = 0
31+
32+
def handle(self, exc):
33+
if self.count > 0:
34+
self.count -= 1
35+
return (self.repl, self.pos)
36+
return (self.repl, exc.end)
37+
2538
# A UnicodeEncodeError object with a bad start attribute
2639
class BadStartUnicodeEncodeError(UnicodeEncodeError):
2740
def __init__(self):
@@ -783,20 +796,104 @@ def test_lookup(self):
783796
codecs.lookup_error("namereplace")
784797
)
785798

786-
def test_unencodablereplacement(self):
799+
def test_encode_nonascii_replacement(self):
800+
def handle(exc):
801+
if isinstance(exc, UnicodeEncodeError):
802+
return (repl, exc.end)
803+
raise TypeError("don't know how to handle %r" % exc)
804+
codecs.register_error("test.replacing", handle)
805+
806+
for enc, input, repl in (
807+
("ascii", "[¤]", "abc"),
808+
("iso-8859-1", "[€]", "½¾"),
809+
("iso-8859-15", "[¤]", "œŸ"),
810+
):
811+
res = input.encode(enc, "test.replacing")
812+
self.assertEqual(res, ("[" + repl + "]").encode(enc))
813+
814+
for enc, input, repl in (
815+
("utf-8", "[\udc80]", "\U0001f40d"),
816+
("utf-16", "[\udc80]", "\U0001f40d"),
817+
("utf-32", "[\udc80]", "\U0001f40d"),
818+
):
819+
with self.subTest(encoding=enc):
820+
with self.assertRaises(UnicodeEncodeError) as cm:
821+
input.encode(enc, "test.replacing")
822+
exc = cm.exception
823+
self.assertEqual(exc.start, 1)
824+
self.assertEqual(exc.end, 2)
825+
self.assertEqual(exc.object, input)
826+
827+
def test_encode_unencodable_replacement(self):
787828
def unencrepl(exc):
788829
if isinstance(exc, UnicodeEncodeError):
789-
return ("\u4242", exc.end)
830+
return (repl, exc.end)
790831
else:
791832
raise TypeError("don't know how to handle %r" % exc)
792833
codecs.register_error("test.unencreplhandler", unencrepl)
793-
for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
794-
self.assertRaises(
795-
UnicodeEncodeError,
796-
"\u4242".encode,
797-
enc,
798-
"test.unencreplhandler"
799-
)
834+
835+
for enc, input, repl in (
836+
("ascii", "[¤]", "½"),
837+
("iso-8859-1", "[€]", "œ"),
838+
("iso-8859-15", "[¤]", "½"),
839+
("utf-8", "[\udc80]", "\udcff"),
840+
("utf-16", "[\udc80]", "\udcff"),
841+
("utf-32", "[\udc80]", "\udcff"),
842+
):
843+
with self.subTest(encoding=enc):
844+
with self.assertRaises(UnicodeEncodeError) as cm:
845+
input.encode(enc, "test.unencreplhandler")
846+
exc = cm.exception
847+
self.assertEqual(exc.start, 1)
848+
self.assertEqual(exc.end, 2)
849+
self.assertEqual(exc.object, input)
850+
851+
def test_encode_bytes_replacement(self):
852+
def handle(exc):
853+
if isinstance(exc, UnicodeEncodeError):
854+
return (repl, exc.end)
855+
raise TypeError("don't know how to handle %r" % exc)
856+
codecs.register_error("test.replacing", handle)
857+
858+
# It works even if the bytes sequence is not decodable.
859+
for enc, input, repl in (
860+
("ascii", "[¤]", b"\xbd\xbe"),
861+
("iso-8859-1", "[€]", b"\xbd\xbe"),
862+
("iso-8859-15", "[¤]", b"\xbd\xbe"),
863+
("utf-8", "[\udc80]", b"\xbd\xbe"),
864+
("utf-16le", "[\udc80]", b"\xbd\xbe"),
865+
("utf-16be", "[\udc80]", b"\xbd\xbe"),
866+
("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
867+
("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
868+
):
869+
with self.subTest(encoding=enc):
870+
res = input.encode(enc, "test.replacing")
871+
self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
872+
873+
def test_encode_odd_bytes_replacement(self):
874+
def handle(exc):
875+
if isinstance(exc, UnicodeEncodeError):
876+
return (repl, exc.end)
877+
raise TypeError("don't know how to handle %r" % exc)
878+
codecs.register_error("test.replacing", handle)
879+
880+
input = "[\udc80]"
881+
# Tests in which the replacement bytestring contains not whole number
882+
# of code units.
883+
for enc, repl in (
884+
*itertools.product(("utf-16le", "utf-16be"),
885+
[b"a", b"abc"]),
886+
*itertools.product(("utf-32le", "utf-32be"),
887+
[b"a", b"ab", b"abc", b"abcde"]),
888+
):
889+
with self.subTest(encoding=enc, repl=repl):
890+
with self.assertRaises(UnicodeEncodeError) as cm:
891+
input.encode(enc, "test.replacing")
892+
exc = cm.exception
893+
self.assertEqual(exc.start, 1)
894+
self.assertEqual(exc.end, 2)
895+
self.assertEqual(exc.object, input)
896+
self.assertEqual(exc.reason, "surrogates not allowed")
800897

801898
def test_badregistercall(self):
802899
# enhance coverage of:
@@ -940,6 +1037,68 @@ def __getitem__(self, key):
9401037
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
9411038
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
9421039

1040+
def test_decodehelper_bug36819(self):
1041+
handler = RepeatedPosReturn("x")
1042+
codecs.register_error("test.bug36819", handler.handle)
1043+
1044+
testcases = [
1045+
("ascii", b"\xff"),
1046+
("utf-8", b"\xff"),
1047+
("utf-16be", b'\xdc\x80'),
1048+
("utf-32be", b'\x00\x00\xdc\x80'),
1049+
("iso-8859-6", b"\xff"),
1050+
]
1051+
for enc, bad in testcases:
1052+
input = "abcd".encode(enc) + bad
1053+
with self.subTest(encoding=enc):
1054+
handler.count = 50
1055+
decoded = input.decode(enc, "test.bug36819")
1056+
self.assertEqual(decoded, 'abcdx' * 51)
1057+
1058+
def test_encodehelper_bug36819(self):
1059+
handler = RepeatedPosReturn()
1060+
codecs.register_error("test.bug36819", handler.handle)
1061+
1062+
input = "abcd\udc80"
1063+
encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in
1064+
encodings += ["iso-8859-15"] # charmap codec
1065+
if sys.platform == 'win32':
1066+
encodings = ["mbcs", "oem"] # code page codecs
1067+
1068+
handler.repl = "\udcff"
1069+
for enc in encodings:
1070+
with self.subTest(encoding=enc):
1071+
handler.count = 50
1072+
with self.assertRaises(UnicodeEncodeError) as cm:
1073+
input.encode(enc, "test.bug36819")
1074+
exc = cm.exception
1075+
self.assertEqual(exc.start, 4)
1076+
self.assertEqual(exc.end, 5)
1077+
self.assertEqual(exc.object, input)
1078+
if sys.platform == "win32":
1079+
handler.count = 50
1080+
with self.assertRaises(UnicodeEncodeError) as cm:
1081+
codecs.code_page_encode(437, input, "test.bug36819")
1082+
exc = cm.exception
1083+
self.assertEqual(exc.start, 4)
1084+
self.assertEqual(exc.end, 5)
1085+
self.assertEqual(exc.object, input)
1086+
1087+
handler.repl = "x"
1088+
for enc in encodings:
1089+
with self.subTest(encoding=enc):
1090+
# The interpreter should segfault after a handful of attempts.
1091+
# 50 was chosen to try to ensure a segfault without a fix,
1092+
# but not OOM a machine with one.
1093+
handler.count = 50
1094+
encoded = input.encode(enc, "test.bug36819")
1095+
self.assertEqual(encoded.decode(enc), "abcdx" * 51)
1096+
if sys.platform == "win32":
1097+
handler.count = 50
1098+
encoded = codecs.code_page_encode(437, input, "test.bug36819")
1099+
self.assertEqual(encoded[0].decode(), "abcdx" * 51)
1100+
self.assertEqual(encoded[1], len(input))
1101+
9431102
def test_translatehelper(self):
9441103
# enhance coverage of:
9451104
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix crashes in built-in encoders with error handlers that return position
2+
less or equal than the starting position of non-encodable characters.

Objects/stringlib/codecs.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
387387
if (!rep)
388388
goto error;
389389

390-
/* subtract preallocated bytes */
391-
writer->min_size -= max_char_size * (newpos - startpos);
390+
if (newpos < startpos) {
391+
writer->overallocate = 1;
392+
p = _PyBytesWriter_Prepare(writer, p,
393+
max_char_size * (startpos - newpos));
394+
if (p == NULL)
395+
goto error;
396+
}
397+
else {
398+
/* subtract preallocated bytes */
399+
writer->min_size -= max_char_size * (newpos - startpos);
400+
/* Only overallocate the buffer if it's not the last write */
401+
writer->overallocate = (newpos < size);
402+
}
392403

393404
if (PyBytes_Check(rep)) {
394405
p = _PyBytesWriter_WriteBytes(writer, p,

0 commit comments

Comments
 (0)