1
1
import codecs
2
2
import html .entities
3
+ import itertools
3
4
import sys
4
5
import unicodedata
5
6
import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
22
23
self .pos = len (exc .object )
23
24
return ("<?>" , oldpos )
24
25
26
+ class RepeatedPosReturn :
27
+ def __init__ (self , repl = "<?>" ):
28
+ self .repl = repl
29
+ self .pos = 0
30
+ self .count = 0
31
+
32
+ def handle (self , exc ):
33
+ if self .count > 0 :
34
+ self .count -= 1
35
+ return (self .repl , self .pos )
36
+ return (self .repl , exc .end )
37
+
25
38
# A UnicodeEncodeError object with a bad start attribute
26
39
class BadStartUnicodeEncodeError (UnicodeEncodeError ):
27
40
def __init__ (self ):
@@ -783,20 +796,104 @@ def test_lookup(self):
783
796
codecs .lookup_error ("namereplace" )
784
797
)
785
798
786
- def test_unencodablereplacement (self ):
799
+ def test_encode_nonascii_replacement (self ):
800
+ def handle (exc ):
801
+ if isinstance (exc , UnicodeEncodeError ):
802
+ return (repl , exc .end )
803
+ raise TypeError ("don't know how to handle %r" % exc )
804
+ codecs .register_error ("test.replacing" , handle )
805
+
806
+ for enc , input , repl in (
807
+ ("ascii" , "[¤]" , "abc" ),
808
+ ("iso-8859-1" , "[€]" , "½¾" ),
809
+ ("iso-8859-15" , "[¤]" , "œŸ" ),
810
+ ):
811
+ res = input .encode (enc , "test.replacing" )
812
+ self .assertEqual (res , ("[" + repl + "]" ).encode (enc ))
813
+
814
+ for enc , input , repl in (
815
+ ("utf-8" , "[\udc80 ]" , "\U0001f40d " ),
816
+ ("utf-16" , "[\udc80 ]" , "\U0001f40d " ),
817
+ ("utf-32" , "[\udc80 ]" , "\U0001f40d " ),
818
+ ):
819
+ with self .subTest (encoding = enc ):
820
+ with self .assertRaises (UnicodeEncodeError ) as cm :
821
+ input .encode (enc , "test.replacing" )
822
+ exc = cm .exception
823
+ self .assertEqual (exc .start , 1 )
824
+ self .assertEqual (exc .end , 2 )
825
+ self .assertEqual (exc .object , input )
826
+
827
+ def test_encode_unencodable_replacement (self ):
787
828
def unencrepl (exc ):
788
829
if isinstance (exc , UnicodeEncodeError ):
789
- return (" \u4242 " , exc .end )
830
+ return (repl , exc .end )
790
831
else :
791
832
raise TypeError ("don't know how to handle %r" % exc )
792
833
codecs .register_error ("test.unencreplhandler" , unencrepl )
793
- for enc in ("ascii" , "iso-8859-1" , "iso-8859-15" ):
794
- self .assertRaises (
795
- UnicodeEncodeError ,
796
- "\u4242 " .encode ,
797
- enc ,
798
- "test.unencreplhandler"
799
- )
834
+
835
+ for enc , input , repl in (
836
+ ("ascii" , "[¤]" , "½" ),
837
+ ("iso-8859-1" , "[€]" , "œ" ),
838
+ ("iso-8859-15" , "[¤]" , "½" ),
839
+ ("utf-8" , "[\udc80 ]" , "\udcff " ),
840
+ ("utf-16" , "[\udc80 ]" , "\udcff " ),
841
+ ("utf-32" , "[\udc80 ]" , "\udcff " ),
842
+ ):
843
+ with self .subTest (encoding = enc ):
844
+ with self .assertRaises (UnicodeEncodeError ) as cm :
845
+ input .encode (enc , "test.unencreplhandler" )
846
+ exc = cm .exception
847
+ self .assertEqual (exc .start , 1 )
848
+ self .assertEqual (exc .end , 2 )
849
+ self .assertEqual (exc .object , input )
850
+
851
+ def test_encode_bytes_replacement (self ):
852
+ def handle (exc ):
853
+ if isinstance (exc , UnicodeEncodeError ):
854
+ return (repl , exc .end )
855
+ raise TypeError ("don't know how to handle %r" % exc )
856
+ codecs .register_error ("test.replacing" , handle )
857
+
858
+ # It works even if the bytes sequence is not decodable.
859
+ for enc , input , repl in (
860
+ ("ascii" , "[¤]" , b"\xbd \xbe " ),
861
+ ("iso-8859-1" , "[€]" , b"\xbd \xbe " ),
862
+ ("iso-8859-15" , "[¤]" , b"\xbd \xbe " ),
863
+ ("utf-8" , "[\udc80 ]" , b"\xbd \xbe " ),
864
+ ("utf-16le" , "[\udc80 ]" , b"\xbd \xbe " ),
865
+ ("utf-16be" , "[\udc80 ]" , b"\xbd \xbe " ),
866
+ ("utf-32le" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
867
+ ("utf-32be" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
868
+ ):
869
+ with self .subTest (encoding = enc ):
870
+ res = input .encode (enc , "test.replacing" )
871
+ self .assertEqual (res , "[" .encode (enc ) + repl + "]" .encode (enc ))
872
+
873
+ def test_encode_odd_bytes_replacement (self ):
874
+ def handle (exc ):
875
+ if isinstance (exc , UnicodeEncodeError ):
876
+ return (repl , exc .end )
877
+ raise TypeError ("don't know how to handle %r" % exc )
878
+ codecs .register_error ("test.replacing" , handle )
879
+
880
+ input = "[\udc80 ]"
881
+ # Tests in which the replacement bytestring contains not whole number
882
+ # of code units.
883
+ for enc , repl in (
884
+ * itertools .product (("utf-16le" , "utf-16be" ),
885
+ [b"a" , b"abc" ]),
886
+ * itertools .product (("utf-32le" , "utf-32be" ),
887
+ [b"a" , b"ab" , b"abc" , b"abcde" ]),
888
+ ):
889
+ with self .subTest (encoding = enc , repl = repl ):
890
+ with self .assertRaises (UnicodeEncodeError ) as cm :
891
+ input .encode (enc , "test.replacing" )
892
+ exc = cm .exception
893
+ self .assertEqual (exc .start , 1 )
894
+ self .assertEqual (exc .end , 2 )
895
+ self .assertEqual (exc .object , input )
896
+ self .assertEqual (exc .reason , "surrogates not allowed" )
800
897
801
898
def test_badregistercall (self ):
802
899
# enhance coverage of:
@@ -940,6 +1037,68 @@ def __getitem__(self, key):
940
1037
self .assertRaises (ValueError , codecs .charmap_encode , "\xff " , err , D ())
941
1038
self .assertRaises (TypeError , codecs .charmap_encode , "\xff " , err , {0xff : 300 })
942
1039
1040
+ def test_decodehelper_bug36819 (self ):
1041
+ handler = RepeatedPosReturn ("x" )
1042
+ codecs .register_error ("test.bug36819" , handler .handle )
1043
+
1044
+ testcases = [
1045
+ ("ascii" , b"\xff " ),
1046
+ ("utf-8" , b"\xff " ),
1047
+ ("utf-16be" , b'\xdc \x80 ' ),
1048
+ ("utf-32be" , b'\x00 \x00 \xdc \x80 ' ),
1049
+ ("iso-8859-6" , b"\xff " ),
1050
+ ]
1051
+ for enc , bad in testcases :
1052
+ input = "abcd" .encode (enc ) + bad
1053
+ with self .subTest (encoding = enc ):
1054
+ handler .count = 50
1055
+ decoded = input .decode (enc , "test.bug36819" )
1056
+ self .assertEqual (decoded , 'abcdx' * 51 )
1057
+
1058
+ def test_encodehelper_bug36819 (self ):
1059
+ handler = RepeatedPosReturn ()
1060
+ codecs .register_error ("test.bug36819" , handler .handle )
1061
+
1062
+ input = "abcd\udc80 "
1063
+ encodings = ["ascii" , "latin1" , "utf-8" , "utf-16" , "utf-32" ] # built-in
1064
+ encodings += ["iso-8859-15" ] # charmap codec
1065
+ if sys .platform == 'win32' :
1066
+ encodings = ["mbcs" , "oem" ] # code page codecs
1067
+
1068
+ handler .repl = "\udcff "
1069
+ for enc in encodings :
1070
+ with self .subTest (encoding = enc ):
1071
+ handler .count = 50
1072
+ with self .assertRaises (UnicodeEncodeError ) as cm :
1073
+ input .encode (enc , "test.bug36819" )
1074
+ exc = cm .exception
1075
+ self .assertEqual (exc .start , 4 )
1076
+ self .assertEqual (exc .end , 5 )
1077
+ self .assertEqual (exc .object , input )
1078
+ if sys .platform == "win32" :
1079
+ handler .count = 50
1080
+ with self .assertRaises (UnicodeEncodeError ) as cm :
1081
+ codecs .code_page_encode (437 , input , "test.bug36819" )
1082
+ exc = cm .exception
1083
+ self .assertEqual (exc .start , 4 )
1084
+ self .assertEqual (exc .end , 5 )
1085
+ self .assertEqual (exc .object , input )
1086
+
1087
+ handler .repl = "x"
1088
+ for enc in encodings :
1089
+ with self .subTest (encoding = enc ):
1090
+ # The interpreter should segfault after a handful of attempts.
1091
+ # 50 was chosen to try to ensure a segfault without a fix,
1092
+ # but not OOM a machine with one.
1093
+ handler .count = 50
1094
+ encoded = input .encode (enc , "test.bug36819" )
1095
+ self .assertEqual (encoded .decode (enc ), "abcdx" * 51 )
1096
+ if sys .platform == "win32" :
1097
+ handler .count = 50
1098
+ encoded = codecs .code_page_encode (437 , input , "test.bug36819" )
1099
+ self .assertEqual (encoded [0 ].decode (), "abcdx" * 51 )
1100
+ self .assertEqual (encoded [1 ], len (input ))
1101
+
943
1102
def test_translatehelper (self ):
944
1103
# enhance coverage of:
945
1104
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
0 commit comments