@@ -201,6 +201,11 @@ static PyObject *
201
201
unicode_decode_utf8 (const char * s , Py_ssize_t size ,
202
202
_Py_error_handler error_handler , const char * errors ,
203
203
Py_ssize_t * consumed );
204
+ static int
205
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
206
+ const char * s , Py_ssize_t size ,
207
+ _Py_error_handler error_handler , const char * errors ,
208
+ Py_ssize_t * consumed );
204
209
#ifdef Py_DEBUG
205
210
static inline int unicode_is_finalizing (void );
206
211
static int unicode_is_singleton (PyObject * unicode );
@@ -2376,14 +2381,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2376
2381
}
2377
2382
2378
2383
static int
2379
- unicode_fromformat_write_cstr (_PyUnicodeWriter * writer , const char * str ,
2384
+ unicode_fromformat_write_utf8 (_PyUnicodeWriter * writer , const char * str ,
2380
2385
Py_ssize_t width , Py_ssize_t precision , int flags )
2381
2386
{
2382
2387
/* UTF-8 */
2383
2388
Py_ssize_t length ;
2384
- PyObject * unicode ;
2385
- int res ;
2386
-
2387
2389
if (precision == -1 ) {
2388
2390
length = strlen (str );
2389
2391
}
@@ -2393,13 +2395,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2393
2395
length ++ ;
2394
2396
}
2395
2397
}
2396
- unicode = PyUnicode_DecodeUTF8Stateful (str , length , "replace" , NULL );
2397
- if (unicode == NULL )
2398
- return -1 ;
2399
2398
2400
- res = unicode_fromformat_write_str (writer , unicode , width , -1 , flags );
2401
- Py_DECREF (unicode );
2402
- return res ;
2399
+ if (width < 0 ) {
2400
+ return unicode_decode_utf8_writer (writer , str , length ,
2401
+ _Py_ERROR_UNKNOWN , "replace" , NULL );
2402
+ }
2403
+ else {
2404
+ PyObject * unicode = PyUnicode_DecodeUTF8Stateful (str , length ,
2405
+ "replace" , NULL );
2406
+ if (unicode == NULL )
2407
+ return -1 ;
2408
+
2409
+ int res = unicode_fromformat_write_str (writer , unicode ,
2410
+ width , -1 , flags );
2411
+ Py_DECREF (unicode );
2412
+ return res ;
2413
+ }
2403
2414
}
2404
2415
2405
2416
static int
@@ -2699,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2699
2710
else {
2700
2711
/* UTF-8 */
2701
2712
const char * s = va_arg (* vargs , const char * );
2702
- if (unicode_fromformat_write_cstr (writer , s , width , precision , flags ) < 0 )
2713
+ if (unicode_fromformat_write_utf8 (writer , s , width , precision , flags ) < 0 )
2703
2714
return NULL ;
2704
2715
}
2705
2716
break ;
@@ -2738,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2738
2749
}
2739
2750
else {
2740
2751
assert (str != NULL );
2741
- if (unicode_fromformat_write_cstr (writer , str , width , precision , flags ) < 0 )
2752
+ if (unicode_fromformat_write_utf8 (writer , str , width , precision , flags ) < 0 )
2742
2753
return NULL ;
2743
2754
}
2744
2755
break ;
@@ -4736,65 +4747,56 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4736
4747
return p - start ;
4737
4748
}
4738
4749
4739
- static PyObject *
4740
- unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4741
- _Py_error_handler error_handler , const char * errors ,
4742
- Py_ssize_t * consumed )
4743
- {
4744
- if (size == 0 ) {
4745
- if (consumed )
4746
- * consumed = 0 ;
4747
- _Py_RETURN_UNICODE_EMPTY ();
4748
- }
4749
-
4750
- /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4751
- if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4752
- if (consumed ) {
4753
- * consumed = 1 ;
4754
- }
4755
- return get_latin1_char ((unsigned char )s [0 ]);
4756
- }
4757
4750
4751
+ static int
4752
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4753
+ const char * s , Py_ssize_t size ,
4754
+ _Py_error_handler error_handler , const char * errors ,
4755
+ Py_ssize_t * consumed )
4756
+ {
4758
4757
const char * starts = s ;
4759
4758
const char * end = s + size ;
4760
4759
4761
4760
// fast path: try ASCII string.
4762
- PyObject * u = PyUnicode_New (size , 127 );
4763
- if (u == NULL ) {
4764
- return NULL ;
4761
+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4762
+ return -1 ;
4765
4763
}
4766
- s += ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4767
- if (s == end ) {
4768
- if (consumed ) {
4769
- * consumed = size ;
4764
+
4765
+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4766
+ if (writer -> kind == PyUnicode_1BYTE_KIND
4767
+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4768
+ {
4769
+ Py_ssize_t decoded = ascii_decode (s , end , dest );
4770
+ writer -> pos += decoded ;
4771
+
4772
+ if (decoded == size ) {
4773
+ if (consumed ) {
4774
+ * consumed = size ;
4775
+ }
4776
+ return 0 ;
4770
4777
}
4771
- return u ;
4778
+ s += decoded ;
4772
4779
}
4773
4780
4774
- // Use _PyUnicodeWriter after fast path is failed.
4775
- _PyUnicodeWriter writer ;
4776
- _PyUnicodeWriter_InitWithBuffer (& writer , u );
4777
- writer .pos = s - starts ;
4778
-
4779
4781
Py_ssize_t startinpos , endinpos ;
4780
4782
const char * errmsg = "" ;
4781
4783
PyObject * error_handler_obj = NULL ;
4782
4784
PyObject * exc = NULL ;
4783
4785
4784
4786
while (s < end ) {
4785
4787
Py_UCS4 ch ;
4786
- int kind = writer . kind ;
4788
+ int kind = writer -> kind ;
4787
4789
4788
4790
if (kind == PyUnicode_1BYTE_KIND ) {
4789
- if (PyUnicode_IS_ASCII (writer . buffer ))
4790
- ch = asciilib_utf8_decode (& s , end , writer . data , & writer . pos );
4791
+ if (PyUnicode_IS_ASCII (writer -> buffer ))
4792
+ ch = asciilib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4791
4793
else
4792
- ch = ucs1lib_utf8_decode (& s , end , writer . data , & writer . pos );
4794
+ ch = ucs1lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4793
4795
} else if (kind == PyUnicode_2BYTE_KIND ) {
4794
- ch = ucs2lib_utf8_decode (& s , end , writer . data , & writer . pos );
4796
+ ch = ucs2lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4795
4797
} else {
4796
4798
assert (kind == PyUnicode_4BYTE_KIND );
4797
- ch = ucs4lib_utf8_decode (& s , end , writer . data , & writer . pos );
4799
+ ch = ucs4lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4798
4800
}
4799
4801
4800
4802
switch (ch ) {
@@ -4825,7 +4827,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4825
4827
endinpos = startinpos + ch - 1 ;
4826
4828
break ;
4827
4829
default :
4828
- if (_PyUnicodeWriter_WriteCharInline (& writer , ch ) < 0 )
4830
+ if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
4829
4831
goto onError ;
4830
4832
continue ;
4831
4833
}
@@ -4839,7 +4841,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4839
4841
break ;
4840
4842
4841
4843
case _Py_ERROR_REPLACE :
4842
- if (_PyUnicodeWriter_WriteCharInline (& writer , 0xfffd ) < 0 )
4844
+ if (_PyUnicodeWriter_WriteCharInline (writer , 0xfffd ) < 0 )
4843
4845
goto onError ;
4844
4846
s += (endinpos - startinpos );
4845
4847
break ;
@@ -4848,13 +4850,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4848
4850
{
4849
4851
Py_ssize_t i ;
4850
4852
4851
- if (_PyUnicodeWriter_PrepareKind (& writer , PyUnicode_2BYTE_KIND ) < 0 )
4853
+ if (_PyUnicodeWriter_PrepareKind (writer , PyUnicode_2BYTE_KIND ) < 0 )
4852
4854
goto onError ;
4853
4855
for (i = startinpos ; i < endinpos ; i ++ ) {
4854
4856
ch = (Py_UCS4 )(unsigned char )(starts [i ]);
4855
- PyUnicode_WRITE (writer . kind , writer . data , writer . pos ,
4857
+ PyUnicode_WRITE (writer -> kind , writer -> data , writer -> pos ,
4856
4858
ch + 0xdc00 );
4857
- writer . pos ++ ;
4859
+ writer -> pos ++ ;
4858
4860
}
4859
4861
s += (endinpos - startinpos );
4860
4862
break ;
@@ -4865,8 +4867,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4865
4867
errors , & error_handler_obj ,
4866
4868
"utf-8" , errmsg ,
4867
4869
& starts , & end , & startinpos , & endinpos , & exc , & s ,
4868
- & writer ))
4870
+ writer )) {
4869
4871
goto onError ;
4872
+ }
4873
+
4874
+ if (_PyUnicodeWriter_Prepare (writer , end - s , 127 ) < 0 ) {
4875
+ return -1 ;
4876
+ }
4870
4877
}
4871
4878
}
4872
4879
@@ -4876,13 +4883,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4876
4883
4877
4884
Py_XDECREF (error_handler_obj );
4878
4885
Py_XDECREF (exc );
4879
- return _PyUnicodeWriter_Finish ( & writer ) ;
4886
+ return 0 ;
4880
4887
4881
4888
onError :
4882
4889
Py_XDECREF (error_handler_obj );
4883
4890
Py_XDECREF (exc );
4884
- _PyUnicodeWriter_Dealloc (& writer );
4885
- return NULL ;
4891
+ return -1 ;
4892
+ }
4893
+
4894
+
4895
+ static PyObject *
4896
+ unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4897
+ _Py_error_handler error_handler , const char * errors ,
4898
+ Py_ssize_t * consumed )
4899
+ {
4900
+ if (size == 0 ) {
4901
+ if (consumed )
4902
+ * consumed = 0 ;
4903
+ _Py_RETURN_UNICODE_EMPTY ();
4904
+ }
4905
+
4906
+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4907
+ if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4908
+ if (consumed ) {
4909
+ * consumed = 1 ;
4910
+ }
4911
+ return get_latin1_char ((unsigned char )s [0 ]);
4912
+ }
4913
+
4914
+ _PyUnicodeWriter writer ;
4915
+ _PyUnicodeWriter_Init (& writer );
4916
+
4917
+ if (unicode_decode_utf8_writer (& writer , s , size ,
4918
+ error_handler , errors ,
4919
+ consumed ) < 0 ) {
4920
+ _PyUnicodeWriter_Dealloc (& writer );
4921
+ return NULL ;
4922
+ }
4923
+ return _PyUnicodeWriter_Finish (& writer );
4886
4924
}
4887
4925
4888
4926
0 commit comments