Skip to content

Commit 6c8aedc

Browse files
committed
gh-119182: Optimize PyUnicode_FromFormat() UTF-8 decoder
Add unicode_decode_utf8_writer() to write directly characters into a _PyUnicodeWriter writer: avoid the creation of a temporary string. Optimize PyUnicode_FromFormat() by using the new unicode_decode_utf8_writer(). Rename unicode_fromformat_write_cstr() to unicode_fromformat_write_utf8(). Microbenchmark on the code: return PyUnicode_FromFormat( "%s %s %s %s %s.", "format", "multiple", "utf8", "short", "strings"); Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
1 parent 87939bd commit 6c8aedc

File tree

1 file changed

+96
-58
lines changed

1 file changed

+96
-58
lines changed

Objects/unicodeobject.c

Lines changed: 96 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ static PyObject *
201201
unicode_decode_utf8(const char *s, Py_ssize_t size,
202202
_Py_error_handler error_handler, const char *errors,
203203
Py_ssize_t *consumed);
204+
static int
205+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
206+
const char *s, Py_ssize_t size,
207+
_Py_error_handler error_handler, const char *errors,
208+
Py_ssize_t *consumed);
204209
#ifdef Py_DEBUG
205210
static inline int unicode_is_finalizing(void);
206211
static int unicode_is_singleton(PyObject *unicode);
@@ -2376,14 +2381,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
23762381
}
23772382

23782383
static int
2379-
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2384+
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
23802385
Py_ssize_t width, Py_ssize_t precision, int flags)
23812386
{
23822387
/* UTF-8 */
23832388
Py_ssize_t length;
2384-
PyObject *unicode;
2385-
int res;
2386-
23872389
if (precision == -1) {
23882390
length = strlen(str);
23892391
}
@@ -2393,13 +2395,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
23932395
length++;
23942396
}
23952397
}
2396-
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2397-
if (unicode == NULL)
2398-
return -1;
23992398

2400-
res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2401-
Py_DECREF(unicode);
2402-
return res;
2399+
if (width < 0) {
2400+
return unicode_decode_utf8_writer(writer, str, length,
2401+
_Py_ERROR_UNKNOWN, "replace", NULL);
2402+
}
2403+
else {
2404+
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2405+
"replace", NULL);
2406+
if (unicode == NULL)
2407+
return -1;
2408+
2409+
int res = unicode_fromformat_write_str(writer, unicode,
2410+
width, -1, flags);
2411+
Py_DECREF(unicode);
2412+
return res;
2413+
}
24032414
}
24042415

24052416
static int
@@ -2699,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
26992710
else {
27002711
/* UTF-8 */
27012712
const char *s = va_arg(*vargs, const char*);
2702-
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
2713+
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
27032714
return NULL;
27042715
}
27052716
break;
@@ -2738,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27382749
}
27392750
else {
27402751
assert(str != NULL);
2741-
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
2752+
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
27422753
return NULL;
27432754
}
27442755
break;
@@ -4736,65 +4747,56 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47364747
return p - start;
47374748
}
47384749

4739-
static PyObject *
4740-
unicode_decode_utf8(const char *s, Py_ssize_t size,
4741-
_Py_error_handler error_handler, const char *errors,
4742-
Py_ssize_t *consumed)
4743-
{
4744-
if (size == 0) {
4745-
if (consumed)
4746-
*consumed = 0;
4747-
_Py_RETURN_UNICODE_EMPTY();
4748-
}
4749-
4750-
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4751-
if (size == 1 && (unsigned char)s[0] < 128) {
4752-
if (consumed) {
4753-
*consumed = 1;
4754-
}
4755-
return get_latin1_char((unsigned char)s[0]);
4756-
}
47574750

4751+
static int
4752+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4753+
const char *s, Py_ssize_t size,
4754+
_Py_error_handler error_handler, const char *errors,
4755+
Py_ssize_t *consumed)
4756+
{
47584757
const char *starts = s;
47594758
const char *end = s + size;
47604759

47614760
// fast path: try ASCII string.
4762-
PyObject *u = PyUnicode_New(size, 127);
4763-
if (u == NULL) {
4764-
return NULL;
4761+
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4762+
return -1;
47654763
}
4766-
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4767-
if (s == end) {
4768-
if (consumed) {
4769-
*consumed = size;
4764+
4765+
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4766+
if (writer->kind == PyUnicode_1BYTE_KIND
4767+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4768+
{
4769+
Py_ssize_t decoded = ascii_decode(s, end, dest);
4770+
writer->pos += decoded;
4771+
4772+
if (decoded == size) {
4773+
if (consumed) {
4774+
*consumed = size;
4775+
}
4776+
return 0;
47704777
}
4771-
return u;
4778+
s += decoded;
47724779
}
47734780

4774-
// Use _PyUnicodeWriter after fast path is failed.
4775-
_PyUnicodeWriter writer;
4776-
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4777-
writer.pos = s - starts;
4778-
47794781
Py_ssize_t startinpos, endinpos;
47804782
const char *errmsg = "";
47814783
PyObject *error_handler_obj = NULL;
47824784
PyObject *exc = NULL;
47834785

47844786
while (s < end) {
47854787
Py_UCS4 ch;
4786-
int kind = writer.kind;
4788+
int kind = writer->kind;
47874789

47884790
if (kind == PyUnicode_1BYTE_KIND) {
4789-
if (PyUnicode_IS_ASCII(writer.buffer))
4790-
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4791+
if (PyUnicode_IS_ASCII(writer->buffer))
4792+
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
47914793
else
4792-
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4794+
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
47934795
} else if (kind == PyUnicode_2BYTE_KIND) {
4794-
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4796+
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
47954797
} else {
47964798
assert(kind == PyUnicode_4BYTE_KIND);
4797-
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4799+
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
47984800
}
47994801

48004802
switch (ch) {
@@ -4825,7 +4827,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48254827
endinpos = startinpos + ch - 1;
48264828
break;
48274829
default:
4828-
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4830+
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
48294831
goto onError;
48304832
continue;
48314833
}
@@ -4839,7 +4841,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48394841
break;
48404842

48414843
case _Py_ERROR_REPLACE:
4842-
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4844+
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
48434845
goto onError;
48444846
s += (endinpos - startinpos);
48454847
break;
@@ -4848,13 +4850,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48484850
{
48494851
Py_ssize_t i;
48504852

4851-
if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4853+
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
48524854
goto onError;
48534855
for (i=startinpos; i<endinpos; i++) {
48544856
ch = (Py_UCS4)(unsigned char)(starts[i]);
4855-
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4857+
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
48564858
ch + 0xdc00);
4857-
writer.pos++;
4859+
writer->pos++;
48584860
}
48594861
s += (endinpos - startinpos);
48604862
break;
@@ -4865,8 +4867,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48654867
errors, &error_handler_obj,
48664868
"utf-8", errmsg,
48674869
&starts, &end, &startinpos, &endinpos, &exc, &s,
4868-
&writer))
4870+
writer)) {
48694871
goto onError;
4872+
}
4873+
4874+
if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
4875+
return -1;
4876+
}
48704877
}
48714878
}
48724879

@@ -4876,13 +4883,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48764883

48774884
Py_XDECREF(error_handler_obj);
48784885
Py_XDECREF(exc);
4879-
return _PyUnicodeWriter_Finish(&writer);
4886+
return 0;
48804887

48814888
onError:
48824889
Py_XDECREF(error_handler_obj);
48834890
Py_XDECREF(exc);
4884-
_PyUnicodeWriter_Dealloc(&writer);
4885-
return NULL;
4891+
return -1;
4892+
}
4893+
4894+
4895+
static PyObject *
4896+
unicode_decode_utf8(const char *s, Py_ssize_t size,
4897+
_Py_error_handler error_handler, const char *errors,
4898+
Py_ssize_t *consumed)
4899+
{
4900+
if (size == 0) {
4901+
if (consumed)
4902+
*consumed = 0;
4903+
_Py_RETURN_UNICODE_EMPTY();
4904+
}
4905+
4906+
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4907+
if (size == 1 && (unsigned char)s[0] < 128) {
4908+
if (consumed) {
4909+
*consumed = 1;
4910+
}
4911+
return get_latin1_char((unsigned char)s[0]);
4912+
}
4913+
4914+
_PyUnicodeWriter writer;
4915+
_PyUnicodeWriter_Init(&writer);
4916+
4917+
if (unicode_decode_utf8_writer(&writer, s, size,
4918+
error_handler, errors,
4919+
consumed) < 0) {
4920+
_PyUnicodeWriter_Dealloc(&writer);
4921+
return NULL;
4922+
}
4923+
return _PyUnicodeWriter_Finish(&writer);
48864924
}
48874925

48884926

0 commit comments

Comments
 (0)