Skip to content

Commit 84e20ea

Browse files
committed
gh-101178: C implementation of base64._a85encode
Initially done to reduce the huge memory consumption of the previous implementation for large inputs, and that no memory-friendly python way was found that did not include a performance regression This implementation also greatly improve performance in all cases Signed-off-by: Romuald Brunet <[email protected]>
1 parent e7f00cd commit 84e20ea

File tree

3 files changed

+207
-27
lines changed

3 files changed

+207
-27
lines changed

Lib/base64.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -298,27 +298,12 @@ def b16decode(s, casefold=False):
298298

299299
def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
300300
# Helper function for a85encode and b85encode
301+
# chars2 is now unused
301302
if not isinstance(b, bytes_types):
302303
b = memoryview(b).tobytes()
303304

304-
padding = (-len(b)) % 4
305-
if padding:
306-
b = b + b'\0' * padding
307-
words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
308-
309-
chunks = [b'z' if foldnuls and not word else
310-
b'y' if foldspaces and word == 0x20202020 else
311-
(chars2[word // 614125] +
312-
chars2[word // 85 % 7225] +
313-
chars[word % 85])
314-
for word in words]
315-
316-
if padding and not pad:
317-
if chunks[-1] == b'z':
318-
chunks[-1] = chars[0] * 5
319-
chunks[-1] = chunks[-1][:-padding]
320-
321-
return b''.join(chunks)
305+
return binascii.b2a_base85(b, chars=chars, pad=pad,
306+
foldnuls=foldnuls, foldspaces=foldspaces)
322307

323308
def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
324309
"""Encode bytes-like object b using Ascii85 and return a bytes object.
@@ -337,14 +322,13 @@ def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
337322
adobe controls whether the encoded byte sequence is framed with <~ and ~>,
338323
which is used by the Adobe implementation.
339324
"""
340-
global _a85chars, _a85chars2
325+
global _a85chars
341326
# Delay the initialization of tables to not waste memory
342327
# if the function is never called
343-
if _a85chars2 is None:
328+
if _a85chars is None:
344329
_a85chars = [bytes((i,)) for i in range(33, 118)]
345-
_a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
346330

347-
result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
331+
result = _85encode(b, b''.join(_a85chars), None, pad, True, foldspaces)
348332

349333
if adobe:
350334
result = _A85START + result
@@ -445,13 +429,12 @@ def b85encode(b, pad=False):
445429
If pad is true, the input is padded with b'\\0' so its length is a multiple of
446430
4 bytes before encoding.
447431
"""
448-
global _b85chars, _b85chars2
432+
global _b85chars
449433
# Delay the initialization of tables to not waste memory
450434
# if the function is never called
451-
if _b85chars2 is None:
435+
if _b85chars is None:
452436
_b85chars = [bytes((i,)) for i in _b85alphabet]
453-
_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
454-
return _85encode(b, _b85chars, _b85chars2, pad)
437+
return _85encode(b, _b85alphabet, None, pad)
455438

456439
def b85decode(b):
457440
"""Decode the base85-encoded bytes-like object or ASCII string b

Modules/binascii.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,13 +1239,101 @@ binascii_b2a_qp_impl(PyObject *module, Py_buffer *data, int quotetabs,
12391239
return rv;
12401240
}
12411241

1242+
/*[clinic input]
1243+
binascii.b2a_base85
1244+
1245+
data: Py_buffer
1246+
chars: Py_buffer
1247+
pad: bool = False
1248+
foldnuls: bool = False
1249+
foldspaces: bool = False
1250+
1251+
Utility method used by the base64 module to encode a85/b85 data
1252+
1253+
data: bytes
1254+
chars: 85 bytes conversion table
1255+
pad: use NULL-paded input if necessary
1256+
foldnuls: replace NULL chunks by 'z'
1257+
foldspaces: replace space-only chucks by 'y'
1258+
1259+
[clinic start generated code]*/
1260+
1261+
static PyObject *
1262+
binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, Py_buffer *chars,
1263+
int pad, int foldnuls, int foldspaces)
1264+
/*[clinic end generated code: output=0a92b3c535580aa0 input=a2d8ae712ed5adba]*/
1265+
{
1266+
if (chars->len != 85) {
1267+
PyErr_SetString(PyExc_ValueError,
1268+
"chars must be exactly 85 bytes long");
1269+
return NULL;
1270+
}
1271+
1272+
_PyBytesWriter writer;
1273+
_PyBytesWriter_Init(&writer);
1274+
1275+
const size_t bin_len = data->len;
1276+
1277+
// Allocate up to maxium encoded length, adjusted at end
1278+
const size_t ascii_len = ((bin_len + 3) / 4) * 5;
1279+
1280+
unsigned char *ascii_data = _PyBytesWriter_Alloc(&writer, ascii_len);
1281+
if (ascii_data == NULL) {
1282+
PyErr_NoMemory();
1283+
return NULL;
1284+
}
1285+
1286+
const unsigned char *table = chars->buf;
1287+
const unsigned char *bin_data = data->buf;
1288+
1289+
size_t i, j;
1290+
for (i = 0; i < bin_len; i += 4) {
1291+
const size_t chunk_size = (bin_len - i >= 4) ? 4 : (bin_len - i);
1292+
1293+
// translate chunk to 32bit integer
1294+
uint32_t value = 0;
1295+
for (j = 0; j < chunk_size; j++) {
1296+
value = (value << 8) | bin_data[i + j];
1297+
}
1298+
value <<= (4 - chunk_size) * 8;
1299+
1300+
if (foldnuls && value == 0) {
1301+
*ascii_data++ = 'z';
1302+
} else if (foldspaces && value == 0x20202020) {
1303+
*ascii_data++ = 'y';
1304+
} else {
1305+
for (j = 0; j < 5 ; j++) {
1306+
ascii_data[4 - j] = table[value % 85];
1307+
value /= 85;
1308+
}
1309+
ascii_data += 5;
1310+
}
1311+
}
1312+
1313+
// In case `i` went over the input size, we may need to shorten the output
1314+
const size_t overflow = (i - bin_len);
1315+
1316+
if (overflow && !pad && foldnuls && ascii_data[-1] == 'z') {
1317+
ascii_data--;
1318+
memset(ascii_data, table[0], 5);
1319+
ascii_data += 5;
1320+
}
1321+
1322+
if (!pad) {
1323+
ascii_data -= overflow;
1324+
}
1325+
1326+
return _PyBytesWriter_Finish(&writer, ascii_data);
1327+
}
1328+
12421329
/* List of functions defined in the module */
12431330

12441331
static struct PyMethodDef binascii_module_methods[] = {
12451332
BINASCII_A2B_UU_METHODDEF
12461333
BINASCII_B2A_UU_METHODDEF
12471334
BINASCII_A2B_BASE64_METHODDEF
12481335
BINASCII_B2A_BASE64_METHODDEF
1336+
BINASCII_B2A_BASE85_METHODDEF
12491337
BINASCII_A2B_HEX_METHODDEF
12501338
BINASCII_B2A_HEX_METHODDEF
12511339
BINASCII_HEXLIFY_METHODDEF

Modules/clinic/binascii.c.h

Lines changed: 110 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)