Skip to content

Commit f9932f5

Browse files
dg-pbkumaraditya303serhiy-storchaka
authored
gh-119109: improve functools.partial vectorcall with keywords (#124584)
Co-authored-by: Kumar Aditya <[email protected]> Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 6ea4258 commit f9932f5

File tree

2 files changed

+140
-53
lines changed

2 files changed

+140
-53
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:func:`functools.partial` calls are now faster when keyword arguments are used.

Modules/_functoolsmodule.c

Lines changed: 139 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -367,19 +367,6 @@ partial_descr_get(PyObject *self, PyObject *obj, PyObject *type)
367367
return PyMethod_New(self, obj);
368368
}
369369

370-
/* Merging keyword arguments using the vectorcall convention is messy, so
371-
* if we would need to do that, we stop using vectorcall and fall back
372-
* to using partial_call() instead. */
373-
Py_NO_INLINE static PyObject *
374-
partial_vectorcall_fallback(PyThreadState *tstate, partialobject *pto,
375-
PyObject *const *args, size_t nargsf,
376-
PyObject *kwnames)
377-
{
378-
pto->vectorcall = NULL;
379-
Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
380-
return _PyObject_MakeTpCall(tstate, (PyObject *)pto, args, nargs, kwnames);
381-
}
382-
383370
static PyObject *
384371
partial_vectorcall(PyObject *self, PyObject *const *args,
385372
size_t nargsf, PyObject *kwnames)
@@ -388,10 +375,7 @@ partial_vectorcall(PyObject *self, PyObject *const *args,
388375
PyThreadState *tstate = _PyThreadState_GET();
389376
Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
390377

391-
/* pto->kw is mutable, so need to check every time */
392-
if (PyDict_GET_SIZE(pto->kw)) {
393-
return partial_vectorcall_fallback(tstate, pto, args, nargsf, kwnames);
394-
}
378+
/* Placeholder check */
395379
Py_ssize_t pto_phcount = pto->phcount;
396380
if (nargs < pto_phcount) {
397381
PyErr_Format(PyExc_TypeError,
@@ -400,50 +384,143 @@ partial_vectorcall(PyObject *self, PyObject *const *args,
400384
return NULL;
401385
}
402386

403-
Py_ssize_t nargskw = nargs;
404-
if (kwnames != NULL) {
405-
nargskw += PyTuple_GET_SIZE(kwnames);
406-
}
407-
408387
PyObject **pto_args = _PyTuple_ITEMS(pto->args);
409388
Py_ssize_t pto_nargs = PyTuple_GET_SIZE(pto->args);
389+
Py_ssize_t pto_nkwds = PyDict_GET_SIZE(pto->kw);
390+
Py_ssize_t nkwds = kwnames == NULL ? 0 : PyTuple_GET_SIZE(kwnames);
391+
Py_ssize_t nargskw = nargs + nkwds;
392+
393+
/* Special cases */
394+
if (!pto_nkwds) {
395+
/* Fast path if we're called without arguments */
396+
if (nargskw == 0) {
397+
return _PyObject_VectorcallTstate(tstate, pto->fn, pto_args,
398+
pto_nargs, NULL);
399+
}
410400

411-
/* Fast path if we're called without arguments */
412-
if (nargskw == 0) {
413-
return _PyObject_VectorcallTstate(tstate, pto->fn,
414-
pto_args, pto_nargs, NULL);
401+
/* Use PY_VECTORCALL_ARGUMENTS_OFFSET to prepend a single
402+
* positional argument. */
403+
if (pto_nargs == 1 && (nargsf & PY_VECTORCALL_ARGUMENTS_OFFSET)) {
404+
PyObject **newargs = (PyObject **)args - 1;
405+
PyObject *tmp = newargs[0];
406+
newargs[0] = pto_args[0];
407+
PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, newargs,
408+
nargs + 1, kwnames);
409+
newargs[0] = tmp;
410+
return ret;
411+
}
415412
}
416413

417-
/* Fast path using PY_VECTORCALL_ARGUMENTS_OFFSET to prepend a single
418-
* positional argument */
419-
if (pto_nargs == 1 && (nargsf & PY_VECTORCALL_ARGUMENTS_OFFSET)) {
420-
PyObject **newargs = (PyObject **)args - 1;
421-
PyObject *tmp = newargs[0];
422-
newargs[0] = pto_args[0];
423-
PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn,
424-
newargs, nargs + 1, kwnames);
425-
newargs[0] = tmp;
426-
return ret;
427-
}
414+
/* Total sizes */
415+
Py_ssize_t tot_nargs = pto_nargs + nargs - pto_phcount;
416+
Py_ssize_t tot_nkwds = pto_nkwds + nkwds;
417+
Py_ssize_t tot_nargskw = tot_nargs + tot_nkwds;
428418

429-
PyObject *small_stack[_PY_FASTCALL_SMALL_STACK];
430-
PyObject **stack;
419+
PyObject *pto_kw_merged = NULL; // pto_kw with duplicates merged (if any)
420+
PyObject *tot_kwnames;
431421

432-
Py_ssize_t tot_nargskw = pto_nargs + nargskw - pto_phcount;
433-
if (tot_nargskw <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) {
422+
/* Allocate Stack
423+
* Note, _PY_FASTCALL_SMALL_STACK is optimal for positional only
424+
* This case might have keyword arguments
425+
* furthermore, it might use extra stack space for temporary key storage
426+
* thus, double small_stack size is used, which is 10 * 8 = 80 bytes */
427+
PyObject *small_stack[_PY_FASTCALL_SMALL_STACK * 2];
428+
PyObject **tmp_stack, **stack;
429+
Py_ssize_t init_stack_size = tot_nargskw;
430+
if (pto_nkwds) {
431+
// If pto_nkwds, allocate additional space for temporary new keys
432+
init_stack_size += nkwds;
433+
}
434+
if (init_stack_size <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) {
434435
stack = small_stack;
435436
}
436437
else {
437-
stack = PyMem_Malloc(tot_nargskw * sizeof(PyObject *));
438+
stack = PyMem_Malloc(init_stack_size * sizeof(PyObject *));
438439
if (stack == NULL) {
439-
PyErr_NoMemory();
440-
return NULL;
440+
return PyErr_NoMemory();
441441
}
442442
}
443443

444-
Py_ssize_t tot_nargs;
444+
/* Copy keywords to stack */
445+
if (!pto_nkwds) {
446+
tot_kwnames = kwnames;
447+
if (nkwds) {
448+
/* if !pto_nkwds & nkwds, then simply append kw */
449+
memcpy(stack + tot_nargs, args + nargs, nkwds * sizeof(PyObject*));
450+
}
451+
}
452+
else {
453+
/* stack is now [<positionals>, <pto_kwds>, <kwds>, <kwds_keys>]
454+
* Will resize later to [<positionals>, <merged_kwds>] */
455+
PyObject *key, *val;
456+
457+
/* Merge kw to pto_kw or add to tail (if not duplicate) */
458+
Py_ssize_t n_tail = 0;
459+
for (Py_ssize_t i = 0; i < nkwds; ++i) {
460+
key = PyTuple_GET_ITEM(kwnames, i);
461+
val = args[nargs + i];
462+
if (PyDict_Contains(pto->kw, key)) {
463+
if (pto_kw_merged == NULL) {
464+
pto_kw_merged = PyDict_Copy(pto->kw);
465+
if (pto_kw_merged == NULL) {
466+
goto error;
467+
}
468+
}
469+
if (PyDict_SetItem(pto_kw_merged, key, val) < 0) {
470+
Py_DECREF(pto_kw_merged);
471+
goto error;
472+
}
473+
}
474+
else {
475+
/* Copy keyword tail to stack */
476+
stack[tot_nargs + pto_nkwds + n_tail] = val;
477+
stack[tot_nargskw + n_tail] = key;
478+
n_tail++;
479+
}
480+
}
481+
Py_ssize_t n_merges = nkwds - n_tail;
482+
483+
/* Create total kwnames */
484+
tot_kwnames = PyTuple_New(tot_nkwds - n_merges);
485+
if (tot_kwnames == NULL) {
486+
Py_XDECREF(pto_kw_merged);
487+
goto error;
488+
}
489+
for (Py_ssize_t i = 0; i < n_tail; ++i) {
490+
key = Py_NewRef(stack[tot_nargskw + i]);
491+
PyTuple_SET_ITEM(tot_kwnames, pto_nkwds + i, key);
492+
}
493+
494+
/* Copy pto_keywords with overlapping call keywords merged
495+
* Note, tail is already coppied. */
496+
Py_ssize_t pos = 0, i = 0;
497+
while (PyDict_Next(n_merges ? pto_kw_merged : pto->kw, &pos, &key, &val)) {
498+
assert(i < pto_nkwds);
499+
PyTuple_SET_ITEM(tot_kwnames, i, Py_NewRef(key));
500+
stack[tot_nargs + i] = val;
501+
i++;
502+
}
503+
assert(i == pto_nkwds);
504+
Py_XDECREF(pto_kw_merged);
505+
506+
/* Resize Stack if the removing overallocation saves some noticable memory
507+
* NOTE: This whole block can be removed without breaking anything */
508+
Py_ssize_t noveralloc = n_merges + nkwds;
509+
if (stack != small_stack && noveralloc > 6 && noveralloc > init_stack_size / 10) {
510+
tmp_stack = PyMem_Realloc(stack, (tot_nargskw - n_merges) * sizeof(PyObject *));
511+
if (tmp_stack == NULL) {
512+
Py_DECREF(tot_kwnames);
513+
if (stack != small_stack) {
514+
PyMem_Free(stack);
515+
}
516+
return PyErr_NoMemory();
517+
}
518+
stack = tmp_stack;
519+
}
520+
}
521+
522+
/* Copy Positionals to stack */
445523
if (pto_phcount) {
446-
tot_nargs = pto_nargs + nargs - pto_phcount;
447524
Py_ssize_t j = 0; // New args index
448525
for (Py_ssize_t i = 0; i < pto_nargs; i++) {
449526
if (pto_args[i] == pto->placeholder) {
@@ -455,22 +532,31 @@ partial_vectorcall(PyObject *self, PyObject *const *args,
455532
}
456533
}
457534
assert(j == pto_phcount);
458-
if (nargskw > pto_phcount) {
459-
memcpy(stack + pto_nargs, args + j, (nargskw - j) * sizeof(PyObject*));
535+
/* Add remaining args from new_args */
536+
if (nargs > pto_phcount) {
537+
memcpy(stack + pto_nargs, args + j, (nargs - j) * sizeof(PyObject*));
460538
}
461539
}
462540
else {
463-
tot_nargs = pto_nargs + nargs;
464-
/* Copy to new stack, using borrowed references */
465541
memcpy(stack, pto_args, pto_nargs * sizeof(PyObject*));
466-
memcpy(stack + pto_nargs, args, nargskw * sizeof(PyObject*));
542+
memcpy(stack + pto_nargs, args, nargs * sizeof(PyObject*));
467543
}
468-
PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn,
469-
stack, tot_nargs, kwnames);
544+
545+
PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, stack,
546+
tot_nargs, tot_kwnames);
470547
if (stack != small_stack) {
471548
PyMem_Free(stack);
472549
}
550+
if (pto_nkwds) {
551+
Py_DECREF(tot_kwnames);
552+
}
473553
return ret;
554+
555+
error:
556+
if (stack != small_stack) {
557+
PyMem_Free(stack);
558+
}
559+
return NULL;
474560
}
475561

476562
/* Set pto->vectorcall depending on the parameters of the partial object */

0 commit comments

Comments
 (0)