From aa420375e24777fd52f04af5141c13637989a202 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 23 May 2025 22:47:32 +0700 Subject: [PATCH 01/32] Implemented NumbaExecutionEngine --- pandas/core/apply.py | 73 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2c96f1ef020ac..fe87b1d2beaa6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -45,9 +45,9 @@ ABCSeries, ) -from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core._numba.executor import generate_apply_looper from pandas.core.util.numba_ import ( get_jit_arguments, prepare_function_arguments, @@ -178,6 +178,57 @@ def apply( """ +class NumbaExecutionEngine(BaseExecutionEngine): + """ + Numba-based execution engine for pandas apply and map operations. + """ + + @staticmethod + def map( + data: np.ndarray | Series | DataFrame, + func, + args: tuple, + kwargs: dict, + engine_kwargs: dict | None, + skip_na: bool, + ): + """ + Elementwise map for the Numba engine. Currently not supported. + """ + raise NotImplementedError("Numba map is not implemented yet.") + + @staticmethod + def apply( + data: np.ndarray | Series | DataFrame, + func, + args: tuple, + kwargs: dict, + engine_kwargs: dict | None, + axis: int | str, + ): + """ + Apply `func` along the given axis using Numba. + """ + + looper_args, looper_kwargs = prepare_function_arguments( + func, # type: ignore[arg-type] + args, + kwargs, + num_required_args=1, + ) + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + func, # type: ignore[arg-type] + **get_jit_arguments(engine_kwargs) + ) + result = nb_looper(data, axis, *looper_args) + # If we made the result 2-D, squeeze it back to 1-D + return np.squeeze(result) + + def frame_apply( obj: DataFrame, func: AggFuncType, @@ -1094,23 +1145,15 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - args, kwargs = prepare_function_arguments( - self.func, # type: ignore[arg-type] + engine_obj = NumbaExecutionEngine() + result = engine_obj.apply( + self.values, + self.func, self.args, self.kwargs, - num_required_args=1, - ) - # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has - # incompatible type "Callable[..., Any] | str | list[Callable - # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | - # list[Callable[..., Any] | str]]"; expected "Hashable" - nb_looper = generate_apply_looper( - self.func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs), + engine_kwargs, + self.axis, ) - result = nb_looper(self.values, self.axis, *args) - # If we made the result 2-D, squeeze it back to 1-D - result = np.squeeze(result) else: result = np.apply_along_axis( wrap_function(self.func), From db9f3b000f237a1fc580f3361e0984b410ee9d3e Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 06:51:23 +0700 Subject: [PATCH 02/32] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab3316e7fca4c..6948ffcde40b2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :class:`pandas.core.apply.NumbaExecutionEngine` as the built-in ``numba`` execution engine for ``apply`` and ``map`` operations (:issue:`61458`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` From 4cb240d95c139ef8956a0430287559a5d75a73bc Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 06:56:06 +0700 Subject: [PATCH 03/32] precommit --- pandas/core/apply.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fe87b1d2beaa6..ba240813d3229 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -45,9 +45,9 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core._numba.executor import generate_apply_looper from pandas.core.util.numba_ import ( get_jit_arguments, prepare_function_arguments, @@ -211,7 +211,7 @@ def apply( """ looper_args, looper_kwargs = prepare_function_arguments( - func, # type: ignore[arg-type] + func, # type: ignore[arg-type] args, kwargs, num_required_args=1, @@ -221,8 +221,8 @@ def apply( # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs) + func, # type: ignore[arg-type] + **get_jit_arguments(engine_kwargs), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 97d9063dcc65968956b282b13fbb49337f0388b2 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 07:21:10 +0700 Subject: [PATCH 04/32] Match function arguments --- pandas/core/apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ba240813d3229..3d760eaa8705a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -189,7 +189,7 @@ def map( func, args: tuple, kwargs: dict, - engine_kwargs: dict | None, + decorator: Callable | None, skip_na: bool, ): """ @@ -203,7 +203,7 @@ def apply( func, args: tuple, kwargs: dict, - engine_kwargs: dict | None, + decorator: Callable, axis: int | str, ): """ @@ -222,7 +222,7 @@ def apply( # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs), + **get_jit_arguments(decorator), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 69e0e355e14312e19b1341157d1b6e100f8dcb3d Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 07:54:50 +0700 Subject: [PATCH 05/32] Fix CI --- pandas/core/apply.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3d760eaa8705a..b765088308b2d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -209,9 +209,12 @@ def apply( """ Apply `func` along the given axis using Numba. """ + engine_kwargs: dict[str, bool] | None = ( + decorator if isinstance(decorator, dict) else None + ) looper_args, looper_kwargs = prepare_function_arguments( - func, # type: ignore[arg-type] + func, args, kwargs, num_required_args=1, @@ -221,8 +224,8 @@ def apply( # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - func, # type: ignore[arg-type] - **get_jit_arguments(decorator), + func, + **get_jit_arguments(engine_kwargs), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 736507949fbc217fae93d061f02ab3f9e2899f05 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 28 May 2025 16:41:30 +0700 Subject: [PATCH 06/32] updated whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6948ffcde40b2..ea9b06a58be92 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,8 +30,8 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`DataFrame.apply` accepts Numba as an engine by passing the JIT decorator directly, e.g. ``df.apply(func, engine=numba.jit)`` (:issue:`61458`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) -- Added :class:`pandas.core.apply.NumbaExecutionEngine` as the built-in ``numba`` execution engine for ``apply`` and ``map`` operations (:issue:`61458`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` From c605857d16bde78f6a4b0cc04556bcf24f7844bc Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 29 May 2025 22:18:39 +0700 Subject: [PATCH 07/32] Updated conditions and delegate method to numba.jit --- pandas/core/apply.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b765088308b2d..a4cce45758feb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,6 +13,7 @@ cast, ) +import numba import numpy as np from pandas._libs.internals import BlockValuesRefs @@ -1148,8 +1149,9 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_obj = NumbaExecutionEngine() - result = engine_obj.apply( + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + result = numba.jit.__pandas_udf__.apply( self.values, self.func, self.args, From 24a06150e01028a38f3466ded5c85e143ea41aef Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 3 Jun 2025 18:37:58 +0700 Subject: [PATCH 08/32] Added try and except to catch ImportError --- pandas/core/apply.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a4cce45758feb..760fd111f21ce 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,7 +13,6 @@ cast, ) -import numba import numpy as np from pandas._libs.internals import BlockValuesRefs @@ -1149,16 +1148,31 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) + try: + import numba + + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + result = numba.jit.__pandas_udf__.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) + else: + raise ImportError + except ImportError: + engine_obj = NumbaExecutionEngine() + result = engine_obj.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) else: result = np.apply_along_axis( wrap_function(self.func), From b7a2ecbae9e64ae4f8e155a1cb0edb96f44b8e6b Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 10 Jun 2025 13:28:12 +0700 Subject: [PATCH 09/32] Use import_optional_dependency to load Numba --- pandas/core/apply.py | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 760fd111f21ce..f54c27c93a2a9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1148,31 +1148,19 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - try: - import numba - - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) - else: - raise ImportError - except ImportError: - engine_obj = NumbaExecutionEngine() - result = engine_obj.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) + numba = import_optional_dependency("numba") + + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + + result = numba.jit.__pandas_udf__.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) else: result = np.apply_along_axis( wrap_function(self.func), From 6f4fb501550fbd69ed10df1ca6c5c6107c26084e Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 14:38:23 +0700 Subject: [PATCH 10/32] Updated engine handling: normalizing numba to a fake decorator and updating empty or python string condition --- pandas/core/frame.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..593e59457518d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,7 +129,7 @@ roperator, ) from pandas.core.accessor import Accessor -from pandas.core.apply import reconstruct_and_relabel_result +from pandas.core.apply import NumbaExecutionEngine, reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( @@ -10616,14 +10616,14 @@ def apply( significant amount of time to run. Fast functions are unlikely to run faster with JIT compilation. """ - if engine is None or isinstance(engine, str): - from pandas.core.apply import frame_apply - - if engine is None: - engine = "python" + if engine == "numba": + numba = import_optional_dependency("numba") + numba_jit = numba.jit(**engine_kwargs) + numba_jit.__pandas_udf__ = NumbaExecutionEngine + engine = numba_jit - if engine not in ["python", "numba"]: - raise ValueError(f"Unknown engine '{engine}'") + if engine is None or engine == "python": + from pandas.core.apply import frame_apply op = frame_apply( self, @@ -10632,7 +10632,7 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, - engine=engine, + engine="python", engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, From 221cf7cd7b8f987bc149ef84d0f8365138e44f30 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 14:47:27 +0700 Subject: [PATCH 11/32] Added check for empty engine_kwargs --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 593e59457518d..8c971603076cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10618,7 +10618,10 @@ def apply( """ if engine == "numba": numba = import_optional_dependency("numba") - numba_jit = numba.jit(**engine_kwargs) + if engine_kwargs is not None: + numba_jit = numba.jit(**engine_kwargs) + else: + numba_jit = numba.jit() numba_jit.__pandas_udf__ = NumbaExecutionEngine engine = numba_jit From ed8dc7f15472c28c3d8fe425aa7a086fab05fea4 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 17:13:25 +0700 Subject: [PATCH 12/32] Moved checks from Apply.apply to NumbaExecutionEngine.apply --- pandas/core/apply.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f54c27c93a2a9..290e72974e9aa 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -209,6 +209,30 @@ def apply( """ Apply `func` along the given axis using Numba. """ + + if is_list_like(func): + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) + + if isinstance(func, str): + raise NotImplementedError( + "the 'numba' engine doesn't support using " + "a string as the callable function" + ) + + elif isinstance(func, np.ufunc): + raise NotImplementedError( + "the 'numba' engine doesn't support " + "using a numpy ufunc as the callable function" + ) + + # check for data typing + if not isinstance(data, np.ndarray): + if len(data.columns) == 0 and len(data.index) == 0: + return data.copy() # mimic apply_empty_result() + return FrameApply.apply_standard() + engine_kwargs: dict[str, bool] | None = ( decorator if isinstance(decorator, dict) else None ) @@ -1011,10 +1035,6 @@ def apply(self) -> DataFrame | Series: # dispatch to handle list-like or dict-like if is_list_like(self.func): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support lists of callables yet" - ) return self.apply_list_or_dict_like() # all empty @@ -1023,20 +1043,10 @@ def apply(self) -> DataFrame | Series: # string dispatch if isinstance(self.func, str): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support using " - "a string as the callable function" - ) return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support " - "using a numpy ufunc as the callable function" - ) with np.errstate(all="ignore"): results = self.obj._mgr.apply("apply", func=self.func) # _constructor will retain self.index and self.columns @@ -1044,10 +1054,6 @@ def apply(self) -> DataFrame | Series: # broadcasting if self.result_type == "broadcast": - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support result_type='broadcast'" - ) return self.apply_broadcast(self.obj) # one axis empty From 65b9d320ffe83cac78f2640673fb7ca0bb49a778 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 18 Jun 2025 10:17:31 +0700 Subject: [PATCH 13/32] Fixed CI, removed unused numba checks, updated raw=false condition, updated engine checks --- pandas/core/apply.py | 43 +++++++++++++------------------------------ pandas/core/frame.py | 13 ++++++++++--- 2 files changed, 23 insertions(+), 33 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 290e72974e9aa..949959de7cbcd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -230,8 +230,12 @@ def apply( # check for data typing if not isinstance(data, np.ndarray): if len(data.columns) == 0 and len(data.index) == 0: - return data.copy() # mimic apply_empty_result() - return FrameApply.apply_standard() + return data.copy() # mimic apply_empty_result() + # TODO: + # Rewrite FrameApply.apply_series_numba() logic without FrameApply object + raise NotImplementedError( + "raw=False is not yet supported in NumbaExecutionEngine." + ) engine_kwargs: dict[str, bool] | None = ( decorator if isinstance(decorator, dict) else None @@ -780,12 +784,6 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: Result when self.func is a list-like or dict-like, None otherwise. """ - if self.engine == "numba": - raise NotImplementedError( - "The 'numba' engine doesn't support list-like/" - "dict likes of callables yet." - ) - if self.axis == 1 and isinstance(self.obj, ABCDataFrame): return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T @@ -1153,28 +1151,13 @@ def wrapper(*args, **kwargs): return wrapper - if engine == "numba": - numba = import_optional_dependency("numba") - - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) - else: - result = np.apply_along_axis( - wrap_function(self.func), - self.axis, - self.values, - *self.args, - **self.kwargs, - ) + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c971603076cc..78c9fc3fc10f1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,7 +129,10 @@ roperator, ) from pandas.core.accessor import Accessor -from pandas.core.apply import NumbaExecutionEngine, reconstruct_and_relabel_result +from pandas.core.apply import ( + NumbaExecutionEngine, + reconstruct_and_relabel_result, +) from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( @@ -10625,9 +10628,12 @@ def apply( numba_jit.__pandas_udf__ = NumbaExecutionEngine engine = numba_jit - if engine is None or engine == "python": + if engine is None or isinstance(engine, str): from pandas.core.apply import frame_apply + if engine not in ["python"] and engine is not None: + raise ValueError(f"Unknown engine '{engine}'") + op = frame_apply( self, func=func, @@ -10641,7 +10647,8 @@ def apply( kwargs=kwargs, ) return op.apply().__finalize__(self, method="apply") - elif hasattr(engine, "__pandas_udf__"): + + if hasattr(engine, "__pandas_udf__"): if result_type is not None: raise NotImplementedError( f"{result_type=} only implemented for the default engine" From 2703f86b21b2ddc6de149f58cc6844c143bd4345 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 19 Jun 2025 11:28:47 +0700 Subject: [PATCH 14/32] Implement and refactor raw=False logic into NumbaExecutionEngine.apply --- pandas/core/apply.py | 128 +++++++++++++++++++++++++++++++++++++++++-- pandas/core/frame.py | 2 +- 2 files changed, 125 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 949959de7cbcd..9ab915f76cd2a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -231,10 +231,8 @@ def apply( if not isinstance(data, np.ndarray): if len(data.columns) == 0 and len(data.index) == 0: return data.copy() # mimic apply_empty_result() - # TODO: - # Rewrite FrameApply.apply_series_numba() logic without FrameApply object - raise NotImplementedError( - "raw=False is not yet supported in NumbaExecutionEngine." + return NumbaExecutionEngine.apply_raw_false( + data, func, args, kwargs, decorator, axis ) engine_kwargs: dict[str, bool] | None = ( @@ -259,6 +257,128 @@ def apply( # If we made the result 2-D, squeeze it back to 1-D return np.squeeze(result) + @staticmethod + def apply_raw_false( + data: Series | DataFrame, + func, + args: tuple, + kwargs: dict, + decorator: Callable, + axis: int | str, + ): + from pandas import ( + DataFrame, + Series, + ) + + engine_kwargs: dict[str, bool] | None = ( + decorator if isinstance(decorator, dict) else {} + ) + + if engine_kwargs.get("parallel", False): + raise NotImplementedError( + "Parallel apply is not supported when raw=False and engine='numba'" + ) + if not data.index.is_unique or not data.columns.is_unique: + raise NotImplementedError( + "The index/columns must be unique when raw=False and engine='numba'" + ) + NumbaExecutionEngine.validate_values_for_numba(data) + results = NumbaExecutionEngine.apply_with_numba( + data, func, args, kwargs, engine_kwargs, axis + ) + + if results: + sample = next(iter(results.values())) + if isinstance(sample, Series): + df_result = DataFrame.from_dict( + results, orient="index" if axis == 1 else "columns" + ) + return df_result + else: + return Series(results) + + return DataFrame() if isinstance(data, DataFrame) else Series() + + @staticmethod + def validate_values_for_numba(df: DataFrame) -> None: + for colname, dtype in df.dtypes.items(): + if not is_numeric_dtype(dtype): + raise ValueError( + f"Column {colname} must have numeric dtype. Found '{dtype}'." + ) + if is_extension_array_dtype(dtype): + raise ValueError( + f"Column {colname} uses extension array dtype, " + "not supported by Numba." + ) + + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, axis, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") + from pandas import Series + from pandas.core._numba.extensions import maybe_cast_str + + jitted_udf = numba.extending.register_jitable(func) + + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names_index, index, *args): + results = {} + for i in range(values.shape[1 - axis]): + if axis == 0 or axis == "index": + arr = values[:, i] + result_key = index[i] + arr_index = col_names_index + else: + arr = values[i].copy() + result_key = index[i] + arr_index = col_names_index + ser = Series( + arr, + index=arr_index, + name=maybe_cast_str(result_key), + ) + results[result_key] = jitted_udf(ser, *args) + + return results + + return numba_func + + @staticmethod + def apply_with_numba( + data, func, args, kwargs, engine_kwargs, axis=0 + ) -> dict[int, Any]: + func = cast(Callable, func) + args, kwargs = prepare_function_arguments( + func, args, kwargs, num_required_args=1 + ) + nb_func = NumbaExecutionEngine.generate_numba_apply_func( + func, axis, **get_jit_arguments(engine_kwargs) + ) + + from pandas.core._numba.extensions import set_numba_data + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + + if axis == 0 or axis == "index": + col_names_index = data.index + result_index = data.columns + else: + col_names_index = data.columns + result_index = data.index + + with ( + set_numba_data(result_index) as index, + set_numba_data(col_names_index) as columns, + ): + res = dict(nb_func(data.values, columns, index, *args)) + + return res + def frame_apply( obj: DataFrame, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 78c9fc3fc10f1..bc8b8d6d2ac13 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10648,7 +10648,7 @@ def apply( ) return op.apply().__finalize__(self, method="apply") - if hasattr(engine, "__pandas_udf__"): + elif hasattr(engine, "__pandas_udf__"): if result_type is not None: raise NotImplementedError( f"{result_type=} only implemented for the default engine" From 347463e958518b8a5cb6cf2bd3405da17f8de0f5 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 19 Jun 2025 17:31:12 +0700 Subject: [PATCH 15/32] Fix CI, update validate_values_for_numba params --- pandas/core/apply.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9ab915f76cd2a..1316a66543e84 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -271,7 +271,7 @@ def apply_raw_false( Series, ) - engine_kwargs: dict[str, bool] | None = ( + engine_kwargs: dict[str, bool] = ( decorator if isinstance(decorator, dict) else {} ) @@ -301,17 +301,27 @@ def apply_raw_false( return DataFrame() if isinstance(data, DataFrame) else Series() @staticmethod - def validate_values_for_numba(df: DataFrame) -> None: - for colname, dtype in df.dtypes.items(): - if not is_numeric_dtype(dtype): + def validate_values_for_numba(obj: Series | DataFrame) -> None: + if isinstance(obj, Series): + if not is_numeric_dtype(obj.dtype): raise ValueError( - f"Column {colname} must have numeric dtype. Found '{dtype}'." + f"Series must have numeric dtype. Found '{dtype}'." ) - if is_extension_array_dtype(dtype): + if is_extension_array_dtype(obj.dtype): raise ValueError( - f"Column {colname} uses extension array dtype, " - "not supported by Numba." + f"Series uses extension array dtype, not supported by Numba." ) + else: + for colname, dtype in obj.dtypes.items(): + if not is_numeric_dtype(dtype): + raise ValueError( + f"Column {colname} must have numeric dtype. Found '{dtype}'." + ) + if is_extension_array_dtype(dtype): + raise ValueError( + f"Column {colname} uses extension array dtype, " + "not supported by Numba." + ) @staticmethod @functools.cache From 77eb1461166c8cedf55418ec0161262736f3df11 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 19 Jun 2025 17:33:36 +0700 Subject: [PATCH 16/32] Adjust error messages --- pandas/core/apply.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1316a66543e84..2ca53c06eaec4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -305,22 +305,25 @@ def validate_values_for_numba(obj: Series | DataFrame) -> None: if isinstance(obj, Series): if not is_numeric_dtype(obj.dtype): raise ValueError( - f"Series must have numeric dtype. Found '{dtype}'." + f"Series must have a numeric dtype. " + f"Found '{dtype}' instead" ) if is_extension_array_dtype(obj.dtype): raise ValueError( - f"Series uses extension array dtype, not supported by Numba." + f"Series is backed by an extension array, " + f"which is not supported by the numba engine." ) else: for colname, dtype in obj.dtypes.items(): if not is_numeric_dtype(dtype): raise ValueError( - f"Column {colname} must have numeric dtype. Found '{dtype}'." + f"Column {colname} must have a numeric dtype. " + f"Found '{dtype}' instead" ) if is_extension_array_dtype(dtype): raise ValueError( - f"Column {colname} uses extension array dtype, " - "not supported by Numba." + f"Column {colname} is backed by an extension array, " + f"which is not supported by the numba engine." ) @staticmethod From 90f264f88b67406c1308d9295bbc0add9148e355 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 19 Jun 2025 17:46:57 +0700 Subject: [PATCH 17/32] Remove Numba-specific logic from FrameApply, added Series import to validate_values_for_numba --- pandas/core/apply.py | 144 +------------------------------------------ 1 file changed, 2 insertions(+), 142 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2ca53c06eaec4..a57bca3353b2a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -302,6 +302,7 @@ def apply_raw_false( @staticmethod def validate_values_for_numba(obj: Series | DataFrame) -> None: + from pandas import Series if isinstance(obj, Series): if not is_numeric_dtype(obj.dtype): raise ValueError( @@ -1115,32 +1116,6 @@ def result_columns(self) -> Index: def series_generator(self) -> Generator[Series]: pass - @staticmethod - @functools.cache - @abc.abstractmethod - def generate_numba_apply_func( - func, nogil=True, nopython=True, parallel=False - ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: - pass - - @abc.abstractmethod - def apply_with_numba(self): - pass - - def validate_values_for_numba(self) -> None: - # Validate column dtyps all OK - for colname, dtype in self.obj.dtypes.items(): - if not is_numeric_dtype(dtype): - raise ValueError( - f"Column {colname} must have a numeric dtype. " - f"Found '{dtype}' instead" - ) - if is_extension_array_dtype(dtype): - raise ValueError( - f"Column {colname} is backed by an extension array, " - f"which is not supported by the numba engine." - ) - @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: Index @@ -1327,10 +1302,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result def apply_standard(self): - if self.engine == "python": - results, res_index = self.apply_series_generator() - else: - results, res_index = self.apply_series_numba() + results, res_index = self.apply_series_generator() # wrap results return self.wrap_results(results, res_index) @@ -1352,19 +1324,6 @@ def apply_series_generator(self) -> tuple[ResType, Index]: return results, res_index - def apply_series_numba(self): - if self.engine_kwargs.get("parallel", False): - raise NotImplementedError( - "Parallel apply is not supported when raw=False and engine='numba'" - ) - if not self.obj.index.is_unique or not self.columns.is_unique: - raise NotImplementedError( - "The index/columns must be unique when raw=False and engine='numba'" - ) - self.validate_values_for_numba() - results = self.apply_with_numba() - return results, self.result_index - def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: from pandas import Series @@ -1404,54 +1363,6 @@ class FrameRowApply(FrameApply): def series_generator(self) -> Generator[Series]: return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) - @staticmethod - @functools.cache - def generate_numba_apply_func( - func, nogil=True, nopython=True, parallel=False - ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: - numba = import_optional_dependency("numba") - from pandas import Series - - # Import helper from extensions to cast string object -> np strings - # Note: This also has the side effect of loading our numba extensions - from pandas.core._numba.extensions import maybe_cast_str - - jitted_udf = numba.extending.register_jitable(func) - - # Currently the parallel argument doesn't get passed through here - # (it's disabled) since the dicts in numba aren't thread-safe. - @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index, *args): - results = {} - for j in range(values.shape[1]): - # Create the series - ser = Series( - values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) - ) - results[j] = jitted_udf(ser, *args) - return results - - return numba_func - - def apply_with_numba(self) -> dict[int, Any]: - func = cast(Callable, self.func) - args, kwargs = prepare_function_arguments( - func, self.args, self.kwargs, num_required_args=1 - ) - nb_func = self.generate_numba_apply_func( - func, **get_jit_arguments(self.engine_kwargs) - ) - from pandas.core._numba.extensions import set_numba_data - - index = self.obj.index - columns = self.obj.columns - - # Convert from numba dict to regular dict - # Our isinstance checks in the df constructor don't pass for numbas typed dict - with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index, *args)) - return res - @property def result_index(self) -> Index: return self.columns @@ -1545,57 +1456,6 @@ def series_generator(self) -> Generator[Series]: mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) yield ser - @staticmethod - @functools.cache - def generate_numba_apply_func( - func, nogil=True, nopython=True, parallel=False - ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: - numba = import_optional_dependency("numba") - from pandas import Series - from pandas.core._numba.extensions import maybe_cast_str - - jitted_udf = numba.extending.register_jitable(func) - - @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index, *args): - results = {} - # Currently the parallel argument doesn't get passed through here - # (it's disabled) since the dicts in numba aren't thread-safe. - for i in range(values.shape[0]): - # Create the series - # TODO: values corrupted without the copy - ser = Series( - values[i].copy(), - index=col_names_index, - name=maybe_cast_str(index[i]), - ) - results[i] = jitted_udf(ser, *args) - - return results - - return numba_func - - def apply_with_numba(self) -> dict[int, Any]: - func = cast(Callable, self.func) - args, kwargs = prepare_function_arguments( - func, self.args, self.kwargs, num_required_args=1 - ) - nb_func = self.generate_numba_apply_func( - func, **get_jit_arguments(self.engine_kwargs) - ) - - from pandas.core._numba.extensions import set_numba_data - - # Convert from numba dict to regular dict - # Our isinstance checks in the df constructor don't pass for numbas typed dict - with ( - set_numba_data(self.obj.index) as index, - set_numba_data(self.columns) as columns, - ): - res = dict(nb_func(self.values, columns, index, *args)) - - return res - @property def result_index(self) -> Index: return self.index From f8f116639b1f0bc11f6f54dd7b542baee442c30a Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 19 Jun 2025 17:52:08 +0700 Subject: [PATCH 18/32] pre-commit --- pandas/core/apply.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a57bca3353b2a..d5c1bf5fef686 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -303,16 +303,16 @@ def apply_raw_false( @staticmethod def validate_values_for_numba(obj: Series | DataFrame) -> None: from pandas import Series + if isinstance(obj, Series): if not is_numeric_dtype(obj.dtype): raise ValueError( - f"Series must have a numeric dtype. " - f"Found '{dtype}' instead" + f"Series must have a numeric dtype. Found '{obj.dtype}' instead" ) if is_extension_array_dtype(obj.dtype): raise ValueError( - f"Series is backed by an extension array, " - f"which is not supported by the numba engine." + "Series is backed by an extension array, " + "which is not supported by the numba engine." ) else: for colname, dtype in obj.dtypes.items(): From bc2939b85cd3a814b76d015b9818eea62efc289b Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 21 Jun 2025 19:30:01 +0700 Subject: [PATCH 19/32] Updated with reviewer suggestions and added axis normalizing --- pandas/core/apply.py | 104 +++++++++++++++++++++++-------------------- pandas/core/frame.py | 10 ++--- 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d5c1bf5fef686..6828594be4408 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,7 +49,6 @@ import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.util.numba_ import ( - get_jit_arguments, prepare_function_arguments, ) @@ -195,7 +194,7 @@ def map( """ Elementwise map for the Numba engine. Currently not supported. """ - raise NotImplementedError("Numba map is not implemented yet.") + raise NotImplementedError("The Numba engine is not implemented for the map method yet.") @staticmethod def apply( @@ -210,35 +209,27 @@ def apply( Apply `func` along the given axis using Numba. """ - if is_list_like(func): - raise NotImplementedError( - "the 'numba' engine doesn't support lists of callables yet" - ) - - if isinstance(func, str): - raise NotImplementedError( - "the 'numba' engine doesn't support using " - "a string as the callable function" - ) + NumbaExecutionEngine.check_numba_support(func) - elif isinstance(func, np.ufunc): - raise NotImplementedError( - "the 'numba' engine doesn't support " - "using a numpy ufunc as the callable function" - ) + # normalize axis values + if axis in (0, "index"): + axis = 0 + else: + axis = 1 # check for data typing if not isinstance(data, np.ndarray): - if len(data.columns) == 0 and len(data.index) == 0: + if data.empty: return data.copy() # mimic apply_empty_result() + NumbaExecutionEngine.validate_values_for_numba_raw_false( + data, + decorator if isinstance(decorator, dict) else {} + ) + return NumbaExecutionEngine.apply_raw_false( data, func, args, kwargs, decorator, axis ) - engine_kwargs: dict[str, bool] | None = ( - decorator if isinstance(decorator, dict) else None - ) - looper_args, looper_kwargs = prepare_function_arguments( func, args, @@ -249,14 +240,33 @@ def apply( # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" - nb_looper = generate_apply_looper( + numba_looper = generate_apply_looper( func, - **get_jit_arguments(engine_kwargs), + decorator, ) - result = nb_looper(data, axis, *looper_args) + result = numba_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D return np.squeeze(result) + @staticmethod + def check_numba_support(func): + if is_list_like(func): + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) + + elif isinstance(func, str): + raise NotImplementedError( + "the 'numba' engine doesn't support using " + "a string as the callable function" + ) + + elif isinstance(func, np.ufunc): + raise NotImplementedError( + "the 'numba' engine doesn't support " + "using a numpy ufunc as the callable function" + ) + @staticmethod def apply_raw_false( data: Series | DataFrame, @@ -271,21 +281,8 @@ def apply_raw_false( Series, ) - engine_kwargs: dict[str, bool] = ( - decorator if isinstance(decorator, dict) else {} - ) - - if engine_kwargs.get("parallel", False): - raise NotImplementedError( - "Parallel apply is not supported when raw=False and engine='numba'" - ) - if not data.index.is_unique or not data.columns.is_unique: - raise NotImplementedError( - "The index/columns must be unique when raw=False and engine='numba'" - ) - NumbaExecutionEngine.validate_values_for_numba(data) results = NumbaExecutionEngine.apply_with_numba( - data, func, args, kwargs, engine_kwargs, axis + data, func, args, kwargs, decorator, axis ) if results: @@ -301,21 +298,30 @@ def apply_raw_false( return DataFrame() if isinstance(data, DataFrame) else Series() @staticmethod - def validate_values_for_numba(obj: Series | DataFrame) -> None: + def validate_values_for_numba_raw_false(data: Series | DataFrame, engine_kwargs: dict[str, bool]) -> None: from pandas import Series - if isinstance(obj, Series): - if not is_numeric_dtype(obj.dtype): + if engine_kwargs.get("parallel", False): + raise NotImplementedError( + "Parallel apply is not supported when raw=False and engine='numba'" + ) + if not data.index.is_unique or not data.columns.is_unique: + raise NotImplementedError( + "The index/columns must be unique when raw=False and engine='numba'" + ) + + if isinstance(data, Series): + if not is_numeric_dtype(data.dtype): raise ValueError( - f"Series must have a numeric dtype. Found '{obj.dtype}' instead" + f"Series must have a numeric dtype. Found '{data.dtype}' instead" ) - if is_extension_array_dtype(obj.dtype): + if is_extension_array_dtype(data.dtype): raise ValueError( "Series is backed by an extension array, " "which is not supported by the numba engine." ) else: - for colname, dtype in obj.dtypes.items(): + for colname, dtype in data.dtypes.items(): if not is_numeric_dtype(dtype): raise ValueError( f"Column {colname} must have a numeric dtype. " @@ -330,7 +336,7 @@ def validate_values_for_numba(obj: Series | DataFrame) -> None: @staticmethod @functools.cache def generate_numba_apply_func( - func, axis, nogil=True, nopython=True, parallel=False + func, axis, decorator: Callable ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: numba = import_optional_dependency("numba") from pandas import Series @@ -338,7 +344,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) - @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + @decorator # type: ignore def numba_func(values, col_names_index, index, *args): results = {} for i in range(values.shape[1 - axis]): @@ -363,14 +369,14 @@ def numba_func(values, col_names_index, index, *args): @staticmethod def apply_with_numba( - data, func, args, kwargs, engine_kwargs, axis=0 + data, func, args, kwargs, decorator, axis=0 ) -> dict[int, Any]: func = cast(Callable, func) args, kwargs = prepare_function_arguments( func, args, kwargs, num_required_args=1 ) nb_func = NumbaExecutionEngine.generate_numba_apply_func( - func, axis, **get_jit_arguments(engine_kwargs) + func, axis, decorator ) from pandas.core._numba.extensions import set_numba_data diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc8b8d6d2ac13..7476bdd21ca05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10621,17 +10621,13 @@ def apply( """ if engine == "numba": numba = import_optional_dependency("numba") - if engine_kwargs is not None: - numba_jit = numba.jit(**engine_kwargs) - else: - numba_jit = numba.jit() - numba_jit.__pandas_udf__ = NumbaExecutionEngine - engine = numba_jit + engine = numba.jit(**engine_kwargs or {}) + engine.__pandas_udf__ = NumbaExecutionEngine if engine is None or isinstance(engine, str): from pandas.core.apply import frame_apply - if engine not in ["python"] and engine is not None: + if engine not in ["python", None]: raise ValueError(f"Unknown engine '{engine}'") op = frame_apply( From 176753bff295e9bcf82a8c5db5a5519fcce3ccb0 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 21 Jun 2025 19:30:26 +0700 Subject: [PATCH 20/32] Updated executor to accept decorator --- pandas/core/_numba/executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 3f3ebe8dbe023..0f9539470e053 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -18,14 +18,14 @@ @functools.cache -def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): +def generate_apply_looper(func, decorator: Callable): if TYPE_CHECKING: import numba else: numba = import_optional_dependency("numba") nb_compat_func = jit_user_function(func) - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + @decorator # type: ignore def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape From cf3e39288ca90c17c5c5875936c536fa5ce2b2ff Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 21 Jun 2025 23:13:21 +0700 Subject: [PATCH 21/32] Fix CI and pre-commit --- pandas/core/_numba/executor.py | 2 +- pandas/core/apply.py | 23 +++++++++++------------ pandas/core/frame.py | 7 ++++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0f9539470e053..8faee288686f5 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -25,7 +25,7 @@ def generate_apply_looper(func, decorator: Callable): numba = import_optional_dependency("numba") nb_compat_func = jit_user_function(func) - @decorator # type: ignore + @decorator def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6828594be4408..fab95cd664ef2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -194,7 +194,9 @@ def map( """ Elementwise map for the Numba engine. Currently not supported. """ - raise NotImplementedError("The Numba engine is not implemented for the map method yet.") + raise NotImplementedError( + "The Numba engine is not implemented for the map method yet." + ) @staticmethod def apply( @@ -222,9 +224,8 @@ def apply( if data.empty: return data.copy() # mimic apply_empty_result() NumbaExecutionEngine.validate_values_for_numba_raw_false( - data, - decorator if isinstance(decorator, dict) else {} - ) + data, decorator if isinstance(decorator, dict) else {} + ) return NumbaExecutionEngine.apply_raw_false( data, func, args, kwargs, decorator, axis @@ -298,7 +299,9 @@ def apply_raw_false( return DataFrame() if isinstance(data, DataFrame) else Series() @staticmethod - def validate_values_for_numba_raw_false(data: Series | DataFrame, engine_kwargs: dict[str, bool]) -> None: + def validate_values_for_numba_raw_false( + data: Series | DataFrame, engine_kwargs: dict[str, bool] + ) -> None: from pandas import Series if engine_kwargs.get("parallel", False): @@ -344,7 +347,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) - @decorator # type: ignore + @decorator def numba_func(values, col_names_index, index, *args): results = {} for i in range(values.shape[1 - axis]): @@ -368,16 +371,12 @@ def numba_func(values, col_names_index, index, *args): return numba_func @staticmethod - def apply_with_numba( - data, func, args, kwargs, decorator, axis=0 - ) -> dict[int, Any]: + def apply_with_numba(data, func, args, kwargs, decorator, axis=0) -> dict[int, Any]: func = cast(Callable, func) args, kwargs = prepare_function_arguments( func, args, kwargs, num_required_args=1 ) - nb_func = NumbaExecutionEngine.generate_numba_apply_func( - func, axis, decorator - ) + nb_func = NumbaExecutionEngine.generate_numba_apply_func(func, axis, decorator) from pandas.core._numba.extensions import set_numba_data diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7476bdd21ca05..18ae32e4747f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10621,10 +10621,11 @@ def apply( """ if engine == "numba": numba = import_optional_dependency("numba") - engine = numba.jit(**engine_kwargs or {}) - engine.__pandas_udf__ = NumbaExecutionEngine + numba_jit = numba.jit(**engine_kwargs or {}) + numba_jit.__pandas_udf__ = NumbaExecutionEngine + engine = numba_jit - if engine is None or isinstance(engine, str): + elif engine is None or isinstance(engine, str): from pandas.core.apply import frame_apply if engine not in ["python", None]: From a4bac1804a8f20cb171da2916b0b1e2c73cbabca Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sun, 22 Jun 2025 18:52:22 +0700 Subject: [PATCH 22/32] Silence pyright warning for untyped decorator --- pandas/core/_numba/executor.py | 2 +- pandas/core/apply.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 8faee288686f5..b28fd58a012d5 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -25,7 +25,7 @@ def generate_apply_looper(func, decorator: Callable): numba = import_optional_dependency("numba") nb_compat_func = jit_user_function(func) - @decorator + @decorator # pyright: ignore[reportUntypedFunctionDecorator] def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fab95cd664ef2..1e34e33538e3e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -347,7 +347,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) - @decorator + @decorator # pyright: ignore[reportUntypedFunctionDecorator] def numba_func(values, col_names_index, index, *args): results = {} for i in range(values.shape[1 - axis]): From ca91e898b8035e68e4c930d25bd7c9d64a484be5 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sun, 22 Jun 2025 23:09:15 +0700 Subject: [PATCH 23/32] Revert elif to if --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 18ae32e4747f5..2af124ab2997a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10625,7 +10625,7 @@ def apply( numba_jit.__pandas_udf__ = NumbaExecutionEngine engine = numba_jit - elif engine is None or isinstance(engine, str): + if engine is None or isinstance(engine, str): from pandas.core.apply import frame_apply if engine not in ["python", None]: From e337cb8ff939b0f047a4e43b3b065846f51d2544 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Mon, 23 Jun 2025 21:59:49 +0700 Subject: [PATCH 24/32] Fix engine_kwargs handling in validate_values_for_numba_raw_false --- pandas/core/apply.py | 3 ++- pandas/core/frame.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1e34e33538e3e..a3681e5dc59b1 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,6 +49,7 @@ import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.util.numba_ import ( + get_jit_arguments, prepare_function_arguments, ) @@ -224,7 +225,7 @@ def apply( if data.empty: return data.copy() # mimic apply_empty_result() NumbaExecutionEngine.validate_values_for_numba_raw_false( - data, decorator if isinstance(decorator, dict) else {} + data, get_jit_arguments(decorator.engine_kwargs) ) return NumbaExecutionEngine.apply_raw_false( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2af124ab2997a..97e6b0645f49e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10623,6 +10623,7 @@ def apply( numba = import_optional_dependency("numba") numba_jit = numba.jit(**engine_kwargs or {}) numba_jit.__pandas_udf__ = NumbaExecutionEngine + numba_jit.engine_kwargs = engine_kwargs engine = numba_jit if engine is None or isinstance(engine, str): From 30703b9d56c0fbc212840a5bbd611f4302e2a962 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 24 Jun 2025 14:31:04 +0700 Subject: [PATCH 25/32] Fix CI and removed references to engine and enginekwargs in FrameApply --- pandas/core/apply.py | 35 ++++++----------------------------- pandas/core/frame.py | 2 -- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a3681e5dc59b1..0f9c066b62b29 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -224,8 +224,11 @@ def apply( if not isinstance(data, np.ndarray): if data.empty: return data.copy() # mimic apply_empty_result() + engine_kwargs = ( + decorator.engine_kwargs if hasattr(decorator, "engine_kwargs") else {} + ) NumbaExecutionEngine.validate_values_for_numba_raw_false( - data, get_jit_arguments(decorator.engine_kwargs) + data, get_jit_arguments(engine_kwargs) ) return NumbaExecutionEngine.apply_raw_false( @@ -407,8 +410,6 @@ def frame_apply( raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", - engine: str = "python", - engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -432,8 +433,6 @@ def frame_apply( raw=raw, result_type=result_type, by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -450,8 +449,6 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat", "_compat"] = "compat", - engine: str = "python", - engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: @@ -464,9 +461,6 @@ def __init__( self.args = args or () self.kwargs = kwargs or {} - self.engine = engine - self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs - if result_type not in [None, "reduce", "broadcast", "expand"]: raise ValueError( "invalid value for result_type, must be one " @@ -1085,8 +1079,6 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat"] = False, - engine: str = "python", - engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: @@ -1098,8 +1090,6 @@ def __init__( raw, result_type, by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -1174,7 +1164,7 @@ def apply(self) -> DataFrame | Series: # raw elif self.raw: - return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) + return self.apply_raw() return self.apply_standard() @@ -1247,7 +1237,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self, engine="python", engine_kwargs=None): + def apply_raw(self): """apply to the values as a numpy array""" def wrap_function(func): @@ -1674,11 +1664,6 @@ def agg_or_apply_list_like( def agg_or_apply_dict_like( self, op_name: Literal["agg", "apply"] ) -> DataFrame | Series: - from pandas.core.groupby.generic import ( - DataFrameGroupBy, - SeriesGroupBy, - ) - assert op_name in ["agg", "apply"] obj = self.obj @@ -1693,14 +1678,6 @@ def agg_or_apply_dict_like( selected_obj = obj._selected_obj selection = obj._selection - is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) - - # Numba Groupby engine/engine-kwargs passthrough - if is_groupby: - engine = self.kwargs.get("engine", None) - engine_kwargs = self.kwargs.get("engine_kwargs", None) - kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs}) - with com.temp_setattr( obj, "as_index", True, condition=hasattr(obj, "as_index") ): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97e6b0645f49e..fc80904c6df73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10639,8 +10639,6 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, - engine="python", - engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) From b39a8d14f36f653fde27b887f7c3d22ea97c3240 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 26 Jun 2025 18:02:34 +0700 Subject: [PATCH 26/32] Added test for axis with string input --- pandas/tests/apply/test_numba.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 75bc3f5b74b9d..bcae69e5fa9c8 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -127,3 +127,16 @@ def test_numba_unsupported_dtypes(apply_axis): "which is not supported by the numba engine.", ): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) + +@pytest.mark.parametrize("axis, expected", [ + ("index", pd.Series([5.0, 7.0, 9.0], index=["a", "b", "c"])), + ("columns", pd.Series([6.0, 15.0], index=[0, 1])) +]) +def test_numba_apply_with_string_axis(axis, expected): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + + def f(x): + return x.sum() + + result = df.apply(f, engine="numba", axis=axis, raw=True) + tm.assert_series_equal(result, expected) From 2b59eeba4fe7d60cc7f9533f2c6b5d90941129a1 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 26 Jun 2025 18:06:41 +0700 Subject: [PATCH 27/32] pre-commit --- pandas/tests/apply/test_numba.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index bcae69e5fa9c8..ca4a7a93ff55d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -8,6 +8,7 @@ from pandas import ( DataFrame, Index, + Series, ) import pandas._testing as tm from pandas.util.version import Version @@ -128,12 +129,16 @@ def test_numba_unsupported_dtypes(apply_axis): ): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) -@pytest.mark.parametrize("axis, expected", [ - ("index", pd.Series([5.0, 7.0, 9.0], index=["a", "b", "c"])), - ("columns", pd.Series([6.0, 15.0], index=[0, 1])) -]) + +@pytest.mark.parametrize( + "axis, expected", + [ + ("index", Series([5.0, 7.0, 9.0], index=["a", "b", "c"])), + ("columns", Series([6.0, 15.0], index=[0, 1])), + ], +) def test_numba_apply_with_string_axis(axis, expected): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) def f(x): return x.sum() From f59fb52b01f3d5cc93004f5ea51af32b2c87fc1b Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 27 Jun 2025 20:59:45 +0700 Subject: [PATCH 28/32] Revert test for numba apply with string axis due to docstring validation issues --- pandas/tests/apply/test_numba.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index ca4a7a93ff55d..75bc3f5b74b9d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -8,7 +8,6 @@ from pandas import ( DataFrame, Index, - Series, ) import pandas._testing as tm from pandas.util.version import Version @@ -128,20 +127,3 @@ def test_numba_unsupported_dtypes(apply_axis): "which is not supported by the numba engine.", ): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) - - -@pytest.mark.parametrize( - "axis, expected", - [ - ("index", Series([5.0, 7.0, 9.0], index=["a", "b", "c"])), - ("columns", Series([6.0, 15.0], index=[0, 1])), - ], -) -def test_numba_apply_with_string_axis(axis, expected): - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) - - def f(x): - return x.sum() - - result = df.apply(f, engine="numba", axis=axis, raw=True) - tm.assert_series_equal(result, expected) From b08c36131e899bdabd2ec674118ea9316c81278d Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 4 Jul 2025 13:08:58 +0700 Subject: [PATCH 29/32] Updated engine_kwargs extraction logic and added test --- pandas/core/apply.py | 7 ++++--- pandas/core/frame.py | 1 - pandas/core/util/numba_.py | 21 +++++++++++++++++++++ pandas/tests/apply/test_numba.py | 11 +++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0f9c066b62b29..60509bea442c1 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,6 +49,7 @@ import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.util.numba_ import ( + extract_numba_options, get_jit_arguments, prepare_function_arguments, ) @@ -224,9 +225,9 @@ def apply( if not isinstance(data, np.ndarray): if data.empty: return data.copy() # mimic apply_empty_result() - engine_kwargs = ( - decorator.engine_kwargs if hasattr(decorator, "engine_kwargs") else {} - ) + + engine_kwargs = extract_numba_options(decorator) + NumbaExecutionEngine.validate_values_for_numba_raw_false( data, get_jit_arguments(engine_kwargs) ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc80904c6df73..dc1ecfd2cf6ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10623,7 +10623,6 @@ def apply( numba = import_optional_dependency("numba") numba_jit = numba.jit(**engine_kwargs or {}) numba_jit.__pandas_udf__ = NumbaExecutionEngine - numba_jit.engine_kwargs = engine_kwargs engine = numba_jit if engine is None or isinstance(engine, str): diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index d3f00c08e0e2c..4c1e01cb422d5 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -148,3 +148,24 @@ def prepare_function_arguments( args = args[num_required_args:] return args, kwargs + + +def extract_numba_options(decorator): + """ + Extract targetoptions from a numba.jit decorator + """ + try: + closure = decorator.__closure__ + if closure is None: + return {} + freevars = decorator.__code__.co_freevars + if "targetoptions" not in freevars: + return {} + idx = freevars.index("targetoptions") + cell = closure[idx] + targetoptions = cell.cell_contents + if isinstance(targetoptions, dict): + return targetoptions + return {} + except Exception: + return {} diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 75bc3f5b74b9d..270ef43249eaa 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -2,6 +2,7 @@ import pytest from pandas.compat import is_platform_arm +from pandas.core.util.numba_ import extract_numba_options import pandas.util._test_decorators as td import pandas as pd @@ -127,3 +128,13 @@ def test_numba_unsupported_dtypes(apply_axis): "which is not supported by the numba engine.", ): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) + + +@pytest.mark.parametrize("jit_args", [ + {"parallel": True, "nogil": True}, + {"parallel": False, "nogil": False}, +]) +def test_extract_numba_options_from_user_decorated_function(jit_args): + extracted = extract_numba_options(numba.jit(**jit_args)) + for k, v in jit_args.items(): + assert extracted.get(k) == v From f33047335f1611a76454d573204c519a5dfc613c Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 4 Jul 2025 22:33:30 +0700 Subject: [PATCH 30/32] Fix CI and pre-commit --- pandas/core/util/numba_.py | 2 +- pandas/tests/apply/test_numba.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index 4c1e01cb422d5..d6dfb1abbb3d2 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -150,7 +150,7 @@ def prepare_function_arguments( return args, kwargs -def extract_numba_options(decorator): +def extract_numba_options(decorator: Callable): """ Extract targetoptions from a numba.jit decorator """ diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 270ef43249eaa..5bb105bb59d5f 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -2,7 +2,6 @@ import pytest from pandas.compat import is_platform_arm -from pandas.core.util.numba_ import extract_numba_options import pandas.util._test_decorators as td import pandas as pd @@ -11,6 +10,7 @@ Index, ) import pandas._testing as tm +from pandas.core.util.numba_ import extract_numba_options from pandas.util.version import Version pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] @@ -130,10 +130,13 @@ def test_numba_unsupported_dtypes(apply_axis): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) -@pytest.mark.parametrize("jit_args", [ - {"parallel": True, "nogil": True}, - {"parallel": False, "nogil": False}, -]) +@pytest.mark.parametrize( + "jit_args", + [ + {"parallel": True, "nogil": True}, + {"parallel": False, "nogil": False}, + ], +) def test_extract_numba_options_from_user_decorated_function(jit_args): extracted = extract_numba_options(numba.jit(**jit_args)) for k, v in jit_args.items(): From c05b1a7b7eeb20a65226e3d7cd4014fe517da16f Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 5 Jul 2025 00:18:45 +0700 Subject: [PATCH 31/32] Add return type annotation for extract_numba_options --- pandas/core/util/numba_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index d6dfb1abbb3d2..bc56bde5a15d6 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -150,7 +150,7 @@ def prepare_function_arguments( return args, kwargs -def extract_numba_options(decorator: Callable): +def extract_numba_options(decorator: Callable) -> dict: """ Extract targetoptions from a numba.jit decorator """ From 4b650d025c902a019f6edb50589255a74c16d383 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 10 Jul 2025 15:11:28 +0700 Subject: [PATCH 32/32] Updated with reviewer suggestions --- pandas/core/apply.py | 68 ++++++++++++++------------------ pandas/core/util/numba_.py | 45 ++++++++++++++------- pandas/tests/apply/test_numba.py | 4 ++ 3 files changed, 65 insertions(+), 52 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f74930f95c9a6..04540e5023cf4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -216,11 +216,8 @@ def apply( NumbaExecutionEngine.check_numba_support(func) - # normalize axis values - if axis in (0, "index"): - axis = 0 - else: - axis = 1 + if not isinstance(data, np.ndarray): + axis = data._get_axis_number(cast(Axis, axis)) # check for data typing if not isinstance(data, np.ndarray): @@ -230,7 +227,7 @@ def apply( engine_kwargs = extract_numba_options(decorator) NumbaExecutionEngine.validate_values_for_numba_raw_false( - data, get_jit_arguments(engine_kwargs) + data, **get_jit_arguments(engine_kwargs) ) return NumbaExecutionEngine.apply_raw_false( @@ -288,9 +285,29 @@ def apply_raw_false( Series, ) - results = NumbaExecutionEngine.apply_with_numba( - data, func, args, kwargs, decorator, axis + func = cast(Callable, func) + args, kwargs = prepare_function_arguments( + func, args, kwargs, num_required_args=1 ) + nb_func = NumbaExecutionEngine.generate_numba_apply_func(func, axis, decorator) + + from pandas.core._numba.extensions import set_numba_data + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + + if axis == 0: + col_names_index = data.index + result_index = data.columns + else: + col_names_index = data.columns + result_index = data.index + + with ( + set_numba_data(result_index) as index, + set_numba_data(col_names_index) as columns, + ): + results = dict(nb_func(data.values, columns, index, *args)) if results: sample = next(iter(results.values())) @@ -306,11 +323,14 @@ def apply_raw_false( @staticmethod def validate_values_for_numba_raw_false( - data: Series | DataFrame, engine_kwargs: dict[str, bool] + data: Series | DataFrame, + nopython: bool | None = None, + nogil: bool | None = None, + parallel: bool | None = None, ) -> None: from pandas import Series - if engine_kwargs.get("parallel", False): + if parallel: raise NotImplementedError( "Parallel apply is not supported when raw=False and engine='numba'" ) @@ -376,34 +396,6 @@ def numba_func(values, col_names_index, index, *args): return numba_func - @staticmethod - def apply_with_numba(data, func, args, kwargs, decorator, axis=0) -> dict[int, Any]: - func = cast(Callable, func) - args, kwargs = prepare_function_arguments( - func, args, kwargs, num_required_args=1 - ) - nb_func = NumbaExecutionEngine.generate_numba_apply_func(func, axis, decorator) - - from pandas.core._numba.extensions import set_numba_data - - # Convert from numba dict to regular dict - # Our isinstance checks in the df constructor don't pass for numbas typed dict - - if axis == 0 or axis == "index": - col_names_index = data.index - result_index = data.columns - else: - col_names_index = data.columns - result_index = data.index - - with ( - set_numba_data(result_index) as index, - set_numba_data(col_names_index) as columns, - ): - res = dict(nb_func(data.values, columns, index, *args)) - - return res - def frame_apply( obj: DataFrame, diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index bc56bde5a15d6..6e9081283b014 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -152,20 +152,37 @@ def prepare_function_arguments( def extract_numba_options(decorator: Callable) -> dict: """ - Extract targetoptions from a numba.jit decorator + Extract the `targetoptions` dictionary from a numba.jit decorator. + + The `targetoptions` attribute stores the keyword arguments + passed to the `@numba.jit` decorator when it is created. + + This function returns a dictionary with the following keys, + if present in the decorator: + - nopython + - nogil + - parallel + + Parameters + ---------- + decorator : Callable + A numba.jit decorated function or a numba dispatcher object. + + Returns + ------- + dict + A dictionary with the extracted numba compilation options. """ - try: - closure = decorator.__closure__ - if closure is None: - return {} - freevars = decorator.__code__.co_freevars - if "targetoptions" not in freevars: - return {} - idx = freevars.index("targetoptions") - cell = closure[idx] - targetoptions = cell.cell_contents - if isinstance(targetoptions, dict): - return targetoptions + closure = decorator.__closure__ + if closure is None: return {} - except Exception: + freevars = decorator.__code__.co_freevars + if "targetoptions" not in freevars: return {} + idx = freevars.index("targetoptions") + cell = closure[idx] + targetoptions = cell.cell_contents + if isinstance(targetoptions, dict): + relevant_keys = {"nopython", "nogil", "parallel"} + return {k: v for k, v in targetoptions.items() if k in relevant_keys} + return {} diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 5bb105bb59d5f..20a7e9cb50327 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -141,3 +141,7 @@ def test_extract_numba_options_from_user_decorated_function(jit_args): extracted = extract_numba_options(numba.jit(**jit_args)) for k, v in jit_args.items(): assert extracted.get(k) == v + + extracted = extract_numba_options(numba.njit(**jit_args)) + for k, v in jit_args.items(): + assert extracted.get(k) == v