From 2b0e85ee5cbaffc3766da272ad7095879bcb8af4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 10 Nov 2022 21:24:53 +0100 Subject: [PATCH 01/13] ENH: Compile Factorizer class for all numeric dtypes --- pandas/_libs/hashtable.pyi | 132 +++++++++++++++++++++ pandas/_libs/hashtable.pyx | 49 -------- pandas/_libs/hashtable_class_helper.pxi.in | 50 ++++++++ 3 files changed, 182 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 18ebc1ff2bd1f..14c35ec1eb990 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -39,6 +39,138 @@ class Int64Factorizer(Factorizer): mask=..., ) -> npt.NDArray[np.intp]: ... +class UInt64Factorizer(Factorizer): + table: UInt64HashTable + uniques: UInt64Vector + def factorize( + self, + values: np.ndarray, # const uint64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int32Factorizer(Factorizer): + table: Int32HashTable + uniques: Int32Vector + def factorize( + self, + values: np.ndarray, # const int32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt32Factorizer(Factorizer): + table: UInt32HashTable + uniques: UInt32Vector + def factorize( + self, + values: np.ndarray, # const uint32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int16Factorizer(Factorizer): + table: Int16HashTable + uniques: Int16Vector + def factorize( + self, + values: np.ndarray, # const int16_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt16Factorizer(Factorizer): + table: UInt16HashTable + uniques: UInt16Vector + def factorize( + self, + values: np.ndarray, # const uint16_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Int8Factorizer(Factorizer): + table: Int8HashTable + uniques: Int8Vector + def factorize( + self, + values: np.ndarray, # const int8_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class UInt8Factorizer(Factorizer): + table: UInt8HashTable + uniques: UInt8Vector + def factorize( + self, + values: np.ndarray, # const uint8_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Float64Factorizer(Factorizer): + table: Float64HashTable + uniques: Float64Vector + def factorize( + self, + values: np.ndarray, # const float64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Float32Factorizer(Factorizer): + table: Float32HashTable + uniques: Float32Vector + def factorize( + self, + values: np.ndarray, # const float32_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Complex64Factorizer(Factorizer): + table: Complex64HashTable + uniques: Complex64Vector + def factorize( + self, + values: np.ndarray, # const complex64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + +class Complex128Factorizer(Factorizer): + table: Complex128HashTable + uniques: Complex128Vector + def factorize( + self, + values: np.ndarray, # const complex128_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + mask=..., + ) -> npt.NDArray[np.intp]: ... + class Int64Vector: def __init__(self, *args) -> None: ... def __len__(self) -> int: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index e4e9b24d725c6..13da0457868b8 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -69,17 +69,6 @@ else: raise ValueError(np.dtype(np.intp)) -cdef class Factorizer: - cdef readonly: - Py_ssize_t count - - def __cinit__(self, size_hint: int): - self.count = 0 - - def get_count(self) -> int: - return self.count - - cdef class ObjectFactorizer(Factorizer): cdef public: PyObjectHashTable table @@ -117,41 +106,3 @@ cdef class ObjectFactorizer(Factorizer): self.count, na_sentinel, na_value) self.count = len(self.uniques) return labels - - -cdef class Int64Factorizer(Factorizer): - cdef public: - Int64HashTable table - Int64Vector uniques - - def __cinit__(self, size_hint: int): - self.table = Int64HashTable(size_hint) - self.uniques = Int64Vector() - - def factorize(self, const int64_t[:] values, - na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: - """ - Returns - ------- - ndarray[intp_t] - - Examples - -------- - Factorize values with nans replaced by na_sentinel - - >>> fac = Int64Factorizer(3) - >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) - array([0, 1, 2]) - """ - cdef: - ndarray[intp_t] labels - - if self.uniques.external_view_exists: - uniques = Int64Vector() - uniques.extend(self.uniques.to_array()) - self.uniques = uniques - labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, - na_value=na_value, mask=mask) - self.count = len(self.uniques) - return labels diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bda8cd83c0605..cf65d7c21d1a4 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -101,6 +101,18 @@ from pandas._libs.khash cimport ( from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA + +cdef class Factorizer: + cdef readonly: + Py_ssize_t count + + def __cinit__(self, size_hint: int): + self.count = 0 + + def get_count(self) -> int: + return self.count + + {{py: # name, dtype, c_type @@ -876,6 +888,44 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques {{endif}} + +cdef class {{name}}Factorizer(Factorizer): + cdef public: + {{name}}HashTable table + {{name}}Vector uniques + + def __cinit__(self, size_hint: int): + self.table = {{name}}HashTable(size_hint) + self.uniques = {{name}}Vector() + + def factorize(self, const {{c_type}}[:] values, + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: + """ + Returns + ------- + ndarray[intp_t] + + Examples + -------- + Factorize values with nans replaced by na_sentinel + + >>> fac = {{name}}Factorizer(3) + >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) + array([0, 1, 2]) + """ + cdef: + ndarray[intp_t] labels + + if self.uniques.external_view_exists: + uniques = {{name}}Vector() + uniques.extend(self.uniques.to_array()) + self.uniques = uniques + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel, + na_value=na_value, mask=mask) + self.count = len(self.uniques) + return labels + {{endfor}} From 6969a802d4e1435d1d125b349a931285ec2cf85a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Nov 2022 17:37:48 +0100 Subject: [PATCH 02/13] Fix test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index cf65d7c21d1a4..bb3d850803e1e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -910,7 +910,7 @@ cdef class {{name}}Factorizer(Factorizer): Factorize values with nans replaced by na_sentinel >>> fac = {{name}}Factorizer(3) - >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) + >>> fac.factorize(np.array([1,2,3], dtype={{dtype}}), na_sentinel=20) array([0, 1, 2]) """ cdef: From 61a5928f6bc8da349e716a21e57fd03bf654426d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Nov 2022 22:59:09 +0100 Subject: [PATCH 03/13] Fix test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bb3d850803e1e..8eaec40b16ec2 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -910,7 +910,7 @@ cdef class {{name}}Factorizer(Factorizer): Factorize values with nans replaced by na_sentinel >>> fac = {{name}}Factorizer(3) - >>> fac.factorize(np.array([1,2,3], dtype={{dtype}}), na_sentinel=20) + >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20) array([0, 1, 2]) """ cdef: From 07429b733d2989e9539002ef9e2f20d6fe8c3619 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Nov 2022 17:22:36 +0100 Subject: [PATCH 04/13] Implement merge for masked arrays --- pandas/_libs/hashtable.pyi | 109 ++------------------- pandas/_libs/hashtable.pyx | 5 +- pandas/_libs/hashtable_class_helper.pxi.in | 2 + pandas/core/reshape/merge.py | 79 +++++++++------ 4 files changed, 65 insertions(+), 130 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 14c35ec1eb990..af47e6c408c05 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -1,4 +1,5 @@ from typing import ( + Any, Hashable, Literal, ) @@ -13,163 +14,69 @@ def unique_label_indices( class Factorizer: count: int + uniques: Any def __init__(self, size_hint: int) -> None: ... def get_count(self) -> int: ... - -class ObjectFactorizer(Factorizer): - table: PyObjectHashTable - uniques: ObjectVector def factorize( self, - values: npt.NDArray[np.object_], + values: np.ndarray, sort: bool = ..., na_sentinel=..., na_value=..., + mask=..., ) -> npt.NDArray[np.intp]: ... +class ObjectFactorizer(Factorizer): + table: PyObjectHashTable + uniques: ObjectVector + class Int64Factorizer(Factorizer): table: Int64HashTable uniques: Int64Vector - def factorize( - self, - values: np.ndarray, # const int64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt64Factorizer(Factorizer): table: UInt64HashTable uniques: UInt64Vector - def factorize( - self, - values: np.ndarray, # const uint64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int32Factorizer(Factorizer): table: Int32HashTable uniques: Int32Vector - def factorize( - self, - values: np.ndarray, # const int32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt32Factorizer(Factorizer): table: UInt32HashTable uniques: UInt32Vector - def factorize( - self, - values: np.ndarray, # const uint32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int16Factorizer(Factorizer): table: Int16HashTable uniques: Int16Vector - def factorize( - self, - values: np.ndarray, # const int16_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt16Factorizer(Factorizer): table: UInt16HashTable uniques: UInt16Vector - def factorize( - self, - values: np.ndarray, # const uint16_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int8Factorizer(Factorizer): table: Int8HashTable uniques: Int8Vector - def factorize( - self, - values: np.ndarray, # const int8_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class UInt8Factorizer(Factorizer): table: UInt8HashTable uniques: UInt8Vector - def factorize( - self, - values: np.ndarray, # const uint8_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Float64Factorizer(Factorizer): table: Float64HashTable uniques: Float64Vector - def factorize( - self, - values: np.ndarray, # const float64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Float32Factorizer(Factorizer): table: Float32HashTable uniques: Float32Vector - def factorize( - self, - values: np.ndarray, # const float32_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Complex64Factorizer(Factorizer): table: Complex64HashTable uniques: Complex64Vector - def factorize( - self, - values: np.ndarray, # const complex64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Complex128Factorizer(Factorizer): table: Complex128HashTable uniques: Complex128Vector - def factorize( - self, - values: np.ndarray, # const complex128_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... class Int64Vector: def __init__(self, *args) -> None: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 13da0457868b8..badd3095805fa 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -79,7 +79,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, na_sentinel=-1, na_value=None + self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None ) -> np.ndarray: """ @@ -98,6 +98,9 @@ cdef class ObjectFactorizer(Factorizer): cdef: ndarray[intp_t] labels + if mask is not None: + raise NotImplementedError("Mask is not implemented for object.") + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8eaec40b16ec2..3687725f3f83b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -112,6 +112,8 @@ cdef class Factorizer: def get_count(self) -> int: return self.count + def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: + pass {{py: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 74a1051825820..60a69eee9b1c1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -79,7 +79,10 @@ Series, ) import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + BaseMaskedArray, + ExtensionArray, +) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com from pandas.core.construction import extract_array @@ -92,6 +95,21 @@ from pandas.core import groupby from pandas.core.arrays import DatetimeArray +_factorizers = { + np.int64: libhashtable.Int64Factorizer, + np.int32: libhashtable.Int32Factorizer, + np.int16: libhashtable.Int16Factorizer, + np.int8: libhashtable.Int8Factorizer, + np.uint64: libhashtable.UInt64Factorizer, + np.uint32: libhashtable.UInt32Factorizer, + np.uint16: libhashtable.UInt16Factorizer, + np.uint8: libhashtable.UInt8Factorizer, + np.float64: libhashtable.Float64Factorizer, + np.float32: libhashtable.Float32Factorizer, + np.complex64: libhashtable.Complex64Factorizer, + np.complex128: libhashtable.Complex128Factorizer, +} + @Substitution("\nleft : DataFrame or named Series") @Appender(_merge_doc, indents=0) @@ -2335,25 +2353,28 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): - lk, _ = lk._values_for_factorize() - - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - - klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer] - if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): - # GH#23917 TODO: needs tests for case where lk is integer-dtype - # and rk is datetime-dtype - klass = libhashtable.Int64Factorizer - lk = ensure_int64(np.asarray(lk)) - rk = ensure_int64(np.asarray(rk)) - - elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): - # GH#23917 TODO: Needs tests for non-matching dtypes - klass = libhashtable.Int64Factorizer - lk = ensure_int64(np.asarray(lk, dtype=np.int64)) - rk = ensure_int64(np.asarray(rk, dtype=np.int64)) + if not isinstance(lk, BaseMaskedArray): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + + klass: type[libhashtable.Factorizer] + if is_numeric_dtype(lk.dtype): + if not is_dtype_equal(lk, rk): + dtype = find_common_type([lk.dtype, rk.dtype]) + # Argument 1 to "astype" of "ndarray" has incompatible type + # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], + # Type[Any], _SupportsDType[dtype[Any]]]" + lk = lk.astype(dtype) # type: ignore[arg-type] + rk = rk.astype(dtype) # type: ignore[arg-type] + if isinstance(lk, BaseMaskedArray): + # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; + # expected type "Type[object]" + klass = _factorizers[lk.dtype.type] # type: ignore[index] + else: + klass = _factorizers[lk.dtype.type] else: klass = libhashtable.ObjectFactorizer @@ -2362,14 +2383,16 @@ def _factorize_keys( rizer = klass(max(len(lk), len(rk))) - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - rlab = rizer.factorize(rk) # type: ignore[arg-type] + if isinstance(lk, BaseMaskedArray): + assert isinstance(rk, BaseMaskedArray) + llab = rizer.factorize(lk._data, mask=lk._mask) + rlab = rizer.factorize(rk._data, mask=lk._mask) + else: + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + llab = rizer.factorize(lk) # type: ignore[arg-type] + rlab = rizer.factorize(rk) # type: ignore[arg-type] assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype From 126fa68634a8ff41896ae01d97f50d1ed369f4e9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 19 Nov 2022 16:59:30 +0100 Subject: [PATCH 05/13] Fix bugs --- pandas/core/reshape/merge.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 60a69eee9b1c1..c7c3dda00207d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -104,10 +104,12 @@ np.uint32: libhashtable.UInt32Factorizer, np.uint16: libhashtable.UInt16Factorizer, np.uint8: libhashtable.UInt8Factorizer, + np.bool_: libhashtable.UInt8Factorizer, np.float64: libhashtable.Float64Factorizer, np.float32: libhashtable.Float32Factorizer, np.complex64: libhashtable.Complex64Factorizer, np.complex128: libhashtable.Complex128Factorizer, + np.object_: libhashtable.ObjectFactorizer, } @@ -2386,7 +2388,7 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=lk._mask) + rlab = rizer.factorize(rk._data, mask=rk._mask) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], From 6dffb54219a6094c8a3d72a08234e0af3242b5b9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 19 Nov 2022 17:03:46 +0100 Subject: [PATCH 06/13] Add asv --- asv_bench/benchmarks/join_merge.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index d9fb3c8a8ff89..e192ebdc8dbe6 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -273,6 +273,40 @@ def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) +class MergeEA: + + params = [ + "Int64", + "Int32", + "Int16", + "Int8", + "UInt64", + "UInt32", + "UInt16", + "UInt8", + "Float64", + "Float32", + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10_000 + indices = np.arange(1, N) + key = np.tile(indices[:8000], 10) + self.left = DataFrame( + {"key": Series(key, dtype=dtype), "value": np.random.randn(80000)} + ) + self.right = DataFrame( + { + "key": Series(indices[2000:], dtype=dtype), + "value2": np.random.randn(8000), + } + ) + + def time_merge(self, dtype): + merge(self.left, self.right) + + class I8Merge: params = ["inner", "outer", "left", "right"] From 5dfd7cddbacf07ea5d6d2a4a149a4d50343b8fa7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 22:38:46 +0000 Subject: [PATCH 07/13] Add tests --- asv_bench/benchmarks/join_merge.py | 2 +- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/reshape/merge.py | 20 +++++++++++++++----- pandas/tests/reshape/merge/conftest.py | 7 +++++++ pandas/tests/reshape/merge/test_merge.py | 24 ++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 pandas/tests/reshape/merge/conftest.py diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index e192ebdc8dbe6..61be7ffa479ca 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -299,7 +299,7 @@ def setup(self, dtype): self.right = DataFrame( { "key": Series(indices[2000:], dtype=dtype), - "value2": np.random.randn(8000), + "value2": np.random.randn(7999), } ) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index be81e462c30ea..ad843fec3a601 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -57,6 +57,7 @@ Other enhancements - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) +- Added support for extension array dtypes in :func:`merge` (:issue:`44240`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9bdd239dafe77..3aa5446bb4c8c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -42,6 +42,7 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( ensure_float64, @@ -2366,11 +2367,20 @@ def _factorize_keys( if is_numeric_dtype(lk.dtype): if not is_dtype_equal(lk, rk): dtype = find_common_type([lk.dtype, rk.dtype]) - # Argument 1 to "astype" of "ndarray" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], - # Type[Any], _SupportsDType[dtype[Any]]]" - lk = lk.astype(dtype) # type: ignore[arg-type] - rk = rk.astype(dtype) # type: ignore[arg-type] + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + if not isinstance(lk, ExtensionArray): + lk = cls._from_sequence(lk, dtype=dtype, copy=False) + else: + lk = lk.astype(dtype) + + if not isinstance(rk, ExtensionArray): + rk = cls._from_sequence(rk, dtype=dtype, copy=False) + else: + rk = rk.astype(dtype) + else: + lk = lk.astype(dtype) + rk = rk.astype(dtype) if isinstance(lk, BaseMaskedArray): # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; # expected type "Type[object]" diff --git a/pandas/tests/reshape/merge/conftest.py b/pandas/tests/reshape/merge/conftest.py new file mode 100644 index 0000000000000..45b1e83a258fa --- /dev/null +++ b/pandas/tests/reshape/merge/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture(params=["left", "right", "inner", "outer"]) +def how(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e4638c43e5a66..698cb68a6a795 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2714,3 +2714,27 @@ def test_merge_different_index_names(): result = merge(left, right, left_on="c", right_on="d") expected = DataFrame({"a_x": [1], "a_y": 1}) tm.assert_frame_equal(result, expected) + + +def test_merge_ea(any_numeric_ea_dtype, how): + # GH#44240 + left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype) + right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype) + result = left.merge(right, how=how) + expected = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}, dtype=any_numeric_ea_dtype) + tm.assert_frame_equal(result, expected) + + +def test_merge_ea_and_non_ea(any_numeric_ea_dtype, how): + # GH#44240 + left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype) + right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype.lower()) + result = left.merge(right, how=how) + expected = DataFrame( + { + "a": Series([1, 2, 3], dtype=any_numeric_ea_dtype), + "b": Series([1, 1, 1], dtype=any_numeric_ea_dtype), + "c": Series([2, 2, 2], dtype=any_numeric_ea_dtype.lower()), + } + ) + tm.assert_frame_equal(result, expected) From 711b91addf3fce5a6bc86dc8015809f83dbac22e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 23:35:37 +0000 Subject: [PATCH 08/13] Fix overflow tests --- pandas/core/reshape/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3aa5446bb4c8c..e818e367ca83d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -98,6 +98,7 @@ _factorizers = { np.int64: libhashtable.Int64Factorizer, + np.longlong: libhashtable.Int64Factorizer, np.int32: libhashtable.Int32Factorizer, np.int16: libhashtable.Int16Factorizer, np.int8: libhashtable.Int8Factorizer, From eead959e75ec850e69f1e04003be994ce9b9150a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 00:35:46 +0100 Subject: [PATCH 09/13] Add versionner --- asv_bench/asv.conf.json | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 4a0c882640eb6..b8136652f342a 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,6 +42,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], + "versioneer[toml]": [], "Cython": ["0.29.32"], "matplotlib": [], "sqlalchemy": [], From 38d558c64add42c309c788d2dfa10024b5671eb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 00:38:11 +0100 Subject: [PATCH 10/13] Add versionner --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index b8136652f342a..31e01fd810ac8 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -127,5 +127,5 @@ }, "build_command": ["python setup.py build -j4", - "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel versioneer[toml] --no-deps --no-index -w {build_cache_dir} {build_dir}"], } From 832ed522861ddf450fb636256ad00b0aeeb7b0c1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 00:49:57 +0100 Subject: [PATCH 11/13] Remove dtypes --- asv_bench/benchmarks/join_merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 61be7ffa479ca..fdbf325dcf997 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -279,11 +279,9 @@ class MergeEA: "Int64", "Int32", "Int16", - "Int8", "UInt64", "UInt32", "UInt16", - "UInt8", "Float64", "Float32", ] From 5185a33e9ae1f863ac06f8b476c5d15d760cdc21 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 00:50:25 +0100 Subject: [PATCH 12/13] Remove versioneer --- asv_bench/asv.conf.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 31e01fd810ac8..4a0c882640eb6 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,7 +42,6 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "versioneer[toml]": [], "Cython": ["0.29.32"], "matplotlib": [], "sqlalchemy": [], @@ -127,5 +126,5 @@ }, "build_command": ["python setup.py build -j4", - "PIP_NO_BUILD_ISOLATION=false python -mpip wheel versioneer[toml] --no-deps --no-index -w {build_cache_dir} {build_dir}"], + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], } From 899d6677b22e57192f9f97565983b8d01c4aae8c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 21:36:25 +0100 Subject: [PATCH 13/13] Remove fixture --- pandas/tests/reshape/merge/conftest.py | 7 ------- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 2 files changed, 4 insertions(+), 11 deletions(-) delete mode 100644 pandas/tests/reshape/merge/conftest.py diff --git a/pandas/tests/reshape/merge/conftest.py b/pandas/tests/reshape/merge/conftest.py deleted file mode 100644 index 45b1e83a258fa..0000000000000 --- a/pandas/tests/reshape/merge/conftest.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest - - -@pytest.fixture(params=["left", "right", "inner", "outer"]) -def how(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 698cb68a6a795..946e7e48148b4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2716,20 +2716,20 @@ def test_merge_different_index_names(): tm.assert_frame_equal(result, expected) -def test_merge_ea(any_numeric_ea_dtype, how): +def test_merge_ea(any_numeric_ea_dtype, join_type): # GH#44240 left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype) right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype) - result = left.merge(right, how=how) + result = left.merge(right, how=join_type) expected = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}, dtype=any_numeric_ea_dtype) tm.assert_frame_equal(result, expected) -def test_merge_ea_and_non_ea(any_numeric_ea_dtype, how): +def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type): # GH#44240 left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype) right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype.lower()) - result = left.merge(right, how=how) + result = left.merge(right, how=join_type) expected = DataFrame( { "a": Series([1, 2, 3], dtype=any_numeric_ea_dtype),