From c79a4bde3eddb1d4c0c0e034d84d5d8d864af79d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Mar 2020 17:59:00 -0800 Subject: [PATCH 1/4] CLN: avoid values_from_object in reshape.merge --- pandas/core/reshape/merge.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c301d6e7c7155..d82a4185ecf65 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -45,6 +45,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc from pandas.core.internals import _transform_index, concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible @@ -1849,9 +1850,15 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): # Some pre-processing for non-ndarray lk / rk + lk = extract_array(lk, extract_numpy=True) + rk = extract_array(rk, extract_numpy=True) + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = getattr(lk, "_values", lk)._data - rk = getattr(rk, "_values", rk)._data + # Extract the ndarray (UTC-localized) values + # Note: we dont need the dtypes to match, as these can still be compared + lk = lk._data + rk = rk._data + # TODO: this is equivalent to _values_for_factorize()[0]; more idiomatic? elif ( is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) @@ -1878,15 +1885,18 @@ def _factorize_keys(lk, rk, sort=True): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) + lk = ensure_int64(np.asarray(lk)) + rk = ensure_int64(np.asarray(rk)) elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( rk.dtype.type, (np.timedelta64, np.datetime64) ): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) + # TODO: above we extracted UTC-localized if both were dt64tz, but what + # if only one is? then np.asarray will return object-dtype here? + + lk = ensure_int64(np.asarray(lk)) + rk = ensure_int64(np.asarray(rk)) else: klass = libhashtable.Factorizer lk = ensure_object(lk) From f039233674ae738de7c820be3c9eb6a92d140a33 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Mar 2020 19:02:08 -0800 Subject: [PATCH 2/4] cleanup --- pandas/core/reshape/merge.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d82a4185ecf65..354e50449a8df 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1873,11 +1873,7 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif ( - is_extension_array_dtype(lk.dtype) - and is_extension_array_dtype(rk.dtype) - and lk.dtype == rk.dtype - ): + elif is_extension_array_dtype(lk.dtype) and lk.dtype == rk.dtype: lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1887,16 +1883,13 @@ def _factorize_keys(lk, rk, sort=True): klass = libhashtable.Int64Factorizer lk = ensure_int64(np.asarray(lk)) rk = ensure_int64(np.asarray(rk)) - elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( - rk.dtype.type, (np.timedelta64, np.datetime64) - ): + + elif needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer - # TODO: above we extracted UTC-localized if both were dt64tz, but what - # if only one is? then np.asarray will return object-dtype here? + lk = ensure_int64(np.asarray(lk, dtype=np.int64)) + rk = ensure_int64(np.asarray(rk, dtype=np.int64)) - lk = ensure_int64(np.asarray(lk)) - rk = ensure_int64(np.asarray(rk)) else: klass = libhashtable.Factorizer lk = ensure_object(lk) From 24b0b80f218dec84a81d2b7a7d1b13aa8b7a5e6d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Mar 2020 08:38:43 -0700 Subject: [PATCH 3/4] use values_for_factorize --- pandas/core/reshape/merge.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c5980fb406188..d429b3171b6e7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1824,12 +1824,11 @@ def _factorize_keys(lk, rk, sort=True): lk = extract_array(lk, extract_numpy=True) rk = extract_array(rk, extract_numpy=True) - if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): + if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - lk = lk._data - rk = rk._data - # TODO: this is equivalent to _values_for_factorize()[0]; more idiomatic? + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif ( is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) From 227836f51726cbef20041bbb02b2b10000d9af4b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Mar 2020 19:21:12 -0700 Subject: [PATCH 4/4] use is_dtype_equal --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d429b3171b6e7..d26b21e082f21 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1843,7 +1843,7 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_extension_array_dtype(lk.dtype) and lk.dtype == rk.dtype: + elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1854,7 +1854,7 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(np.asarray(lk)) rk = ensure_int64(np.asarray(rk)) - elif needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: + elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer lk = ensure_int64(np.asarray(lk, dtype=np.int64))