From 07239bb9b27536e874d3ad1240e39d2e2ee24d49 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 00:11:31 +0100 Subject: [PATCH 1/2] BUG: merge not raising for String and numeric merges --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 18 +++++++++++------- pandas/tests/reshape/merge/test_join.py | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ad44e87cacf82..3b71a61794756 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -664,6 +664,7 @@ Reshaping - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f07c4fb8f7d5f..cbf3ec70cf353 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1355,8 +1355,12 @@ def _maybe_coerce_merge_keys(self) -> None: lk_is_cat = isinstance(lk.dtype, CategoricalDtype) rk_is_cat = isinstance(rk.dtype, CategoricalDtype) - lk_is_object = is_object_dtype(lk.dtype) - rk_is_object = is_object_dtype(rk.dtype) + lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( + lk.dtype + ) + rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( + rk.dtype + ) # if either left or right is a categorical # then the must match exactly in categories & ordered @@ -1451,14 +1455,14 @@ def _maybe_coerce_merge_keys(self) -> None: # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( - is_bool_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( + is_bool_dtype(lk.dtype) and rk_is_object_or_string ): pass # object values are allowed to be merged - elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( - is_numeric_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( + is_numeric_dtype(lk.dtype) and rk_is_object_or_string ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) @@ -1497,7 +1501,7 @@ def _maybe_coerce_merge_keys(self) -> None: # allows datetime with different resolutions continue - elif lk_is_object and rk_is_object: + elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue # Houston, we have a problem! diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 3408e6e4731bd..1203d8a923068 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -150,8 +150,8 @@ def test_join_on(self, target_source): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object columns for key 'A'. " - "If you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object|string columns for key " + "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") From 3e63fb99b2a2d28f1b246319e0b8fc3a6fca9095 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 00:38:58 +0100 Subject: [PATCH 2/2] Add coverage --- pandas/tests/reshape/merge/test_join.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 1203d8a923068..580b0c4d7b7b8 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -118,7 +120,10 @@ def test_handle_overlap_arbitrary_key(self, df, df2): assert "key1.foo" in joined assert "key2.bar" in joined - def test_join_on(self, target_source): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_on(self, target_source, infer_string): target, source = target_source merged = target.join(source, on="C")