-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG: Index.union() inconsistent with non-unique Indexes #36299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 53 commits
b8f27e2
070abad
7a90556
dbcd0a7
7a9aec7
01d55c6
8463078
cddd7f9
3c6b079
90056b7
46b7c6c
9f55905
870c0ac
97fe91f
e724ade
920f9ff
c36ccff
396c70f
1062778
cf06418
6f04408
167d695
0b3b548
73d9ab3
561f835
59dbdf6
51131be
04817b4
a1887c7
fe41a6f
b63a732
fa49dfe
2a6f6ed
d80949c
48e041a
b57f00f
aa4533a
8fd90a3
091942a
446eb50
6b8fa64
484f4f8
5209bf0
6cabad8
b80fbdd
60bceec
2547f65
25885d4
fcc4635
ccaa0c1
f4ee466
20e62e6
e92ab7a
7ffa07a
76ded89
0af939d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2277,3 +2277,29 @@ def _sort_tuples(values: np.ndarray): | |
arrays, _ = to_arrays(values, None) | ||
indexer = lexsort_indexer(arrays, orders=True) | ||
return values[indexer] | ||
|
||
|
||
def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: | ||
""" | ||
Extracts the union from lvals and rvals with respect to duplicates and nans in | ||
both arrays. | ||
|
||
Parameters | ||
---------- | ||
lvals: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. types here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we duplicating them if signature is typed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. afaik yes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
left values which is ordered in front. | ||
rvals: | ||
right values ordered after lvals. | ||
|
||
Returns | ||
------- | ||
np.ndarray containing the unsorted union of both arrays | ||
""" | ||
indexer = [] | ||
l_count = value_counts(lvals, dropna=False) | ||
r_count = value_counts(rvals, dropna=False) | ||
l_count, r_count = l_count.align(r_count, fill_value=0) | ||
unique_array = unique(np.append(lvals, rvals)) | ||
for i, value in enumerate(unique_array): | ||
indexer += [i] * int(max(l_count[value], r_count[value])) | ||
return unique_array.take(indexer) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2935,32 +2935,44 @@ def _union(self, other, sort): | |
lvals = self._values | ||
rvals = other._values | ||
|
||
if sort is None and self.is_monotonic and other.is_monotonic: | ||
if ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment here on when these branches are taken (similar to what you did below). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
sort is None | ||
and self.is_monotonic | ||
and other.is_monotonic | ||
and not (self.has_duplicates and other.has_duplicates) | ||
): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Both are unique and monotonic, so can use outer join | ||
try: | ||
result = self._outer_indexer(lvals, rvals)[0] | ||
return self._outer_indexer(lvals, rvals)[0] | ||
except (TypeError, IncompatibleFrequency): | ||
# incomparable objects | ||
result = list(lvals) | ||
value_list = list(lvals) | ||
|
||
# worth making this faster? a very unusual case | ||
value_set = set(lvals) | ||
result.extend([x for x in rvals if x not in value_set]) | ||
result = Index(result)._values # do type inference here | ||
else: | ||
# find indexes of things in "other" that are not in "self" | ||
if self.is_unique: | ||
indexer = self.get_indexer(other) | ||
missing = (indexer == -1).nonzero()[0] | ||
else: | ||
missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) | ||
value_list.extend([x for x in rvals if x not in value_set]) | ||
return Index(value_list)._values # do type inference here | ||
|
||
if len(missing) > 0: | ||
other_diff = algos.take_nd(rvals, missing, allow_fill=False) | ||
result = concat_compat((lvals, other_diff)) | ||
elif not other.is_unique and not self.is_unique: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# self and other both have duplicates | ||
result = algos.union_with_duplicates(lvals, rvals) | ||
return _maybe_try_sort(result, sort) | ||
|
||
else: | ||
result = lvals | ||
# Either other or self is not unique | ||
# find indexes of things in "other" that are not in "self" | ||
if self.is_unique: | ||
indexer = self.get_indexer(other) | ||
missing = (indexer == -1).nonzero()[0] | ||
else: | ||
missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) | ||
|
||
if len(missing) > 0: | ||
other_diff = algos.take_nd(rvals, missing, allow_fill=False) | ||
result = concat_compat((lvals, other_diff)) | ||
else: | ||
result = lvals | ||
|
||
if not self.is_monotonic or not other.is_monotonic: | ||
result = _maybe_try_sort(result, sort) | ||
|
||
return result | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -501,6 +501,81 @@ def check_intersection_commutative(left, right): | |
assert idx.intersection(idx_non_unique).is_unique | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"cls", | ||
[ | ||
Int64Index, | ||
Float64Index, | ||
DatetimeIndex, | ||
CategoricalIndex, | ||
TimedeltaIndex, | ||
lambda x: Index(x, dtype=object), | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
], | ||
) | ||
def test_union_duplicate_index_subsets_of_each_other(cls): | ||
# GH#31326 | ||
a = cls([1, 2, 2, 3]) | ||
b = cls([3, 3, 4]) | ||
expected = cls([1, 2, 2, 3, 3, 4]) | ||
if cls is CategoricalIndex: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
expected = Index([1, 2, 2, 3, 3, 4]) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result = a.union(b) | ||
tm.assert_index_equal(result, expected) | ||
result = a.union(b, sort=False) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"cls", | ||
[ | ||
Int64Index, | ||
Float64Index, | ||
DatetimeIndex, | ||
CategoricalIndex, | ||
TimedeltaIndex, | ||
lambda x: Index(x, dtype=object), | ||
], | ||
) | ||
def test_union_with_duplicate_index(cls): | ||
# GH#36289 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
a = cls([1, 0, 0]) | ||
b = cls([0, 1]) | ||
expected = cls([0, 0, 1]) | ||
|
||
result = a.union(b) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = a.union(b) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_union_duplicate_index_different_dtypes(): | ||
# GH#36289 | ||
a = Index([1, 2, 2, 3]) | ||
b = Index(["1", "0", "0"]) | ||
expected = Index([1, 2, 2, 3, "1", "0", "0"]) | ||
result = a.union(b, sort=False) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_union_same_value_duplicated_in_both(): | ||
# GH#36289 | ||
a = Index([0, 0, 1]) | ||
b = Index([0, 0, 1, 2]) | ||
result = a.union(b) | ||
expected = Index([0, 0, 1, 2]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_union_nan_in_both(): | ||
# GH#36289 | ||
a = Index([np.nan, 1, 2, 2]) | ||
b = Index([np.nan, 1, 1, 2]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it matter if the duplicated entry is nan? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No it should not, parametrized to test |
||
result = a.union(b, sort=False) | ||
expected = Index([np.nan, 1.0, 1.0, 2.0, 2.0]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
class TestSetOpsUnsorted: | ||
# These may eventually belong in a dtype-specific test_setops, or | ||
# parametrized over a more general fixture | ||
|
Uh oh!
There was an error while loading. Please reload this page.