From c13af19cfc9562fc7b4e0248e7c6c49e993ac57a Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 9 Jan 2020 23:52:55 +0800 Subject: [PATCH 01/44] ENH: Added DataFrame.differences and Series.differences (GH30429) --- pandas/core/frame.py | 141 ++++++++++++++++++ pandas/core/series.py | 120 +++++++++++++++ .../tests/frame/methods/test_differences.py | 130 ++++++++++++++++ .../tests/series/methods/test_differences.py | 85 +++++++++++ 4 files changed, 476 insertions(+) create mode 100644 pandas/tests/frame/methods/test_differences.py create mode 100644 pandas/tests/series/methods/test_differences.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 538d0feade96f..c8bbab101bea7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5304,6 +5304,147 @@ def _construct_result(self, result) -> "DataFrame": out.columns = self.columns return out + def differences(self, other, axis=1, keep_indices=False, keep_values=False): + """ + Compare to another DataFrame and show the differences. + + The axis on which to stack results and how much information to + preserve can be customized. + + Note that NaNs are considered not different from other NaNs. + + Parameters + ---------- + other : DataFrame + Object to compare with. + + axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine how the differences are stacked. + * 0, or 'index' : Stack differences on neighbouring rows. + * 1, or 'columns' : Stack differences on neighbouring columns. + + keep_indices: bool, default False + Whether to keep the rows and columns that are equal, or drop them. + + keep_values: bool, default False + Whether to keep the values that are equal, or show as NaNs. + + Returns + ------- + DataFrame + DataFrame that shows the differences stacked side by side. + + See Also + -------- + Series.differences: Show differences. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "col1": ["a", "a", "b", "b", "a"], + ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], + ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + ... }, + ... columns=["col1", "col2", "col3"], + ... ) + >>> df + col1 col2 col3 + 0 a 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 3.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + >>> df2 = df.copy() + >>> df2.loc[0, 'col1'] = 'c' + >>> df2.loc[2, 'col3'] = 4.0 + >>> df2 + col1 col2 col3 + 0 c 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 4.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + Stack the differences on columns + + >>> df.differences(df2) + col1 col3 + self other self other + 0 a c NaN NaN + 2 NaN NaN 3.0 4.0 + + Stack the differences on rows + + >>> df.differences(df2, axis=0) + col1 col3 + 0 self a NaN + other c NaN + 2 self NaN 3.0 + other NaN 4.0 + + Keep all the original indices (rows and columns) + + >>> df.differences(df2, keep_indices=True) + col1 col2 col3 + self other self other self other + 0 a c NaN NaN NaN NaN + 1 NaN NaN NaN NaN NaN NaN + 2 NaN NaN NaN NaN 3.0 4.0 + 3 NaN NaN NaN NaN NaN NaN + 4 NaN NaN NaN NaN NaN NaN + + Keep all original indices and data + + >>> df.differences(df2, keep_indices=True, keep_values=True) + col1 col2 col3 + self other self other self other + 0 a c 1.0 1.0 1.0 1.0 + 1 a a 2.0 2.0 2.0 2.0 + 2 b b 3.0 3.0 3.0 4.0 + 3 b b NaN NaN 4.0 4.0 + 4 a a 5.0 5.0 5.0 5.0 + """ + from pandas.core.reshape.concat import concat + + mask = ~((self == other) | (self.isna() & other.isna())) + keys = ["self", "other"] + + if not keep_values: + self = self.where(mask) + other = other.where(mask) + + if not keep_indices: + cmask = mask.any() + rmask = mask.any(axis=1) + self = self.loc[rmask, cmask] + other = other.loc[rmask, cmask] + + axis = self._get_axis_number(axis) + diff = concat([self, other], axis=axis, keys=keys) + + ax = diff._get_axis(axis) + ax_names = np.array(ax.names) + + # set index names to positions to avoid confusion + ax.names = np.arange(len(ax_names)) + + # bring self-other to inner level + order = list(range(1, ax.nlevels)) + [0] + diff = diff.reorder_levels(order, axis=axis) + + # restore the index names in order + diff._get_axis(axis=axis).names = ax_names[order] + + # reorder axis to keep things organized + indices = ( + np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + ) + diff = diff.take(indices, axis=axis) + + return diff + def combine( self, other: "DataFrame", func, fill_value=None, overwrite=True ) -> "DataFrame": diff --git a/pandas/core/series.py b/pandas/core/series.py index 446654374f37c..96a015253080d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2566,6 +2566,126 @@ def _binop(self, other, func, level=None, fill_value=None): ret = ops._construct_result(self, result, new_index, name) return ret + def differences(self, other, axis=1, keep_indices=False, keep_values=False): + """ + Compare to another Series and show the differences. + + The axis on which to stack results and how much information to + preserve can be customized. + + Note that NaNs are considered not different from other NaNs. + + Parameters + ---------- + other : Series + Object to compare with. + + axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine how the differences are stacked. + * 0, or 'index' : Stack differences on neighbouring indices. + * 1, or 'columns' : Stack differences on neighbouring columns. + + keep_indices: bool, default False + Whether to keep the indices that are equal, or drop them. + + keep_values: bool, default False + Whether to keep the values that are equal, or show as NaNs. + + Returns + ------- + Series or DataFrame + If axis is 0 or 'index' the result will be a Series. + If axis is 1 or 'columns' the result will be a DataFrame. + + See Also + -------- + DataFrame.differences: Show differences. + + Examples + -------- + >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) + >>> s2 = pd.Series(["a", "a", "c", "b", "e"]) + + Stack the differences on columns + + >>> s1.differences(s2) + self other + 1 b a + 3 d b + + Stack the differences on indices + + >>> s1.differences(s2, axis=0) + 1 self b + other a + 3 self d + other b + dtype: object + + Keep all the original indices + + >>> s1.differences(s2, keep_indices=True) + self other + 0 NaN NaN + 1 b a + 2 NaN NaN + 3 d b + 4 NaN NaN + + Keep all original indices and data + + >>> s1.differences(s2, keep_indices=True, keep_values=True) + self other + 0 a a + 1 b a + 2 c c + 3 d b + 4 e e + """ + from pandas.core.reshape.concat import concat + + mask = ~((self == other) | (self.isna() & other.isna())) + keys = ["self", "other"] + + if not keep_values: + self = self.where(mask) + other = other.where(mask) + + if not keep_indices: + self = self[mask] + other = other[mask] + + if axis in (1, "columns"): + axis = 1 + else: + axis = self._get_axis_number(axis) + + diff = concat([self, other], axis=axis, keys=keys) + + if axis == 1: + return diff + + ax = diff._get_axis(axis) + ax_names = np.array(ax.names) + + # set index names to positions to avoid confusion + ax.names = np.arange(len(ax_names)) + + # bring self-other to inner level + order = list(range(1, ax.nlevels)) + [0] + diff = diff.reorder_levels(order) + + # restore the index names in order + diff._get_axis(axis=axis).names = ax_names[order] + + # reorder axis to keep things organized + indices = ( + np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + ) + diff = diff.take(indices, axis=axis) + + return diff + def combine(self, other, func, fill_value=None): """ Combine the Series with a Series or scalar according to `func`. diff --git a/pandas/tests/frame/methods/test_differences.py b/pandas/tests/frame/methods/test_differences.py new file mode 100644 index 0000000000000..058888f8551e2 --- /dev/null +++ b/pandas/tests/frame/methods/test_differences.py @@ -0,0 +1,130 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) +def test_differences_axis(axis): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.differences(df2, axis=axis) + + if axis in (1, "columns"): + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], + index=indices, + columns=columns, + ) + else: + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + columns = pd.Index(["col1", "col3"]) + expected = pd.DataFrame( + [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_indices, keep_values", + [ + (True, False), + (False, True), + (True, True), + # False, False case is already covered in test_differences_axis + ], +) +def test_differences_various_formats(keep_indices, keep_values): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.differences(df2, keep_indices=keep_indices, keep_values=keep_values) + + if keep_indices: + indices = pd.Index([0, 1, 2]) + columns = pd.MultiIndex.from_product( + [["col1", "col2", "col3"], ["self", "other"]] + ) + if keep_values: + expected = pd.DataFrame( + [ + ["a", "c", 1.0, 1.0, 1.0, 1.0], + ["b", "b", 2.0, 2.0, 2.0, 2.0], + ["c", "c", np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + expected = pd.DataFrame( + [ + ["a", "c", np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + + +def test_differences_with_equal_nulls(): + # We want to make sure two NaNs are considered the same + # and dropped where applicable + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + + result = df.differences(df2) + indices = pd.Index([0]) + columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]]) + expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_differences_with_non_equal_nulls(): + # We want to make sure the relevant NaNs do not get dropped + # even if the entire row or column are NaNs + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = np.nan + + result = df.differences(df2) + + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_differences.py b/pandas/tests/series/methods/test_differences.py new file mode 100644 index 0000000000000..58780b129905c --- /dev/null +++ b/pandas/tests/series/methods/test_differences.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) +def test_differences_axis(axis): + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", "z"]) + + result = s1.differences(s2, axis=axis) + + if axis in (1, "columns"): + indices = pd.Index([0, 2]) + columns = pd.Index(["self", "other"]) + expected = pd.DataFrame( + [["a", "x"], ["c", "z"]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + else: + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + expected = pd.Series(["a", "x", "c", "z"], index=indices) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_indices, keep_values", + [ + (True, False), + (False, True), + (True, True), + # False, False case is already covered in test_differences_axis + ], +) +def test_differences_various_formats(keep_indices, keep_values): + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", "z"]) + + result = s1.differences(s2, keep_indices=keep_indices, keep_values=keep_values) + + if keep_indices: + indices = pd.Index([0, 1, 2]) + columns = pd.Index(["self", "other"]) + if keep_values: + expected = pd.DataFrame( + [["a", "x"], ["b", "b"], ["c", "z"]], index=indices, columns=columns + ) + else: + expected = pd.DataFrame( + [["a", "x"], [np.nan, np.nan], ["c", "z"]], + index=indices, + columns=columns, + ) + else: + indices = pd.Index([0, 2]) + columns = pd.Index(["self", "other"]) + expected = pd.DataFrame( + [["a", "x"], ["c", "z"]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + + +def test_differences_with_equal_nulls(): + # We want to make sure two NaNs are considered the same + # and dropped where applicable + s1 = pd.Series(["a", "b", np.nan]) + s2 = pd.Series(["x", "b", np.nan]) + + result = s1.differences(s2) + expected = pd.DataFrame([["a", "x"]], columns=["self", "other"]) + tm.assert_frame_equal(result, expected) + + +def test_differences_with_non_equal_nulls(): + # We want to make sure the relevant NaNs do not get dropped + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", np.nan]) + + result = s1.differences(s2, axis=0) + + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + expected = pd.Series(["a", "x", "c", np.nan], index=indices) + tm.assert_series_equal(result, expected) From 8f5d0fb4370785c688fff5144c5d6697ecbc639c Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 10 Jan 2020 00:42:26 +0800 Subject: [PATCH 02/44] CLN: reformatted docstring (GH30429) --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8bbab101bea7..1cf6a38f48363 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5323,10 +5323,10 @@ def differences(self, other, axis=1, keep_indices=False, keep_values=False): * 0, or 'index' : Stack differences on neighbouring rows. * 1, or 'columns' : Stack differences on neighbouring columns. - keep_indices: bool, default False + keep_indices : bool, default False Whether to keep the rows and columns that are equal, or drop them. - keep_values: bool, default False + keep_values : bool, default False Whether to keep the values that are equal, or show as NaNs. Returns diff --git a/pandas/core/series.py b/pandas/core/series.py index 96a015253080d..9f74ffec27197 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2585,10 +2585,10 @@ def differences(self, other, axis=1, keep_indices=False, keep_values=False): * 0, or 'index' : Stack differences on neighbouring indices. * 1, or 'columns' : Stack differences on neighbouring columns. - keep_indices: bool, default False + keep_indices : bool, default False Whether to keep the indices that are equal, or drop them. - keep_values: bool, default False + keep_values : bool, default False Whether to keep the values that are equal, or show as NaNs. Returns From c5b793a2de275cfbc385d2e8fab17c25e0cd4a25 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 10 Jan 2020 22:16:02 +0800 Subject: [PATCH 03/44] ENH: Extracted differences() from DataFrame and Series into NDFrame --- pandas/core/frame.py | 39 ++--------------------------- pandas/core/generic.py | 56 ++++++++++++++++++++++++++++++++++++++++++ pandas/core/series.py | 44 ++------------------------------- 3 files changed, 60 insertions(+), 79 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1cf6a38f48363..5edd11a4ba889 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5406,44 +5406,9 @@ def differences(self, other, axis=1, keep_indices=False, keep_values=False): 3 b b NaN NaN 4.0 4.0 4 a a 5.0 5.0 5.0 5.0 """ - from pandas.core.reshape.concat import concat - - mask = ~((self == other) | (self.isna() & other.isna())) - keys = ["self", "other"] - - if not keep_values: - self = self.where(mask) - other = other.where(mask) - - if not keep_indices: - cmask = mask.any() - rmask = mask.any(axis=1) - self = self.loc[rmask, cmask] - other = other.loc[rmask, cmask] - - axis = self._get_axis_number(axis) - diff = concat([self, other], axis=axis, keys=keys) - - ax = diff._get_axis(axis) - ax_names = np.array(ax.names) - - # set index names to positions to avoid confusion - ax.names = np.arange(len(ax_names)) - - # bring self-other to inner level - order = list(range(1, ax.nlevels)) + [0] - diff = diff.reorder_levels(order, axis=axis) - - # restore the index names in order - diff._get_axis(axis=axis).names = ax_names[order] - - # reorder axis to keep things organized - indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + return super().differences( + other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values ) - diff = diff.take(indices, axis=axis) - - return diff def combine( self, other: "DataFrame", func, fill_value=None, overwrite=True diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22655bf9889c7..6bb965655ee6f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8189,6 +8189,62 @@ def ranker(data): return ranker(data) + def differences(self, other, axis=1, keep_indices=False, keep_values=False): + from pandas.core.reshape.concat import concat + + mask = ~((self == other) | (self.isna() & other.isna())) + keys = ["self", "other"] + + if not keep_values: + self = self.where(mask) + other = other.where(mask) + + if not keep_indices: + if isinstance(self, ABCDataFrame): + cmask = mask.any() + rmask = mask.any(axis=1) + self = self.loc[rmask, cmask] + other = other.loc[rmask, cmask] + else: + self = self[mask] + other = other[mask] + + if axis in (1, "columns"): # This is needed for Series + axis = 1 + else: + axis = self._get_axis_number(axis) + + diff = concat([self, other], axis=axis, keys=keys) + + if axis >= self.ndim: + # No need to reorganize data if stacking on new axis + # This currently applies for stacking two Series on columns + return diff + + ax = diff._get_axis(axis) + ax_names = np.array(ax.names) + + # set index names to positions to avoid confusion + ax.names = np.arange(len(ax_names)) + + # bring self-other to inner level + order = list(range(1, ax.nlevels)) + [0] + if isinstance(diff, ABCDataFrame): + diff = diff.reorder_levels(order, axis=axis) + else: + diff = diff.reorder_levels(order) + + # restore the index names in order + diff._get_axis(axis=axis).names = ax_names[order] + + # reorder axis to keep things organized + indices = ( + np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + ) + diff = diff.take(indices, axis=axis) + + return diff + _shared_docs[ "align" ] = """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 9f74ffec27197..791c2ab4839de 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2642,49 +2642,9 @@ def differences(self, other, axis=1, keep_indices=False, keep_values=False): 3 d b 4 e e """ - from pandas.core.reshape.concat import concat - - mask = ~((self == other) | (self.isna() & other.isna())) - keys = ["self", "other"] - - if not keep_values: - self = self.where(mask) - other = other.where(mask) - - if not keep_indices: - self = self[mask] - other = other[mask] - - if axis in (1, "columns"): - axis = 1 - else: - axis = self._get_axis_number(axis) - - diff = concat([self, other], axis=axis, keys=keys) - - if axis == 1: - return diff - - ax = diff._get_axis(axis) - ax_names = np.array(ax.names) - - # set index names to positions to avoid confusion - ax.names = np.arange(len(ax_names)) - - # bring self-other to inner level - order = list(range(1, ax.nlevels)) + [0] - diff = diff.reorder_levels(order) - - # restore the index names in order - diff._get_axis(axis=axis).names = ax_names[order] - - # reorder axis to keep things organized - indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + return super().differences( + other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values ) - diff = diff.take(indices, axis=axis) - - return diff def combine(self, other, func, fill_value=None): """ From d22e21a11752c97a4cd797867b7f3ca8f7f073e9 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 12:09:15 +0800 Subject: [PATCH 04/44] ENH: organized docstring using _shared_doc and reduced duplicates (GH30429) --- pandas/core/frame.py | 180 +++++++++++++++++++---------------------- pandas/core/generic.py | 32 +++++++- pandas/core/series.py | 130 +++++++++++++---------------- 3 files changed, 168 insertions(+), 174 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f66ae8e75638..936b089b7ea82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5365,108 +5365,90 @@ def _construct_result(self, result) -> "DataFrame": out.columns = self.columns return out - def differences(self, other, axis=1, keep_indices=False, keep_values=False): + @Appender( """ - Compare to another DataFrame and show the differences. - - The axis on which to stack results and how much information to - preserve can be customized. - - Note that NaNs are considered not different from other NaNs. - - Parameters - ---------- - other : DataFrame - Object to compare with. - - axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine how the differences are stacked. - * 0, or 'index' : Stack differences on neighbouring rows. - * 1, or 'columns' : Stack differences on neighbouring columns. - - keep_indices : bool, default False - Whether to keep the rows and columns that are equal, or drop them. - - keep_values : bool, default False - Whether to keep the values that are equal, or show as NaNs. - - Returns - ------- - DataFrame - DataFrame that shows the differences stacked side by side. +Returns +------- +DataFrame + DataFrame that shows the differences stacked side by side. - See Also - -------- - Series.differences: Show differences. +See Also +-------- +Series.differences: Show differences. - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "col1": ["a", "a", "b", "b", "a"], - ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] - ... }, - ... columns=["col1", "col2", "col3"], - ... ) - >>> df - col1 col2 col3 - 0 a 1.0 1.0 - 1 a 2.0 2.0 - 2 b 3.0 3.0 - 3 b NaN 4.0 - 4 a 5.0 5.0 - - >>> df2 = df.copy() - >>> df2.loc[0, 'col1'] = 'c' - >>> df2.loc[2, 'col3'] = 4.0 - >>> df2 - col1 col2 col3 - 0 c 1.0 1.0 - 1 a 2.0 2.0 - 2 b 3.0 4.0 - 3 b NaN 4.0 - 4 a 5.0 5.0 - - Stack the differences on columns - - >>> df.differences(df2) - col1 col3 - self other self other - 0 a c NaN NaN - 2 NaN NaN 3.0 4.0 - - Stack the differences on rows - - >>> df.differences(df2, axis=0) - col1 col3 - 0 self a NaN - other c NaN - 2 self NaN 3.0 - other NaN 4.0 - - Keep all the original indices (rows and columns) - - >>> df.differences(df2, keep_indices=True) - col1 col2 col3 - self other self other self other - 0 a c NaN NaN NaN NaN - 1 NaN NaN NaN NaN NaN NaN - 2 NaN NaN NaN NaN 3.0 4.0 - 3 NaN NaN NaN NaN NaN NaN - 4 NaN NaN NaN NaN NaN NaN - - Keep all original indices and data - - >>> df.differences(df2, keep_indices=True, keep_values=True) - col1 col2 col3 - self other self other self other - 0 a c 1.0 1.0 1.0 1.0 - 1 a a 2.0 2.0 2.0 2.0 - 2 b b 3.0 3.0 3.0 4.0 - 3 b b NaN NaN 4.0 4.0 - 4 a a 5.0 5.0 5.0 5.0 - """ +Examples +-------- +>>> df = pd.DataFrame( +... { +... "col1": ["a", "a", "b", "b", "a"], +... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], +... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] +... }, +... columns=["col1", "col2", "col3"], +... ) +>>> df + col1 col2 col3 +0 a 1.0 1.0 +1 a 2.0 2.0 +2 b 3.0 3.0 +3 b NaN 4.0 +4 a 5.0 5.0 + +>>> df2 = df.copy() +>>> df2.loc[0, 'col1'] = 'c' +>>> df2.loc[2, 'col3'] = 4.0 +>>> df2 + col1 col2 col3 +0 c 1.0 1.0 +1 a 2.0 2.0 +2 b 3.0 4.0 +3 b NaN 4.0 +4 a 5.0 5.0 + +Stack the differences on columns + +>>> df.differences(df2) + col1 col3 + self other self other +0 a c NaN NaN +2 NaN NaN 3.0 4.0 + +Stack the differences on rows + +>>> df.differences(df2, axis=0) + col1 col3 +0 self a NaN + other c NaN +2 self NaN 3.0 + other NaN 4.0 + +Keep all the original indices (rows and columns) + +>>> df.differences(df2, keep_indices=True) + col1 col2 col3 + self other self other self other +0 a c NaN NaN NaN NaN +1 NaN NaN NaN NaN NaN NaN +2 NaN NaN NaN NaN 3.0 4.0 +3 NaN NaN NaN NaN NaN NaN +4 NaN NaN NaN NaN NaN NaN + +Keep all original indices and data + +>>> df.differences(df2, keep_indices=True, keep_values=True) + col1 col2 col3 + self other self other self other +0 a c 1.0 1.0 1.0 1.0 +1 a a 2.0 2.0 2.0 2.0 +2 b b 3.0 3.0 3.0 4.0 +3 b b NaN NaN 4.0 4.0 +4 a a 5.0 5.0 5.0 5.0 +""" + ) + @Appender(_shared_docs["differences"] % _shared_doc_kwargs) + def differences( + self, other, axis=1, keep_indices=False, keep_values=False + ) -> "DataFrame": return super().differences( other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 58e8b813163c6..537f4fd8d3c9f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8103,7 +8103,37 @@ def ranker(data): return ranker(data) - def differences(self, other, axis=1, keep_indices=False, keep_values=False): + _shared_docs[ + "differences" + ] = """ + Compare to another %(klass)s and show the differences. + + The axis on which to stack results and how much information to + preserve can be customized. + + Note that NaNs are considered not different from other NaNs. + + Parameters + ---------- + other : %(klass)s + Object to compare with. + + axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine how the differences are stacked. + * 0, or 'index' : Stack differences on neighbouring rows. + * 1, or 'columns' : Stack differences on neighbouring columns. + + keep_indices : bool, default False + Whether to keep the rows and columns that are equal, or drop them. + + keep_values : bool, default False + Whether to keep the values that are equal, or show as NaNs. + """ + + @Appender(_shared_docs["differences"] % _shared_doc_kwargs) + def differences( + self, other, axis=1, keep_indices=False, keep_values=False + ) -> FrameOrSeries: from pandas.core.reshape.concat import concat mask = ~((self == other) | (self.isna() & other.isna())) diff --git a/pandas/core/series.py b/pandas/core/series.py index d5904103fc758..08caf5b0c62af 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,7 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs -from pandas._typing import Label +from pandas._typing import Label, FrameOrSeries from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -2555,82 +2555,64 @@ def _binop(self, other, func, level=None, fill_value=None): ret = ops._construct_result(self, result, new_index, name) return ret - def differences(self, other, axis=1, keep_indices=False, keep_values=False): + @Appender( """ - Compare to another Series and show the differences. - - The axis on which to stack results and how much information to - preserve can be customized. - - Note that NaNs are considered not different from other NaNs. - - Parameters - ---------- - other : Series - Object to compare with. - - axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine how the differences are stacked. - * 0, or 'index' : Stack differences on neighbouring indices. - * 1, or 'columns' : Stack differences on neighbouring columns. - - keep_indices : bool, default False - Whether to keep the indices that are equal, or drop them. - - keep_values : bool, default False - Whether to keep the values that are equal, or show as NaNs. - - Returns - ------- - Series or DataFrame - If axis is 0 or 'index' the result will be a Series. - If axis is 1 or 'columns' the result will be a DataFrame. - - See Also - -------- - DataFrame.differences: Show differences. +Returns +------- +Series or DataFrame + If axis is 0 or 'index' the result will be a Series. + If axis is 1 or 'columns' the result will be a DataFrame. - Examples - -------- - >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) - >>> s2 = pd.Series(["a", "a", "c", "b", "e"]) - - Stack the differences on columns - - >>> s1.differences(s2) - self other - 1 b a - 3 d b - - Stack the differences on indices - - >>> s1.differences(s2, axis=0) - 1 self b - other a - 3 self d - other b - dtype: object - - Keep all the original indices - - >>> s1.differences(s2, keep_indices=True) - self other - 0 NaN NaN - 1 b a - 2 NaN NaN - 3 d b - 4 NaN NaN - - Keep all original indices and data +See Also +-------- +DataFrame.differences: Show differences. - >>> s1.differences(s2, keep_indices=True, keep_values=True) - self other - 0 a a - 1 b a - 2 c c - 3 d b - 4 e e - """ +Examples +-------- +>>> s1 = pd.Series(["a", "b", "c", "d", "e"]) +>>> s2 = pd.Series(["a", "a", "c", "b", "e"]) + +Stack the differences on columns + +>>> s1.differences(s2) + self other +1 b a +3 d b + +Stack the differences on indices + +>>> s1.differences(s2, axis=0) +1 self b + other a +3 self d + other b +dtype: object + +Keep all the original indices + +>>> s1.differences(s2, keep_indices=True) + self other +0 NaN NaN +1 b a +2 NaN NaN +3 d b +4 NaN NaN + +Keep all original indices and data + +>>> s1.differences(s2, keep_indices=True, keep_values=True) + self other +0 a a +1 b a +2 c c +3 d b +4 e e +""" + ) + @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) + def differences( + self, other, axis=1, keep_indices=False, keep_values=False + ) -> FrameOrSeries: return super().differences( other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values ) From 83f31df165040a948706dc21518f4f4c30677794 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 12:12:32 +0800 Subject: [PATCH 05/44] ENH: added argument type indication (GH30429) --- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 936b089b7ea82..31f601c0458f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5447,7 +5447,7 @@ def _construct_result(self, result) -> "DataFrame": ) @Appender(_shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other, axis=1, keep_indices=False, keep_values=False + self, other: "DataFrame", axis=1, keep_indices=False, keep_values=False ) -> "DataFrame": return super().differences( other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values diff --git a/pandas/core/series.py b/pandas/core/series.py index 08caf5b0c62af..c1afa9b6d01f1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2611,7 +2611,7 @@ def _binop(self, other, func, level=None, fill_value=None): ) @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other, axis=1, keep_indices=False, keep_values=False + self, other: "Series", axis=1, keep_indices=False, keep_values=False ) -> FrameOrSeries: return super().differences( other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values From 488c8a89ca5494bb9d5dcdfc35f2e1b6cb9bd1c9 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 12:57:18 +0800 Subject: [PATCH 06/44] ENH: reordered imports (GH30429) --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c1afa9b6d01f1..3b03eebb2dfcf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,7 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs -from pandas._typing import Label, FrameOrSeries +from pandas._typing import FrameOrSeries, Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile From 322ff20fcd195b194e0ef8051741bad13ccccaa1 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 13:48:27 +0800 Subject: [PATCH 07/44] ENH: removed inconsistent type indication (GH30429) --- pandas/core/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 537f4fd8d3c9f..7e44d5dec5158 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8131,9 +8131,7 @@ def ranker(data): """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences( - self, other, axis=1, keep_indices=False, keep_values=False - ) -> FrameOrSeries: + def differences(self, other, axis=1, keep_indices=False, keep_values=False): from pandas.core.reshape.concat import concat mask = ~((self == other) | (self.isna() & other.isna())) From e50172c01c0d9941c6701aa11f4b237132bf336d Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 30 Jan 2020 23:11:59 +0800 Subject: [PATCH 08/44] ENH: Added whatsnew entry (GH30429) --- doc/source/whatsnew/v1.1.0.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..bb43508e2c10d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -36,6 +36,31 @@ For example: ser["2014"] ser.loc["May 2015"] +.. _whatsnew_100.differences: + +Comparing two `DataFrame` or two `Series` and summarizing the differences +^^^^^^^^^^^^^^^^^^^^^^ + +We've added :meth:`~DataFrame.differences` and :meth:`~Series.differences` for comparing two `DataFrame`s or two `Series`s (:issue:`30429`) + +.. ipython:: python + + df = pd.DataFrame( + { + "col1": ["a", "a", "b", "b", "a"], + "col2": [1.0, 2.0, 3.0, np.nan, 5.0], + "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + columns=["col1", "col2", "col3"], + ) + df + df2 = df.copy() + df2.loc[0, 'col1'] = 'c' + df2.loc[2, 'col3'] = 4.0 + df2 + df.differences(df2) + + .. _whatsnew_110.enhancements.other: Other enhancements From 4a82bec7a2ab602050d1f909c3f58176790f96b2 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 30 Jan 2020 23:12:58 +0800 Subject: [PATCH 09/44] ENH: Minor correction in whatsnew entry (GH30429) --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bb43508e2c10d..b794d7375ca86 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -36,7 +36,7 @@ For example: ser["2014"] ser.loc["May 2015"] -.. _whatsnew_100.differences: +.. _whatsnew_110.differences: Comparing two `DataFrame` or two `Series` and summarizing the differences ^^^^^^^^^^^^^^^^^^^^^^ From b2849ed41e757ffdbe63540113be921daaa70904 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 30 Jan 2020 23:34:48 +0800 Subject: [PATCH 10/44] ENH: Minor correction in whatsnew entry (GH30429) --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b794d7375ca86..2e17e8b941f5f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -41,7 +41,7 @@ For example: Comparing two `DataFrame` or two `Series` and summarizing the differences ^^^^^^^^^^^^^^^^^^^^^^ -We've added :meth:`~DataFrame.differences` and :meth:`~Series.differences` for comparing two `DataFrame`s or two `Series`s (:issue:`30429`) +We've added :meth:`~DataFrame.differences` and :meth:`~Series.differences` for comparing two `DataFrame` or two `Series` (:issue:`30429`) .. ipython:: python From ff7a57205fd3ab3021af75efdbc5819c8d0d8fc4 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 31 Jan 2020 09:49:16 +0800 Subject: [PATCH 11/44] ENH: Correction in whatsnew entry (GH30429) --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2e17e8b941f5f..2a85a55fcae91 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -39,9 +39,9 @@ For example: .. _whatsnew_110.differences: Comparing two `DataFrame` or two `Series` and summarizing the differences -^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :meth:`~DataFrame.differences` and :meth:`~Series.differences` for comparing two `DataFrame` or two `Series` (:issue:`30429`) +We've added :meth:`DataFrame.differences` and :meth:`Series.differences` for comparing two `DataFrame` or two `Series` (:issue:`30429`) .. ipython:: python From bc969e8ec787f3671003a494d116c290d8477d87 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Mon, 10 Feb 2020 11:50:09 +0800 Subject: [PATCH 12/44] ENH: updated whatsnew (GH31200) --- doc/source/whatsnew/v1.1.0.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2a85a55fcae91..02b9ea9fc13cd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -53,10 +53,13 @@ We've added :meth:`DataFrame.differences` and :meth:`Series.differences` for com }, columns=["col1", "col2", "col3"], ) - df df2 = df.copy() df2.loc[0, 'col1'] = 'c' df2.loc[2, 'col3'] = 4.0 + +.. ipython:: python + + df df2 df.differences(df2) From 26c6ca6633a2009d8ac38853e86a43e987d8ea37 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Mon, 10 Feb 2020 11:50:28 +0800 Subject: [PATCH 13/44] ENH: added doc references (GH31200) --- doc/source/reference/frame.rst | 3 ++- doc/source/reference/series.rst | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c7b1cc1c832be..3ef881868f0f6 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -239,7 +239,7 @@ Reshaping, sorting, transposing DataFrame.T DataFrame.transpose -Combining / joining / merging +Combining / comparing / joining / merging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -249,6 +249,7 @@ Combining / joining / merging DataFrame.join DataFrame.merge DataFrame.update + DataFrame.differences Time series-related ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 1a69fa076dbf0..6ae2640ca7cbe 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -240,7 +240,7 @@ Reshaping, sorting Series.squeeze Series.view -Combining / joining / merging +Combining / comparing / joining / merging ----------------------------- .. autosummary:: :toctree: api/ @@ -248,6 +248,7 @@ Combining / joining / merging Series.append Series.replace Series.update + Series.differences Time series-related ------------------- From 5fb2edceda2967a3d578e93880c29d8d8a018d30 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Mon, 10 Feb 2020 12:26:20 +0800 Subject: [PATCH 14/44] DOC: fixed formatting issue in doc references --- doc/source/reference/frame.rst | 2 +- doc/source/reference/series.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 3ef881868f0f6..d0ad3e0a102e1 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -240,7 +240,7 @@ Reshaping, sorting, transposing DataFrame.transpose Combining / comparing / joining / merging -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 6ae2640ca7cbe..8ac2052b5c693 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -241,7 +241,7 @@ Reshaping, sorting Series.view Combining / comparing / joining / merging ------------------------------ +----------------------------------------- .. autosummary:: :toctree: api/ From 35ccb5f334bd798cd5ae9144f5b01c1e67df8889 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 25 Feb 2020 23:03:31 +0800 Subject: [PATCH 15/44] updated parameter names, docstring, and relevant tests (GH30429) --- pandas/core/frame.py | 11 +++++--- pandas/core/generic.py | 25 ++++++++----------- pandas/core/series.py | 15 ++++++++--- .../tests/frame/methods/test_differences.py | 10 ++++---- .../tests/series/methods/test_differences.py | 10 ++++---- 5 files changed, 40 insertions(+), 31 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e53cf06c7b54..81d9dbdc135db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5150,12 +5150,17 @@ def _construct_result(self, result) -> "DataFrame": Returns ------- DataFrame - DataFrame that shows the differences stacked side by side. + DataFrame that shows the differences stacked side by side. + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. See Also -------- Series.differences: Show differences. +Notes +----- +NaNs are considered equal to other NaNs. + Examples -------- >>> df = pd.DataFrame( @@ -5227,10 +5232,10 @@ def _construct_result(self, result) -> "DataFrame": ) @Appender(_shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "DataFrame", axis=1, keep_indices=False, keep_values=False + self, other: "DataFrame", axis=1, keep_shape=False, keep_equal=False ) -> "DataFrame": return super().differences( - other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values + other=other, axis=axis, keep_shape=keep_shape, keep_equal=keep_equal ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e572eb7382590..2f8d6eb5b033f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8153,11 +8153,6 @@ def ranker(data): ] = """ Compare to another %(klass)s and show the differences. - The axis on which to stack results and how much information to - preserve can be customized. - - Note that NaNs are considered not different from other NaNs. - Parameters ---------- other : %(klass)s @@ -8165,28 +8160,30 @@ def ranker(data): axis : {0 or 'index', 1 or 'columns'}, default 1 Determine how the differences are stacked. - * 0, or 'index' : Stack differences on neighbouring rows. - * 1, or 'columns' : Stack differences on neighbouring columns. + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are stacked horizontally + with columns drawn alternately from self and other. - keep_indices : bool, default False - Whether to keep the rows and columns that are equal, or drop them. + keep_shape : bool, default False + If true, all rows and columns are kept. Otherwise only the different ones are kept. - keep_values : bool, default False - Whether to keep the values that are equal, or show as NaNs. + keep_equal : bool, default False + If true, the result keeps values that are equal. Otherwise they are shown as NaNs. """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences(self, other, axis=1, keep_indices=False, keep_values=False): + def differences(self, other, axis=1, keep_shape=False, keep_equal=False): from pandas.core.reshape.concat import concat mask = ~((self == other) | (self.isna() & other.isna())) keys = ["self", "other"] - if not keep_values: + if not keep_equal: self = self.where(mask) other = other.where(mask) - if not keep_indices: + if not keep_shape: if isinstance(self, ABCDataFrame): cmask = mask.any() rmask = mask.any(axis=1) diff --git a/pandas/core/series.py b/pandas/core/series.py index e278ab4b318ca..304de8382debd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2579,13 +2579,20 @@ def _binop(self, other, func, level=None, fill_value=None): Returns ------- Series or DataFrame - If axis is 0 or 'index' the result will be a Series. - If axis is 1 or 'columns' the result will be a DataFrame. + If axis is 0 or 'index' the result will be a Series. + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. + + If axis is 1 or 'columns' the result will be a DataFrame. + Its will have two columns namely 'self' and 'other'. See Also -------- DataFrame.differences: Show differences. +Notes +----- +NaNs are considered equal to other NaNs. + Examples -------- >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) @@ -2630,10 +2637,10 @@ def _binop(self, other, func, level=None, fill_value=None): ) @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "Series", axis=1, keep_indices=False, keep_values=False + self, other: "Series", axis=1, keep_shape=False, keep_equal=False ) -> FrameOrSeries: return super().differences( - other=other, axis=axis, keep_indices=keep_indices, keep_values=keep_values + other=other, axis=axis, keep_shape=keep_shape, keep_equal=keep_equal ) def combine(self, other, func, fill_value=None) -> "Series": diff --git a/pandas/tests/frame/methods/test_differences.py b/pandas/tests/frame/methods/test_differences.py index 058888f8551e2..aa2f3d7b11a2a 100644 --- a/pandas/tests/frame/methods/test_differences.py +++ b/pandas/tests/frame/methods/test_differences.py @@ -37,7 +37,7 @@ def test_differences_axis(axis): @pytest.mark.parametrize( - "keep_indices, keep_values", + "keep_shape, keep_equal", [ (True, False), (False, True), @@ -45,7 +45,7 @@ def test_differences_axis(axis): # False, False case is already covered in test_differences_axis ], ) -def test_differences_various_formats(keep_indices, keep_values): +def test_differences_various_formats(keep_shape, keep_equal): df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, columns=["col1", "col2", "col3"], @@ -54,14 +54,14 @@ def test_differences_various_formats(keep_indices, keep_values): df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 - result = df.differences(df2, keep_indices=keep_indices, keep_values=keep_values) + result = df.differences(df2, keep_shape=keep_shape, keep_equal=keep_equal) - if keep_indices: + if keep_shape: indices = pd.Index([0, 1, 2]) columns = pd.MultiIndex.from_product( [["col1", "col2", "col3"], ["self", "other"]] ) - if keep_values: + if keep_equal: expected = pd.DataFrame( [ ["a", "c", 1.0, 1.0, 1.0, 1.0], diff --git a/pandas/tests/series/methods/test_differences.py b/pandas/tests/series/methods/test_differences.py index 58780b129905c..c6d1b3ed1a65c 100644 --- a/pandas/tests/series/methods/test_differences.py +++ b/pandas/tests/series/methods/test_differences.py @@ -26,7 +26,7 @@ def test_differences_axis(axis): @pytest.mark.parametrize( - "keep_indices, keep_values", + "keep_shape, keep_equal", [ (True, False), (False, True), @@ -34,16 +34,16 @@ def test_differences_axis(axis): # False, False case is already covered in test_differences_axis ], ) -def test_differences_various_formats(keep_indices, keep_values): +def test_differences_various_formats(keep_shape, keep_equal): s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) - result = s1.differences(s2, keep_indices=keep_indices, keep_values=keep_values) + result = s1.differences(s2, keep_shape=keep_shape, keep_equal=keep_equal) - if keep_indices: + if keep_shape: indices = pd.Index([0, 1, 2]) columns = pd.Index(["self", "other"]) - if keep_values: + if keep_equal: expected = pd.DataFrame( [["a", "x"], ["b", "b"], ["c", "z"]], index=indices, columns=columns ) From 586e37c56b3b49c0c110dd054aa5353d828fc47a Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 25 Feb 2020 23:15:02 +0800 Subject: [PATCH 16/44] added doc-string tests (GH30429) --- ci/code_checks.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e2dc543360a62..10ff7085d5b81 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -259,17 +259,18 @@ fi if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests frame.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/frame.py + pytest -q --doctest-modules pandas/core/frame.py \ + -k"-differences" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests series.py' ; echo $MSG pytest -q --doctest-modules pandas/core/series.py \ - -k"-nonzero -reindex -searchsorted -to_dict" + -k"-differences -nonzero -reindex -searchsorted -to_dict" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests generic.py' ; echo $MSG pytest -q --doctest-modules pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" + -k"-_set_axis_name -_xs -describe -differences -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_clipboard -to_json -transpose -values -xs" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests groupby.py' ; echo $MSG From d13db2fe0e251f37fe8b23af6f191cc2fa56473b Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 25 Feb 2020 23:28:33 +0800 Subject: [PATCH 17/44] fixed some PEP8 issues in doc-strings (GH30429) --- pandas/core/frame.py | 6 ++++-- pandas/core/generic.py | 10 ++++++---- pandas/core/series.py | 10 ++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 81d9dbdc135db..2cdc9dcee5aba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5150,8 +5150,10 @@ def _construct_result(self, result) -> "DataFrame": Returns ------- DataFrame - DataFrame that shows the differences stacked side by side. - The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. + DataFrame that shows the differences stacked side by side. + + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. See Also -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2f8d6eb5b033f..5ac1f33d7f25b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8160,16 +8160,18 @@ def ranker(data): axis : {0 or 'index', 1 or 'columns'}, default 1 Determine how the differences are stacked. - * 0, or 'index' : Resulting differences are stacked vertically + * 0, or 'index' : Resulting differences are stacked vertically with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are stacked horizontally + * 1, or 'columns' : Resulting differences are stacked horizontally with columns drawn alternately from self and other. keep_shape : bool, default False - If true, all rows and columns are kept. Otherwise only the different ones are kept. + If true, all rows and columns are kept. + Otherwise, only the different ones are kept. keep_equal : bool, default False - If true, the result keeps values that are equal. Otherwise they are shown as NaNs. + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) diff --git a/pandas/core/series.py b/pandas/core/series.py index 304de8382debd..9f8647022d528 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2579,10 +2579,12 @@ def _binop(self, other, func, level=None, fill_value=None): Returns ------- Series or DataFrame - If axis is 0 or 'index' the result will be a Series. - The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. - - If axis is 1 or 'columns' the result will be a DataFrame. + If axis is 0 or 'index' the result will be a Series. + + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + + If axis is 1 or 'columns' the result will be a DataFrame. Its will have two columns namely 'self' and 'other'. See Also From 534220875f8c03ebd6e005e4c7c067099eafe62d Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 26 Feb 2020 09:24:03 +0800 Subject: [PATCH 18/44] removed trailing spaces in doc-strings (GH30429) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2cdc9dcee5aba..858ee18c28a6e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5152,7 +5152,7 @@ def _construct_result(self, result) -> "DataFrame": DataFrame DataFrame that shows the differences stacked side by side. - The resulting index will be a MultiIndex with 'self' and 'other' + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. See Also diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ac1f33d7f25b..0f463724716b6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8166,11 +8166,11 @@ def ranker(data): with columns drawn alternately from self and other. keep_shape : bool, default False - If true, all rows and columns are kept. + If true, all rows and columns are kept. Otherwise, only the different ones are kept. keep_equal : bool, default False - If true, the result keeps values that are equal. + If true, the result keeps values that are equal. Otherwise, equal values are shown as NaNs. """ From 77b1c9e217aff2cab0f67d6ba0b79d6fa8fdd9cf Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 26 Feb 2020 20:29:18 +0800 Subject: [PATCH 19/44] fixed sphinx identation issues in doc-strings (GH30429) --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f463724716b6..4bcbe8edf2805 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8167,11 +8167,11 @@ def ranker(data): keep_shape : bool, default False If true, all rows and columns are kept. - Otherwise, only the different ones are kept. + Otherwise, only the different ones are kept. keep_equal : bool, default False If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. + Otherwise, equal values are shown as NaNs. """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) From 51ffe0ed28a6511a50bf2f43d63d2eca16388b6b Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 26 Feb 2020 21:39:33 +0800 Subject: [PATCH 20/44] sphinx identation issues in doc-strings (GH30429) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/series.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 858ee18c28a6e..91154f52315ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5151,7 +5151,7 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame DataFrame that shows the differences stacked side by side. - + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4bcbe8edf2805..0f463724716b6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8167,11 +8167,11 @@ def ranker(data): keep_shape : bool, default False If true, all rows and columns are kept. - Otherwise, only the different ones are kept. + Otherwise, only the different ones are kept. keep_equal : bool, default False If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. + Otherwise, equal values are shown as NaNs. """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9f8647022d528..259eabb67d914 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2580,12 +2580,12 @@ def _binop(self, other, func, level=None, fill_value=None): ------- Series or DataFrame If axis is 0 or 'index' the result will be a Series. - + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. - + If axis is 1 or 'columns' the result will be a DataFrame. - Its will have two columns namely 'self' and 'other'. + It will have two columns namely 'self' and 'other'. See Also -------- From 827b69cbccb07f825f8cc81a3cbc8eb70092c73f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 26 Feb 2020 22:27:40 +0800 Subject: [PATCH 21/44] sphinx identation issues in doc-strings (GH30429) --- pandas/core/frame.py | 5 ++--- pandas/core/series.py | 8 +++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6289cf703b55..f68edfaf9c6b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5251,9 +5251,8 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame DataFrame that shows the differences stacked side by side. - - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. See Also -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 259eabb67d914..c6ca89b75b279 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2580,12 +2580,10 @@ def _binop(self, other, func, level=None, fill_value=None): ------- Series or DataFrame If axis is 0 or 'index' the result will be a Series. - - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. - + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. If axis is 1 or 'columns' the result will be a DataFrame. - It will have two columns namely 'self' and 'other'. + It will have two columns namely 'self' and 'other'. See Also -------- From 53918a571a2b1a172336c0f38bf6cb27e28cc14c Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Wed, 26 Feb 2020 23:45:56 +0800 Subject: [PATCH 22/44] Update pandas/core/frame.py minor changes in docstring Co-Authored-By: William Ayd --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f68edfaf9c6b1..a1ad127542002 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5256,7 +5256,7 @@ def _construct_result(self, result) -> "DataFrame": See Also -------- -Series.differences: Show differences. +Series.differences : Show differences. Notes ----- From 110f1387613b49b88e6e43a353fae067dc142264 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Wed, 26 Feb 2020 23:46:08 +0800 Subject: [PATCH 23/44] Update pandas/core/series.py minor changes in docstring Co-Authored-By: William Ayd --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c6ca89b75b279..9e7c7b53da8e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2587,7 +2587,7 @@ def _binop(self, other, func, level=None, fill_value=None): See Also -------- -DataFrame.differences: Show differences. +DataFrame.differences : Show differences. Notes ----- From acd51e010f08638d89d6f2a47262d2b80bd9d7fa Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 27 Feb 2020 19:47:23 +0800 Subject: [PATCH 24/44] attempt to fix sphinx identation issues in doc-strings (GH30429) --- pandas/core/frame.py | 5 +++-- pandas/core/generic.py | 1 + pandas/core/series.py | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1ad127542002..a0d8729543a19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5251,8 +5251,9 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame DataFrame that shows the differences stacked side by side. - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. + + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. See Also -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f463724716b6..b955a11b6822b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8160,6 +8160,7 @@ def ranker(data): axis : {0 or 'index', 1 or 'columns'}, default 1 Determine how the differences are stacked. + * 0, or 'index' : Resulting differences are stacked vertically with rows drawn alternately from self and other. * 1, or 'columns' : Resulting differences are stacked horizontally diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e7c7b53da8e9..3de8ebfaa91f5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2580,10 +2580,11 @@ def _binop(self, other, func, level=None, fill_value=None): ------- Series or DataFrame If axis is 0 or 'index' the result will be a Series. - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + If axis is 1 or 'columns' the result will be a DataFrame. - It will have two columns namely 'self' and 'other'. + It will have two columns namely 'self' and 'other'. See Also -------- From 1ef31c947156a334b6496514da72290dbb189ad5 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 27 Feb 2020 20:20:52 +0800 Subject: [PATCH 25/44] removed trailing spaces in doc-strings (GH30429) --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/series.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a0d8729543a19..9ea057dfb7d44 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5251,7 +5251,7 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame DataFrame that shows the differences stacked side by side. - + The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b955a11b6822b..e6130f244ebe4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8160,7 +8160,7 @@ def ranker(data): axis : {0 or 'index', 1 or 'columns'}, default 1 Determine how the differences are stacked. - + * 0, or 'index' : Resulting differences are stacked vertically with rows drawn alternately from self and other. * 1, or 'columns' : Resulting differences are stacked horizontally diff --git a/pandas/core/series.py b/pandas/core/series.py index 3de8ebfaa91f5..823d5d5d78284 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2582,7 +2582,7 @@ def _binop(self, other, func, level=None, fill_value=None): If axis is 0 or 'index' the result will be a Series. The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. - + If axis is 1 or 'columns' the result will be a DataFrame. It will have two columns namely 'self' and 'other'. From 3bc7485a9c4eb0bc470b815b135ddd8a600376e0 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 10 Mar 2020 10:47:29 +0800 Subject: [PATCH 26/44] removed unintended changes in ci/code_checks (GH30429) --- ci/code_checks.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 10ff7085d5b81..e2dc543360a62 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -259,18 +259,17 @@ fi if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests frame.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/frame.py \ - -k"-differences" + pytest -q --doctest-modules pandas/core/frame.py RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests series.py' ; echo $MSG pytest -q --doctest-modules pandas/core/series.py \ - -k"-differences -nonzero -reindex -searchsorted -to_dict" + -k"-nonzero -reindex -searchsorted -to_dict" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests generic.py' ; echo $MSG pytest -q --doctest-modules pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -differences -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_clipboard -to_json -transpose -values -xs" + -k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests groupby.py' ; echo $MSG From 06ed216570ea46afda2d69c7a2591632a6d5acaf Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 10 Mar 2020 11:10:45 +0800 Subject: [PATCH 27/44] corrected errors in docstring (GH30429) --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb4797aa39cbf..89b3e92405d4e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5312,7 +5312,7 @@ def _construct_result(self, result) -> "DataFrame": Keep all the original indices (rows and columns) ->>> df.differences(df2, keep_indices=True) +>>> df.differences(df2, keep_shape=True) col1 col2 col3 self other self other self other 0 a c NaN NaN NaN NaN @@ -5323,7 +5323,7 @@ def _construct_result(self, result) -> "DataFrame": Keep all original indices and data ->>> df.differences(df2, keep_indices=True, keep_values=True) +>>> df.differences(df2, keep_shape=True, keep_equal=True) col1 col2 col3 self other self other self other 0 a c 1.0 1.0 1.0 1.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 09e6e1f9d5eaa..39e659f599717 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2619,7 +2619,7 @@ def _binop(self, other, func, level=None, fill_value=None): Keep all the original indices ->>> s1.differences(s2, keep_indices=True) +>>> s1.differences(s2, keep_shape=True) self other 0 NaN NaN 1 b a @@ -2629,7 +2629,7 @@ def _binop(self, other, func, level=None, fill_value=None): Keep all original indices and data ->>> s1.differences(s2, keep_indices=True, keep_values=True) +>>> s1.differences(s2, keep_shape=True, keep_equal=True) self other 0 a a 1 b a From e4729ca19c45e009079693e38f711452cf06d424 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Sat, 14 Mar 2020 22:27:18 +0800 Subject: [PATCH 28/44] Update pandas/core/frame.py: slight semantic cleanup in docstring Co-Authored-By: William Ayd --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 89b3e92405d4e..766dc1898cb7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5262,7 +5262,7 @@ def _construct_result(self, result) -> "DataFrame": Notes ----- -NaNs are considered equal to other NaNs. +Matching NaNs will not appear as a difference. Examples -------- From b6c0f78549c82bc629b098522b6e3074b3c90a0f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 14 Mar 2020 23:18:42 +0800 Subject: [PATCH 29/44] renamed parameter axis to align_axis; added tests (GH30429) --- pandas/core/frame.py | 7 ++- pandas/core/generic.py | 30 +++++++------ pandas/core/series.py | 9 ++-- .../tests/frame/methods/test_differences.py | 43 +++++++++++++++++-- .../tests/series/methods/test_differences.py | 24 ++++++++--- 5 files changed, 85 insertions(+), 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 766dc1898cb7d..edea72fce22ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5335,10 +5335,13 @@ def _construct_result(self, result) -> "DataFrame": ) @Appender(_shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "DataFrame", axis=1, keep_shape=False, keep_equal=False + self, other: "DataFrame", align_axis=1, keep_shape=False, keep_equal=False ) -> "DataFrame": return super().differences( - other=other, axis=axis, keep_shape=keep_shape, keep_equal=keep_equal + other=other, + align_axis=align_axis, + keep_shape=keep_shape, + keep_equal=keep_equal, ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0427bf589c8be..f19d137bcb280 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8169,12 +8169,12 @@ def ranker(data): other : %(klass)s Object to compare with. - axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine how the differences are stacked. + align_axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine which axis to align the comparison on. * 0, or 'index' : Resulting differences are stacked vertically with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are stacked horizontally + * 1, or 'columns' : Resulting differences are aligned horizontally with columns drawn alternately from self and other. keep_shape : bool, default False @@ -8187,7 +8187,7 @@ def ranker(data): """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences(self, other, axis=1, keep_shape=False, keep_equal=False): + def differences(self, other, align_axis=1, keep_shape=False, keep_equal=False): from pandas.core.reshape.concat import concat mask = ~((self == other) | (self.isna() & other.isna())) @@ -8207,19 +8207,19 @@ def differences(self, other, axis=1, keep_shape=False, keep_equal=False): self = self[mask] other = other[mask] - if axis in (1, "columns"): # This is needed for Series - axis = 1 + if align_axis in (1, "columns"): # This is needed for Series + align_axis = 1 else: - axis = self._get_axis_number(axis) + align_axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=keys) + diff = concat([self, other], axis=align_axis, keys=keys) - if axis >= self.ndim: + if align_axis >= self.ndim: # No need to reorganize data if stacking on new axis # This currently applies for stacking two Series on columns return diff - ax = diff._get_axis(axis) + ax = diff._get_axis(align_axis) ax_names = np.array(ax.names) # set index names to positions to avoid confusion @@ -8228,18 +8228,20 @@ def differences(self, other, axis=1, keep_shape=False, keep_equal=False): # bring self-other to inner level order = list(range(1, ax.nlevels)) + [0] if isinstance(diff, ABCDataFrame): - diff = diff.reorder_levels(order, axis=axis) + diff = diff.reorder_levels(order, axis=align_axis) else: diff = diff.reorder_levels(order) # restore the index names in order - diff._get_axis(axis=axis).names = ax_names[order] + diff._get_axis(axis=align_axis).names = ax_names[order] # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[align_axis]) + .reshape([2, diff.shape[align_axis] // 2]) + .T.flatten() ) - diff = diff.take(indices, axis=axis) + diff = diff.take(indices, axis=align_axis) return diff diff --git a/pandas/core/series.py b/pandas/core/series.py index 39e659f599717..b7314a686d33e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2594,7 +2594,7 @@ def _binop(self, other, func, level=None, fill_value=None): Notes ----- -NaNs are considered equal to other NaNs. +Matching NaNs will not appear as a difference. Examples -------- @@ -2640,10 +2640,13 @@ def _binop(self, other, func, level=None, fill_value=None): ) @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "Series", axis=1, keep_shape=False, keep_equal=False + self, other: "Series", align_axis=1, keep_shape=False, keep_equal=False ) -> FrameOrSeries: return super().differences( - other=other, axis=axis, keep_shape=keep_shape, keep_equal=keep_equal + other=other, + align_axis=align_axis, + keep_shape=keep_shape, + keep_equal=keep_equal, ) def combine(self, other, func, fill_value=None) -> "Series": diff --git a/pandas/tests/frame/methods/test_differences.py b/pandas/tests/frame/methods/test_differences.py index aa2f3d7b11a2a..c4524ea9dda75 100644 --- a/pandas/tests/frame/methods/test_differences.py +++ b/pandas/tests/frame/methods/test_differences.py @@ -5,8 +5,8 @@ import pandas._testing as tm -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -def test_differences_axis(axis): +@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) +def test_differences_axis(align_axis): df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, columns=["col1", "col2", "col3"], @@ -15,9 +15,9 @@ def test_differences_axis(axis): df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 - result = df.differences(df2, axis=axis) + result = df.differences(df2, align_axis=align_axis) - if axis in (1, "columns"): + if align_axis in (1, "columns"): indices = pd.Index([0, 2]) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( @@ -128,3 +128,38 @@ def test_differences_with_non_equal_nulls(): columns=columns, ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("align_axis", [0, 1]) +def test_differences_multi_index(align_axis): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]} + ) + df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]]) + df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]]) + + df2 = df.copy() + df2.iloc[0, 0] = "c" + df2.iloc[2, 2] = 4.0 + + result = df.differences(df2, align_axis=align_axis) + + if align_axis == 0: + indices = pd.MultiIndex.from_arrays( + [["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]] + ) + columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]]) + data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]] + else: + indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]]) + columns = pd.MultiIndex.from_arrays( + [ + ["a", "a", "b", "b"], + ["col1", "col1", "col3", "col3"], + ["self", "other", "self", "other"], + ] + ) + data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]] + + expected = pd.DataFrame(data=data, index=indices, columns=columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_differences.py b/pandas/tests/series/methods/test_differences.py index c6d1b3ed1a65c..30b972636cbf2 100644 --- a/pandas/tests/series/methods/test_differences.py +++ b/pandas/tests/series/methods/test_differences.py @@ -5,14 +5,14 @@ import pandas._testing as tm -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -def test_differences_axis(axis): +@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) +def test_differences_axis(align_axis): s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) - result = s1.differences(s2, axis=axis) + result = s1.differences(s2, align_axis=align_axis) - if axis in (1, "columns"): + if align_axis in (1, "columns"): indices = pd.Index([0, 2]) columns = pd.Index(["self", "other"]) expected = pd.DataFrame( @@ -78,8 +78,22 @@ def test_differences_with_non_equal_nulls(): s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", np.nan]) - result = s1.differences(s2, axis=0) + result = s1.differences(s2, align_axis=0) indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) expected = pd.Series(["a", "x", "c", np.nan], index=indices) tm.assert_series_equal(result, expected) + + +def test_differences_multi_index(): + index = pd.MultiIndex.from_arrays([[0, 0, 1], [0, 1, 2]]) + s1 = pd.Series(["a", "b", "c"], index=index) + s2 = pd.Series(["x", "b", "z"], index=index) + + result = s1.differences(s2, align_axis=0) + + indices = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], [0, 0, 2, 2], ["self", "other", "self", "other"]] + ) + expected = pd.Series(["a", "x", "c", "z"], index=indices) + tm.assert_series_equal(result, expected) From 08504204a3a6c0628467e3684a8b8518d7cf0071 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 14 Mar 2020 23:29:38 +0800 Subject: [PATCH 30/44] minor correction in docstring --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 513823ae81283..d7ade19197a2e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5295,7 +5295,7 @@ def _construct_result(self, result) -> "DataFrame": 3 b NaN 4.0 4 a 5.0 5.0 -Stack the differences on columns +Align the differences on columns >>> df.differences(df2) col1 col3 @@ -5305,7 +5305,7 @@ def _construct_result(self, result) -> "DataFrame": Stack the differences on rows ->>> df.differences(df2, axis=0) +>>> df.differences(df2, align_axis=0) col1 col3 0 self a NaN other c NaN diff --git a/pandas/core/series.py b/pandas/core/series.py index 1606473cf88d1..3a0816a78e626 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2602,7 +2602,7 @@ def _binop(self, other, func, level=None, fill_value=None): >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) >>> s2 = pd.Series(["a", "a", "c", "b", "e"]) -Stack the differences on columns +Align the differences on columns >>> s1.differences(s2) self other @@ -2611,7 +2611,7 @@ def _binop(self, other, func, level=None, fill_value=None): Stack the differences on indices ->>> s1.differences(s2, axis=0) +>>> s1.differences(s2, align_axis=0) 1 self b other a 3 self d From a709db7f0b959d45cbe8860a80e23b1ead975263 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 14 Mar 2020 23:39:17 +0800 Subject: [PATCH 31/44] some semantic cleanup in docstrings --- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 2 +- pandas/core/series.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d7ade19197a2e..a0305c74488a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5312,7 +5312,7 @@ def _construct_result(self, result) -> "DataFrame": 2 self NaN 3.0 other NaN 4.0 -Keep all the original indices (rows and columns) +Keep all original rows and columns >>> df.differences(df2, keep_shape=True) col1 col2 col3 @@ -5323,7 +5323,7 @@ def _construct_result(self, result) -> "DataFrame": 3 NaN NaN NaN NaN NaN NaN 4 NaN NaN NaN NaN NaN NaN -Keep all original indices and data +Keep all original rows and columns and also all original values >>> df.differences(df2, keep_shape=True, keep_equal=True) col1 col2 col3 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d606e05b38161..2443f31faef99 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8288,7 +8288,7 @@ def ranker(data): keep_shape : bool, default False If true, all rows and columns are kept. - Otherwise, only the different ones are kept. + Otherwise, only the ones with different values are kept. keep_equal : bool, default False If true, the result keeps values that are equal. diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a0816a78e626..e5aa9ba1bff38 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2618,7 +2618,7 @@ def _binop(self, other, func, level=None, fill_value=None): other b dtype: object -Keep all the original indices +Keep all original rows >>> s1.differences(s2, keep_shape=True) self other @@ -2628,7 +2628,7 @@ def _binop(self, other, func, level=None, fill_value=None): 3 d b 4 NaN NaN -Keep all original indices and data +Keep all original rows and also all original values >>> s1.differences(s2, keep_shape=True, keep_equal=True) self other From 9509604137a21a50870d7b31544e349a31cfadd3 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 17 Mar 2020 22:23:53 +0800 Subject: [PATCH 32/44] added type indicator for method arguments --- pandas/core/frame.py | 6 +++++- pandas/core/generic.py | 8 +++++++- pandas/core/series.py | 6 +++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a0305c74488a8..150cd2ea8df82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5337,7 +5337,11 @@ def _construct_result(self, result) -> "DataFrame": ) @Appender(_shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "DataFrame", align_axis=1, keep_shape=False, keep_equal=False + self, + other: "DataFrame", + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, ) -> "DataFrame": return super().differences( other=other, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2443f31faef99..05409bcdde90a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8296,7 +8296,13 @@ def ranker(data): """ @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences(self, other, align_axis=1, keep_shape=False, keep_equal=False): + def differences( + self, + other: FrameOrSeries, + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, + ): from pandas.core.reshape.concat import concat mask = ~((self == other) | (self.isna() & other.isna())) diff --git a/pandas/core/series.py b/pandas/core/series.py index e5aa9ba1bff38..1afa04ad62940 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2641,7 +2641,11 @@ def _binop(self, other, func, level=None, fill_value=None): ) @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) def differences( - self, other: "Series", align_axis=1, keep_shape=False, keep_equal=False + self, + other: "Series", + align_axis: Axis = 1, + keep_shape: bool = False, + keep_equal: bool = False, ) -> FrameOrSeries: return super().differences( other=other, From e1a1c49bb5de3019c179444050bddbcb3b01762f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 20 Mar 2020 17:57:21 +0800 Subject: [PATCH 33/44] updated type hints --- pandas/core/generic.py | 26 +++++++++++++------------- pandas/core/series.py | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 05409bcdde90a..1f7254ffb3ee8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8298,10 +8298,10 @@ def ranker(data): @Appender(_shared_docs["differences"] % _shared_doc_kwargs) def differences( self, - other: FrameOrSeries, + other, align_axis: Axis = 1, - keep_shape: bool = False, - keep_equal: bool = False, + keep_shape: bool_t = False, + keep_equal: bool_t = False, ): from pandas.core.reshape.concat import concat @@ -8323,18 +8323,18 @@ def differences( other = other[mask] if align_axis in (1, "columns"): # This is needed for Series - align_axis = 1 + axis = 1 else: - align_axis = self._get_axis_number(align_axis) + axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=align_axis, keys=keys) + diff = concat([self, other], axis=axis, keys=keys) - if align_axis >= self.ndim: + if axis >= self.ndim: # No need to reorganize data if stacking on new axis # This currently applies for stacking two Series on columns return diff - ax = diff._get_axis(align_axis) + ax = diff._get_axis(axis) ax_names = np.array(ax.names) # set index names to positions to avoid confusion @@ -8343,20 +8343,20 @@ def differences( # bring self-other to inner level order = list(range(1, ax.nlevels)) + [0] if isinstance(diff, ABCDataFrame): - diff = diff.reorder_levels(order, axis=align_axis) + diff = diff.reorder_levels(order, axis=axis) else: diff = diff.reorder_levels(order) # restore the index names in order - diff._get_axis(axis=align_axis).names = ax_names[order] + diff._get_axis(axis=axis).names = ax_names[order] # reorder axis to keep things organized indices = ( - np.arange(diff.shape[align_axis]) - .reshape([2, diff.shape[align_axis] // 2]) + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) .T.flatten() ) - diff = diff.take(indices, axis=align_axis) + diff = diff.take(indices, axis=axis) return diff diff --git a/pandas/core/series.py b/pandas/core/series.py index 1afa04ad62940..ccc9103b4a1dd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -22,7 +22,7 @@ from pandas._config import get_option from pandas._libs import lib, properties, reshape, tslibs -from pandas._typing import Axis, DtypeObj, FrameOrSeries, Label +from pandas._typing import Axis, DtypeObj, FrameOrSeriesUnion, Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -2646,7 +2646,7 @@ def differences( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> FrameOrSeries: + ) -> FrameOrSeriesUnion: return super().differences( other=other, align_axis=align_axis, From a8caa539b071a1a62082ccab54042cde8774b342 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 20 Mar 2020 19:00:32 +0800 Subject: [PATCH 34/44] added NDFrame in FrameOrSeriesUnion type --- pandas/_typing.py | 2 +- pandas/core/generic.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 3b7392f781525..7387e100bcfad 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -52,7 +52,7 @@ # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series # is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed # in, either a DataFrame or a Series is returned. -FrameOrSeriesUnion = Union["DataFrame", "Series"] +FrameOrSeriesUnion = Union["DataFrame", "NDFrame", "Series"] # FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1f7254ffb3ee8..eeb761418a059 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8352,9 +8352,7 @@ def differences( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]) - .reshape([2, diff.shape[axis] // 2]) - .T.flatten() + np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() ) diff = diff.take(indices, axis=axis) From 4056f90a21c17a620e513fab4051ba0efb8b199d Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 21 Mar 2020 09:21:12 +0800 Subject: [PATCH 35/44] fixed type hints of concat function --- pandas/_typing.py | 2 +- pandas/core/reshape/concat.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 7387e100bcfad..3b7392f781525 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -52,7 +52,7 @@ # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series # is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed # in, either a DataFrame or a Series is returned. -FrameOrSeriesUnion = Union["DataFrame", "NDFrame", "Series"] +FrameOrSeriesUnion = Union["DataFrame", "Series"] # FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 091129707228f..eb195f836ebed 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -6,7 +6,7 @@ import numpy as np -from pandas._typing import FrameOrSeriesUnion, Label +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -48,7 +48,7 @@ def concat( @overload def concat( - objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -63,7 +63,7 @@ def concat( def concat( - objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join="outer", ignore_index: bool = False, From 39f857ebdbf5a11003b6e8125ddb249a9b26534c Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 9 Apr 2020 17:16:34 +0800 Subject: [PATCH 36/44] renamed `differences` method to `compare` --- doc/source/reference/frame.rst | 2 +- doc/source/reference/series.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 4 ++-- pandas/core/frame.py | 16 +++++++------- pandas/core/generic.py | 6 ++--- pandas/core/series.py | 16 +++++++------- .../{test_differences.py => test_compare.py} | 22 +++++++++---------- .../{test_differences.py => test_compare.py} | 22 +++++++++---------- 8 files changed, 45 insertions(+), 45 deletions(-) rename pandas/tests/frame/methods/{test_differences.py => test_compare.py} (89%) rename pandas/tests/series/methods/{test_differences.py => test_compare.py} (82%) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 925e48875356b..825035d427259 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -247,10 +247,10 @@ Combining / comparing / joining / merging DataFrame.append DataFrame.assign + DataFrame.compare DataFrame.join DataFrame.merge DataFrame.update - DataFrame.differences Time series-related ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8ac2052b5c693..6e81da64fb53a 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -246,9 +246,9 @@ Combining / comparing / joining / merging :toctree: api/ Series.append + Series.differences Series.replace Series.update - Series.differences Time series-related ------------------- diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 57e7bdb05dfe6..983ab17bea9ef 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -39,7 +39,7 @@ For example: Comparing two `DataFrame` or two `Series` and summarizing the differences ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :meth:`DataFrame.differences` and :meth:`Series.differences` for comparing two `DataFrame` or two `Series` (:issue:`30429`) +We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing two `DataFrame` or two `Series` (:issue:`30429`) .. ipython:: python @@ -59,7 +59,7 @@ We've added :meth:`DataFrame.differences` and :meth:`Series.differences` for com df df2 - df.differences(df2) + df.compare(df2) .. _whatsnew_110.timestamp_fold_support: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2fa2e00193c48..d028f5215870b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5338,7 +5338,7 @@ def _construct_result(self, result) -> "DataFrame": See Also -------- -Series.differences : Show differences. +Series.compare : Compare with another Series and show differences. Notes ----- @@ -5375,7 +5375,7 @@ def _construct_result(self, result) -> "DataFrame": Align the differences on columns ->>> df.differences(df2) +>>> df.compare(df2) col1 col3 self other self other 0 a c NaN NaN @@ -5383,7 +5383,7 @@ def _construct_result(self, result) -> "DataFrame": Stack the differences on rows ->>> df.differences(df2, align_axis=0) +>>> df.compare(df2, align_axis=0) col1 col3 0 self a NaN other c NaN @@ -5392,7 +5392,7 @@ def _construct_result(self, result) -> "DataFrame": Keep all original rows and columns ->>> df.differences(df2, keep_shape=True) +>>> df.compare(df2, keep_shape=True) col1 col2 col3 self other self other self other 0 a c NaN NaN NaN NaN @@ -5403,7 +5403,7 @@ def _construct_result(self, result) -> "DataFrame": Keep all original rows and columns and also all original values ->>> df.differences(df2, keep_shape=True, keep_equal=True) +>>> df.compare(df2, keep_shape=True, keep_equal=True) col1 col2 col3 self other self other self other 0 a c 1.0 1.0 1.0 1.0 @@ -5413,15 +5413,15 @@ def _construct_result(self, result) -> "DataFrame": 4 a a 5.0 5.0 5.0 5.0 """ ) - @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences( + @Appender(_shared_docs["compare"] % _shared_doc_kwargs) + def compare( self, other: "DataFrame", align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, ) -> "DataFrame": - return super().differences( + return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 29efec97ea34a..bfe9c7a750258 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8253,7 +8253,7 @@ def ranker(data): return ranker(data) _shared_docs[ - "differences" + "compare" ] = """ Compare to another %(klass)s and show the differences. @@ -8279,8 +8279,8 @@ def ranker(data): Otherwise, equal values are shown as NaNs. """ - @Appender(_shared_docs["differences"] % _shared_doc_kwargs) - def differences( + @Appender(_shared_docs["compare"] % _shared_doc_kwargs) + def compare( self, other, align_axis: Axis = 1, diff --git a/pandas/core/series.py b/pandas/core/series.py index 9a8e9ec544aaa..e36bc13982643 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2640,7 +2640,7 @@ def _binop(self, other, func, level=None, fill_value=None): See Also -------- -DataFrame.differences : Show differences. +DataFrame.compare : Compare with another DataFrame and show differences. Notes ----- @@ -2653,14 +2653,14 @@ def _binop(self, other, func, level=None, fill_value=None): Align the differences on columns ->>> s1.differences(s2) +>>> s1.compare(s2) self other 1 b a 3 d b Stack the differences on indices ->>> s1.differences(s2, align_axis=0) +>>> s1.compare(s2, align_axis=0) 1 self b other a 3 self d @@ -2669,7 +2669,7 @@ def _binop(self, other, func, level=None, fill_value=None): Keep all original rows ->>> s1.differences(s2, keep_shape=True) +>>> s1.compare(s2, keep_shape=True) self other 0 NaN NaN 1 b a @@ -2679,7 +2679,7 @@ def _binop(self, other, func, level=None, fill_value=None): Keep all original rows and also all original values ->>> s1.differences(s2, keep_shape=True, keep_equal=True) +>>> s1.compare(s2, keep_shape=True, keep_equal=True) self other 0 a a 1 b a @@ -2688,15 +2688,15 @@ def _binop(self, other, func, level=None, fill_value=None): 4 e e """ ) - @Appender(generic._shared_docs["differences"] % _shared_doc_kwargs) - def differences( + @Appender(generic._shared_docs["compare"] % _shared_doc_kwargs) + def compare( self, other: "Series", align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, ) -> FrameOrSeriesUnion: - return super().differences( + return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, diff --git a/pandas/tests/frame/methods/test_differences.py b/pandas/tests/frame/methods/test_compare.py similarity index 89% rename from pandas/tests/frame/methods/test_differences.py rename to pandas/tests/frame/methods/test_compare.py index c4524ea9dda75..147e4eae4c0f2 100644 --- a/pandas/tests/frame/methods/test_differences.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) -def test_differences_axis(align_axis): +def test_compare_axis(align_axis): df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, columns=["col1", "col2", "col3"], @@ -15,7 +15,7 @@ def test_differences_axis(align_axis): df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 - result = df.differences(df2, align_axis=align_axis) + result = df.compare(df2, align_axis=align_axis) if align_axis in (1, "columns"): indices = pd.Index([0, 2]) @@ -42,10 +42,10 @@ def test_differences_axis(align_axis): (True, False), (False, True), (True, True), - # False, False case is already covered in test_differences_axis + # False, False case is already covered in test_compare_axis ], ) -def test_differences_various_formats(keep_shape, keep_equal): +def test_compare_various_formats(keep_shape, keep_equal): df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, columns=["col1", "col2", "col3"], @@ -54,7 +54,7 @@ def test_differences_various_formats(keep_shape, keep_equal): df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 - result = df.differences(df2, keep_shape=keep_shape, keep_equal=keep_equal) + result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) if keep_shape: indices = pd.Index([0, 1, 2]) @@ -90,7 +90,7 @@ def test_differences_various_formats(keep_shape, keep_equal): tm.assert_frame_equal(result, expected) -def test_differences_with_equal_nulls(): +def test_compare_with_equal_nulls(): # We want to make sure two NaNs are considered the same # and dropped where applicable df = pd.DataFrame( @@ -100,14 +100,14 @@ def test_differences_with_equal_nulls(): df2 = df.copy() df2.loc[0, "col1"] = "c" - result = df.differences(df2) + result = df.compare(df2) indices = pd.Index([0]) columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]]) expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns) tm.assert_frame_equal(result, expected) -def test_differences_with_non_equal_nulls(): +def test_compare_with_non_equal_nulls(): # We want to make sure the relevant NaNs do not get dropped # even if the entire row or column are NaNs df = pd.DataFrame( @@ -118,7 +118,7 @@ def test_differences_with_non_equal_nulls(): df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = np.nan - result = df.differences(df2) + result = df.compare(df2) indices = pd.Index([0, 2]) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) @@ -131,7 +131,7 @@ def test_differences_with_non_equal_nulls(): @pytest.mark.parametrize("align_axis", [0, 1]) -def test_differences_multi_index(align_axis): +def test_compare_multi_index(align_axis): df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]} ) @@ -142,7 +142,7 @@ def test_differences_multi_index(align_axis): df2.iloc[0, 0] = "c" df2.iloc[2, 2] = 4.0 - result = df.differences(df2, align_axis=align_axis) + result = df.compare(df2, align_axis=align_axis) if align_axis == 0: indices = pd.MultiIndex.from_arrays( diff --git a/pandas/tests/series/methods/test_differences.py b/pandas/tests/series/methods/test_compare.py similarity index 82% rename from pandas/tests/series/methods/test_differences.py rename to pandas/tests/series/methods/test_compare.py index 30b972636cbf2..2a5a1fed30226 100644 --- a/pandas/tests/series/methods/test_differences.py +++ b/pandas/tests/series/methods/test_compare.py @@ -6,11 +6,11 @@ @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) -def test_differences_axis(align_axis): +def test_compare_axis(align_axis): s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) - result = s1.differences(s2, align_axis=align_axis) + result = s1.compare(s2, align_axis=align_axis) if align_axis in (1, "columns"): indices = pd.Index([0, 2]) @@ -31,14 +31,14 @@ def test_differences_axis(align_axis): (True, False), (False, True), (True, True), - # False, False case is already covered in test_differences_axis + # False, False case is already covered in test_compare_axis ], ) -def test_differences_various_formats(keep_shape, keep_equal): +def test_compare_various_formats(keep_shape, keep_equal): s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) - result = s1.differences(s2, keep_shape=keep_shape, keep_equal=keep_equal) + result = s1.compare(s2, keep_shape=keep_shape, keep_equal=keep_equal) if keep_shape: indices = pd.Index([0, 1, 2]) @@ -62,35 +62,35 @@ def test_differences_various_formats(keep_shape, keep_equal): tm.assert_frame_equal(result, expected) -def test_differences_with_equal_nulls(): +def test_compare_with_equal_nulls(): # We want to make sure two NaNs are considered the same # and dropped where applicable s1 = pd.Series(["a", "b", np.nan]) s2 = pd.Series(["x", "b", np.nan]) - result = s1.differences(s2) + result = s1.compare(s2) expected = pd.DataFrame([["a", "x"]], columns=["self", "other"]) tm.assert_frame_equal(result, expected) -def test_differences_with_non_equal_nulls(): +def test_compare_with_non_equal_nulls(): # We want to make sure the relevant NaNs do not get dropped s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", np.nan]) - result = s1.differences(s2, align_axis=0) + result = s1.compare(s2, align_axis=0) indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) expected = pd.Series(["a", "x", "c", np.nan], index=indices) tm.assert_series_equal(result, expected) -def test_differences_multi_index(): +def test_compare_multi_index(): index = pd.MultiIndex.from_arrays([[0, 0, 1], [0, 1, 2]]) s1 = pd.Series(["a", "b", "c"], index=index) s2 = pd.Series(["x", "b", "z"], index=index) - result = s1.differences(s2, align_axis=0) + result = s1.compare(s2, align_axis=0) indices = pd.MultiIndex.from_arrays( [[0, 0, 1, 1], [0, 0, 2, 2], ["self", "other", "self", "other"]] From 6c62b0e94d67d9f6e0de88c7e4a14b122f588aab Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 9 Apr 2020 17:50:43 +0800 Subject: [PATCH 37/44] correction of method name in doc/source/reference/series.rst --- doc/source/reference/series.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 66447157b4cb1..797ade9594c7d 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -246,7 +246,7 @@ Combining / comparing / joining / merging :toctree: api/ Series.append - Series.differences + Series.compare Series.replace Series.update From 098d40cf831f3d2635ce15f55d98c459e42acccd Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 11 Apr 2020 11:22:36 +0800 Subject: [PATCH 38/44] added type checking in `compare` method and reformatted whatsnew a bit --- doc/source/whatsnew/v1.1.0.rst | 7 +++++-- pandas/core/generic.py | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 91941a5dec586..6491efff90780 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -51,14 +51,17 @@ We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing t }, columns=["col1", "col2", "col3"], ) + df + +.. ipython:: python + df2 = df.copy() df2.loc[0, 'col1'] = 'c' df2.loc[2, 'col3'] = 4.0 + df2 .. ipython:: python - df - df2 df.compare(df2) .. _whatsnew_110.timestamp_fold_support: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79227d29b8c3f..9fe1d0407879c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8283,6 +8283,12 @@ def compare( ): from pandas.core.reshape.concat import concat + if type(self) is not type(other): + cls_self, cls_other = type(self).__name__, type(other).__name__ + raise TypeError( + f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'" + ) + mask = ~((self == other) | (self.isna() & other.isna())) keys = ["self", "other"] From 4223eb4ce768145ddf43a0c54468c9fbcd408e1f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 11 Apr 2020 11:45:31 +0800 Subject: [PATCH 39/44] removed unintended line break --- pandas/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 213d5681f5340..ec5383835a939 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2738,7 +2738,6 @@ def compare( ) def combine(self, other, func, fill_value=None) -> "Series": - """ Combine the Series with a Series or scalar according to `func`. From 91758c8c6a03238fc4204187bacb1072ff3a5b74 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 29 Apr 2020 17:10:48 +0800 Subject: [PATCH 40/44] resolved a linting issue --- pandas/core/series.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7213ee0862580..4278e434a4c10 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,7 +23,15 @@ from pandas._config import get_option from pandas._libs import lib, properties, reshape, tslibs -from pandas._typing import ArrayLike, Axis, DtypeObj, FrameOrSeriesUnion, IndexKeyFunc, Label, ValueKeyFunc +from pandas._typing import ( + ArrayLike, + Axis, + DtypeObj, + FrameOrSeriesUnion, + IndexKeyFunc, + Label, + ValueKeyFunc, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc from pandas.util._validators import validate_bool_kwarg, validate_percentile From 774ff5dd1a305c54e5017966872002d95d7d9522 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 30 Apr 2020 11:01:50 +0800 Subject: [PATCH 41/44] updated whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1ecffebdd6b44..b92e728d9708f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -36,6 +36,8 @@ For example: ser["2014"] ser.loc["May 2015"] +.. _whatsnew_110.dataframe_or_series_comparing: + Comparing two `DataFrame` or two `Series` and summarizing the differences ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 0189623e5d4a7d4d946a06985d91d7965cb46cc7 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 15 May 2020 10:50:27 +0800 Subject: [PATCH 42/44] added doc in user guide merging.rst and more tests --- doc/source/user_guide/merging.rst | 67 ++++++++++++++++++++- doc/source/whatsnew/v1.1.0.rst | 2 + pandas/core/generic.py | 2 + pandas/tests/frame/methods/test_compare.py | 15 +++++ pandas/tests/series/methods/test_compare.py | 15 +++++ 5 files changed, 98 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 0450c81958a51..56ff8c1fc7c9b 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -10,15 +10,18 @@ p = doctools.TablePlotter() -**************************** -Merge, join, and concatenate -**************************** +************************************ +Merge, join, concatenate and compare +************************************ pandas provides various facilities for easily combining together Series or DataFrame with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. +In addition, pandas also provides utilities to compare two Series or DataFrame +and summarize their differences. + .. _merging.concat: Concatenating objects @@ -1477,3 +1480,61 @@ exclude exact matches on time. Note that though we exclude the exact matches by='ticker', tolerance=pd.Timedelta('10ms'), allow_exact_matches=False) + +.. _merging.compare: + +Comparing objects +----------------- + +The :meth:`~Series.compare` and :meth:`~DataFrame.compare` methods allow you to +compare two DataFrame or Series, respectively, and summarize their differences. + +This feature was added in :ref:`V1.1.0 `. + +For example, you might want to compare two `DataFrame` and stack their differences +side by side. + +.. ipython:: python + + df = pd.DataFrame( + { + "col1": ["a", "a", "b", "b", "a"], + "col2": [1.0, 2.0, 3.0, np.nan, 5.0], + "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + columns=["col1", "col2", "col3"], + ) + df + +.. ipython:: python + + df2 = df.copy() + df2.loc[0, 'col1'] = 'c' + df2.loc[2, 'col3'] = 4.0 + df2 + +.. ipython:: python + + df.compare(df2) + +By default, if two corresponding values are equal, they will be shown as ``NaN``. +Furthermore, if all values in an entire row / column, the row / column will be +omitted from the result. The remaining differences will be aligned on columns. + +If you wish, you may choose to stack the differences on rows. + +.. ipython:: python + + df.compare(df2, align_axis=0) + +If you wish to keep all original rows and columns, set `keep_shape` argument +to ``True``. + +.. ipython:: python + + df.compare(df2, keep_shape=True) + +You may also keep all the original values even if they are equal. + +.. ipython:: python + df.compare(df2, keep_shape=True, keep_equal=True) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6fff71ed18b2d..db6b1ec5cf416 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -67,6 +67,8 @@ We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing t df.compare(df2) +See :ref:`User Guide ` for more details. + .. _whatsnew_110.groupby_key: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ad076f401cb07..2c33b474fc9a7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8410,6 +8410,8 @@ def ranker(data): "compare" ] = """ Compare to another %(klass)s and show the differences. + + .. versionadded:: 1.1.0 Parameters ---------- diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 147e4eae4c0f2..3a89364b6c5be 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) def test_compare_axis(align_axis): + # GH#30429 df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, columns=["col1", "col2", "col3"], @@ -163,3 +164,17 @@ def test_compare_multi_index(align_axis): expected = pd.DataFrame(data=data, index=indices, columns=columns) tm.assert_frame_equal(result, expected) + + +def test_compare_unaligned_objects(): + # test DataFrames with different indices + with pytest.raises(ValueError, match='Can only compare identically-labeled DataFrame objects'): + df1 = pd.DataFrame([1, 2, 3], index=['a', 'b', 'c']) + df2 = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']) + df1.compare(df2) + + # test DataFrames with different shapes + with pytest.raises(ValueError, match='Can only compare identically-labeled DataFrame objects'): + df1 = pd.DataFrame(np.ones((3, 3))) + df2 = pd.DataFrame(np.zeros((2, 1))) + df1.compare(df2) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 2a5a1fed30226..5222da9330b38 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) def test_compare_axis(align_axis): + # GH#30429 s1 = pd.Series(["a", "b", "c"]) s2 = pd.Series(["x", "b", "z"]) @@ -97,3 +98,17 @@ def test_compare_multi_index(): ) expected = pd.Series(["a", "x", "c", "z"], index=indices) tm.assert_series_equal(result, expected) + + +def test_compare_unaligned_objects(): + # test Series with different indices + with pytest.raises(ValueError, match='Can only compare identically-labeled Series objects'): + ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + ser2 = pd.Series([1, 2, 3], index=['a', 'b', 'd']) + ser1.compare(ser2) + + # test Series with different lengths + with pytest.raises(ValueError, match='Can only compare identically-labeled Series objects'): + ser1 = pd.Series([1, 2, 3]) + ser2 = pd.Series([1, 2, 3, 4]) + ser1.compare(ser2) From b0b3e24c4815e0c17ebddbdaba389f0696f0a97e Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 15 May 2020 11:33:17 +0800 Subject: [PATCH 43/44] removed trailing space in docstring and blackified code --- pandas/core/generic.py | 2 +- pandas/tests/frame/methods/test_compare.py | 10 ++++++---- pandas/tests/series/methods/test_compare.py | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2c33b474fc9a7..14389093a0a85 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8410,7 +8410,7 @@ def ranker(data): "compare" ] = """ Compare to another %(klass)s and show the differences. - + .. versionadded:: 1.1.0 Parameters diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 3a89364b6c5be..468811eba0d39 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -168,13 +168,15 @@ def test_compare_multi_index(align_axis): def test_compare_unaligned_objects(): # test DataFrames with different indices - with pytest.raises(ValueError, match='Can only compare identically-labeled DataFrame objects'): - df1 = pd.DataFrame([1, 2, 3], index=['a', 'b', 'c']) - df2 = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']) + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"]) + df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"]) df1.compare(df2) # test DataFrames with different shapes - with pytest.raises(ValueError, match='Can only compare identically-labeled DataFrame objects'): + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): df1 = pd.DataFrame(np.ones((3, 3))) df2 = pd.DataFrame(np.zeros((2, 1))) df1.compare(df2) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 5222da9330b38..8570800048898 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -102,13 +102,15 @@ def test_compare_multi_index(): def test_compare_unaligned_objects(): # test Series with different indices - with pytest.raises(ValueError, match='Can only compare identically-labeled Series objects'): - ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - ser2 = pd.Series([1, 2, 3], index=['a', 'b', 'd']) + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"]) ser1.compare(ser2) # test Series with different lengths - with pytest.raises(ValueError, match='Can only compare identically-labeled Series objects'): + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): ser1 = pd.Series([1, 2, 3]) ser2 = pd.Series([1, 2, 3, 4]) ser1.compare(ser2) From 007eeb71ffecca19d5b54ce4ea81d4b2517bcefe Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 27 May 2020 11:35:16 +0800 Subject: [PATCH 44/44] added one more example in docstring of DataFrame.compare --- pandas/core/frame.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b46bdb1393b86..4911617b8eed4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5819,6 +5819,14 @@ def _construct_result(self, result) -> "DataFrame": 2 self NaN 3.0 other NaN 4.0 +Keep the equal values + +>>> df.compare(df2, keep_equal=True) + col1 col3 + self other self other +0 a c 1.0 1.0 +2 b b 3.0 4.0 + Keep all original rows and columns >>> df.compare(df2, keep_shape=True)