From d3670d0845039333ca171c88877c0b3cc660dc0b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 09:08:16 -0400 Subject: [PATCH 01/10] DOC/TST: add pd.unique doc-string & buggy return of Categorical closes #9346 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 55 ++++++++++++++++++++++++++++++--- pandas/tests/test_algos.py | 45 +++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7664688ffa4f4..17e30df59d6a2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1148,6 +1148,7 @@ Conversion - Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) - Bug in ``DataFrame.fillna()`` with tz-aware datetimes (:issue:`15855`) - Bug in ``is_string_dtype``, ``is_timedelta64_ns_dtype``, and ``is_string_like_dtype`` in which an error was raised when ``None`` was passed in (:issue:`15941`) +- Bug in the return type of ``pd.unique`` on a ``Categorical``, which was returning an ndarray and not a ``Categorical`` (:issue:`15903`) Indexing ^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9b88ea23483bd..50d7f9db53306 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -267,11 +267,58 @@ def match(to_match, values, na_sentinel=-1): return result -def unique1d(values): +def unique(values): """ - Hash table-based unique + Hash table-based unique. uniques are returned in order + of appearance. This does NOT sort. + + Parameters + ---------- + values : 1d array-like + + Returns + ------- + unique values. The returned type will be a pandas Index + if a pandas type is input, otherwise ndarray + + Examples + -------- + pd.unique(pd.Series([2, 1, 3, 3])) + array([2, 1, 3]) + + >>> pd.unique(pd.Series([2] + [1] * 5)) + array([2, 1]) + + >>> pd.unique(Series([pd.Timestamp('20160101'), + ... pd.Timestamp('20160101')])) + array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + + >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + ... pd.Timestamp('20160101', tz='US/Eastern')])) + DatetimeIndex(['2016-01-01 00:00:00-05:00'], + ... dtype='datetime64[ns, US/Eastern]', freq=None) + + >>> pd.unique(list('aabc')) + array(['a', 'b', 'c'], dtype=object) + + >>> pd.unique(Series(pd.Categorical(list('aabc')))) + 0 a + 1 b + 2 c + dtype: category + Categories (3, object): [a, b, c] + """ values = _ensure_arraylike(values) + + # categorical is a fast-path + if is_categorical_dtype(values): + + if isinstance(values, ABCSeries): + from pandas import Series + return Series(values.values.unique(), name=values.name) + return values.unique() + original = values htable, _, values, dtype, ndtype = _get_hashtable_algo(values) @@ -282,7 +329,7 @@ def unique1d(values): return uniques -unique = unique1d +unique1d = unique def isin(comps, values): @@ -651,7 +698,7 @@ def mode(values): if is_categorical_dtype(values): if isinstance(values, Series): - return Series(values.values.mode()) + return Series(values.values.mode(), name=values.name) return values.mode() values, dtype, ndtype = _ensure_data(values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d893183dae0ed..30fb69bb7dc77 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -384,6 +384,51 @@ def test_uint64_overflow(self): exp = np.array([1, 2, 2**63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp) + def test_categorical(self): + c = pd.Categorical(list('aabc')) + result = c.unique() + expected = pd.Categorical(list('abc')) + tm.assert_categorical_equal(result, expected) + + result = algos.unique(c) + tm.assert_categorical_equal(result, expected) + + result = algos.unique(Series(c, name='foo')) + expected = Series(expected, name='foo') + tm.assert_series_equal(result, expected) + + def test_order_of_appearance(self): + # 9346 + # light testing of guarantee of order of appearance + # these also are the doc-examples + result = pd.unique(pd.Series([2, 1, 3, 3])) + tm.assert_numpy_array_equal(result, np.array([2, 1, 3])) + + result = pd.unique(pd.Series([2] + [1] * 5)) + tm.assert_numpy_array_equal(result, np.array([2, 1])) + + result = pd.unique(Series([pd.Timestamp('20160101'), + pd.Timestamp('20160101')])) + expected = np.array(['2016-01-01T00:00:00.000000000'], + dtype='datetime64[ns]') + tm.assert_numpy_array_equal(result, expected) + + result = pd.unique(pd.Index( + [pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])) + expected = pd.DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', + freq=None) + tm.assert_index_equal(result, expected) + + result = pd.unique(list('aabc')) + expected = np.array(['a', 'b', 'c'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = pd.unique(Series(pd.Categorical(list('aabc')))) + expected = Series(pd.Categorical(list('abc'))) + tm.assert_series_equal(result, expected) + class TestIsin(tm.TestCase): From c5b227103d20d2c236a8f34606ddf0738cb2bf8b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 10:19:11 -0400 Subject: [PATCH 02/10] more tests --- pandas/tests/test_algos.py | 158 ++++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 64 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 30fb69bb7dc77..cf3db1519f539 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -6,7 +6,8 @@ from numpy import nan from datetime import datetime from itertools import permutations -from pandas import Series, Categorical, CategoricalIndex, Index +from pandas import (Series, Categorical, CategoricalIndex, Index, + Timestamp, DatetimeIndex) import pandas as pd from pandas import compat @@ -34,7 +35,7 @@ def test_ints(self): expected = Series(np.array([0, 2, 1, 1, 0, 2, np.nan, 0])) tm.assert_series_equal(result, expected) - s = pd.Series(np.arange(5), dtype=np.float32) + s = Series(np.arange(5), dtype=np.float32) result = algos.match(s, [2, 4]) expected = np.array([-1, -1, 0, -1, 1], dtype=np.int64) self.assert_numpy_array_equal(result, expected) @@ -204,20 +205,20 @@ def test_mixed(self): def test_datelike(self): # M8 - v1 = pd.Timestamp('20130101 09:00:00.00004') - v2 = pd.Timestamp('20130101') + v1 = Timestamp('20130101 09:00:00.00004') + v2 = Timestamp('20130101') x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) - exp = pd.DatetimeIndex([v1, v2]) + exp = DatetimeIndex([v1, v2]) self.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) - exp = pd.DatetimeIndex([v2, v1]) + exp = DatetimeIndex([v2, v1]) self.assert_index_equal(uniques, exp) # period @@ -350,7 +351,7 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) - s = pd.Series(dt_index) + s = Series(dt_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) @@ -369,7 +370,7 @@ def test_timedelta64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) - s = pd.Series(td_index) + s = Series(td_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) @@ -380,11 +381,12 @@ def test_timedelta64_dtype_array_returned(self): self.assertEqual(result.dtype, expected.dtype) def test_uint64_overflow(self): - s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) + s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) exp = np.array([1, 2, 2**63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp) def test_categorical(self): + # GH 15939 c = pd.Categorical(list('aabc')) result = c.unique() expected = pd.Categorical(list('abc')) @@ -397,28 +399,56 @@ def test_categorical(self): expected = Series(expected, name='foo') tm.assert_series_equal(result, expected) + def test_datetime64tz_aware(self): + # GH 15939 + result = Series( + pd.Index([Timestamp('20160101', tz='US/Eastern'), + Timestamp('20160101', tz='US/Eastern')])).unique() + expected = np.array([Timestamp('2016-01-01 00:00:00-0500', + tz='US/Eastern')], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Index([Timestamp('20160101', tz='US/Eastern'), + Timestamp('20160101', tz='US/Eastern')]).unique() + expected = DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', freq=None) + tm.assert_index_equal(result, expected) + + result = pd.unique( + Series(pd.Index([Timestamp('20160101', tz='US/Eastern'), + Timestamp('20160101', tz='US/Eastern')]))) + expected = DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', freq=None) + tm.assert_index_equal(result, expected) + + result = pd.unique(pd.Index([Timestamp('20160101', tz='US/Eastern'), + Timestamp('20160101', tz='US/Eastern')])) + expected = DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', freq=None) + tm.assert_index_equal(result, expected) + def test_order_of_appearance(self): # 9346 # light testing of guarantee of order of appearance # these also are the doc-examples - result = pd.unique(pd.Series([2, 1, 3, 3])) + result = pd.unique(Series([2, 1, 3, 3])) tm.assert_numpy_array_equal(result, np.array([2, 1, 3])) - result = pd.unique(pd.Series([2] + [1] * 5)) + result = pd.unique(Series([2] + [1] * 5)) tm.assert_numpy_array_equal(result, np.array([2, 1])) - result = pd.unique(Series([pd.Timestamp('20160101'), - pd.Timestamp('20160101')])) + result = pd.unique(Series([Timestamp('20160101'), + Timestamp('20160101')])) expected = np.array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') tm.assert_numpy_array_equal(result, expected) result = pd.unique(pd.Index( - [pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) - expected = pd.DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) + [Timestamp('20160101', tz='US/Eastern'), + Timestamp('20160101', tz='US/Eastern')])) + expected = DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', + freq=None) tm.assert_index_equal(result, expected) result = pd.unique(list('aabc')) @@ -448,15 +478,15 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(pd.Series([1, 2]), [1]) + result = algos.isin(Series([1, 2]), [1]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(pd.Series([1, 2]), pd.Series([1])) + result = algos.isin(Series([1, 2]), Series([1])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(pd.Series([1, 2]), set([1])) + result = algos.isin(Series([1, 2]), set([1])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -464,11 +494,11 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(pd.Series(['a', 'b']), pd.Series(['a'])) + result = algos.isin(Series(['a', 'b']), Series(['a'])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(pd.Series(['a', 'b']), set(['a'])) + result = algos.isin(Series(['a', 'b']), set(['a'])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -565,33 +595,33 @@ def test_value_counts_nat(self): self.assertEqual(len(vc), 1) self.assertEqual(len(vc_with_na), 2) - exp_dt = pd.Series({pd.Timestamp('2014-01-01 00:00:00'): 1}) + exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1}) tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) def test_value_counts_datetime_outofbounds(self): # GH 13663 - s = pd.Series([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1), - datetime(3000, 1, 1), datetime(3000, 1, 1)]) + s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1), + datetime(3000, 1, 1), datetime(3000, 1, 1)]) res = s.value_counts() exp_index = pd.Index([datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], dtype=object) - exp = pd.Series([3, 2, 1], index=exp_index) + exp = Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(pd.Series(['2362-01-01', np.nan]), + res = pd.to_datetime(Series(['2362-01-01', np.nan]), errors='ignore') - exp = pd.Series(['2362-01-01', np.nan], dtype=object) + exp = Series(['2362-01-01', np.nan], dtype=object) tm.assert_series_equal(res, exp) def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() - expected = pd.Series([3, 2, 1], - index=pd.CategoricalIndex(['a', 'b', 'c'])) + expected = Series([3, 2, 1], + index=pd.CategoricalIndex(['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) # preserve order? @@ -604,11 +634,11 @@ def test_categorical_nans(self): s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() - expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( + expected = Series([4, 3, 2], index=pd.CategoricalIndex( ['a', 'b', 'c'], categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = pd.Series([ + expected = Series([ 4, 3, 2, 1 ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) @@ -618,12 +648,12 @@ def test_categorical_nans(self): list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) s.iloc[1] = np.nan result = s.value_counts() - expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( + expected = Series([4, 3, 2], index=pd.CategoricalIndex( ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex( + expected = Series([4, 3, 2, 1], index=pd.CategoricalIndex( ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True) @@ -640,33 +670,33 @@ def test_dropna(self): # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328 tm.assert_series_equal( - pd.Series([True, True, False]).value_counts(dropna=True), - pd.Series([2, 1], index=[True, False])) + Series([True, True, False]).value_counts(dropna=True), + Series([2, 1], index=[True, False])) tm.assert_series_equal( - pd.Series([True, True, False]).value_counts(dropna=False), - pd.Series([2, 1], index=[True, False])) + Series([True, True, False]).value_counts(dropna=False), + Series([2, 1], index=[True, False])) tm.assert_series_equal( - pd.Series([True, True, False, None]).value_counts(dropna=True), - pd.Series([2, 1], index=[True, False])) + Series([True, True, False, None]).value_counts(dropna=True), + Series([2, 1], index=[True, False])) tm.assert_series_equal( - pd.Series([True, True, False, None]).value_counts(dropna=False), - pd.Series([2, 1, 1], index=[True, False, np.nan])) + Series([True, True, False, None]).value_counts(dropna=False), + Series([2, 1, 1], index=[True, False, np.nan])) tm.assert_series_equal( - pd.Series([10.3, 5., 5.]).value_counts(dropna=True), - pd.Series([2, 1], index=[5., 10.3])) + Series([10.3, 5., 5.]).value_counts(dropna=True), + Series([2, 1], index=[5., 10.3])) tm.assert_series_equal( - pd.Series([10.3, 5., 5.]).value_counts(dropna=False), - pd.Series([2, 1], index=[5., 10.3])) + Series([10.3, 5., 5.]).value_counts(dropna=False), + Series([2, 1], index=[5., 10.3])) tm.assert_series_equal( - pd.Series([10.3, 5., 5., None]).value_counts(dropna=True), - pd.Series([2, 1], index=[5., 10.3])) + Series([10.3, 5., 5., None]).value_counts(dropna=True), + Series([2, 1], index=[5., 10.3])) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - result = pd.Series([10.3, 5., 5., None]).value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[5., 10.3, np.nan]) + result = Series([10.3, 5., 5., None]).value_counts(dropna=False) + expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): @@ -781,15 +811,15 @@ def test_numeric_object_likes(self): tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [pd.Series(case), pd.Series(case, dtype='category')]: + for s in [Series(case), Series(case, dtype='category')]: res_first = s.duplicated(keep='first') - tm.assert_series_equal(res_first, pd.Series(exp_first)) + tm.assert_series_equal(res_first, Series(exp_first)) res_last = s.duplicated(keep='last') - tm.assert_series_equal(res_last, pd.Series(exp_last)) + tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) - tm.assert_series_equal(res_false, pd.Series(exp_false)) + tm.assert_series_equal(res_false, Series(exp_false)) def test_datetime_likes(self): @@ -798,8 +828,8 @@ def test_datetime_likes(self): td = ['1 days', '2 days', '1 days', 'NaT', '3 days', '2 days', '4 days', '1 days', 'NaT', '6 days'] - cases = [np.array([pd.Timestamp(d) for d in dt]), - np.array([pd.Timestamp(d, tz='US/Eastern') for d in dt]), + cases = [np.array([Timestamp(d) for d in dt]), + np.array([Timestamp(d, tz='US/Eastern') for d in dt]), np.array([pd.Period(d, freq='D') for d in dt]), np.array([np.datetime64(d) for d in dt]), np.array([pd.Timedelta(d) for d in td])] @@ -833,16 +863,16 @@ def test_datetime_likes(self): tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [pd.Series(case), pd.Series(case, dtype='category'), - pd.Series(case, dtype=object)]: + for s in [Series(case), Series(case, dtype='category'), + Series(case, dtype=object)]: res_first = s.duplicated(keep='first') - tm.assert_series_equal(res_first, pd.Series(exp_first)) + tm.assert_series_equal(res_first, Series(exp_first)) res_last = s.duplicated(keep='last') - tm.assert_series_equal(res_last, pd.Series(exp_last)) + tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) - tm.assert_series_equal(res_false, pd.Series(exp_false)) + tm.assert_series_equal(res_false, Series(exp_false)) def test_unique_index(self): cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)] @@ -984,7 +1014,7 @@ def test_lookup_overflow(self): np.arange(len(xs), dtype=np.int64)) def test_get_unique(self): - s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) + s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) exp = np.array([1, 2, 2**63], dtype=np.uint64) self.assert_numpy_array_equal(s.unique(), exp) From 41c2d1bf2fbb34f400c67bc3be4ae5d5cc1a1314 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 10:41:58 -0400 Subject: [PATCH 03/10] fix tz-aware unique --- doc/source/whatsnew/v0.20.0.txt | 52 +++++++++++++++++++++++++++++++++ pandas/core/algorithms.py | 19 ++++++++++-- pandas/core/base.py | 1 + pandas/core/series.py | 10 +++++-- pandas/tests/test_algos.py | 7 +++-- 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 17e30df59d6a2..babe63e623dcd 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -593,6 +593,58 @@ result. On the other hand, this might have backward incompatibilities: e.g. compared to numpy arrays, ``Index`` objects are not mutable. To get the original ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``. +.. _whatsnew_0200.api_breaking.unique: + +pd.unique will now be consistent with extension types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. + +Previous behaviour: + +Datetime tz-aware + +.. code-block:: ipython + + In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])).unique() + Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) + + In [7]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]))) + Out[7]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') + +Categoricals + +.. code-block:: ipython + + + In [1]: pd.Series(pd.Categorical(list('aabc'))).unique() + Out[1]: + [a, b, c] + Categories (3, object): [a, b, c] + + In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) + Out[2]: array(['a', 'b', 'c'], dtype=object) + +New Behavior: + +Datetime tz-aware + +.. ipython:: python + + Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])).unique() + pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]))) + +Categoricals + +.. ipython:: python + + pd.Series(pd.Categorical(list('aabc'))).unique() + pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) + .. _whatsnew_0200.api_breaking.s3: S3 File Handling diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 50d7f9db53306..9d0630cde9edd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -278,8 +278,10 @@ def unique(values): Returns ------- - unique values. The returned type will be a pandas Index - if a pandas type is input, otherwise ndarray + unique values. + - If the input is a Categorical dtype, the return is a Categorical + - If the input is an Index, the return is an Index + - If the input is a Series/ndarray, the return will be an ndarray Examples -------- @@ -293,6 +295,11 @@ def unique(values): ... pd.Timestamp('20160101')])) array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + ... pd.Timestamp('20160101', tz='US/Eastern')])) + array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], + dtype=object) + >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), ... pd.Timestamp('20160101', tz='US/Eastern')])) DatetimeIndex(['2016-01-01 00:00:00-05:00'], @@ -309,6 +316,7 @@ def unique(values): Categories (3, object): [a, b, c] """ + values = _ensure_arraylike(values) # categorical is a fast-path @@ -326,6 +334,13 @@ def unique(values): uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) + if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): + # we are special casing datetime64tz_dtype + # to return an object array of tz-aware Timestamps + + # TODO: it must return DatetimeArray with tz in pandas 2.0 + uniques = uniques.asobject.values + return uniques diff --git a/pandas/core/base.py b/pandas/core/base.py index 3401c7c59cb56..b723c066fe764 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -873,6 +873,7 @@ def unique(self): else: from pandas.core.algorithms import unique1d result = unique1d(values) + return result def nunique(self, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index 760abc20351cf..5ee3ca73742ae 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1204,10 +1204,14 @@ def mode(self): @Appender(base._shared_docs['unique'] % _shared_doc_kwargs) def unique(self): result = super(Series, self).unique() + if is_datetime64tz_dtype(self.dtype): - # to return array of Timestamp with tz - # ToDo: it must return DatetimeArray with tz in pandas 2.0 - return result.asobject.values + # we are special casing datetime64tz_dtype + # to return an object array of tz-aware Timestamps + + # TODO: it must return DatetimeArray with tz in pandas 2.0 + result = result.asobject.values + return result @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cf3db1519f539..4ee1b3af4dabd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -401,6 +401,7 @@ def test_categorical(self): def test_datetime64tz_aware(self): # GH 15939 + result = Series( pd.Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')])).unique() @@ -417,9 +418,9 @@ def test_datetime64tz_aware(self): result = pd.unique( Series(pd.Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')]))) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) - tm.assert_index_equal(result, expected) + expected = np.array([Timestamp('2016-01-01 00:00:00-0500', + tz='US/Eastern')], dtype=object) + tm.assert_numpy_array_equal(result, expected) result = pd.unique(pd.Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')])) From 3e2ae12789790ec15542882a137d0762928c767f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 10:55:17 -0400 Subject: [PATCH 04/10] docs --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index babe63e623dcd..89739b43755d5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -598,7 +598,7 @@ ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``. pd.unique will now be consistent with extension types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. +In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. Note that the behavior of ``Index.unique()`` and ``pd.unique(Index)`` has not changed. (:issue:`15939`) Previous behaviour: From e2b3867dd994e248e1ccbd81a27481495432d71e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 11:07:43 -0400 Subject: [PATCH 05/10] tests for CI --- doc/source/whatsnew/v0.20.0.txt | 74 ++++++++++++++++++++------------- pandas/tests/test_algos.py | 9 ++++ 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 89739b43755d5..26a67014b73e2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -598,52 +598,68 @@ ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``. pd.unique will now be consistent with extension types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. Note that the behavior of ``Index.unique()`` and ``pd.unique(Index)`` has not changed. (:issue:`15939`) +In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. -Previous behaviour: +- Datetime tz-aware -Datetime tz-aware + Previous behaviour: -.. code-block:: ipython + .. code-block:: ipython - In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])).unique() - Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) + # Series + In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])).unique() + Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) - In [7]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]))) - Out[7]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') + In [6]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]))) + Out[6]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') -Categoricals + # Index + In [7]: pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]).unique() + Out[7]: DatetimeIndex(['2016-01-01 00:00:00-05:00'], dtype='datetime64[ns, US/Eastern]', freq=None) -.. code-block:: ipython + In [8]: pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])) + Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') + New Behavior: - In [1]: pd.Series(pd.Categorical(list('aabc'))).unique() - Out[1]: - [a, b, c] - Categories (3, object): [a, b, c] + .. ipython:: python - In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) - Out[2]: array(['a', 'b', 'c'], dtype=object) + # Series + Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])).unique() + pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]))) -New Behavior: + # Index + pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]) + pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])) -Datetime tz-aware +- Categoricals -.. ipython:: python + Previous behaviour: + + .. code-block:: ipython - Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])).unique() - pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]))) + In [1]: pd.Series(pd.Categorical(list('aabc'))).unique() + Out[1]: + [a, b, c] + Categories (3, object): [a, b, c] -Categoricals + In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) + Out[2]: array(['a', 'b', 'c'], dtype=object) -.. ipython:: python + New Behavior: + + .. ipython:: python - pd.Series(pd.Categorical(list('aabc'))).unique() - pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) + pd.Series(pd.Categorical(list('aabc'))).unique() + pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) .. _whatsnew_0200.api_breaking.s3: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4ee1b3af4dabd..f2ae82099d05b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -399,6 +399,15 @@ def test_categorical(self): expected = Series(expected, name='foo') tm.assert_series_equal(result, expected) + # CI + ci = pd.CategoricalIndex(pd.Categorical(list('aabc'))) + result = ci.unique() + expected = pd.CategoricalIndex(pd.Categorical(list('abc'))) + tm.assert_index_equal(result, expected) + + result = pd.unique(ci) + tm.assert_index_equal(result, expected) + def test_datetime64tz_aware(self): # GH 15939 From c725c7f239233654bc7384c306a8266f7042a9f7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 13:08:12 -0400 Subject: [PATCH 06/10] TST: fix test on windows --- pandas/tests/test_algos.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f2ae82099d05b..abd097cc1e30d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -442,10 +442,12 @@ def test_order_of_appearance(self): # light testing of guarantee of order of appearance # these also are the doc-examples result = pd.unique(Series([2, 1, 3, 3])) - tm.assert_numpy_array_equal(result, np.array([2, 1, 3])) + tm.assert_numpy_array_equal(result, + np.array([2, 1, 3], dtype='int64')) result = pd.unique(Series([2] + [1] * 5)) - tm.assert_numpy_array_equal(result, np.array([2, 1])) + tm.assert_numpy_array_equal(result, + np.array([2, 1], dtype='int64')) result = pd.unique(Series([Timestamp('20160101'), Timestamp('20160101')])) From a046ddee5cc533a233d6819c63f172db367a80cb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 16:31:17 -0400 Subject: [PATCH 07/10] doc fixes --- doc/source/whatsnew/v0.20.0.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 26a67014b73e2..531a919baac9d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -598,7 +598,8 @@ ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``. pd.unique will now be consistent with extension types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware datatypes would yield different return types. These are now consistent to return the extension type. +In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware +datatypes would yield different return types. These are now made consistent. - Datetime tz-aware @@ -628,13 +629,13 @@ In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Cate .. ipython:: python - # Series + # Series, returns an array of Timestamp tz-aware Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), pd.Timestamp('20160101', tz='US/Eastern')])).unique() pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), pd.Timestamp('20160101', tz='US/Eastern')]))) - # Index + # Index, returns a DatetimeIndex pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), pd.Timestamp('20160101', tz='US/Eastern')]) pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), @@ -658,6 +659,7 @@ In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Cate .. ipython:: python + # returns a Categorical pd.Series(pd.Categorical(list('aabc'))).unique() pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) From 220a86b6515ed72ecca41bb823d87c77fcce49ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 18:24:56 -0400 Subject: [PATCH 08/10] more doc fixes --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 531a919baac9d..7a61a768bcbc6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -599,7 +599,7 @@ pd.unique will now be consistent with extension types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In prior versions, using ``Series.unique()`` and ``pd.unique(Series)`` on ``Categorical`` and tz-aware -datatypes would yield different return types. These are now made consistent. +datatypes would yield different return types. These are now made consistent. (:issue:`15903`) - Datetime tz-aware From 7d05b50e42b27593be66c38b93bc6c9f53b872da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 9 Apr 2017 09:39:32 -0400 Subject: [PATCH 09/10] fix docs as per review .unique of Categorical/Series now returns Categorical --- doc/source/whatsnew/v0.20.0.txt | 36 +++++++++++------------ pandas/core/algorithms.py | 51 ++++++++++++++++++++++----------- pandas/core/base.py | 19 +++++++++--- pandas/core/categorical.py | 27 +++++++++++++++++ pandas/tests/test_algos.py | 43 ++++++++++++++++++++------- 5 files changed, 128 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7a61a768bcbc6..4c0594c024774 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -608,12 +608,12 @@ datatypes would yield different return types. These are now made consistent. (:i .. code-block:: ipython # Series - In [5]: Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])).unique() + In [5]: pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]).unique() Out[5]: array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], dtype=object) - In [6]: pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]))) + In [6]: pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])) Out[6]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') # Index @@ -621,8 +621,8 @@ datatypes would yield different return types. These are now made consistent. (:i pd.Timestamp('20160101', tz='US/Eastern')]).unique() Out[7]: DatetimeIndex(['2016-01-01 00:00:00-05:00'], dtype='datetime64[ns, US/Eastern]', freq=None) - In [8]: pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])) + In [8]: pd.unique([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]) Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') New Behavior: @@ -630,14 +630,14 @@ datatypes would yield different return types. These are now made consistent. (:i .. ipython:: python # Series, returns an array of Timestamp tz-aware - Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')])).unique() - pd.unique(Series(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]))) + pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')]).unique() + pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + pd.Timestamp('20160101', tz='US/Eastern')])) # Index, returns a DatetimeIndex pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - pd.Timestamp('20160101', tz='US/Eastern')]) + pd.Timestamp('20160101', tz='US/Eastern')]).unique() pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), pd.Timestamp('20160101', tz='US/Eastern')])) @@ -647,21 +647,21 @@ datatypes would yield different return types. These are now made consistent. (:i .. code-block:: ipython - In [1]: pd.Series(pd.Categorical(list('aabc'))).unique() + In [1]: pd.Series(pd.Categorical(list('baabc'))).unique() Out[1]: - [a, b, c] - Categories (3, object): [a, b, c] + [b, a, c] + Categories (3, object): [b, a, c] - In [2]: pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) - Out[2]: array(['a', 'b', 'c'], dtype=object) + In [2]: pd.unique(pd.Series(pd.Categorical(list('baabc')))) + Out[2]: array(['b', 'a', 'c'], dtype=object) New Behavior: .. ipython:: python # returns a Categorical - pd.Series(pd.Categorical(list('aabc'))).unique() - pd.unique(pd.Series(pd.Categorical(list('aabc'))).unique()) + pd.Series(pd.Categorical(list('baabc'))).unique() + pd.unique(pd.Series(pd.Categorical(list('baabc'))).unique()) .. _whatsnew_0200.api_breaking.s3: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9d0630cde9edd..c39d0216e9073 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -269,9 +269,11 @@ def match(to_match, values, na_sentinel=-1): def unique(values): """ - Hash table-based unique. uniques are returned in order + Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. + Significantly faster than numpy.unique. Includes NA values. + Parameters ---------- values : 1d array-like @@ -279,9 +281,9 @@ def unique(values): Returns ------- unique values. - - If the input is a Categorical dtype, the return is a Categorical - - If the input is an Index, the return is an Index - - If the input is a Series/ndarray, the return will be an ndarray + - If the input is an Index, the return is an Index + - If the input is a Categorical dtype, the return is a Categorical + - If the input is a Series/ndarray, the return will be an ndarray Examples -------- @@ -305,26 +307,43 @@ def unique(values): DatetimeIndex(['2016-01-01 00:00:00-05:00'], ... dtype='datetime64[ns, US/Eastern]', freq=None) - >>> pd.unique(list('aabc')) - array(['a', 'b', 'c'], dtype=object) + >>> pd.unique(list('baabc')) + array(['b', 'a', 'c'], dtype=object) + + An unordered Categorical will return categories in the + order of appearance. + + >>> pd.unique(Series(pd.Categorical(list('baabc')))) + [b, a, c] + Categories (3, object): [b, a, c] + + >>> pd.unique(Series(pd.Categorical(list('baabc'), + ... categories=list('abc')))) + [b, a, c] + Categories (3, object): [b, a, c] - >>> pd.unique(Series(pd.Categorical(list('aabc')))) - 0 a - 1 b - 2 c - dtype: category - Categories (3, object): [a, b, c] + An ordered Categorical preserves the category ordering. + + >>> pd.unique(Series(pd.Categorical(list('baabc'), + ... categories=list('abc'), + ... ordered=True))) + [b, a, c] + Categories (3, object): [a < b < c] + + See Also + -------- + pd.Index.unique + pd.Series.unique """ values = _ensure_arraylike(values) # categorical is a fast-path + # this will coerce Categorical, CategoricalIndex, + # and category dtypes Series to same return of Category if is_categorical_dtype(values): - - if isinstance(values, ABCSeries): - from pandas import Series - return Series(values.values.unique(), name=values.name) + values = getattr(values, '.values', values) return values.unique() original = values diff --git a/pandas/core/base.py b/pandas/core/base.py index b723c066fe764..d0680d939ba3f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -855,13 +855,24 @@ def value_counts(self, normalize=False, sort=True, ascending=False, _shared_docs['unique'] = ( """ - Return %(unique)s of unique values in the object. - Significantly faster than numpy.unique. Includes NA values. - The order of the original is preserved. + Hash table-based unique. Uniques are returned in order + of appearance. This does NOT sort. + + Parameters + ---------- + values : 1d array-like Returns ------- - uniques : %(unique)s + unique values. + - If the input is an Index, the return is an Index + - If the input is a Categorical dtype, the return is a Categorical + - If the input is a Series/ndarray, the return will be an ndarray + + See Also + -------- + pd.unique + pd.Categorical.unique """) @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0fcf8664e755d..b7b9bd2ec4fca 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1895,6 +1895,33 @@ def unique(self): Returns ------- unique values : ``Categorical`` + + Examples + -------- + An unordered Categorical will return categories in the + order of appearance. + + >>> pd.Categorical(list('baabc')) + [b, a, c] + Categories (3, object): [b, a, c] + + >>> pd.Categorical(list('baabc'), categories=list('abc')) + [b, a, c] + Categories (3, object): [b, a, c] + + An ordered Categorical preserves the category ordering. + + >>> pd.Categorical(list('baabc'), + ... categories=list('abc'), + ... ordered=True) + [b, a, c] + Categories (3, object): [a < b < c] + + See Also + -------- + pd.unique + pd.CategoricalIndex.unique + """ # unlike np.unique, unique1d does not sort diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index abd097cc1e30d..d9f81968c684d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -386,23 +386,46 @@ def test_uint64_overflow(self): tm.assert_numpy_array_equal(algos.unique(s), exp) def test_categorical(self): + + # we are expecting to return in the order + # of appearance + expected = pd.Categorical(list('bac'), + categories=list('bac')) + + # we are expecting to return in the order + # of the categories + expected_o = pd.Categorical(list('bac'), + categories=list('abc'), + ordered=True) + # GH 15939 - c = pd.Categorical(list('aabc')) + c = pd.Categorical(list('baabc')) result = c.unique() - expected = pd.Categorical(list('abc')) tm.assert_categorical_equal(result, expected) result = algos.unique(c) tm.assert_categorical_equal(result, expected) - result = algos.unique(Series(c, name='foo')) - expected = Series(expected, name='foo') - tm.assert_series_equal(result, expected) + c = pd.Categorical(list('baabc'), ordered=True) + result = c.unique() + tm.assert_categorical_equal(result, expected_o) + + result = algos.unique(c) + tm.assert_categorical_equal(result, expected_o) - # CI - ci = pd.CategoricalIndex(pd.Categorical(list('aabc'))) + # Series of categorical dtype + s = Series(pd.Categorical(list('baabc')), name='foo') + result = s.unique() + tm.assert_categorical_equal(result, expected) + + result = pd.unique(s) + tm.assert_categorical_equal(result, expected) + + # CI -> return CI + ci = pd.CategoricalIndex(pd.Categorical(list('baabc'), + categories=list('bac'))) + expected = pd.CategoricalIndex(expected) result = ci.unique() - expected = pd.CategoricalIndex(pd.Categorical(list('abc'))) tm.assert_index_equal(result, expected) result = pd.unique(ci) @@ -468,8 +491,8 @@ def test_order_of_appearance(self): tm.assert_numpy_array_equal(result, expected) result = pd.unique(Series(pd.Categorical(list('aabc')))) - expected = Series(pd.Categorical(list('abc'))) - tm.assert_series_equal(result, expected) + expected = pd.Categorical(list('abc')) + tm.assert_categorical_equal(result, expected) class TestIsin(tm.TestCase): From dc81286bf0b3510d4fe8f87522402951c98d1db2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 9 Apr 2017 17:27:04 +0200 Subject: [PATCH 10/10] small doc fixes --- pandas/core/algorithms.py | 4 ++-- pandas/core/base.py | 8 ++++---- pandas/core/categorical.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c39d0216e9073..654e38e43b6c0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -332,8 +332,8 @@ def unique(values): See Also -------- - pd.Index.unique - pd.Series.unique + pandas.Index.unique + pandas.Series.unique """ diff --git a/pandas/core/base.py b/pandas/core/base.py index d0680d939ba3f..56bdeee6982d5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -855,8 +855,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, _shared_docs['unique'] = ( """ - Hash table-based unique. Uniques are returned in order - of appearance. This does NOT sort. + Return unique values in the object. Uniques are returned in order + of appearance, this does NOT sort. Hash table-based unique. Parameters ---------- @@ -871,8 +871,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, See Also -------- - pd.unique - pd.Categorical.unique + pandas.unique + pandas.Categorical.unique """) @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b7b9bd2ec4fca..e3d6792604c4c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1919,8 +1919,8 @@ def unique(self): See Also -------- - pd.unique - pd.CategoricalIndex.unique + pandas.unique + pandas.CategoricalIndex.unique """