From a020c1081f86948a3f9332ad05ab35d8b6bf46d3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 19 Mar 2017 17:58:08 -0700 Subject: [PATCH] PERF: Improve drop_duplicates for bool columns (#12963) Add whatsnew Add dtype label and reorg logic --- asv_bench/benchmarks/reindex.py | 5 +++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 6fe6c32a96df9..537d275e7c727 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -132,6 +132,9 @@ def setup(self): self.K = 10000 self.key1 = np.random.randint(0, self.K, size=self.N) self.df_int = DataFrame({'key1': self.key1}) + self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K, + dtype=bool) + for i in range(10)}) def time_frame_drop_dups(self): self.df.drop_duplicates(['key1', 'key2']) @@ -154,6 +157,8 @@ def time_series_drop_dups_string(self): def time_frame_drop_dups_int(self): self.df_int.drop_duplicates() + def time_frame_drop_dups_bool(self): + self.df_bool.drop_duplicates() #---------------------------------------------------------------------- # blog "pandas escaped the zoo" diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 680aefc4041fb..02e80dd77aa0a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -788,6 +788,7 @@ Performance Improvements - Improved performance of ``.rank()`` for categorical data (:issue:`15498`) - Improved performance when using ``.unstack()`` (:issue:`15503`) - Improved performance of merge/join on ``category`` columns (:issue:`10409`) +- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6937675603c10..b6f496f417a74 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,6 +19,7 @@ is_period_dtype, is_period_arraylike, is_float_dtype, + is_bool_dtype, needs_i8_conversion, is_categorical, is_datetime64_dtype, @@ -341,6 +342,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # numpy dtype dtype = values.dtype vals = values.view(np.int64) + elif is_bool_dtype(values): + dtype = bool + # transform to int dtype to avoid object path + vals = np.asarray(values).view('uint8') else: vals = np.asarray(values)