From 04dcc725a207035fffc03c6815d734e8504f6b32 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 30 Dec 2017 23:02:28 -0800 Subject: [PATCH 1/4] CLN: ASV indexing --- asv_bench/benchmarks/indexing.py | 275 +++++++++++++++---------------- 1 file changed, 137 insertions(+), 138 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 5b12f6ea89614..6d2a8d55846c9 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,71 +1,90 @@ -from .pandas_vb_common import * +import string +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, + IntervalIndex, IndexSlice) +from .pandas_vb_common import setup, Panel # noqa + + +class NumericSeriesIndexing(object): -class Int64Indexing(object): goal_time = 0.2 + params = [Int64Index, Float64Index] + param = ['index'] - def setup(self): - self.s = Series(np.random.rand(1000000)) + def setup(self, index): + N = 10**6 + idx = index(range(N)) + self.data = Series(np.random.rand(N), index=idx) + self.array = np.arange(10000) + self.array_list = self.array.tolist() - def time_getitem_scalar(self): - self.s[800000] + def time_getitem_scalar(self, index): + self.data[800000] - def time_getitem_slice(self): - self.s[:800000] + def time_getitem_slice(self, index): + self.data[:800000] + + def time_getitem_list_like(self, index): + self.data[[800000]] - def time_getitem_list_like(self): - self.s[[800000]] + def time_getitem_array(self, index): + self.data[self.array] - def time_getitem_array(self): - self.s[np.arange(10000)] + def time_getitem_lists(self, index): + self.data[self.array_list] - def time_getitem_lists(self): - self.s[np.arange(10000).tolist()] + def time_iloc_array(self, index): + self.data.iloc[self.array] - def time_iloc_array(self): - self.s.iloc[np.arange(10000)] + def time_iloc_list_like(self, index): + self.data.iloc[[800000]] - def time_iloc_list_like(self): - self.s.iloc[[800000]] + def time_iloc_scalar(self, index): + self.data.iloc[800000] - def time_iloc_scalar(self): - self.s.iloc[800000] + def time_iloc_slice(self, index): + self.data.iloc[:800000] - def time_iloc_slice(self): - self.s.iloc[:800000] + def time_ix_array(self, index): + self.data.ix[self.array] - def time_ix_array(self): - self.s.ix[np.arange(10000)] + def time_ix_list_like(self, index): + self.data.ix[[800000]] - def time_ix_list_like(self): - self.s.ix[[800000]] + def time_ix_scalar(self, index): + self.data.ix[800000] - def time_ix_scalar(self): - self.s.ix[800000] + def time_ix_slice(self, index): + self.data.ix[:800000] - def time_ix_slice(self): - self.s.ix[:800000] + def time_loc_array(self, index): + self.data.loc[self.array] - def time_loc_array(self): - self.s.loc[np.arange(10000)] + def time_loc_list_like(self, index): + self.data.loc[[800000]] - def time_loc_list_like(self): - self.s.loc[[800000]] + def time_loc_scalar(self, index): + self.data.loc[800000] - def time_loc_scalar(self): - self.s.loc[800000] + def time_loc_slice(self, index): + self.data.loc[:800000] - def time_loc_slice(self): - self.s.loc[:800000] +class NonNumericSeriesIndexing(object): -class StringIndexing(object): goal_time = 0.2 + params = ['string', 'datetime'] + param_names = ['index'] - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - self.lbl = self.s.index[800000] + def setup(self, index): + N = 10**6 + indexes = {'string': tm.makeStringIndex(N), + 'datetime': tm.makeTimeSeries(N)} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.lbl = index[800000] def time_getitem_label_slice(self): self.s[:self.lbl] @@ -76,41 +95,33 @@ def time_getitem_pos_slice(self): def time_get_value(self): self.s.get_value(self.lbl) - -class DatetimeIndexing(object): - goal_time = 0.2 - - def setup(self): - tm.N = 1000 - self.ts = tm.makeTimeSeries() - self.dt = self.ts.index[500] - - def time_getitem_scalar(self): - self.ts[self.dt] + def time_getitem_scalar(self, index): + self.s[self.lbl] class DataFrameIndexing(object): + goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, - columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] + index = tm.makeStringIndex(1000) + columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=index, + columns=columns) + self.idx = index[100] + self.col = columns[10] self.df2 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df2['B'] > 0) + self.indexer = self.df2['B'] > 0 self.obj_indexer = self.indexer.astype('O') - # duptes - self.idx_dupe = (np.array(range(30)) * 99) - self.df3 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000),}) - self.df3 = concat([self.df3, (2 * self.df3), (3 * self.df3)]) + # dupes + self.idx_dupe = np.array(range(30)) * 99 + self.df3 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) + self.df3 = concat([self.df3, 2 * self.df3, 3 * self.df3]) - self.df_big = DataFrame(dict(A=(['foo'] * 1000000))) + self.df_big = DataFrame(dict(A=['foo'] * 1000000)) def time_get_value(self): self.df.get_value(self.idx, self.col) @@ -142,13 +153,14 @@ class IndexingMethods(object): goal_time = 0.2 def setup(self): - a = np.arange(100000) - self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + N = 100000 + a = np.arange(N) + self.ind = Float64Index(a * 4.8000000418824129e-08) - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), - index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) + self.s = Series(np.random.rand(N)) + self.ts = Series(np.random.rand(N), + index=date_range('2011-01-01', freq='S', periods=N)) + self.indexer = [True, False, True, True, False] * 20000 def time_get_loc_float(self): self.ind.get_loc(0) @@ -161,38 +173,32 @@ def time_take_intindex(self): class MultiIndexing(object): + goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) + self.mi = MultiIndex.from_product([range(1000), range(1000)]) self.s = Series(np.random.randn(1000000), index=self.mi) self.df = DataFrame(self.s) # slicers - np.random.seed(1234) - self.idx = pd.IndexSlice - self.n = 100000 - self.mdt = pandas.DataFrame() - self.mdt['A'] = np.random.choice(range(10000, 45000, 1000), self.n) - self.mdt['B'] = np.random.choice(range(10, 400), self.n) - self.mdt['C'] = np.random.choice(range(1, 150), self.n) - self.mdt['D'] = np.random.choice(range(10000, 45000), self.n) - self.mdt['x'] = np.random.choice(range(400), self.n) - self.mdt['y'] = np.random.choice(range(25), self.n) - self.test_A = 25000 - self.test_B = 25 - self.test_C = 40 - self.test_D = 35000 - self.eps_A = 5000 - self.eps_B = 5 - self.eps_C = 5 - self.eps_D = 5000 + n = 100000 + self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), + n), + 'B': np.random.choice(range(10, 400), n), + 'C': np.random.choice(range(1, 150), n), + 'D': np.random.choice(range(10000, 45000), n), + 'x': np.random.choice(range(400), n), + 'y': np.random.choice(range(25), n)}) + self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - self.miint = MultiIndex.from_product( - [np.arange(1000), - np.arange(1000)], names=['one', 'two']) - - import string + self.miint = MultiIndex.from_product([np.arange(1000), + np.arange(1000)], + names=['one', 'two']) + self.obj_index = np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object) self.mi_large = MultiIndex.from_product( [np.arange(1000), np.arange(20), list(string.ascii_letters)], @@ -204,11 +210,10 @@ def setup(self): [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) - rng = np.random.RandomState(4) - size = 1 << 16 + size = 65536 self.mi_unused_levels = pd.MultiIndex.from_arrays([ - rng.randint(0, 1 << 13, size), - rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1] + rng.randint(0, 8192, size), + rng.randint(0, 1024, size)])[rng.random.rand(size) < 0.1] def time_series_xs_mi_ix(self): self.s.ix[999] @@ -217,18 +222,10 @@ def time_frame_xs_mi_ix(self): self.df.ix[999] def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[ - (self.test_A - self.eps_A):(self.test_A + self.eps_A), - (self.test_B - self.eps_B):(self.test_B + self.eps_B), - (self.test_C - self.eps_C):(self.test_C + self.eps_C), - (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + self.mdt2.loc[self.idx, :] def time_multiindex_get_indexer(self): - self.miint.get_indexer( - np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object)) + self.miint.get_indexer(self.obj_index) def time_multiindex_large_get_loc(self): self.mi_large.get_loc((999, 19, 'Z')) @@ -259,26 +256,29 @@ def time_remove_unused_levels(self): class IntervalIndexing(object): + goal_time = 0.2 - def setup(self): - self.monotonic = Series(np.arange(1000000), - index=IntervalIndex.from_breaks(np.arange(1000001))) + def setup_cache(self): + idx = IntervalIndex.from_breaks(np.arange(1000001)) + monotonic = Series(np.arange(1000000), index=idx) + return monotonic - def time_getitem_scalar(self): - self.monotonic[80000] + def time_getitem_scalar(self, monotonic): + monotonic[80000] - def time_loc_scalar(self): - self.monotonic.loc[80000] + def time_loc_scalar(self, monotonic): + monotonic.loc[80000] - def time_getitem_list(self): - self.monotonic[80000:] + def time_getitem_list(self, monotonic): + monotonic[80000:] - def time_loc_list(self): - self.monotonic.loc[80000:] + def time_loc_list(self, monotonic): + monotonic.loc[80000:] class PanelIndexing(object): + goal_time = 0.2 def setup(self): @@ -289,20 +289,22 @@ def time_subset(self): self.p.ix[(self.inds, self.inds, self.inds)] -class IndexerLookup(object): +class MethodLookup(object): + goal_time = 0.2 - def setup(self): - self.s = Series(range(10)) + def setup_cache(self): + s = Series() + return s - def time_lookup_iloc(self): - self.s.iloc + def time_lookup_iloc(self, s): + s.iloc - def time_lookup_ix(self): - self.s.ix + def time_lookup_ix(self, s): + s.ix - def time_lookup_loc(self): - self.s.loc + def time_lookup_loc(self, s): + s.loc class BooleanRowSelect(object): @@ -311,7 +313,6 @@ class BooleanRowSelect(object): def setup(self): N = 10000 - np.random.seed(1234) self.df = DataFrame(np.random.randn(N, 100)) self.bool_arr = np.zeros(N, dtype=bool) self.bool_arr[:1000] = True @@ -325,15 +326,14 @@ class GetItemSingleColumn(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) - self.df2 = DataFrame(np.random.randn(3000, 1), columns=['A']) - self.df3 = DataFrame(np.random.randn(3000, 1)) + self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df_int_col = DataFrame(np.random.randn(3000, 1)) def time_frame_getitem_single_column_label(self): - self.df2['A'] + self.df_string_col['A'] def time_frame_getitem_single_column_int(self): - self.df3[0] + self.df_int_col[0] class AssignTimeseriesIndex(object): @@ -342,7 +342,6 @@ class AssignTimeseriesIndex(object): def setup(self): N = 100000 - np.random.seed(1234) dx = date_range('1/1/2000', periods=N, freq='H') self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) @@ -356,7 +355,7 @@ class InsertColumns(object): def setup(self): self.N = 10**3 - self.df = DataFrame(index=range(N)) + self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) From 82315d5b97aee4277833c7e9cc8454998a9411f2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 31 Dec 2017 21:43:21 -0800 Subject: [PATCH 2/4] CLN: ASV indexing --- asv_bench/benchmarks/index_object.py | 90 ++++++++++++- asv_bench/benchmarks/indexing.py | 189 +++++++++------------------ ci/lint.sh | 2 +- 3 files changed, 152 insertions(+), 129 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index d73b216478ad5..6108d4b9c0deb 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,7 +1,9 @@ +import string + import numpy as np import pandas.util.testing as tm from pandas import (Series, date_range, DatetimeIndex, Index, MultiIndex, - RangeIndex) + RangeIndex, Float64Index) from .pandas_vb_common import setup # noqa @@ -222,3 +224,89 @@ def time_slice(self, dtype): def time_slice_step(self, dtype): self.idx[::2] + + +class Float64IndexMethod(object): + # GH 13166 + goal_time = 0.2 + + def setup(self): + N = 100000 + a = np.arange(N) + self.ind = Float64Index(a * 4.8000000418824129e-08) + + def time_get_loc(self): + self.ind.get_loc(0) + + +class MultiIndexGet(object): + + goal_time = 0.2 + + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], + names=['one', 'two', 'three']) + + def time_multiindex_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + + def time_multiindex_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + + def time_multiindex_string_get_loc(self): + self.mi_small.get_loc((99, 'A', 'A')) + + def time_multiindex_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + + +class MultiIndexDuplicates(object): + + goal_time = 0.2 + + def setup(self): + size = 65536 + arrays = [np.random.randint(0, 8192, size), + np.random.randint(0, 1024, size)] + mask = np.random.rand(size) < 0.1 + self.mi_unused_levels = MultiIndex.from_arrays(arrays) + self.mi_unused_levels = self.mi_unused_levels[mask] + + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + + +class MultiIndexInteger(object): + + goal_time = 0.2 + + def setup(self): + self.mi_int = MultiIndex.from_product([np.arange(1000), + np.arange(1000)], + names=['one', 'two']) + self.obj_index = np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object) + + def time_get_indexer(self): + self.mi_int.get_indexer(self.obj_index) + + def time_is_monotonic(self): + self.mi_int.is_monotonic diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 6d2a8d55846c9..b35f00db2b054 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,9 +1,7 @@ -import string - import numpy as np import pandas.util.testing as tm from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, - IntervalIndex, IndexSlice) + IntervalIndex, IndexSlice, concat, date_range) from .pandas_vb_common import setup, Panel # noqa @@ -79,27 +77,27 @@ class NonNumericSeriesIndexing(object): param_names = ['index'] def setup(self, index): - N = 10**6 + N = 10**5 indexes = {'string': tm.makeStringIndex(N), - 'datetime': tm.makeTimeSeries(N)} + 'datetime': date_range('1900', periods=N, freq='s')} index = indexes[index] self.s = Series(np.random.rand(N), index=index) - self.lbl = index[800000] + self.lbl = index[80000] - def time_getitem_label_slice(self): + def time_getitem_label_slice(self, index): self.s[:self.lbl] - def time_getitem_pos_slice(self): - self.s[:800000] + def time_getitem_pos_slice(self, index): + self.s[:80000] - def time_get_value(self): + def time_get_value(self, index): self.s.get_value(self.lbl) def time_getitem_scalar(self, index): self.s[self.lbl] -class DataFrameIndexing(object): +class DataFrameStringIndexing(object): goal_time = 0.2 @@ -108,67 +106,71 @@ def setup(self): columns = tm.makeStringIndex(30) self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) - self.idx = index[100] - self.col = columns[10] - - self.df2 = DataFrame(np.random.randn(10000, 4), - columns=['A', 'B', 'C', 'D']) - self.indexer = self.df2['B'] > 0 - self.obj_indexer = self.indexer.astype('O') - - # dupes - self.idx_dupe = np.array(range(30)) * 99 - self.df3 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) - self.df3 = concat([self.df3, 2 * self.df3, 3 * self.df3]) - - self.df_big = DataFrame(dict(A=['foo'] * 1000000)) + self.idx_scalar = index[100] + self.col_scalar = columns[10] + self.bool_indexer = self.df[self.col_scalar] > 0 + self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): - self.df.get_value(self.idx, self.col) + self.df.get_value(self.idx_scalar, self.col_scalar) + + def time_ix(self): + self.df.ix[self.idx_scalar, self.col_scalar] - def time_get_value_ix(self): - self.df.ix[(self.idx, self.col)] + def time_loc(self): + self.df.loc[self.idx_scalar, self.col_scalar] def time_getitem_scalar(self): - self.df[self.col][self.idx] + self.df[self.col_scalar][self.idx_scalar] def time_boolean_rows(self): - self.df2[self.indexer] + self.df[self.bool_indexer] def time_boolean_rows_object(self): - self.df2[self.obj_indexer] + self.df[self.bool_obj_indexer] + + +class DataFrameNumericIndexing(object): + + goal_time = 0.2 + + def setup(self): + self.idx_dupe = np.array(range(30)) * 99 + self.df = DataFrame(np.random.randn(10000, 5)) + self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) + self.bool_indexer = [True] * 5000 + [False] * 5000 def time_iloc_dups(self): - self.df3.iloc[self.idx_dupe] + self.df_dup.iloc[self.idx_dupe] def time_loc_dups(self): - self.df3.loc[self.idx_dupe] + self.df_dup.loc[self.idx_dupe] - def time_iloc_big(self): - self.df_big.iloc[:100, 0] + def time_iloc(self): + self.df.iloc[:100, 0] + def time_loc(self): + self.df.loc[:100, 0] -class IndexingMethods(object): - # GH 13166 - goal_time = 0.2 + def time_bool_indexer(self): + self.df[self.bool_indexer] - def setup(self): - N = 100000 - a = np.arange(N) - self.ind = Float64Index(a * 4.8000000418824129e-08) - self.s = Series(np.random.rand(N)) - self.ts = Series(np.random.rand(N), - index=date_range('2011-01-01', freq='S', periods=N)) - self.indexer = [True, False, True, True, False] * 20000 +class Take(object): - def time_get_loc_float(self): - self.ind.get_loc(0) + goal_time = 0.2 + params = ['int', 'datetime'] + param_names = ['index'] - def time_take_dtindex(self): - self.ts.take(self.indexer) + def setup(self, index): + N = 100000 + indexes = {'int': Int64Index(np.arange(N)), + 'datetime': date_range('2011-01-01', freq='S', periods=N)} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.indexer = [True, False, True, True, False] * 20000 - def time_take_intindex(self): + def time_take(self, index): self.s.take(self.indexer) @@ -177,11 +179,10 @@ class MultiIndexing(object): goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_product([range(1000), range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) + mi = MultiIndex.from_product([range(1000), range(1000)]) + self.s = Series(np.random.randn(1000000), index=mi) self.df = DataFrame(self.s) - # slicers n = 100000 self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), n), @@ -191,68 +192,16 @@ def setup(self): 'x': np.random.choice(range(400), n), 'y': np.random.choice(range(25), n)}) self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] - self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - self.miint = MultiIndex.from_product([np.arange(1000), - np.arange(1000)], - names=['one', 'two']) - self.obj_index = np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object) - - self.mi_large = MultiIndex.from_product( - [np.arange(1000), np.arange(20), list(string.ascii_letters)], - names=['one', 'two', 'three']) - self.mi_med = MultiIndex.from_product( - [np.arange(1000), np.arange(10), list('A')], - names=['one', 'two', 'three']) - self.mi_small = MultiIndex.from_product( - [np.arange(100), list('A'), list('A')], - names=['one', 'two', 'three']) - - size = 65536 - self.mi_unused_levels = pd.MultiIndex.from_arrays([ - rng.randint(0, 8192, size), - rng.randint(0, 1024, size)])[rng.random.rand(size) < 0.1] - - def time_series_xs_mi_ix(self): + self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index() + + def time_series_ix(self): self.s.ix[999] - def time_frame_xs_mi_ix(self): + def time_frame_ix(self): self.df.ix[999] - def time_multiindex_slicers(self): - self.mdt2.loc[self.idx, :] - - def time_multiindex_get_indexer(self): - self.miint.get_indexer(self.obj_index) - - def time_multiindex_large_get_loc(self): - self.mi_large.get_loc((999, 19, 'Z')) - - def time_multiindex_large_get_loc_warm(self): - for _ in range(1000): - self.mi_large.get_loc((999, 19, 'Z')) - - def time_multiindex_med_get_loc(self): - self.mi_med.get_loc((999, 9, 'A')) - - def time_multiindex_med_get_loc_warm(self): - for _ in range(1000): - self.mi_med.get_loc((999, 9, 'A')) - - def time_multiindex_string_get_loc(self): - self.mi_small.get_loc((99, 'A', 'A')) - - def time_multiindex_small_get_loc_warm(self): - for _ in range(1000): - self.mi_small.get_loc((99, 'A', 'A')) - - def time_is_monotonic(self): - self.miint.is_monotonic - - def time_remove_unused_levels(self): - self.mi_unused_levels.remove_unused_levels() + def time_index_slice(self): + self.mdt.loc[self.idx, :] class IntervalIndexing(object): @@ -307,20 +256,6 @@ def time_lookup_loc(self, s): s.loc -class BooleanRowSelect(object): - - goal_time = 0.2 - - def setup(self): - N = 10000 - self.df = DataFrame(np.random.randn(N, 100)) - self.bool_arr = np.zeros(N, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - - class GetItemSingleColumn(object): goal_time = 0.2 @@ -342,7 +277,7 @@ class AssignTimeseriesIndex(object): def setup(self): N = 100000 - dx = date_range('1/1/2000', periods=N, freq='H') + idx = date_range('1/1/2000', periods=N, freq='H') self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) def time_frame_assign_timeseries_index(self): diff --git a/ci/lint.sh b/ci/lint.sh index 5380c91831cec..b4eafcaf28e39 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -24,7 +24,7 @@ if [ "$LINT" ]; then echo "Linting setup.py DONE" echo "Linting asv_bench/benchmarks/" - flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/[ips]*.py --ignore=F811 + flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/[ps]*.py --ignore=F811 if [ $? -ne "0" ]; then RET=1 fi From 361a91b19e91685cc8f08f356d847d5f1f363343 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 2 Jan 2018 20:26:45 -0800 Subject: [PATCH 3/4] Create new multiindex_object.py --- asv_bench/benchmarks/index_object.py | 139 +-------------------- asv_bench/benchmarks/multiindex_object.py | 140 ++++++++++++++++++++++ 2 files changed, 142 insertions(+), 137 deletions(-) create mode 100644 asv_bench/benchmarks/multiindex_object.py diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 6108d4b9c0deb..970760373632a 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,9 +1,7 @@ -import string - import numpy as np import pandas.util.testing as tm -from pandas import (Series, date_range, DatetimeIndex, Index, MultiIndex, - RangeIndex, Float64Index) +from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, + Float64Index) from .pandas_vb_common import setup # noqa @@ -86,66 +84,6 @@ def time_modulo(self, dtype): self.index % 2 -class Duplicated(object): - - goal_time = 0.2 - - def setup(self): - n, k = 200, 5000 - levels = [np.arange(n), - tm.makeStringIndex(n).values, - 1000 + np.arange(n)] - labels = [np.random.choice(n, (k * n)) for lev in levels] - self.mi = MultiIndex(levels=levels, labels=labels) - - def time_duplicated(self): - self.mi.duplicated() - - -class Sortlevel(object): - - goal_time = 0.2 - - def setup(self): - n = 1182720 - low, high = -4096, 4096 - arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) - for k in [11, 7, 5, 3, 1]] - self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] - - a = np.repeat(np.arange(100), 1000) - b = np.tile(np.arange(1000), 100) - self.mi = MultiIndex.from_arrays([a, b]) - self.mi = self.mi.take(np.random.permutation(np.arange(100000))) - - def time_sortlevel_int64(self): - self.mi_int.sortlevel() - - def time_sortlevel_zero(self): - self.mi.sortlevel(0) - - def time_sortlevel_one(self): - self.mi.sortlevel(1) - - -class MultiIndexValues(object): - - goal_time = 0.2 - - def setup_cache(self): - - level1 = range(1000) - level2 = date_range(start='1/1/2012', periods=100) - mi = MultiIndex.from_product([level1, level2]) - return mi - - def time_datetime_level_values_copy(self, mi): - mi.copy().values - - def time_datetime_level_values_sliced(self, mi): - mi[:10].values - - class Range(object): goal_time = 0.2 @@ -237,76 +175,3 @@ def setup(self): def time_get_loc(self): self.ind.get_loc(0) - - -class MultiIndexGet(object): - - goal_time = 0.2 - - def setup(self): - self.mi_large = MultiIndex.from_product( - [np.arange(1000), np.arange(20), list(string.ascii_letters)], - names=['one', 'two', 'three']) - self.mi_med = MultiIndex.from_product( - [np.arange(1000), np.arange(10), list('A')], - names=['one', 'two', 'three']) - self.mi_small = MultiIndex.from_product( - [np.arange(100), list('A'), list('A')], - names=['one', 'two', 'three']) - - def time_multiindex_large_get_loc(self): - self.mi_large.get_loc((999, 19, 'Z')) - - def time_multiindex_large_get_loc_warm(self): - for _ in range(1000): - self.mi_large.get_loc((999, 19, 'Z')) - - def time_multiindex_med_get_loc(self): - self.mi_med.get_loc((999, 9, 'A')) - - def time_multiindex_med_get_loc_warm(self): - for _ in range(1000): - self.mi_med.get_loc((999, 9, 'A')) - - def time_multiindex_string_get_loc(self): - self.mi_small.get_loc((99, 'A', 'A')) - - def time_multiindex_small_get_loc_warm(self): - for _ in range(1000): - self.mi_small.get_loc((99, 'A', 'A')) - - -class MultiIndexDuplicates(object): - - goal_time = 0.2 - - def setup(self): - size = 65536 - arrays = [np.random.randint(0, 8192, size), - np.random.randint(0, 1024, size)] - mask = np.random.rand(size) < 0.1 - self.mi_unused_levels = MultiIndex.from_arrays(arrays) - self.mi_unused_levels = self.mi_unused_levels[mask] - - def time_remove_unused_levels(self): - self.mi_unused_levels.remove_unused_levels() - - -class MultiIndexInteger(object): - - goal_time = 0.2 - - def setup(self): - self.mi_int = MultiIndex.from_product([np.arange(1000), - np.arange(1000)], - names=['one', 'two']) - self.obj_index = np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object) - - def time_get_indexer(self): - self.mi_int.get_indexer(self.obj_index) - - def time_is_monotonic(self): - self.mi_int.is_monotonic diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py new file mode 100644 index 0000000000000..0c92214795557 --- /dev/null +++ b/asv_bench/benchmarks/multiindex_object.py @@ -0,0 +1,140 @@ +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import date_range, MultiIndex + +from .pandas_vb_common import setup # noqa + + +class GetLoc(object): + + goal_time = 0.2 + + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], + names=['one', 'two', 'three']) + + def time_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + + def time_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + + def time_string_get_loc(self): + self.mi_small.get_loc((99, 'A', 'A')) + + def time_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + + +class Duplicates(object): + + goal_time = 0.2 + + def setup(self): + size = 65536 + arrays = [np.random.randint(0, 8192, size), + np.random.randint(0, 1024, size)] + mask = np.random.rand(size) < 0.1 + self.mi_unused_levels = MultiIndex.from_arrays(arrays) + self.mi_unused_levels = self.mi_unused_levels[mask] + + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + + +class Integer(object): + + goal_time = 0.2 + + def setup(self): + self.mi_int = MultiIndex.from_product([np.arange(1000), + np.arange(1000)], + names=['one', 'two']) + self.obj_index = np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object) + + def time_get_indexer(self): + self.mi_int.get_indexer(self.obj_index) + + def time_is_monotonic(self): + self.mi_int.is_monotonic + + +class Duplicated(object): + + goal_time = 0.2 + + def setup(self): + n, k = 200, 5000 + levels = [np.arange(n), + tm.makeStringIndex(n).values, + 1000 + np.arange(n)] + labels = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, labels=labels) + + def time_duplicated(self): + self.mi.duplicated() + + +class Sortlevel(object): + + goal_time = 0.2 + + def setup(self): + n = 1182720 + low, high = -4096, 4096 + arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1]] + self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] + + a = np.repeat(np.arange(100), 1000) + b = np.tile(np.arange(1000), 100) + self.mi = MultiIndex.from_arrays([a, b]) + self.mi = self.mi.take(np.random.permutation(np.arange(100000))) + + def time_sortlevel_int64(self): + self.mi_int.sortlevel() + + def time_sortlevel_zero(self): + self.mi.sortlevel(0) + + def time_sortlevel_one(self): + self.mi.sortlevel(1) + + +class Values(object): + + goal_time = 0.2 + + def setup_cache(self): + + level1 = range(1000) + level2 = date_range(start='1/1/2012', periods=100) + mi = MultiIndex.from_product([level1, level2]) + return mi + + def time_datetime_level_values_copy(self, mi): + mi.copy().values + + def time_datetime_level_values_sliced(self, mi): + mi[:10].values From 207c79746d0f5d7991fd8459cbe20e14b30c919a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 2 Jan 2018 20:29:52 -0800 Subject: [PATCH 4/4] Fix lint error --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index bc4599436111f..3b7fdc6e2d78c 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -132,7 +132,7 @@ def setup(self, compression, engine): # The Python 2 C parser can't read bz2 from open files. raise NotImplementedError try: - import s3fs + import s3fs # noqa except ImportError: # Skip these benchmarks if `boto` is not installed. raise NotImplementedError