diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 40cfec1bcd4c7..45d62163ae80b 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,7 +1,6 @@ from importlib import import_module import numpy as np - import pandas as pd from pandas.util import testing as tm @@ -12,113 +11,116 @@ except: pass -class Algorithms(object): +from .pandas_vb_common import setup # noqa + + +class Factorize(object): + goal_time = 0.2 - def setup(self): - N = 100000 - np.random.seed(1234) + params = [True, False] + param_names = ['sort'] - self.int_unique = pd.Int64Index(np.arange(N * 5)) - # cache is_unique - self.int_unique.is_unique + def setup(self, sort): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - self.int = pd.Int64Index(np.arange(N).repeat(5)) - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + def time_factorize_int(self, sort): + self.int_idx.factorize(sort=sort) - # Convenience naming. - self.checked_add = pd.core.algorithms.checked_add_with_arr + def time_factorize_float(self, sort): + self.float_idx.factorize(sort=sort) - self.arr = np.arange(1000000) - self.arrpos = np.arange(1000000) - self.arrneg = np.arange(-1000000, 0) - self.arrmixed = np.array([1, -1]).repeat(500000) - self.strings = tm.makeStringIndex(100000) + def time_factorize_string(self, sort): + self.string_idx.factorize(sort=sort) - self.arr_nan = np.random.choice([True, False], size=1000000) - self.arrmixed_nan = np.random.choice([True, False], size=1000000) - # match - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) +class Duplicated(object): - def time_factorize_string(self): - self.strings.factorize() + goal_time = 0.2 - def time_factorize_int(self): - self.int.factorize() + params = ['first', 'last', False] + param_names = ['keep'] - def time_factorize_float(self): - self.int.factorize() + def setup(self, keep): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - def time_duplicated_int_unique(self): - self.int_unique.duplicated() + def time_duplicated_int(self, keep): + self.int_idx.duplicated(keep=keep) - def time_duplicated_int(self): - self.int.duplicated() + def time_duplicated_float(self, keep): + self.float_idx.duplicated(keep=keep) - def time_duplicated_float(self): - self.float.duplicated() + def time_duplicated_string(self, keep): + self.string_idx.duplicated(keep=keep) - def time_match_strings(self): - pd.match(self.all, self.uniques) - def time_add_overflow_pos_scalar(self): - self.checked_add(self.arr, 1) +class DuplicatedUniqueIndex(object): - def time_add_overflow_neg_scalar(self): - self.checked_add(self.arr, -1) + goal_time = 0.2 - def time_add_overflow_zero_scalar(self): - self.checked_add(self.arr, 0) + def setup(self): + N = 10**5 + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.idx_int_dup.is_unique - def time_add_overflow_pos_arr(self): - self.checked_add(self.arr, self.arrpos) + def time_duplicated_unique_int(self): + self.idx_int_dup.duplicated() - def time_add_overflow_neg_arr(self): - self.checked_add(self.arr, self.arrneg) - def time_add_overflow_mixed_arr(self): - self.checked_add(self.arr, self.arrmixed) +class Match(object): - def time_add_overflow_first_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + goal_time = 0.2 - def time_add_overflow_second_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + def setup(self): + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) - def time_add_overflow_both_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, - b_mask=self.arrmixed_nan) + def time_match_string(self): + pd.match(self.all, self.uniques) class Hashing(object): + goal_time = 0.2 - def setup(self): - N = 100000 - - self.df = pd.DataFrame( - {'A': pd.Series(tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=N))), - 'B': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'D': np.random.randn(N), - 'E': np.arange(N), - 'F': pd.date_range('20110101', freq='s', periods=N), - 'G': pd.timedelta_range('1 day', freq='s', periods=N), - }) - self.df['C'] = self.df['B'].astype('category') - self.df.iloc[10:20] = np.nan - - def time_frame(self): - hashing.hash_pandas_object(self.df) - - def time_series_int(self): - hashing.hash_pandas_object(self.df.E) - - def time_series_string(self): - hashing.hash_pandas_object(self.df.B) - - def time_series_categorical(self): - hashing.hash_pandas_object(self.df.C) + def setup_cache(self): + N = 10**5 + + df = pd.DataFrame( + {'strings': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'floats': np.random.randn(N), + 'ints': np.arange(N), + 'dates': pd.date_range('20110101', freq='s', periods=N), + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) + df['categories'] = df['strings'].astype('category') + df.iloc[10:20] = np.nan + return df + + def time_frame(self, df): + hashing.hash_pandas_object(df) + + def time_series_int(self, df): + hashing.hash_pandas_object(df['ints']) + + def time_series_string(self, df): + hashing.hash_pandas_object(df['strings']) + + def time_series_float(self, df): + hashing.hash_pandas_object(df['floats']) + + def time_series_categorical(self, df): + hashing.hash_pandas_object(df['categories']) + + def time_series_timedeltas(self, df): + hashing.hash_pandas_object(df['timedeltas']) + + def time_series_dates(self, df): + hashing.hash_pandas_object(df['dates']) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index b7610037bed4d..3c091be7a8424 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame try: from pandas.util import cache_readonly @@ -7,9 +8,11 @@ class DataFrameAttributes(object): + goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index @@ -21,6 +24,7 @@ def time_set_index(self): class CacheReadonly(object): + goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 0ca21b929ea17..cc8766e1fa39c 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,11 +1,16 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, date_range +from pandas.core.algorithms import checked_add_with_arr try: import pandas.core.computation.expressions as expr except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Ops(object): + goal_time = 0.2 params = [[True, False], ['default', 1]] @@ -20,18 +25,17 @@ def setup(self, use_numexpr, threads): if not use_numexpr: expr.set_use_numexpr(False) - def time_frame_add(self, use_numexpr, threads): - (self.df + self.df2) + self.df + self.df2 def time_frame_mult(self, use_numexpr, threads): - (self.df * self.df2) + self.df * self.df2 def time_frame_multi_and(self, use_numexpr, threads): - self.df[((self.df > 0) & (self.df2 > 0))] + self.df[(self.df > 0) & (self.df2 > 0)] def time_frame_comparison(self, use_numexpr, threads): - (self.df > self.df2) + self.df > self.df2 def teardown(self, use_numexpr, threads): expr.set_use_numexpr(True) @@ -39,75 +43,109 @@ def teardown(self, use_numexpr, threads): class Ops2(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) + N = 10**3 + self.df = DataFrame(np.random.randn(N, N)) + self.df2 = DataFrame(np.random.randn(N, N)) - self.df_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) - self.df2_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) + self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) + self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) - ## Division + # Division def time_frame_float_div(self): - (self.df // self.df2) + self.df // self.df2 def time_frame_float_div_by_zero(self): - (self.df / 0) + self.df / 0 def time_frame_float_floor_by_zero(self): - (self.df // 0) + self.df // 0 def time_frame_int_div_by_zero(self): - (self.df_int / 0) + self.df_int / 0 - ## Modulo + # Modulo def time_frame_int_mod(self): - (self.df / self.df2) + self.df_int % self.df2_int def time_frame_float_mod(self): - (self.df / self.df2) + self.df % self.df2 class Timeseries(object): + goal_time = 0.2 - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T')) - self.ts = self.s[self.halfway] + params = [None, 'US/Eastern'] + param_names = ['tz'] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s')) + def setup(self, tz): + N = 10**6 + halfway = (N // 2) - 1 + self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.ts = self.s[halfway] - def time_series_timestamp_compare(self): - (self.s <= self.ts) + self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) - def time_timestamp_series_compare(self): - (self.ts >= self.s) + def time_series_timestamp_compare(self, tz): + self.s <= self.ts - def time_timestamp_ops_diff1(self): + def time_timestamp_series_compare(self, tz): + self.ts >= self.s + + def time_timestamp_ops_diff(self, tz): self.s2.diff() - def time_timestamp_ops_diff2(self): - (self.s - self.s.shift()) + def time_timestamp_ops_diff_with_shift(self, tz): + self.s - self.s.shift() + +class AddOverflowScalar(object): + goal_time = 0.2 -class TimeseriesTZ(Timeseries): + params = [1, -1, 0] + param_names = ['scalar'] + + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) + + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) - self.ts = self.s[self.halfway] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) +class AddOverflowArray(object): + + goal_time = 0.2 + + def setup(self): + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) + + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) + + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) + + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) + + def time_add_overflow_both_arg_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a5bb5e790dec1..1613ca1b97f4b 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,6 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +import pandas.util.testing as tm try: from pandas.api.types import union_categoricals except ImportError: @@ -7,108 +9,136 @@ except ImportError: pass +from .pandas_vb_common import setup # noqa + + +class Concat(object): -class Categoricals(object): goal_time = 0.2 def setup(self): - N = 100000 - self.s = pd.Series((list('aabbcd') * N)).astype('category') + N = 10**5 + self.s = pd.Series(list('aabbcd') * N).astype('category') + + self.a = pd.Categorical(list('aabbcd') * N) + self.b = pd.Categorical(list('bbcdjk') * N) + + def time_concat(self): + pd.concat([self.s, self.s]) + + def time_union(self): + union_categoricals([self.a, self.b]) - self.a = pd.Categorical((list('aabbcd') * N)) - self.b = pd.Categorical((list('bbcdjk') * N)) +class Constructor(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 self.categories = list('abcde') - self.cat_idx = Index(self.categories) + self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) self.codes = np.tile(range(len(self.categories)), N) - self.datetimes = pd.Series(pd.date_range( - '1995-01-01 00:00:00', periods=10000, freq='s')) + self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', + periods=N / 10, + freq='s')) + self.datetimes_with_nat = self.datetimes.copy() + self.datetimes_with_nat.iloc[-1] = pd.NaT self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) - def time_concat(self): - concat([self.s, self.s]) - - def time_union(self): - union_categoricals([self.a, self.b]) + def time_regular(self): + pd.Categorical(self.values, self.categories) - def time_constructor_regular(self): - Categorical(self.values, self.categories) + def time_fastpath(self): + pd.Categorical(self.codes, self.cat_idx, fastpath=True) - def time_constructor_fastpath(self): - Categorical(self.codes, self.cat_idx, fastpath=True) + def time_datetimes(self): + pd.Categorical(self.datetimes) - def time_constructor_datetimes(self): - Categorical(self.datetimes) + def time_datetimes_with_nat(self): + pd.Categorical(self.datetimes_with_nat) - def time_constructor_datetimes_with_nat(self): - t = self.datetimes - t.iloc[-1] = pd.NaT - Categorical(t) + def time_with_nan(self): + pd.Categorical(self.values_some_nan) - def time_constructor_with_nan(self): - Categorical(self.values_some_nan) + def time_all_nan(self): + pd.Categorical(self.values_all_nan) - def time_constructor_all_nan(self): - Categorical(self.values_all_nan) +class ValueCounts(object): -class Categoricals2(object): goal_time = 0.2 - def setup(self): - n = 500000 - np.random.seed(2718281) + params = [True, False] + param_names = ['dropna'] + + def setup(self, dropna): + n = 5 * 10**5 arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') + self.ts = pd.Series(arr).astype('category') + + def time_value_counts(self, dropna): + self.ts.value_counts(dropna=dropna) - self.sel = self.ts.loc[[0]] - def time_value_counts(self): - self.ts.value_counts(dropna=False) +class Repr(object): - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) + goal_time = 0.2 + + def setup(self): + self.sel = pd.Series(['s1234']).astype('category') def time_rendering(self): str(self.sel) + +class SetCategories(object): + + goal_time = 0.2 + + def setup(self): + n = 5 * 10**5 + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype('category') + def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) -class Categoricals3(object): +class Rank(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 ncats = 100 - self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats))) - self.s1_cat = self.s1.astype('category') - self.s1_cat_ordered = self.s1.astype('category', ordered=True) + self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str_cat = self.s_str.astype('category') + self.s_str_cat_ordered = self.s_str.astype('category', ordered=True) - self.s2 = Series(np.random.randint(0, ncats, size=N)) - self.s2_cat = self.s2.astype('category') - self.s2_cat_ordered = self.s2.astype('category', ordered=True) + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) + self.s_int_cat = self.s_int.astype('category') + self.s_int_cat_ordered = self.s_int.astype('category', ordered=True) def time_rank_string(self): - self.s1.rank() + self.s_str.rank() def time_rank_string_cat(self): - self.s1_cat.rank() + self.s_str_cat.rank() def time_rank_string_cat_ordered(self): - self.s1_cat_ordered.rank() + self.s_str_cat_ordered.rank() def time_rank_int(self): - self.s2.rank() + self.s_int.rank() def time_rank_int_cat(self): - self.s2_cat.rank() + self.s_int_cat.rank() def time_rank_int_cat_ordered(self): - self.s2_cat_ordered.rank() + self.s_int_cat_ordered.rank() diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index b5694a3a21502..6276dc324ca0d 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,24 +1,29 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp + +from .pandas_vb_common import setup # noqa class Constructors(object): + goal_time = 0.2 def setup(self): - self.arr = np.random.randn(100, 100) + N = 10**2 + self.arr = np.random.randn(N, N) self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - self.data = np.random.randn(100) - self.index = Index(np.arange(100)) + self.data = np.random.randn(N) + self.index = Index(np.arange(N)) - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')] * 1000)) + self.s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * N * 10) def time_frame_from_ndarray(self): DataFrame(self.arr) def time_series_from_ndarray(self): - pd.Series(self.data, index=self.index) + Series(self.data, index=self.index) def time_index_from_array_string(self): Index(self.arr_str) @@ -26,5 +31,5 @@ def time_index_from_array_string(self): def time_dtindex_from_series(self): DatetimeIndex(self.s) - def time_dtindex_from_series2(self): + def time_dtindex_from_index_with_series(self): Index(self.s) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 6f33590ee9e33..8e581dcf22b4c 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,70 +1,67 @@ -from .pandas_vb_common import * +import numpy as np import pandas as pd try: import pandas.core.computation.expressions as expr except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Eval(object): + goal_time = 0.2 params = [['numexpr', 'python'], [1, 'all']] param_names = ['engine', 'threads'] def setup(self, engine, threads): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) + self.df = pd.DataFrame(np.random.randn(20000, 100)) + self.df2 = pd.DataFrame(np.random.randn(20000, 100)) + self.df3 = pd.DataFrame(np.random.randn(20000, 100)) + self.df4 = pd.DataFrame(np.random.randn(20000, 100)) if threads == 1: expr.set_numexpr_threads(1) def time_add(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df + df2 + df3 + df4', engine=engine) + pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) def time_and(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine) + pd.eval('(self.df > 0) & (self.df2 > 0) & ' + '(self.df3 > 0) & (self.df4 > 0)', engine=engine) def time_chained_cmp(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df < df2 < df3 < df4', engine=engine) + pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) def time_mult(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df * df2 * df3 * df4', engine=engine) + pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) def teardown(self, engine, threads): expr.set_numexpr_threads() class Query(object): + goal_time = 0.2 def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) - self.df2 = DataFrame({'dates': self.s.values,}) - - self.df3 = DataFrame({'a': np.random.randn(self.N),}) - self.min_val = self.df3['a'].min() - self.max_val = self.df3['a'].max() + N = 10**6 + halfway = (N // 2) - 1 + index = pd.date_range('20010101', periods=N, freq='T') + s = pd.Series(index) + self.ts = s.iloc[halfway] + self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': s}, + index=index) + data = np.random.randn(N) + self.min_val = data.min() + self.max_val = data.max() def time_query_datetime_index(self): - ts = self.ts - self.df.query('index < @ts') + self.df.query('index < @self.ts') - def time_query_datetime_series(self): - ts = self.ts - self.df2.query('dates < @ts') + def time_query_datetime_column(self): + self.df.query('dates < @self.ts') def time_query_with_boolean_selection(self): - min_val, max_val = self.min_val, self.max_val - self.df.query('(a >= @min_val) & (a <= @max_val)') + self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 7f95e8d06eb72..5f465a91d38d3 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,31 +1,28 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: - from pandas.tseries.offsets import * + from pandas.tseries import offsets except: - from pandas.core.datetools import * + from pandas.core.datetools import * # noqa +from .pandas_vb_common import setup # noqa -#---------------------------------------------------------------------- -# Creation from nested dict class FromDicts(object): + goal_time = 0.2 def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() + N, K = 5000, 50 + index = tm.makeStringIndex(N) + columns = tm.makeStringIndex(K) + frame = DataFrame(np.random.randn(N, K), index=index, columns=columns) + self.data = frame.to_dict() self.some_dict = list(self.data.values())[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - - self.data2 = dict( - ((i, dict(((j, float(j)) for j in range(100)))) for i in - range(2000))) + self.dict_list = frame.to_dict(orient='records') + self.data2 = {i: {j: float(j) for j in range(100)} + for i in range(2000)} def time_frame_ctor_list_of_dict(self): DataFrame(self.dict_list) @@ -38,38 +35,20 @@ def time_series_ctor_from_dict(self): def time_frame_ctor_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 - DataFrame(self.data) + DataFrame(self.data2) -# from a mi-series +class FromSeries(object): -class frame_from_series(object): goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) - self.s = Series(randn(10000), index=self.mi) + mi = MultiIndex.from_product([range(100), range(100)]) + self.s = Series(np.random.randn(10000), index=mi) def time_frame_from_mi_series(self): DataFrame(self.s) - -#---------------------------------------------------------------------- -# get_numeric_data - -class frame_get_numeric_data(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' - self.df = self.df.consolidate() - - def time_frame_get_numeric_data(self): - self.df._get_numeric_data() - - # ---------------------------------------------------------------------- # From dict with DatetimeIndex with all offsets @@ -84,13 +63,15 @@ def get_period_count(start_date, off): if (ten_offsets_in_days == 0): return 1000 else: - return min((9 * ((Timestamp.max - start_date).days // ten_offsets_in_days)), 1000) + periods = 9 * (Timestamp.max - start_date).days // ten_offsets_in_days + return min(periods, 1000) def get_index_for_offset(off): start_date = Timestamp('1/1/1900') - return date_range(start_date, periods=min(1000, get_period_count( - start_date, off)), freq=off) + return date_range(start_date, + periods=get_period_count(start_date, off), + freq=off) all_offsets = offsets.__all__ @@ -100,7 +81,7 @@ def get_index_for_offset(off): all_offsets.extend([off + '_1', off + '_2']) -class FrameConstructorDTIndexFromOffsets(object): +class FromDictwithTimestampOffsets(object): params = [all_offsets, [1, 2]] param_names = ['offset', 'n_steps'] @@ -108,13 +89,15 @@ class FrameConstructorDTIndexFromOffsets(object): offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, 'FY5253': {'startingMonth': 1, 'weekday': 1}, - 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + 'FY5253Quarter': {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 1}} offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, 'FY5253Quarter': {'variation': ['nearest', 'last']}} def setup(self, offset, n_steps): - + np.random.seed(1234) extra = False if offset.endswith("_", None, -1): extra = int(offset[-1]) @@ -127,12 +110,27 @@ def setup(self, offset, n_steps): if extra: extras = self.offset_extra_cases[offset] for extra_arg in extras: - kwargs[extra_arg] = extras[extra_arg][extra -1] + kwargs[extra_arg] = extras[extra_arg][extra - 1] offset = getattr(offsets, offset) self.idx = get_index_for_offset(offset(n_steps, **kwargs)) self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict(self.df.items()) + self.d = self.df.to_dict() def time_frame_ctor(self, offset, n_steps): DataFrame(self.d) + + +class FromRecords(object): + + goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] + + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) + + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index af72ca1e9a6ab..2b48168238ee8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,20 +1,40 @@ -from .pandas_vb_common import * import string +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + isnull, NaT) +from .pandas_vb_common import setup # noqa -#---------------------------------------------------------------------- -# lookup -class frame_fancy_lookup(object): +class GetNumericData(object): + + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 25)) + self.df['foo'] = 'bar' + self.df['bar'] = 'baz' + self.df = self.df.consolidate() + + def time_frame_get_numeric_data(self): + self.df._get_numeric_data() + + +class Lookup(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df = DataFrame(np.random.randn(10000, 8), + columns=list('abcdefgh')) self.df['foo'] = 'bar' self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + self.col_labels = list(self.df.columns) * 100 + self.row_labels_all = np.array( + list(self.df.index) * len(self.df.columns), dtype='object') + self.col_labels_all = np.array( + list(self.df.columns) * len(self.df.index), dtype='object') def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -23,25 +43,20 @@ def time_frame_fancy_lookup_all(self): self.df.lookup(self.row_labels_all, self.col_labels_all) -#---------------------------------------------------------------------- -# reindex - class Reindex(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.idx = np.arange(4000, 7000) - + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), - 1: randint(0, 1000, 1000).astype( - np.int16), - 2: randint(0, 1000, 1000).astype( - np.int32), - 3: randint(0, 1000, 1000).astype( - np.int64),}[randint(0, 4)]) for c in - range(1000)])) + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -53,81 +68,86 @@ def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) def time_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] + self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) -#---------------------------------------------------------------------- -# iteritems (monitor no-copying behaviour) - class Iteration(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(np.random.randn(50000, 10)) - self.df3 = pd.DataFrame(np.random.randn(1000,5000), - columns=['C'+str(c) for c in range(5000)]) + N = 1000 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.df2 = DataFrame(np.random.randn(N * 50, 10)) + self.df3 = DataFrame(np.random.randn(N, 5 * N), + columns=['C' + str(c) for c in range(N * 5)]) - def f(self): + def time_iteritems(self): + # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): + for name, col in self.df.iteritems(): pass - def g(self): - for (name, col) in self.df.iteritems(): - pass - - def time_iteritems(self): - self.f() - def time_iteritems_cached(self): - self.g() + for name, col in self.df.iteritems(): + pass def time_iteritems_indexing(self): - df = self.df3 - for col in df: - df[col] + for col in self.df3: + self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass + def time_iterrows(self): + for row in self.df.iterrows(): + pass + -#---------------------------------------------------------------------- -# to_string, to_html, repr +class ToString(object): -class Formatting(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) + self.df = DataFrame(np.random.randn(100, 10)) - self.nrows = 500 - self.df2 = DataFrame(randn(self.nrows, 10)) - self.df2[0] = period_range('2000', '2010', self.nrows) - self.df2[1] = range(self.nrows) + def time_to_string_floats(self): + self.df.to_string() - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) - self.df3 = DataFrame(self.data, index=self.idx) - self.idx = randn(self.nrows) - self.df4 = DataFrame(self.data, index=self.idx) - self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) +class ToHTML(object): - self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + goal_time = 0.2 - def time_to_string_floats(self): - self.df.to_string() + def setup(self): + nrows = 500 + self.df2 = DataFrame(np.random.randn(nrows, 10)) + self.df2[0] = period_range('2000', '2010', nrows) + self.df2[1] = range(nrows) def time_to_html_mixed(self): self.df2.to_html() + +class Repr(object): + + goal_time = 0.2 + + def setup(self): + nrows = 10000 + data = np.random.randn(nrows, 10) + idx = MultiIndex.from_arrays(np.tile(np.random.randn(3, nrows / 100), + 100)) + self.df3 = DataFrame(data, index=idx) + self.df4 = DataFrame(data, index=np.random.randn(nrows)) + self.df_tall = DataFrame(np.random.randn(nrows, 10)) + self.df_wide = DataFrame(np.random.randn(10, nrows)) + def time_html_repr_trunc_mi(self): self.df3._repr_html_() @@ -141,21 +161,16 @@ def time_frame_repr_wide(self): repr(self.df_wide) -#---------------------------------------------------------------------- -# nulls/masking +class MaskBool(object): - -## masking - -class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + data = np.random.randn(1000, 500) + df = DataFrame(data) + df = df.where(df > 0) + self.bools = df > 0 + self.mask = isnull(df) def time_frame_mask_bools(self): self.bools.mask(self.mask) @@ -164,31 +179,26 @@ def time_frame_mask_floats(self): self.bools.astype(float).mask(self.mask) -## isnull +class Isnull(object): -class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - - np.random.seed(1234) - self.sample = np.array([np.nan, 1.0]) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array(list(string.ascii_lowercase) + - list(string.ascii_uppercase) + - list(string.whitespace)) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_strings= DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_obj = DataFrame(self.data) + N = 10**3 + self.df_no_null = DataFrame(np.random.randn(N, N)) + + sample = np.array([np.nan, 1.0]) + data = np.random.choice(sample, (N, N)) + self.df = DataFrame(data) + + sample = np.array(list(string.ascii_letters + string.whitespace)) + data = np.random.choice(sample, (N, N)) + self.df_strings = DataFrame(data) + + sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + data = np.random.choice(sample, (N, N)) + self.df_obj = DataFrame(data) def time_isnull_floats_no_null(self): isnull(self.df_no_null) @@ -203,92 +213,74 @@ def time_isnull_obj(self): isnull(self.df_obj) -# ---------------------------------------------------------------------- -# fillna in place - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - +class Fillna(object): - -class frame_fillna_many_columns_pad(object): goal_time = 0.2 + params = ([True, False], ['pad', 'bfill']) + param_names = ['inplace', 'method'] - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') + def setup(self, inplace, method): + values = np.random.randn(10000, 100) + values[::2] = np.nan + self.df = DataFrame(values) + def time_frame_fillna(self, inplace, method): + self.df.fillna(inplace=inplace, method=method) class Dropna(object): + goal_time = 0.2 + params = (['all', 'any'], [0, 1]) + param_names = ['how', 'axis'] - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + def setup(self, how, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed['foo'] = 'bar' - self.df_mi = self.df.copy() - self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) - self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) - - self.df_mixed_mi = self.df_mixed.copy() - self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) - self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) - - def time_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) + def time_dropna(self, how, axis): + self.df.dropna(how=how, axis=axis) - def time_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) + def time_dropna_axis_mixed_dtypes(self, how, axis): + self.df_mixed.dropna(how=how, axis=axis) - def time_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) - def time_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) +class Count(object): - def time_dropna_axis0_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=0) - - def time_dropna_axis0_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=0) - - def time_dropna_axis1_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=1) + goal_time = 0.2 - def time_dropna_axis1_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=1) + params = [0, 1] + param_names = ['axis'] - def time_count_level_axis0_multi(self): - self.df_mi.count(axis=0, level=1) + def setup(self, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def time_count_level_axis1_multi(self): - self.df_mi.count(axis=1, level=1) + self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, + self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, + self.df_mixed.index]) + self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, + self.df_mixed.columns]) - def time_count_level_axis0_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=0, level=1) + def time_count_level_multi(self, axis): + self.df.count(axis=axis, level=1) - def time_count_level_axis1_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=1, level=1) + def time_count_level_mixed_dtypes_multi(self, axis): + self.df_mixed.count(axis=axis, level=1) class Apply(object): + goal_time = 0.2 def setup(self): @@ -296,32 +288,29 @@ def setup(self): self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_apply_user_func(self): - self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) def time_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) + self.df.apply(lambda x: x + 1, axis=1) def time_apply_lambda_mean(self): - self.df.apply((lambda x: x.mean())) + self.df.apply(lambda x: x.mean()) def time_apply_np_mean(self): self.df.apply(np.mean) def time_apply_pass_thru(self): - self.df.apply((lambda x: x)) + self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) + self.df3.apply(lambda x: x['A'] + x['B'], axis=1) -#---------------------------------------------------------------------- -# dtypes +class Dtypes(object): -class frame_dtypes(object): goal_time = 0.2 def setup(self): @@ -330,316 +319,170 @@ def setup(self): def time_frame_dtypes(self): self.df.dtypes -#---------------------------------------------------------------------- -# equals class Equals(object): + goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( - ('float_df', self.float_df), ('object_df', self.object_df), - ('nonunique_cols', self.nonunique_cols))]) + N = 10**3 + self.float_df = DataFrame(np.random.randn(N, N)) + self.float_df_nan = self.float_df.copy() + self.float_df_nan.iloc[-1, -1] = np.nan - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df_nan = self.object_df.copy() + self.object_df_nan.iloc[-1, -1] = np.nan - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols_nan = self.nonunique_cols.copy() + self.nonunique_cols_nan.iloc[-1, -1] = np.nan def time_frame_float_equal(self): - self.test_equal('float_df') + self.float_df.equals(self.float_df) def time_frame_float_unequal(self): - self.test_unequal('float_df') + self.float_df.equals(self.float_df_nan) def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols) def time_frame_nonunique_unequal(self): - self.test_unequal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols_nan) def time_frame_object_equal(self): - self.test_equal('object_df') + self.object_df.equals(self.object_df) def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.object_df.equals(self.object_df_nan) class Interpolate(object): + goal_time = 0.2 + params = [None, 'infer'] + param_names = ['downcast'] - def setup(self): + def setup(self, downcast): + N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(randn(10000, 100)) + self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame( - {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), - 'C': randn(10000), 'D': randn(10000),}) + self.df2 = DataFrame({'A': np.arange(0, N), + 'B': np.random.randint(0, 100, N), + 'C': np.random.randn(N), + 'D': np.random.randn(N)}) self.df2.loc[1::5, 'A'] = np.nan self.df2.loc[1::5, 'C'] = np.nan - def time_interpolate(self): - self.df.interpolate() + def time_interpolate(self, downcast): + self.df.interpolate(downcast=downcast) - def time_interpolate_some_good(self): - self.df2.interpolate() - - def time_interpolate_some_good_infer(self): - self.df2.interpolate(downcast='infer') + def time_interpolate_some_good(self, downcast): + self.df2.interpolate(downcast=downcast) class Shift(object): # frame shift speedup issue-5609 goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): + def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) - def time_shift_axis0(self): - self.df.shift(1, axis=0) + def time_shift(self, axis): + self.df.shift(1, axis=axis) - def time_shift_axis_1(self): - self.df.shift(1, axis=1) - - -#----------------------------------------------------------------------------- -# from_records issue-6700 - -class frame_from_records_generator(object): - goal_time = 0.2 - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) - - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) - - - -#----------------------------------------------------------------------------- -# nunique - -class frame_nunique(object): +class Nunique(object): def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + self.df = DataFrame(np.random.randn(10000, 1000)) def time_frame_nunique(self): self.df.nunique() +class Duplicated(object): -#----------------------------------------------------------------------------- -# duplicated - -class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + n = (1 << 20) + t = date_range('2015-01-01', freq='S', periods=(n // 64)) + xs = np.random.randn(n // 64).round(2) + self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), + 'b': np.random.choice(t, n), + 'c': np.random.choice(xs, n)}) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): self.df.duplicated() def time_frame_duplicated_wide(self): - self.df2.T.duplicated() - - - - - - - - - - - + self.df2.duplicated() +class XS(object): - - - -class frame_xs_col(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(1, 100000)) - - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) + def setup(self, axis): + self.N = 10**4 + self.df = DataFrame(np.random.randn(self.N, self.N)) + def time_frame_xs(self, axis): + self.df.xs(self.N / 2, axis=axis) -class frame_xs_row(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(100000, 1)) - def time_frame_xs_row(self): - self.df.xs(50000) +class SortValues(object): - -class frame_sort_index(object): goal_time = 0.2 + params = [True, False] + param_names = ['ascending'] - def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + def setup(self, ascending): + self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) - def time_frame_sort_index(self): - self.df.sort_index() + def time_frame_sort_values(self, ascending): + self.df.sort_values(by='A', ascending=ascending) -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) - +class SortIndexByColumns(object): -class frame_quantile_axis1(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + N = 10000 + K = 10 + self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), + 'key2': tm.makeStringIndex(N).values.repeat(K), + 'value': np.random.randn(N * K)}) - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) + def time_frame_sort_values_by_columns(self): + self.df.sort_values(by=['key1', 'key2']) -#---------------------------------------------------------------------- -# boolean indexing +class Quantile(object): -class frame_boolean_row_select(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] + def setup(self, axis): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) -class frame_getitem_single_column(object): - goal_time = 0.2 + def time_frame_quantile(self, axis): + self.df.quantile([0.1, 0.5], axis=axis) - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - def time_frame_getitem_single_column(self): - self.h() - - def time_frame_getitem_single_column2(self): - self.j() - - -#---------------------------------------------------------------------- -# assignment - -class frame_assign_timeseries_index(object): - goal_time = 0.2 - - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='H') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) - - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index - - - -# insert many columns - -class frame_insert_100_columns_begin(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000 - - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) - - def g(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col - - def time_frame_insert_100_columns_begin(self): - self.f() - - def time_frame_insert_500_columns_end(self): - self.g() - - - -#---------------------------------------------------------------------- -# strings methods, #2602 - -class series_string_vector_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series((['abcdefg', np.nan] * 500000)) - - def time_series_string_vector_slice(self): - self.s.str[:5] - - -#---------------------------------------------------------------------- -# df.info() and get_dtype_counts() # 2807 - -class frame_get_dtype_counts(object): +class GetDtypeCounts(object): + # 2807 goal_time = 0.2 def setup(self): @@ -648,13 +491,21 @@ def setup(self): def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() + def time_info(self): + self.df.info() + + +class NSort(object): -class frame_nlargest(object): goal_time = 0.2 + params = ['first', 'last'] + param_names = ['keep'] - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + def setup(self, keep): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + + def time_nlargest(self, keep): + self.df.nlargest(100, 'A', keep=keep) - def time_frame_nlargest(self): - self.df.nlargest(100, 'A') + def time_nsmallest(self, keep): + self.df.nsmallest(100, 'A', keep=keep) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 13b5cd2b06032..3abf2338e1d94 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,85 +1,108 @@ -from .pandas_vb_common import * from string import ascii_letters, digits from itertools import product +from functools import partial +import numpy as np +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + TimeGrouper, Categorical) +import pandas.util.testing as tm -class groupby_agg_builtins(object): +from .pandas_vb_common import setup # noqa + + +class ApplyDictReturn(object): goal_time = 0.2 def setup(self): - np.random.seed(27182) - self.n = 100000 - self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + self.labels = np.arange(1000).repeat(10) + self.data = Series(np.random.randn(len(self.labels))) + self.f = lambda x: {'first': x.values[0], 'last': x.values[(-1)]} - def time_groupby_agg_builtins1(self): - self.df.groupby('jim').agg([sum, min, max]) + def time_groupby_apply_dict_return(self): + self.data.groupby(self.labels).apply(self.f) - def time_groupby_agg_builtins2(self): - self.df.groupby(['jim', 'joe']).agg([sum, min, max]) -#---------------------------------------------------------------------- -# dict return values +class Apply(object): -class groupby_apply_dict_return(object): goal_time = 0.2 def setup(self): - self.labels = np.arange(1000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], }) + N = 10**4 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + self.df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4), + }) + self.scalar_function = lambda x: 1 + + def time_scalar_function_multi_col(self): + self.df.groupby(['key', 'key2']).apply(self.scalar_function) - def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(self.f) + def time_scalar_function_single_col(self): + self.df.groupby('key').apply(self.scalar_function) + @staticmethod + def df_copy_function(g): + # ensure that the group name is available (see GH #15062) + g.name + return g.copy() + + def time_copy_function_multi_col(self): + self.df.groupby(['key', 'key2']).apply(self.df_copy_function) + + def time_copy_overhead_single_col(self): + self.df.groupby('key').apply(self.df_copy_function) -#---------------------------------------------------------------------- -# groups class Groups(object): - goal_time = 0.1 - size = 2 ** 22 - data = { - 'int64_small': Series(np.random.randint(0, 100, size=size)), - 'int64_large' : Series(np.random.randint(0, 10000, size=size)), - 'object_small': Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))), - 'object_large': Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) - } + goal_time = 0.2 - param_names = ['df'] + param_names = ['key'] params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def setup(self, df): - self.df = self.data[df] + def setup_cache(self): + size = 10**6 + data = {'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large': Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series( + tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=size))), + 'object_large': Series( + tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=size)))} + return data - def time_groupby_groups(self, df): - self.df.groupby(self.df).groups + def setup(self, data, key): + self.ser = data[key] + def time_series_groups(self, data, key): + self.ser.groupby(self.ser).groups -#---------------------------------------------------------------------- -# First / last functions class FirstLast(object): + goal_time = 0.2 param_names = ['dtype'] params = ['float32', 'float64', 'datetime', 'object'] - # with datetimes (GH7555) - def setup(self, dtype): - + N = 10**5 + # with datetimes (GH7555) if dtype == 'datetime': - self.df = DataFrame( - {'values': date_range('1/1/2011', periods=100000, freq='s'), - 'key': range(100000),}) + self.df = DataFrame({'values': date_range('1/1/2011', + periods=N, + freq='s'), + 'key': range(N)}) elif dtype == 'object': - self.df = DataFrame( - {'values': (['foo'] * 100000), - 'key': range(100000)}) + self.df = DataFrame({'values': ['foo'] * N, + 'key': range(N)}) else: - labels = np.arange(10000).repeat(10) - data = Series(randn(len(labels)), dtype=dtype) + labels = np.arange(N / 10).repeat(10) + data = Series(np.random.randn(len(labels)), dtype=dtype) data[::3] = np.nan data[1::3] = np.nan labels = labels.take(np.random.permutation(len(labels))) @@ -91,313 +114,249 @@ def time_groupby_first(self, dtype): def time_groupby_last(self, dtype): self.df.groupby('key').last() - def time_groupby_nth_any(self, dtype): + def time_groupby_nth_all(self, dtype): self.df.groupby('key').nth(0, dropna='all') def time_groupby_nth_none(self, dtype): self.df.groupby('key').nth(0) -#---------------------------------------------------------------------- -# DataFrame Apply overhead +class GroupManyLabels(object): -class groupby_frame_apply(object): goal_time = 0.2 + params = [1, 1000] + param_names = ['ncols'] - def setup(self): - self.N = 10000 - self.labels = np.random.randint(0, 2000, size=self.N) - self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({ - 'key': self.labels, - 'key2': self.labels2, - 'value1': np.random.randn(self.N), - 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)), - }) - - @staticmethod - def scalar_function(g): - return 1 + def setup(self, ncols): + N = 1000 + data = np.random.randn(N, ncols) + self.labels = np.random.randint(0, 100, size=N) + self.df = DataFrame(data) - def time_groupby_frame_apply_scalar_function(self): - self.df.groupby(['key', 'key2']).apply(self.scalar_function) - - def time_groupby_frame_apply_scalar_function_overhead(self): - self.df.groupby('key').apply(self.scalar_function) + def time_sum(self, ncols): + self.df.groupby(self.labels).sum() - @staticmethod - def df_copy_function(g): - # ensure that the group name is available (see GH #15062) - g.name - return g.copy() - def time_groupby_frame_df_copy_function(self): - self.df.groupby(['key', 'key2']).apply(self.df_copy_function) +class Nth(object): - def time_groupby_frame_apply_df_copy_overhead(self): - self.df.groupby('key').apply(self.df_copy_function) + goal_time = 0.2 + def setup_cache(self): + df = DataFrame(np.random.randint(1, 100, (10000, 2))) + df.iloc[1, 1] = np.nan + return df -#---------------------------------------------------------------------- -# 2d grouping, aggregate many columns + def time_frame_nth_any(self, df): + df.groupby(0).nth(0, dropna='any') -class groupby_frame_cython_many_columns(object): - goal_time = 0.2 + def time_frame_nth(self, df): + df.groupby(0).nth(0) - def setup(self): - self.labels = np.random.randint(0, 100, size=1000) - self.df = DataFrame(randn(1000, 1000)) + def time_series_nth_any(self, df): + df[1].groupby(df[0]).nth(0, dropna='any') - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_series_nth(self, df): + df[1].groupby(df[0]).nth(0) -#---------------------------------------------------------------------- -# single key, long, integer key +class DateAttributes(object): -class groupby_frame_singlekey_integer(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100000, 1) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) + rng = date_range('1/1/2000', '12/31/2005', freq='H') + self.year, self.month, self.day = rng.year, rng.month, rng.day + self.ts = Series(np.random.randn(len(rng)), index=rng) - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_len_groupby_object(self): + len(self.ts.groupby([self.year, self.month, self.day])) -#---------------------------------------------------------------------- -# DataFrame nth +class Int64(object): -class groupby_nth(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) - - def time_groupby_frame_nth_any(self): - self.df.groupby(0).nth(0, dropna='any') - - def time_groupby_frame_nth_none(self): - self.df.groupby(0).nth(0) + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) + i = np.random.choice(len(arr), len(arr) * 5) + arr = np.vstack((arr, arr[i])) + i = np.random.permutation(len(arr)) + arr = arr[i] + self.cols = list('abcde') + self.df = DataFrame(arr, columns=self.cols) + self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 - def time_groupby_series_nth_any(self): - self.df[1].groupby(self.df[0]).nth(0, dropna='any') + def time_overflow(self): + self.df.groupby(self.cols).max() - def time_groupby_series_nth_none(self): - self.df[1].groupby(self.df[0]).nth(0) +class CountMultiDtype(object): -#---------------------------------------------------------------------- -# groupby_indices replacement, chop up Series - -class groupby_indices(object): goal_time = 0.2 - def setup(self): - try: - self.rng = date_range('1/1/2000', '12/31/2005', freq='H') - (self.year, self.month, self.day) = (self.rng.year, self.rng.month, self.rng.day) - except: - self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) - self.year = self.rng.map((lambda x: x.year)) - self.month = self.rng.map((lambda x: x.month)) - self.day = self.rng.map((lambda x: x.day)) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_groupby_indices(self): - len(self.ts.groupby([self.year, self.month, self.day])) - + def setup_cache(self): + n = 10000 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64('nat') + offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + value2 = np.random.randn(n) + value2[np.random.rand(n) > 0.5] = np.nan + obj = np.random.choice(list('ab'), size=n).astype(object) + obj[np.random.randn(n) > 0.5] = np.nan + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'dates': dates, + 'value2': value2, + 'value3': np.random.randn(n), + 'ints': np.random.randint(0, 1000, size=n), + 'obj': obj, + 'offsets': offsets}) + return df + + def time_multi_count(self, df): + df.groupby(['key1', 'key2']).count() + + +class CountInt(object): -class groupby_int64_overflow(object): goal_time = 0.2 - def setup(self): - self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) - self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) - self.arr = np.vstack((self.arr, self.arr[self.i])) - self.i = np.random.permutation(len(self.arr)) - self.arr = self.arr[self.i] - self.df = DataFrame(self.arr, columns=list('abcde')) - (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10) + def setup_cache(self): + n = 10000 + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'ints': np.random.randint(0, 1000, size=n), + 'ints2': np.random.randint(0, 1000, size=n)}) + return df - def time_groupby_int64_overflow(self): - self.df.groupby(list('abcde')).max() + def time_int_count(self, df): + df.groupby(['key1', 'key2']).count() + def time_int_nunique(self, df): + df.groupby(['key1', 'key2']).nunique() -#---------------------------------------------------------------------- -# count() speed -class groupby_multi_count(object): - goal_time = 0.2 +class AggFunctions(object): - def setup(self): - self.n = 10000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') - self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') - self.value2 = np.random.randn(self.n) - self.value2[(np.random.rand(self.n) > 0.5)] = np.nan - self.obj = np.random.choice(list('ab'), size=self.n).astype(object) - self.obj[(np.random.randn(self.n) > 0.5)] = np.nan - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), - 'key2': np.random.randint(0, 100, size=self.n), - 'dates': self.dates, - 'value2': self.value2, - 'value3': np.random.randn(self.n), - 'ints': np.random.randint(0, 1000, size=self.n), - 'obj': self.obj, - 'offsets': self.offsets, }) - - def time_groupby_multi_count(self): - self.df.groupby(['key1', 'key2']).count() - - -class groupby_int_count(object): goal_time = 0.2 - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) - - def time_groupby_int_count(self): - self.df.groupby(['key1', 'key2']).count() + def setup_cache(self): + N = 10**5 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), + 'key2': fac2.take(np.random.randint(0, 2, size=N)), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + return df + def time_different_str_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': 'mean', + 'value2': 'var', + 'value3': 'sum'}) -#---------------------------------------------------------------------- -# nunique() speed + def time_different_numpy_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': np.mean, + 'value2': np.var, + 'value3': np.sum}) -class groupby_nunique(object): + def time_different_python_functions_multicol(self, df): + df.groupby(['key1', 'key2']).agg([sum, min, max]) - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) + def time_different_python_functions_singlecol(self, df): + df.groupby('key1').agg([sum, min, max]) - def time_groupby_nunique(self): - self.df.groupby(['key1', 'key2']).nunique() +class GroupStrings(object): -#---------------------------------------------------------------------- -# group with different functions per column - -class groupby_agg_multi(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) - - def time_groupby_multi_different_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum'}) - - def time_groupby_multi_different_numpy_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum}) - - -class groupby_multi_index(object): goal_time = 0.2 def setup(self): - self.n = (((5 * 7) * 11) * (1 << 9)) - self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) - self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k)) - self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), }) + n = (5 * 7 * 11) * (1 << 9) + alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) + f = lambda k: np.repeat(np.random.choice(alpha, (n // k)), k) + self.df = DataFrame({'a': f(11), + 'b': f(7), + 'c': f(5), + 'd': f(1)}) self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) - self.i = np.random.permutation(len(self.df)) - self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + i = np.random.permutation(len(self.df)) + self.df = self.df.iloc[i].reset_index(drop=True) - def time_groupby_multi_index(self): + def time_multi_columns(self): self.df.groupby(list('abcd')).max() -class groupby_multi(object): +class MultiColumn(object): + goal_time = 0.2 def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, int(n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr - - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) - - def time_groupby_multi_cython(self): + N = 10**5 + key1 = np.tile(np.arange(100, dtype=object), 1000) + key2 = key1.copy() + np.random.shuffle(key1) + np.random.shuffle(key2) + self.df = DataFrame({'key1': key1, + 'key2': key2, + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + self.f = lambda x: x.values.sum() + + def time_lambda_sum(self): + self.df.groupby(['key1', 'key2']).agg(self.f) + + def time_cython_sum(self): self.df.groupby(['key1', 'key2']).sum() - def time_groupby_multi_python(self): - self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) - - def time_groupby_multi_series_op(self): - self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) - - def time_groupby_series_simple_cython(self): - self.simple_series.groupby(self.key1).sum() + def time_col_select_lambda_sum(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(self.f) - def time_groupby_series_simple_rank(self): - self.df.groupby('key1').rank(pct=True) + def time_col_select_numpy_sum(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(np.sum) -#---------------------------------------------------------------------- -# size() speed +class Size(object): -class groupby_size(object): goal_time = 0.2 def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) - - N = 1000000 - self.draws = pd.Series(np.random.randn(N)) - labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + n = 10**5 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'value1': np.random.randn(n), + 'value2': np.random.randn(n), + 'value3': np.random.randn(n), + 'dates': dates}) + self.draws = Series(np.random.randn(n)) + labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') - def time_groupby_multi_size(self): + def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_groupby_dt_size(self): + def time_dt_size(self): self.df.groupby(['dates']).size() - def time_groupby_dt_timegrouper_size(self): + def time_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_groupby_size(self): + def time_category_size(self): self.draws.groupby(self.cats).size() +class GroupByMethods(object): -#---------------------------------------------------------------------- -# groupby with a variable value for ngroups - -class GroupBySuite(object): goal_time = 0.2 param_names = ['dtype', 'ngroups'] params = [['int', 'float'], [100, 10000]] def setup(self, dtype, ngroups): - np.random.seed(1234) size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) @@ -479,6 +438,9 @@ def time_rank(self, dtype, ngroups): def time_sem(self, dtype, ngroups): self.df.groupby('key')['values'].sem() + def time_shift(self, dtype, ngroups): + self.df.groupby('key')['values'].shift() + def time_size(self, dtype, ngroups): self.df.groupby('key')['values'].size() @@ -504,7 +466,7 @@ def time_var(self, dtype, ngroups): self.df.groupby('key')['values'].var() -class groupby_float32(object): +class Float32(object): # GH 13335 goal_time = 0.2 @@ -515,27 +477,28 @@ def setup(self): arr = np.repeat(tmp, 10) self.df = DataFrame(dict(a=arr, b=arr)) - def time_groupby_sum(self): + def time_sum(self): self.df.groupby(['a'])['b'].sum() -class groupby_categorical(object): +class Categories(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 arr = np.random.random(N) - - self.df = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N)), - b=arr)) - self.df_ordered = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N), ordered=True), - b=arr)) - self.df_extra_cat = DataFrame(dict( - a=Categorical(np.random.randint(100, size=N), - categories=np.arange(10000)), - b=arr)) + data = {'a': Categorical(np.random.randint(10000, size=N)), + 'b': arr} + self.df = DataFrame(data) + data = {'a': Categorical(np.random.randint(10000, size=N), + ordered=True), + 'b': arr} + self.df_ordered = DataFrame(data) + data = {'a': Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + 'b': arr} + self.df_extra_cat = DataFrame(data) def time_groupby_sort(self): self.df.groupby('a')['b'].count() @@ -556,130 +519,71 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby('a', sort=False)['b'].count() -class groupby_period(object): +class Datelike(object): # GH 14338 goal_time = 0.2 - - def make_grouper(self, N): - return pd.period_range('1900-01-01', freq='D', periods=N) - - def setup(self): - N = 10000 - self.grouper = self.make_grouper(N) - self.df = pd.DataFrame(np.random.randn(N, 2)) - - def time_groupby_sum(self): + params = ['period_range', 'date_range', 'date_range_tz'] + param_names = ['grouper'] + + def setup(self, grouper): + N = 10**4 + rng_map = {'period_range': period_range, + 'date_range': date_range, + 'date_range_tz': partial(date_range, tz='US/Central')} + self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) + self.df = DataFrame(np.random.randn(10**4, 2)) + + def time_sum(self, grouper): self.df.groupby(self.grouper).sum() -class groupby_datetime(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N) - - -class groupby_datetimetz(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N, - tz='US/Central') - -#---------------------------------------------------------------------- -# Series.value_counts - -class series_value_counts(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)) - self.s2 = self.s.astype(float) - - self.K = 1000 - self.N = 100000 - self.uniques = tm.makeStringIndex(self.K).values - self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) - - def time_value_counts_int64(self): - self.s.value_counts() - - def time_value_counts_float64(self): - self.s2.value_counts() - - def time_value_counts_strings(self): - self.s.value_counts() - - -#---------------------------------------------------------------------- -# pivot_table - -class groupby_pivot_table(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.ind1 = np.random.randint(0, 3, size=100000) - self.ind2 = np.random.randint(0, 2, size=100000) - self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) - - def time_groupby_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) - - -#---------------------------------------------------------------------- -# Sum booleans #2692 - -class groupby_sum_booleans(object): +class SumBools(object): + # GH 2692 goal_time = 0.2 def setup(self): - self.N = 500 - self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], }) + N = 500 + self.df = DataFrame({'ii': range(N), + 'bb': [True] * N}) def time_groupby_sum_booleans(self): self.df.groupby('ii').sum() -#---------------------------------------------------------------------- -# multi-indexed group sum #9049 - -class groupby_sum_multiindex(object): +class SumMultiLevel(object): + # GH 9049 goal_time = 0.2 + timeout = 120.0 def setup(self): - self.N = 50 - self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B']) + N = 50 + self.df = DataFrame({'A': range(N) * 2, + 'B': range(N * 2), + 'C': 1}).set_index(['A', 'B']) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() -#------------------------------------------------------------------------------- -# Transform testing - class Transform(object): + goal_time = 0.2 def setup(self): n1 = 400 n2 = 250 - - index = MultiIndex( - levels=[np.arange(n1), pd.util.testing.makeStringIndex(n2)], - labels=[[i for i in range(n1) for _ in range(n2)], - (list(range(n2)) * n1)], - names=['lev1', 'lev2']) - - data = DataFrame(np.random.randn(n1 * n2, 3), - index=index, columns=['col1', 'col20', 'col3']) - step = int((n1 * n2 * 0.1)) - for col in range(len(data.columns)): - idx = col - while (idx < len(data)): - data.set_value(data.index[idx], data.columns[col], np.nan) - idx += step + index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], + labels=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], + names=['lev1', 'lev2']) + arr = np.random.randn(n1 * n2, 3) + arr[::10000, 0] = np.nan + arr[1::10000, 1] = np.nan + arr[2::10000, 2] = np.nan + data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) self.df = data - self.f_fillna = (lambda x: x.fillna(method='pad')) + self.f_max = lambda x: max(x) - np.random.seed(2718281) n = 20000 self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), columns=['jim', 'joe', 'jolie']) @@ -691,10 +595,10 @@ def setup(self): self.df4 = self.df3.copy() self.df4['jim'] = self.df4['joe'] - def time_transform_func(self): - self.df.groupby(level='lev2').transform(self.f_fillna) + def time_transform_lambda_max(self): + self.df.groupby(level='lev1').transform(self.f_max) - def time_transform_ufunc(self): + def time_transform_ufunc_max(self): self.df.groupby(level='lev1').transform(np.max) def time_transform_multi_key1(self): @@ -710,63 +614,31 @@ def time_transform_multi_key4(self): self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') +class TransformBools(object): - -np.random.seed(0) -N = 120000 -N_TRANSITIONS = 1400 -transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] -transition_points.sort() -transitions = np.zeros((N,), dtype=np.bool) -transitions[transition_points] = True -g = transitions.cumsum() -df = DataFrame({'signal': np.random.rand(N), }) - - - - - -class groupby_transform_series(object): goal_time = 0.2 def setup(self): - np.random.seed(0) N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros((N,), dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({'signal': np.random.rand(N)}) - def time_groupby_transform_series(self): + def time_transform_mean(self): self.df['signal'].groupby(self.g).transform(np.mean) -class groupby_transform_series2(object): +class TransformNaN(object): + # GH 12737 goal_time = 0.2 def setup(self): - np.random.seed(0) - self.df = DataFrame({'key': (np.arange(100000) // 3), - 'val': np.random.randn(100000)}) - - self.df_nans = pd.DataFrame({'key': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df_nans.ix[4::10, 'B':'C'] = 5 - - def time_transform_series2(self): - self.df.groupby('key')['val'].transform(np.mean) - - def time_cumprod(self): - self.df.groupby('key').cumprod() - - def time_cumsum(self): - self.df.groupby('key').cumsum() - - def time_shift(self): - self.df.groupby('key').shift() + self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.loc[4::10, 'B':'C'] = 5 - def time_transform_dataframe(self): - # GH 12737 + def time_first(self): self.df_nans.groupby('key').transform('first') diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 7d490180e8af6..5aa8f76917797 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -40,10 +40,11 @@ def setup(self): def teardown(self): self.store.close() + self.remove(self.f) def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass @@ -115,10 +116,11 @@ def setup(self): def teardown(self): self.store.close() + self.remove(self.f) def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 7697c3b9d3840..a607168ea0457 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -12,7 +12,7 @@ def setup(self): if (self.rng.dtype == object): self.idx_rng = self.rng.view(Index) else: - self.idx_rng = self.rng.asobject + self.idx_rng = self.rng.astype(object) self.idx_rng2 = self.idx_rng[:(-1)] # other datetime diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f3e7ebbbd33e8..f271b82c758ee 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -303,3 +303,69 @@ def time_lookup_ix(self): def time_lookup_loc(self): self.s.loc + + +class BooleanRowSelect(object): + + goal_time = 0.2 + + def setup(self): + N = 10000 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N, 100)) + self.bool_arr = np.zeros(N, dtype=bool) + self.bool_arr[:1000] = True + + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] + + +class GetItemSingleColumn(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.df2 = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df3 = DataFrame(np.random.randn(3000, 1)) + + def time_frame_getitem_single_column_label(self): + self.df2['A'] + + def time_frame_getitem_single_column_int(self): + self.df3[0] + + +class AssignTimeseriesIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + np.random.seed(1234) + dx = date_range('1/1/2000', periods=N, freq='H') + self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + + def time_frame_assign_timeseries_index(self): + self.df['date'] = self.df.index + + +class InsertColumns(object): + + goal_time = 0.2 + + def setup(self): + self.N = 10**3 + self.df = DataFrame(index=range(N)) + + def time_insert(self): + np.random.seed(1234) + for i in range(100): + self.df.insert(0, i, np.random.randn(self.N)) + + def time_assign_with_setitem(self): + np.random.seed(1234) + for i in range(100): + self.df[i] = np.random.randn(self.N) + + diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index d5eedf63dfe8a..c718b13912e73 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -8,18 +8,36 @@ import timeit -class frame_to_csv(object): +class _BenchTeardown(object): + """ + base class for teardown method implementation + """ + fname = None + + def remove(self, f): + try: + os.remove(f) + except: + pass + + def teardown(self): + self.remove(self.fname) + + +class frame_to_csv(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df = DataFrame(np.random.randn(3000, 30)) def time_frame_to_csv(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) -class frame_to_csv2(object): +class frame_to_csv2(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df = DataFrame({'A': range(50000), }) @@ -28,22 +46,24 @@ def setup(self): self.df['D'] = (self.df.A + 3.0) def time_frame_to_csv2(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) -class frame_to_csv_date_formatting(object): +class frame_to_csv_date_formatting(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.rng = date_range('1/1/2000', periods=1000) self.data = DataFrame(self.rng, index=self.rng) def time_frame_to_csv_date_formatting(self): - self.data.to_csv('__test__.csv', date_format='%Y%m%d') + self.data.to_csv(self.fname, date_format='%Y%m%d') -class frame_to_csv_mixed(object): +class frame_to_csv_mixed(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=self.create_cols('float')) @@ -55,7 +75,7 @@ def setup(self): self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) def time_frame_to_csv_mixed(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def create_cols(self, name): return [('%s%03d' % (name, i)) for i in range(5)] @@ -94,28 +114,30 @@ def time_read_csv_infer_datetime_format_ymd(self): read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) -class read_csv_skiprows(object): +class read_csv_skiprows(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(20000) self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def time_read_csv_skiprows(self): - read_csv('__test__.csv', skiprows=10000) + read_csv(self.fname, skiprows=10000) -class read_csv_standard(object): +class read_csv_standard(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(10000) self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def time_read_csv_standard(self): - read_csv('__test__.csv') + read_csv(self.fname) class read_parse_dates_iso8601(object): @@ -152,15 +174,16 @@ def time_read_uint64_na_values(self): read_csv(StringIO(self.data1), header=None, na_values=self.na_values) -class write_csv_standard(object): +class write_csv_standard(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(10000) self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) def time_write_csv_standard(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) class read_csv_from_s3(object): @@ -195,7 +218,7 @@ def time_read_nrows(self, compression, engine): compression=compression, engine=engine) -class read_json_lines(object): +class read_json_lines(_BenchTeardown): goal_time = 0.2 fname = "__test__.json" @@ -205,12 +228,6 @@ def setup(self): self.df = DataFrame({('float{0}'.format(i), randn(self.N)) for i in range(self.C)}) self.df.to_json(self.fname,orient="records",lines=True) - def teardown(self): - try: - os.remove(self.fname) - except: - pass - def time_read_json_lines(self): pd.read_json(self.fname, lines=True) diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py new file mode 100644 index 0000000000000..849776bf9a591 --- /dev/null +++ b/asv_bench/benchmarks/offset.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np + +import pandas as pd +from pandas import date_range + +try: + import pandas.tseries.holiday +except ImportError: + pass + +hcal = pd.tseries.holiday.USFederalHolidayCalendar() + + +class ApplyIndex(object): + goal_time = 0.2 + + params = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), + pd.offsets.BYearEnd(), pd.offsets.BYearBegin(), + pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), + pd.offsets.BQuarterEnd(), pd.offsets.BQuarterBegin(), + pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), + pd.offsets.BMonthEnd(), pd.offsets.BMonthBegin()] + + def setup(self, param): + self.offset = param + + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + self.ser = pd.Series(self.rng) + + def time_apply_index(self, param): + self.rng + self.offset + + def time_apply_series(self, param): + self.ser + self.offset + + +class OnOffset(object): + goal_time = 0.2 + + params = [pd.offsets.QuarterBegin(), pd.offsets.QuarterEnd(), + pd.offsets.BQuarterBegin(), pd.offsets.BQuarterEnd()] + param_names = ['offset'] + + def setup(self, offset): + self.offset = offset + self.dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31)] + + def time_on_offset(self, offset): + for date in self.dates: + self.offset.onOffset(date) + + +class DatetimeIndexArithmetic(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + self.day_offset = pd.offsets.Day() + self.relativedelta_offset = pd.offsets.DateOffset(months=2, days=2) + self.busday_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + self.rng + self.day_offset + + def time_add_offset_fast(self): + self.rng + self.relativedelta_offset + + def time_add_offset_slow(self): + self.rng + self.busday_offset + + +class SeriesArithmetic(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + rng = date_range(start='20140101', freq='T', periods=self.N) + self.ser = pd.Series(rng) + self.day_offset = pd.offsets.Day() + self.relativedelta_offset = pd.offsets.DateOffset(months=2, days=2) + self.busday_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + self.ser + self.day_offset + + def time_add_offset_fast(self): + self.ser + self.relativedelta_offset + + def time_add_offset_slow(self): + self.ser + self.busday_offset + + +class YearBegin(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.year = pd.offsets.YearBegin() + + def time_timeseries_year_apply(self): + self.year.apply(self.date) + + def time_timeseries_year_incr(self): + self.date + self.year + + +class Day(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.day = pd.offsets.Day() + + def time_timeseries_day_apply(self): + self.day.apply(self.date) + + def time_timeseries_day_incr(self): + self.date + self.day + + +class CBDay(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.cday = pd.offsets.CustomBusinessDay() + + def time_custom_bday_decr(self): + self.date - self.cday + + def time_custom_bday_incr(self): + self.date + self.cday + + def time_custom_bday_apply(self): + self.cday.apply(self.date) + + def time_custom_bday_apply_dt64(self): + self.cday.apply(self.dt64) + + +class CBDayHolidays(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=hcal) + + def time_custom_bday_cal_incr(self): + self.date + 1 * self.cdayh + + def time_custom_bday_cal_decr(self): + self.date - 1 * self.cdayh + + def time_custom_bday_cal_incr_n(self): + self.date + 10 * self.cdayh + + def time_custom_bday_cal_incr_neg_n(self): + self.date - 10 * self.cdayh + + +class CBMonthBegin(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal) + + def time_custom_bmonthbegin_decr_n(self): + self.date - (10 * self.cmb) + + def time_custom_bmonthbegin_incr_n(self): + self.date + (10 * self.cmb) + + +class CBMonthEnd(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal) + + def time_custom_bmonthend_incr(self): + self.date + self.cme + + def time_custom_bmonthend_incr_n(self): + self.date + (10 * self.cme) + + def time_custom_bmonthend_decr_n(self): + self.date - (10 * self.cme) + + +class SemiMonthOffset(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + # date is not on an offset which will be slowest case + self.date = datetime(2011, 1, 2) + self.semi_month_end = pd.offsets.SemiMonthEnd() + self.semi_month_begin = pd.offsets.SemiMonthBegin() + + def time_end_apply(self): + self.semi_month_end.apply(self.date) + + def time_end_incr(self): + self.date + self.semi_month_end + + def time_end_incr_n(self): + self.date + 10 * self.semi_month_end + + def time_end_decr(self): + self.date - self.semi_month_end + + def time_end_decr_n(self): + self.date - 10 * self.semi_month_end + + def time_end_apply_index(self): + self.semi_month_end.apply_index(self.rng) + + def time_end_incr_rng(self): + self.rng + self.semi_month_end + + def time_end_decr_rng(self): + self.rng - self.semi_month_end + + def time_begin_apply(self): + self.semi_month_begin.apply(self.date) + + def time_begin_incr(self): + self.date + self.semi_month_begin + + def time_begin_incr_n(self): + self.date + 10 * self.semi_month_begin + + def time_begin_decr(self): + self.date - self.semi_month_begin + + def time_begin_decr_n(self): + self.date - 10 * self.semi_month_begin + + def time_begin_apply_index(self): + self.semi_month_begin.apply_index(self.rng) + + def time_begin_incr_rng(self): + self.rng + self.semi_month_begin + + def time_begin_decr_rng(self): + self.rng - self.semi_month_begin diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index e3d95aa3586c5..758162f000e8d 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -9,6 +9,7 @@ import numpy as np from random import randrange + class _Packers(object): goal_time = 0.2 @@ -17,19 +18,22 @@ def _setup(self): self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict(('float{0}'.format(i), randn(self.N)) for i in range(self.C)), index=self.index) + self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)}, index=self.index) self.df2 = self.df.copy() self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass + def teardown(self): + self.remove(self.f) + + class Packers(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -38,8 +42,8 @@ def setup(self): def time_packers_read_csv(self): pd.read_csv(self.f) + class packers_read_excel(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -54,7 +58,6 @@ def time_packers_read_excel(self): class packers_read_hdf_store(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -115,6 +118,7 @@ def setup(self): def time_packers_read_pickle(self): pd.read_pickle(self.f) + class packers_read_sql(_Packers): def setup(self): @@ -177,9 +181,6 @@ def setup(self): def time_write_csv(self): self.df.to_csv(self.f) - def teardown(self): - self.remove(self.f) - class Excel(_Packers): @@ -217,8 +218,6 @@ def time_write_hdf_store(self): def time_write_hdf_table(self): self.df2.to_hdf(self.f, 'df', table=True) - def teardown(self): - self.remove(self.f) class JSON(_Packers): @@ -259,9 +258,6 @@ def time_write_json_mixed_float_int_str(self): def time_write_json_lines(self): self.df.to_json(self.f, orient="records", lines=True) - def teardown(self): - self.remove(self.f) - class MsgPack(_Packers): @@ -271,9 +267,6 @@ def setup(self): def time_write_msgpack(self): self.df2.to_msgpack(self.f) - def teardown(self): - self.remove(self.f) - class Pickle(_Packers): @@ -283,9 +276,6 @@ def setup(self): def time_write_pickle(self): self.df2.to_pickle(self.f) - def teardown(self): - self.remove(self.f) - class SQL(_Packers): @@ -313,6 +303,3 @@ def time_write_stata(self): def time_write_stata_with_validation(self): self.df3.to_stata(self.f, {'index': 'tc', }) - - def teardown(self): - self.remove(self.f) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b1a58e49fe86c..62eb826418030 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,7 +13,11 @@ except ImportError: pass -np.random.seed(1234) +# This function just needs to be imported into each benchmark file in order to +# sets up the random seed before each function. +# http://asv.readthedocs.io/en/latest/writing_benchmarks.html +def setup(*args, **kwargs): + np.random.seed(1234) # try em until it works! for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 63562f90eab2b..157d5fe1e3948 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -23,7 +23,7 @@ class replace_large_dict(object): def setup(self): self.n = (10 ** 6) self.start_value = (10 ** 5) - self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n))) + self.to_rep = {i: self.start_value + i for i in range(self.n)} self.s = Series(np.random.randint(self.n, size=(10 ** 3))) def time_replace_large_dict(self): @@ -35,8 +35,8 @@ class replace_convert(object): def setup(self): self.n = (10 ** 3) - self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) - self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) + self.to_ts = {i: pd.Timestamp(i) for i in range(self.n)} + self.to_td = {i: pd.Timedelta(i) for i in range(self.n)} self.s = Series(np.random.randint(self.n, size=(10 ** 3))) self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), 'B': np.random.randint(self.n, size=(10 ** 3))}) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 177e3e7cb87fa..951f718257170 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -117,3 +117,23 @@ def setup(self): def time_wide_to_long_big(self): self.df['id'] = self.df.index wide_to_long(self.df, list(self.vars), i='id', j='year') + + +class PivotTable(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + ind1 = np.random.randint(0, 3, size=N) + ind2 = np.random.randint(0, 2, size=N) + self.df = DataFrame({'key1': fac1.take(ind1), + 'key2': fac2.take(ind2), + 'key3': fac2.take(ind2), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + + def time_pivot_table(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3']) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3c0e2869357ae..81c43f7bc975f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -123,6 +123,30 @@ def time_series_dropna_datetime(self): self.s.dropna() +class series_map_dict(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_dict = {i: map_size - i for i in range(map_size)} + + def time_series_map_dict(self): + self.s.map(self.map_dict) + + +class series_map_series(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_series = Series(map_size - np.arange(map_size)) + + def time_series_map_series(self): + self.s.map(self.map_series) + + class series_clip(object): goal_time = 0.2 @@ -131,3 +155,25 @@ def setup(self): def time_series_dropna_datetime(self): self.s.clip(0, 1) + + +class series_value_counts(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=100000)) + self.s2 = self.s.astype(float) + + self.K = 1000 + self.N = 100000 + self.uniques = tm.makeStringIndex(self.K).values + self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) + + def time_value_counts_int64(self): + self.s.value_counts() + + def time_value_counts_float64(self): + self.s2.value_counts() + + def time_value_counts_strings(self): + self.s.value_counts() diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c1600d4e07f58..948d4b92a5a57 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -105,3 +105,15 @@ def setup(self): def time_encode_decode(self): self.ser.str.encode('utf-8').str.decode('utf-8') + + +class StringSlice(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(['abcdefg', np.nan] * 500000) + + def time_series_string_vector_slice(self): + # GH 2602 + self.s.str[:5] diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 2ca2416f58b57..b3996739e33f7 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,13 +2,11 @@ from pandas.plotting._converter import DatetimeConverter except ImportError: from pandas.tseries.converter import DatetimeConverter -from .pandas_vb_common import * + import pandas as pd +from pandas import to_datetime, date_range, Series, DataFrame, period_range + import datetime as dt -try: - import pandas.tseries.holiday -except ImportError: - pass from pandas.tseries.frequencies import infer_freq import numpy as np @@ -22,32 +20,38 @@ class DatetimeIndex(object): def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') + self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, + freq='S', tz='US/Eastern') - self.index_repeated = date_range(start='1/1/2000', periods=1000, freq='T').repeat(10) + self.index_repeated = date_range(start='1/1/2000', + periods=1000, freq='T').repeat(10) self.rng3 = date_range(start='1/1/2000', periods=1000, freq='H') self.df = DataFrame(np.random.randn(len(self.rng3), 2), self.rng3) - self.rng4 = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), index=self.rng4) + self.rng4 = date_range(start='1/1/2000', periods=1000, + freq='H', tz='US/Eastern') + self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), + index=self.rng4) N = 100000 self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, tz='Asia/Tokyo').repeat(5) - self.rng5 = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') + self.rng5 = date_range(start='1/1/2000', + end='3/1/2000', tz='US/Eastern') - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') + self.dst_rng = date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', + end='10/29/2000 00:59:59', freq='S') self.index = self.index.append(self.dst_rng) self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', + end='10/29/2000 3:00:00', + freq='S')) self.N = 10000 self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') @@ -62,15 +66,6 @@ def setup(self): def time_add_timedelta(self): (self.rng + dt.timedelta(minutes=2)) - def time_add_offset_delta(self): - (self.rng + self.delta_offset) - - def time_add_offset_fast(self): - (self.rng + self.fast_offset) - - def time_add_offset_slow(self): - (self.rng + self.slow_offset) - def time_normalize(self): self.rng2.normalize() @@ -116,6 +111,7 @@ def time_to_date(self): def time_to_pydatetime(self): self.rng.to_pydatetime() + class TimeDatetimeConverter(object): goal_time = 0.2 @@ -156,7 +152,7 @@ def time_iter_periodindex_preexit(self): self.iter_n(self.idx2, self.M) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Resampling class ResampleDataFrame(object): @@ -195,7 +191,8 @@ def setup(self): self.rng2 = date_range(start='1/1/2000', end='1/1/2001', freq='T') self.ts2 = Series(np.random.randn(len(self.rng2)), index=self.rng2) - self.rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') + self.rng3 = date_range(start='2000-01-01 00:00:00', + end='2000-01-01 10:00:00', freq='555000U') self.int_ts = Series(5, self.rng3, dtype='int64') self.dt_ts = self.int_ts.astype('datetime64[ns]') @@ -223,7 +220,8 @@ def setup(self): self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.dates = date_range(start='1/1/1990', + periods=(self.N * 10), freq='5s') self.ts2 = self.ts.copy() self.ts2[250:5000] = np.nan self.ts3 = self.ts.copy() @@ -261,7 +259,8 @@ def setup(self): self.N = 10000 self.M = 100 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.dates = date_range(start='1/1/1990', + periods=(self.N * 10), freq='5s') self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) self.ts2 = self.ts.copy() self.ts2.iloc[250:5000] = np.nan @@ -306,8 +305,10 @@ def setup(self): self.lindex = np.random.permutation(self.N)[:(self.N // 2)] self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts2.values.take(self.lindex), index=self.ts2.index.take(self.lindex)) - self.right = Series(self.ts2.values.take(self.rindex), index=self.ts2.index.take(self.rindex)) + self.left = Series(self.ts2.values.take(self.lindex), + index=self.ts2.index.take(self.lindex)) + self.right = Series(self.ts2.values.take(self.rindex), + index=self.ts2.index.take(self.rindex)) self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') self.ts3 = Series(1, index=self.rng3) @@ -329,26 +330,6 @@ def time_large_lookup_value(self): self.ts3.index._cleanup() -class SeriesArithmetic(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_add_offset_delta(self): - (self.s + self.delta_offset) - - def time_add_offset_fast(self): - (self.s + self.fast_offset) - - def time_add_offset_slow(self): - (self.s + self.slow_offset) - - class ToDatetime(object): goal_time = 0.2 @@ -425,136 +406,6 @@ def time_cache_false_with_dup_string_tzoffset_dates(self): to_datetime(self.dup_string_with_tz, cache=False) -class Offsets(object): - goal_time = 0.2 - - def setup(self): - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_apply(self): - self.day.apply(self.date) - - def time_timeseries_day_incr(self): - (self.date + self.day) - - def time_timeseries_year_apply(self): - self.year.apply(self.date) - - def time_timeseries_year_incr(self): - (self.date + self.year) - - # custom business offsets - - def time_custom_bday_decr(self): - (self.date - self.cday) - - def time_custom_bday_incr(self): - (self.date + self.cday) - - def time_custom_bday_apply(self): - self.cday.apply(self.date) - - def time_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) - - def time_custom_bday_cal_incr(self): - self.date + 1 * self.cdayh - - def time_custom_bday_cal_decr(self): - self.date - 1 * self.cdayh - - def time_custom_bday_cal_incr_n(self): - self.date + 10 * self.cdayh - - def time_custom_bday_cal_incr_neg_n(self): - self.date - 10 * self.cdayh - - # Increment custom business month - - def time_custom_bmonthend_incr(self): - (self.date + self.cme) - - def time_custom_bmonthend_incr_n(self): - (self.date + (10 * self.cme)) - - def time_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) - - def time_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - def time_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class SemiMonthOffset(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - # date is not on an offset which will be slowest case - self.date = dt.datetime(2011, 1, 2) - self.semi_month_end = pd.offsets.SemiMonthEnd() - self.semi_month_begin = pd.offsets.SemiMonthBegin() - - def time_end_apply(self): - self.semi_month_end.apply(self.date) - - def time_end_incr(self): - self.date + self.semi_month_end - - def time_end_incr_n(self): - self.date + 10 * self.semi_month_end - - def time_end_decr(self): - self.date - self.semi_month_end - - def time_end_decr_n(self): - self.date - 10 * self.semi_month_end - - def time_end_apply_index(self): - self.semi_month_end.apply_index(self.rng) - - def time_end_incr_rng(self): - self.rng + self.semi_month_end - - def time_end_decr_rng(self): - self.rng - self.semi_month_end - - def time_begin_apply(self): - self.semi_month_begin.apply(self.date) - - def time_begin_incr(self): - self.date + self.semi_month_begin - - def time_begin_incr_n(self): - self.date + 10 * self.semi_month_begin - - def time_begin_decr(self): - self.date - self.semi_month_begin - - def time_begin_decr_n(self): - self.date - 10 * self.semi_month_begin - - def time_begin_apply_index(self): - self.semi_month_begin.apply_index(self.rng) - - def time_begin_incr_rng(self): - self.rng + self.semi_month_begin - - def time_begin_decr_rng(self): - self.rng - self.semi_month_begin - - class DatetimeAccessor(object): def setup(self): self.N = 100000 diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index b8ef309e6a464..9d7d6d2998a8b 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -1,4 +1,3 @@ -from .pandas_vb_common import * from pandas import to_timedelta, Timestamp import pytz import datetime @@ -7,61 +6,64 @@ class TimestampProperties(object): goal_time = 0.2 - params = [None, pytz.timezone('Europe/Amsterdam')] - param_names = ['tz'] + params = [(None, None), + (pytz.timezone('Europe/Amsterdam'), None), + (None, 'B'), + (pytz.timezone('Europe/Amsterdam'), 'B')] + param_names = ['tz', 'freq'] - def setup(self, tz): - self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz) + def setup(self, tz, freq): + self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) - def time_tz(self): + def time_tz(self, tz, freq): self.ts.tz - def time_offset(self): + def time_offset(self, tz, freq): self.ts.offset - def time_dayofweek(self): + def time_dayofweek(self, tz, freq): self.ts.dayofweek - def time_weekday_name(self): + def time_weekday_name(self, tz, freq): self.ts.weekday_name - def time_dayofyear(self): + def time_dayofyear(self, tz, freq): self.ts.dayofyear - def time_week(self): + def time_week(self, tz, freq): self.ts.week - def time_quarter(self): + def time_quarter(self, tz, freq): self.ts.quarter - def time_days_in_month(self): + def time_days_in_month(self, tz, freq): self.ts.days_in_month - def time_freqstr(self): + def time_freqstr(self, tz, freq): self.ts.freqstr - def time_is_month_start(self): + def time_is_month_start(self, tz, freq): self.ts.is_month_start - def time_is_month_end(self): + def time_is_month_end(self, tz, freq): self.ts.is_month_end - def time_is_quarter_start(self): + def time_is_quarter_start(self, tz, freq): self.ts.is_quarter_start - def time_is_quarter_end(self): + def time_is_quarter_end(self, tz, freq): self.ts.is_quarter_end - def time_is_year_start(self): + def time_is_year_start(self, tz, freq): self.ts.is_quarter_end - def time_is_year_end(self): + def time_is_year_end(self, tz, freq): self.ts.is_quarter_end - def time_is_leap_year(self): + def time_is_leap_year(self, tz, freq): self.ts.is_quarter_end - def time_microsecond(self): + def time_microsecond(self, tz, freq): self.ts.microsecond @@ -74,13 +76,13 @@ class TimestampOps(object): def setup(self, tz): self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) - def time_replace_tz(self): + def time_replace_tz(self, tz): self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) - def time_replace_None(self): + def time_replace_None(self, tz): self.ts.replace(tzinfo=None) - def time_to_pydatetime(self): + def time_to_pydatetime(self, tz): self.ts.to_pydatetime() diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index c3d3d59f895c6..57748fef1a2e5 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -6,8 +6,8 @@ dependencies: - Cython - NumPy - moto - - pytest - - python-dateutil + - pytest>=3.1 + - python-dateutil>=2.5.0 - python=3 - pytz - setuptools diff --git a/ci/lint.sh b/ci/lint.sh index 4027737900bf9..5d9fafe6c9064 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -90,13 +90,26 @@ if [ "$LINT" ]; then # # Check the following functions: # any(), all(), sum(), max(), min(), list(), dict(), set(), frozenset(), tuple(), str.join() - grep -R --include="*.py*" -E "[^_](any|all|sum|max|min|list|dict|set|frozenset|tuple|join)\(\[.* for .* in .*\]\)" + grep -R --include="*.py*" -E "[^_](any|all|sum|max|min|list|dict|set|frozenset|tuple|join)\(\[.* for .* in .*\]\)" * if [ $? = "0" ]; then RET=1 fi echo "Check for use of lists instead of generators in built-in Python functions DONE" + echo "Check for incorrect sphinx directives" + SPHINX_DIRECTIVES=$(echo \ + "autosummary|contents|currentmodule|deprecated|function|image|"\ + "important|include|ipython|literalinclude|math|module|note|raw|"\ + "seealso|toctree|versionadded|versionchanged|warning" | tr -d "[:space:]") + for path in './pandas' './doc/source' + do + grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. ($SPHINX_DIRECTIVES):[^:]" $path + if [ $? = "0" ]; then + RET=1 + fi + done + echo "Check for incorrect sphinx directives DONE" else echo "NOT Linting" fi diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index 415df13179fcf..e24baa98d956e 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,6 +1,6 @@ python=2.7* -python-dateutil=2.4.1 +python-dateutil=2.5.0 pytz=2013b nomkl numpy -cython=0.23 +cython=0.24 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index a68e1d256058d..7c10b98fb6e14 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -1,11 +1,11 @@ -python-dateutil=2.4.1 +python-dateutil=2.5.0 pytz=2013b numpy xlwt=0.7.5 numexpr pytables matplotlib -openpyxl=1.6.2 +openpyxl=2.4.0 xlrd=0.9.2 sqlalchemy=0.9.6 lxml diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build index d9c932daa110b..0a83a7346e8b5 100644 --- a/ci/requirements-2.7_COMPAT.build +++ b/ci/requirements-2.7_COMPAT.build @@ -1,5 +1,5 @@ python=2.7* numpy=1.9.2 -cython=0.23 -dateutil=1.5 +cython=0.24 +python-dateutil=2.5.0 pytz=2013b diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run index 39bf720140733..c3daed6e6e1da 100644 --- a/ci/requirements-2.7_COMPAT.run +++ b/ci/requirements-2.7_COMPAT.run @@ -1,5 +1,5 @@ numpy=1.9.2 -dateutil=1.5 +python-dateutil=2.5.0 pytz=2013b scipy=0.14.0 xlwt=0.7.5 diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build index 96cb184ec2665..a6f2e25387910 100644 --- a/ci/requirements-2.7_LOCALE.build +++ b/ci/requirements-2.7_LOCALE.build @@ -2,4 +2,4 @@ python=2.7* python-dateutil pytz=2013b numpy=1.9.2 -cython=0.23 +cython=0.24 diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run index 978bbf6a051c5..0a809a7dd6e5d 100644 --- a/ci/requirements-2.7_LOCALE.run +++ b/ci/requirements-2.7_LOCALE.run @@ -1,8 +1,8 @@ python-dateutil -pytz=2013b +pytz numpy=1.9.2 xlwt=0.7.5 -openpyxl=1.6.2 +openpyxl=2.4.0 xlsxwriter=0.5.2 xlrd=0.9.2 bottleneck=1.0.0 diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 6e4f7b65f9728..c9565f2173070 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,2 +1,2 @@ xarray==0.9.1 -pandas-gbq +pandas_gbq diff --git a/ci/requirements-3.6_BUILD_TEST.pip b/ci/requirements-3.6_BUILD_TEST.pip index a0fc77c40bc00..f4617133cad5b 100644 --- a/ci/requirements-3.6_BUILD_TEST.pip +++ b/ci/requirements-3.6_BUILD_TEST.pip @@ -1,7 +1,6 @@ xarray geopandas seaborn -pandas_gbq pandas_datareader statsmodels scikit-learn diff --git a/ci/requirements-3.6_BUILD_TEST.sh b/ci/requirements-3.6_BUILD_TEST.sh index 84dd27c50d587..2a3adeff836ee 100644 --- a/ci/requirements-3.6_BUILD_TEST.sh +++ b/ci/requirements-3.6_BUILD_TEST.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 36 BUILD_TEST" -conda install -n pandas -c conda-forge pyarrow dask +conda install -n pandas -c conda-forge pyarrow dask pyqt qtpy diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 06b22bd8f2c63..8d4421ba2b681 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -1,11 +1,13 @@ # This file was autogenerated by scripts/convert_deps.py -# Do not modify directlybeautifulsoup4 +# Do not modify directly +beautifulsoup4 blosc bottleneck fastparquet feather-format html5lib ipython +ipykernel jinja2 lxml matplotlib diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 2fb36b7cd70d8..e9840388203b1 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -3,8 +3,8 @@ Cython NumPy moto -pytest -python-dateutil +pytest>=3.1 +python-dateutil>=2.5.0 pytz setuptools sphinx \ No newline at end of file diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 0e46d28ab6fc4..435985bd42148 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -5,5 +5,5 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas -echo "pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" +pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/script_multi.sh b/ci/script_multi.sh index e03d60360c800..58742552628c8 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -38,17 +38,17 @@ elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas elif [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" - echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas else - echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/script_single.sh b/ci/script_single.sh index 375e9879e950f..963ce00b4a094 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -23,12 +23,12 @@ elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 2aee11772896f..8152af84228b8 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -16,13 +16,11 @@ requirements: - cython - numpy x.x - setuptools - - pytz - - python-dateutil run: - python - numpy x.x - - python-dateutil + - python-dateutil >=2.5.0 - pytz test: diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 2a358900e340d..66e16808f6af9 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -344,7 +344,9 @@ The following methods are available: :meth:`~Window.sum`, Sum of values :meth:`~Window.mean`, Mean of values -The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are: +The weights used in the window are specified by the ``win_type`` keyword. +The list of recognized types are the `scipy.signal window functions + `__: - ``boxcar`` - ``triang`` diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index f7d1edff15cfb..8ed647c2a19bc 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -53,6 +53,18 @@ the latest web technologies. Its goal is to provide elegant, concise constructio graphics in the style of Protovis/D3, while delivering high-performance interactivity over large data to thin clients. +`seaborn `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Seaborn is a Python visualization library based on `matplotlib +`__. It provides a high-level, dataset-oriented +interface for creating attractive statistical graphics. The plotting functions +in seaborn understand pandas objects and leverage pandas grouping operations +internally to support concise specification of complex visualizations. Seaborn +also goes beyond matplotlib and pandas with the option to perform statistical +estimation while plotting, aggregating across observations and visualizing the +fit of statistical models to emphasize patterns in a dataset. + `yhat/ggplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,15 +76,6 @@ but a faithful implementation for python users has long been missing. Although s (as of Jan-2014), the `yhat/ggplot `__ project has been progressing quickly in that direction. -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Although pandas has quite a bit of "just plot it" functionality built-in, visualization and -in particular statistical graphics is a vast field with a long tradition and lots of ground -to cover. The `Seaborn `__ project builds on top of pandas -and `matplotlib `__ to provide easy plotting of data which extends to -more advanced types of plots then those offered by pandas. - `Vincent `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 264bd1de1fc77..cbe945e0cf2cf 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -94,8 +94,7 @@ hence we'll concentrate our efforts cythonizing these two functions. Plain cython ~~~~~~~~~~~~ -First we're going to need to import the cython magic function to ipython (for -cython versions < 0.21 you can use ``%load_ext cythonmagic``): +First we're going to need to import the cython magic function to ipython: .. ipython:: python :okwarning: diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f5c65e175b0db..7c7457df8ea93 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -137,7 +137,6 @@ See the package overview for more detail about what's in the library. visualization style io - remote_data enhancingperf sparse gotchas diff --git a/doc/source/install.rst b/doc/source/install.rst index c805f84d0faaa..aeb1abbadabb3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -200,8 +200,8 @@ Dependencies * `setuptools `__ * `NumPy `__: 1.9.0 or higher -* `python-dateutil `__: 1.5 or higher -* `pytz `__: Needed for time zone support +* `python-dateutil `__: 2.5.0 or higher +* `pytz `__ .. _install.recommended_dependencies: @@ -228,7 +228,7 @@ Optional Dependencies ~~~~~~~~~~~~~~~~~~~~~ * `Cython `__: Only necessary to build development - version. Version 0.23 or higher. + version. Version 0.24 or higher. * `SciPy `__: miscellaneous statistical functions, Version 0.14.0 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. @@ -244,21 +244,23 @@ Optional Dependencies * For Excel I/O: * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) - * `openpyxl `__: openpyxl version 1.6.1 - or higher (but lower than 2.0.0), or version 2.2 or higher, for writing .xlsx files (xlrd >= 0.9.0) + * `openpyxl `__: openpyxl version 2.4.0 + for writing .xlsx files (xlrd >= 0.9.0) * `XlsxWriter `__: Alternative Excel writer * `Jinja2 `__: Template engine for conditional HTML formatting. * `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). * `blosc `__: for msgpack compression using ``blosc`` -* One of `PyQt4 - `__, `PySide - `__, `pygtk - `__, `xsel - `__, or `xclip - `__: necessary to use +* One of + `qtpy `__ (requires PyQt or PySide), + `PyQt5 `__, + `PyQt4 `__, + `pygtk `__, + `xsel `__, or + `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* For Google BigQuery I/O - see `here `__ +* `pandas-gbq `__: for Google BigQuery I/O. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the diff --git a/doc/source/io.rst b/doc/source/io.rst index c94d5bc75d4fc..f96e33dbf9882 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -103,15 +103,20 @@ Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ header : int or list of ints, default ``'infer'`` - Row number(s) to use as the column names, and the start of the data. Default - behavior is as if ``header=0`` if no ``names`` passed, otherwise as if - ``header=None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of ints that specify row locations for a - multi-index on the columns e.g. ``[0,1,3]``. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first line of the file, if column names are + passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to replace + existing names. + + The header can be a list of ints that specify row locations + for a multi-index on the columns e.g. ``[0,1,3]``. Intervening rows + that are not specified will be skipped (e.g. 2 in this example is + skipped). Note that this parameter ignores commented lines and empty + lines if ``skip_blank_lines=True``, so header=0 denotes the first + line of data rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. Duplicates in this list will cause @@ -553,6 +558,14 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. note:: + + Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first nonblank line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. + .. _io.dupe_names: Duplicate names parsing @@ -2922,7 +2935,7 @@ Writing Excel Files to Memory +++++++++++++++++++++++++++++ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or -``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. Pandas also supports Openpyxl >= 2.2. +``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -2978,9 +2991,7 @@ files if `Xlsxwriter`_ is not available. To specify which writer you want to use, you can pass an engine keyword argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: -- ``openpyxl``: This includes stable support for Openpyxl from 1.6.1. However, - it is advised to use version 2.2 and higher, especially when working with - styles. +- ``openpyxl``: version 2.4 or higher is required - ``xlsxwriter`` - ``xlwt`` @@ -3053,7 +3064,7 @@ We can see that we got the same content back, which we had earlier written to th .. note:: - You may need to install xclip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. + You may need to install xclip or xsel (with gtk, PyQt5, PyQt4 or qtpy) on Linux to use these methods. .. _io.pickle: diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7d981b815d01b..86d2ec2254057 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -518,14 +518,16 @@ standard database join operations between DataFrame objects: - ``left``: A DataFrame object - ``right``: Another DataFrame object -- ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and +- ``on``: Column or index level names to join on. Must be found in both the left + and right DataFrame objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the DataFrames will be inferred to be the join keys -- ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame -- ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame +- ``left_on``: Columns or index levels from the left DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame +- ``right_on``: Columns or index levels from the right DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame - ``left_index``: If ``True``, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys @@ -563,6 +565,10 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.21.0 +.. note:: + + Support for specifying index levels as the ``on``, ``left_on``, and + ``right_on`` parameters was added in version 0.22.0. The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -1121,6 +1127,56 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using labels=['left', 'right'], vertical=False); plt.close('all'); +.. _merging.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.22 + +Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters +may refer to either column names or index level names. This enables merging +``DataFrame`` instances on a combination of index levels and columns without +resetting indexes. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + result = left.merge(right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merge_on_index_and_column.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. note:: + + When DataFrames are merged on a string that matches an index level in both + frames, the index level is preserved as an index level in the resulting + DataFrame. + +.. note:: + + If a string matches both a column name and an index level name, then a + warning is issued and the column takes precedence. This will result in an + ambiguity error in a future version. + Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst deleted file mode 100644 index 5054bb7bcd12e..0000000000000 --- a/doc/source/remote_data.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _remote_data: - -.. currentmodule:: pandas - -****************** -Remote Data Access -****************** - -.. _remote_data.pandas_datareader: - -DataReader ----------- - -The sub-package ``pandas.io.data`` was removed in -`v.0.19 `__. -Instead there has been created a separately installable -`pandas-datareader package `__. -This will allow the data modules to be independently updated on your pandas installation. - -For code older than < 0.19 you should replace the imports of the following: - -.. code-block:: python - - from pandas.io import data, wb - -With: - -.. code-block:: python - - from pandas_datareader import data, wb diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 89efa7b4be3ee..2e224f103a95e 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -85,15 +85,6 @@ can be converted back to a regular ndarray by calling ``to_dense``: sparr.to_dense() -.. _sparse.list: - -SparseList ----------- - -The ``SparseList`` class has been deprecated and will be removed in a future version. -See the `docs of a previous version `__ -for documentation on ``SparseList``. - SparseIndex objects ------------------- diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5829481cdb731..73962efad61e7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). - .. _whatsnew_0211.deprecations: @@ -56,6 +56,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- + Conversion ^^^^^^^^^^ @@ -63,8 +65,9 @@ Conversion - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) -- -- +- Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) +- Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) +- Bug in :meth:`Series.fillna` which raised when passed a long integer on Python 2 (:issue:`18159`). - Indexing @@ -73,7 +76,8 @@ Indexing - Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) -- +- Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) +- Bug in masked assignment of a ``timedelta64[ns]`` dtype ``Series``, incorrectly coerced to float (:issue:`18493`) - I/O @@ -87,7 +91,9 @@ I/O - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - +- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) +- Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) Plotting ^^^^^^^^ @@ -102,7 +108,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - @@ -119,7 +125,7 @@ Reshaping - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) - Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`) -- +- Bug when merging empty DataFrames when ``np.seterr(divide='raise')`` is set (:issue:`17776`) Numeric ^^^^^^^ @@ -135,10 +141,16 @@ Categorical - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) +- Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +- Bug in ``DataFrame.groupby(axis=1)`` with a ``CategoricalIndex`` (:issue:`18432`) + +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) Other ^^^^^ - - -- diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a4d60b4dfbb2..495d0beaf3faa 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -32,6 +32,37 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp pd.get_dummies(df, columns=['c'], dtype=bool).dtypes +.. _whatsnew_0220.enhancements.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` +parameters may now refer to either column names or index level names. +This enables merging ``DataFrame`` instances on a combination of index levels +and columns without resetting indexes. See the :ref:`Merge on columns and +levels ` documentation section. +(:issue:`14355`) + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + left.merge(right, on=['key1', 'key2']) + + .. _whatsnew_0220.enhancements.other: Other Enhancements @@ -44,19 +75,29 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) +- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) .. _whatsnew_0220.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- -- - +.. _whatsnew_0220.api_breaking.deps: +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +We have updated our minimum supported versions of dependencies (:issue:`15184`). +If installed, we now require: + +-----------------+-----------------+----------+ + | Package | Minimum Version | Required | + +=================+=================+==========+ + | python-dateutil | 2.5.0 | X | + +-----------------+-----------------+----------+ + | openpyxl | 2.4.0 | | + +-----------------+-----------------+----------+ .. _whatsnew_0220.api: @@ -66,6 +107,7 @@ Other API Changes - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) - ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) +- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) @@ -73,8 +115,18 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - +- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) +- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) +- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). +- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) +- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) +- Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`) +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) +- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`) .. _whatsnew_0220.deprecations: @@ -82,8 +134,8 @@ Deprecations ~~~~~~~~~~~~ - ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). -- -- +- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). +- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use '.astype(object)' instead (:issue:`18572`) .. _whatsnew_0220.prior_deprecations: @@ -94,6 +146,10 @@ Removal of prior version deprecations/changes - The ``levels`` and ``labels`` attributes of a ``MultiIndex`` can no longer be set directly (:issue:`4039`). - ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) - ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) +- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) +- The ``SparseList`` class has been removed (:issue:`14007`) +- The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) +- ``Categorical.from_array`` has been removed (:issue:`13854`) .. _whatsnew_0220.performance: @@ -104,10 +160,12 @@ Performance Improvements - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) +- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) -- +- Improved performance of ``IntervalIndex.symmetric_difference()`` (:issue:`18475`) +- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) .. _whatsnew_0220.docs: @@ -127,7 +185,7 @@ Bug Fixes Conversion ^^^^^^^^^^ -- +- Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - - @@ -137,7 +195,12 @@ Indexing - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) -- +- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) +- Bug in :class:`Index`` construction from list of mixed type tuples (:issue:`18505`) +- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) +- Bug in ``IntervalIndex.symmetric_difference()`` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) +- Bug in indexing a datetimelike ``Index`` that raised ``ValueError`` instead of ``IndexError`` (:issue:`18386`). + I/O ^^^ @@ -159,7 +222,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) - - @@ -173,7 +236,8 @@ Sparse Reshaping ^^^^^^^^^ -- +- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) + - - @@ -195,5 +259,7 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) -- +- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) +- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - diff --git a/doc/sphinxext/numpydoc/phantom_import.py b/doc/sphinxext/numpydoc/phantom_import.py index e0bd645f5db76..f33dd838e8bb3 100755 --- a/doc/sphinxext/numpydoc/phantom_import.py +++ b/doc/sphinxext/numpydoc/phantom_import.py @@ -60,8 +60,8 @@ def import_phantom_module(xml_file): # Sort items so that # - Base classes come before classes inherited from them # - Modules come before their contents - all_nodes = dict((n.attrib['id'], n) for n in root) - + all_nodes = {n.attrib['id']: n for n in root} + def _get_bases(node, recurse=False): bases = [x.attrib['ref'] for x in node.findall('base')] if recurse: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a5aae6d6af656..df8f7bab51dbe 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -27,7 +27,7 @@ from numpy cimport (ndarray, cdef double NaN = np.NaN cdef double nan = NaN -from libc.math cimport sqrt, fabs +from libc.math cimport fabs, sqrt # this is our util.pxd from util cimport numeric, get_nat @@ -212,51 +212,6 @@ cpdef numeric median(numeric[:] arr): kth_smallest(arr, n // 2 - 1)) / 2 -# -------------- Min, Max subsequence - -@cython.boundscheck(False) -@cython.wraparound(False) -def max_subseq(ndarray[double_t] arr): - cdef: - Py_ssize_t i=0, s=0, e=0, T, n - double m, S - - n = len(arr) - - if len(arr) == 0: - return (-1, -1, None) - - m = arr[0] - S = m - T = 0 - - with nogil: - for i in range(1, n): - # S = max { S + A[i], A[i] ) - if (S > 0): - S = S + arr[i] - else: - S = arr[i] - T = i - if S > m: - s = T - e = i - m = S - - return (s, e, m) - - -@cython.boundscheck(False) -@cython.wraparound(False) -def min_subseq(ndarray[double_t] arr): - cdef: - Py_ssize_t s, e - double m - - (s, e, m) = max_subseq(-arr) - - return (s, e, -m) - # ---------------------------------------------------------------------- # Pairwise correlation/covariance diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 336dd77ea9a89..0d3f6664da9e3 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -552,8 +552,8 @@ cpdef ensure_object(object arr): return arr else: return arr.astype(np.object_) - elif hasattr(arr, 'asobject'): - return arr.asobject + elif hasattr(arr, '_box_values_as_index'): + return arr._box_values_as_index() else: return np.array(arr, dtype=np.object_) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dc0fdcf123c32..9d9ac2ef2f5b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -75,57 +75,6 @@ def group_nth_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -169,56 +118,6 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t _median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 53203dd30daee..4c4449fb3e291 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -105,11 +105,6 @@ cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: p[3] = (v >> 24) -cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: - u32to8_le(p, v) - u32to8_le(p + 4, (v >> 32)) - - cdef inline uint64_t u8to64_le(uint8_t* p) nogil: return (p[0] | p[1] << 8 | diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 03596d7d091e0..fa2e1271f4649 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,7 +17,8 @@ from tslibs.conversion cimport maybe_datetimelike_to_i8 from hashtable cimport HashTable -from pandas._libs import algos, period as periodlib, hashtable as _hash +from pandas._libs import algos, hashtable as _hash +from pandas._libs.tslibs import period as periodlib from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta, date diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index c09642511207a..822df1ce2b968 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -14,30 +14,46 @@ import numbers _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) -cdef class IntervalMixin: - property closed_left: - def __get__(self): - return self.closed == 'left' or self.closed == 'both' - - property closed_right: - def __get__(self): - return self.closed == 'right' or self.closed == 'both' - - property open_left: - def __get__(self): - return not self.closed_left - - property open_right: - def __get__(self): - return not self.closed_right - - property mid: - def __get__(self): - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * (self.right - self.left) +cdef class IntervalMixin(object): + + @property + def closed_left(self): + """ + Return True if the Interval is closed on the left-side, else False + """ + return self.closed in ('left', 'both') + + @property + def closed_right(self): + """ + Return True if the Interval is closed on the right-side, else False + """ + return self.closed in ('right', 'both') + + @property + def open_left(self): + """ + Return True if the Interval is open on the left-side, else False + """ + return not self.closed_left + + @property + def open_right(self): + """ + Return True if the Interval is open on the right-side, else False + """ + return not self.closed_right + + @property + def mid(self): + """ + Return the midpoint of the Interval + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * (self.right - self.left) cdef _interval_like(other): @@ -55,12 +71,12 @@ cdef class Interval(IntervalMixin): Parameters ---------- left : value - Left bound for interval. + Left bound for the interval right : value - Right bound for interval. - closed : {'left', 'right', 'both', 'neither'} + Right bound for the interval + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the interval is closed on the left-side, right-side, both or - neither. Defaults to 'right'. + neither Examples -------- @@ -77,20 +93,30 @@ cdef class Interval(IntervalMixin): See Also -------- - IntervalIndex : an Index of ``interval`` s that are all closed on the same - side. - cut, qcut : convert arrays of continuous data into categoricals/series of - ``Interval``. + IntervalIndex : An Index of Interval objects that are all closed on the + same side. + cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Interval. """ - cdef readonly object left, right + cdef readonly object left + """Left bound for the interval""" + + cdef readonly object right + """Right bound for the interval""" + cdef readonly str closed + """ + Whether the interval is closed on the left-side, right-side, both or + neither + """ def __init__(self, left, right, str closed='right'): # note: it is faster to just do these checks than to use a special # constructor (__cinit__/__new__) to avoid them if closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': %s" % closed) + msg = "invalid option for 'closed': {closed}".format(closed=closed) + raise ValueError(msg) if not left <= right: raise ValueError('left side of interval must be <= right side') self.left = left @@ -122,10 +148,11 @@ cdef class Interval(IntervalMixin): if op == Py_EQ or op == Py_NE: return NotImplemented else: + name = type(self).__name__ + other = type(other).__name__ op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError( - 'unorderable types: %s() %s %s()' % - (type(self).__name__, op_str, type(other).__name__)) + raise TypeError('unorderable types: {name}() {op} {other}()' + .format(name=name, op=op_str, other=other)) def __reduce__(self): args = (self.left, self.right, self.closed) @@ -145,15 +172,18 @@ cdef class Interval(IntervalMixin): def __repr__(self): left, right = self._repr_base() - return ('%s(%r, %r, closed=%r)' % - (type(self).__name__, left, right, self.closed)) + name = type(self).__name__ + repr_str = '{name}({left!r}, {right!r}, closed={closed!r})'.format( + name=name, left=left, right=right, closed=self.closed) + return repr_str def __str__(self): left, right = self._repr_base() start_symbol = '[' if self.closed_left else '(' end_symbol = ']' if self.closed_right else ')' - return '%s%s, %s%s' % (start_symbol, left, right, end_symbol) + return '{start}{left}, {right}{end}'.format( + start=start_symbol, left=left, right=right, end=end_symbol) def __add__(self, y): if isinstance(y, numbers.Number): @@ -211,8 +241,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals): int64_t n = len(intervals) ndarray left, right - left = np.empty(n, dtype=object) - right = np.empty(n, dtype=object) + left = np.empty(n, dtype=intervals.dtype) + right = np.empty(n, dtype=intervals.dtype) for i in range(len(intervals)): interval = intervals[i] @@ -222,8 +252,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals): continue if not isinstance(interval, Interval): - raise TypeError("type {} with value {} is not an interval".format( - type(interval), interval)) + raise TypeError("type {typ} with value {iv} is not an interval" + .format(typ=type(interval), iv=interval)) left[i] = interval.left right[i] = interval.right diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 6befc5e60f5f6..344c5d25d0c3d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -240,28 +240,4 @@ def ffill_indexer(ndarray[int64_t] indexer): return result -def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, - int64_t max_group): - cdef: - Py_ssize_t i, n = len(indexer) - ndarray[int64_t] result, last_obs - int64_t gid, val - - result = np.empty(n, dtype=np.int64) - - last_obs = np.empty(max_group, dtype=np.int64) - last_obs.fill(-1) - - for i in range(n): - gid = group_ids[i] - val = indexer[i] - if val == -1: - result[i] = last_obs[gid] - else: - result[i] = val - last_obs[gid] = val - - return result - - include "join_helper.pxi" diff --git a/pandas/_libs/src/khash.pxd b/pandas/_libs/khash.pxd similarity index 98% rename from pandas/_libs/src/khash.pxd rename to pandas/_libs/khash.pxd index ba9a3c70097b2..b1d965c3618cd 100644 --- a/pandas/_libs/src/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# cython: profile=False from cpython cimport PyObject from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 956aeaf39b021..a39f83d5261c0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,13 +1,18 @@ # cython: profile=False -cimport numpy as np -cimport cython -import numpy as np -import sys +import operator -cdef bint PY3 = (sys.version_info[0] >= 3) - -from numpy cimport * +cimport cython +from cython cimport Py_ssize_t +import numpy as np +cimport numpy as np +from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, PyArray_SETITEM, + PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + flatiter, NPY_OBJECT, + int64_t, + float32_t, float64_t, + uint8_t, uint64_t, + complex128_t) # initialize numpy np.import_array() np.import_ufunc() @@ -57,12 +62,12 @@ from tslib import NaT, Timestamp, Timedelta, array_to_datetime from interval import Interval from missing cimport checknull -cdef int64_t NPY_NAT = util.get_nat() cimport util +cdef int64_t NPY_NAT = util.get_nat() from util cimport is_array, _checknull -from libc.math cimport sqrt, fabs +from libc.math cimport fabs, sqrt def values_from_object(object o): @@ -76,27 +81,6 @@ def values_from_object(object o): return o -cpdef map_indices_list(list index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i from 0 <= i < length: - result[index[i]] = i - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): @@ -515,7 +499,6 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): @cython.wraparound(False) @cython.boundscheck(False) def scalar_compare(ndarray[object] values, object val, object op): - import operator cdef: Py_ssize_t i, n = len(values) ndarray[uint8_t, cast=True] result @@ -550,7 +533,7 @@ def scalar_compare(ndarray[object] values, object val, object op): result[i] = True else: try: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) except (TypeError): result[i] = True elif flag == cpython.Py_EQ: @@ -562,7 +545,7 @@ def scalar_compare(ndarray[object] values, object val, object op): result[i] = False else: try: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) except (TypeError): result[i] = False @@ -574,7 +557,7 @@ def scalar_compare(ndarray[object] values, object val, object op): elif isnull_val: result[i] = False else: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) return result.view(bool) @@ -603,7 +586,6 @@ cpdef bint array_equivalent_object(object[:] left, object[:] right): @cython.wraparound(False) @cython.boundscheck(False) def vec_compare(ndarray[object] left, ndarray[object] right, object op): - import operator cdef: Py_ssize_t i, n = len(left) ndarray[uint8_t, cast=True] result @@ -638,7 +620,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): if checknull(x) or checknull(y): result[i] = True else: - result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + result[i] = PyObject_RichCompareBool(x, y, flag) else: for i in range(n): x = left[i] @@ -647,7 +629,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): if checknull(x) or checknull(y): result[i] = False else: - result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + result[i] = PyObject_RichCompareBool(x, y, flag) return result.view(bool) @@ -929,19 +911,6 @@ def write_csv_rows(list data, ndarray data_index, # ------------------------------------------------------------------------------ # Groupby-related functions -@cython.boundscheck(False) -def arrmap(ndarray[object] index, object func): - cdef int length = index.shape[0] - cdef int i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - for i from 0 <= i < length: - result[i] = func(index[i]) - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): @@ -1107,27 +1076,6 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, return out -def group_count(ndarray[int64_t] values, Py_ssize_t size): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] counts - - counts = np.zeros(size, dtype=np.int64) - for i in range(n): - counts[values[i]] += 1 - return counts - - -def lookup_values(ndarray[object] values, dict mapping): - cdef: - Py_ssize_t i, n = len(values) - - result = np.empty(n, dtype='O') - for i in range(n): - result[i] = mapping[values[i]] - return maybe_convert_objects(result) - - @cython.boundscheck(False) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, @@ -1158,70 +1106,6 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -cdef class _PandasNull: - - def __richcmp__(_PandasNull self, object other, int op): - if op == 2: # == - return isinstance(other, _PandasNull) - elif op == 3: # != - return not isinstance(other, _PandasNull) - else: - return False - - def __hash__(self): - return 0 - -pandas_null = _PandasNull() - - -def fast_zip_fillna(list ndarrays, fill_value=pandas_null): - """ - For zipping multiple ndarrays into an ndarray of tuples - """ - cdef: - Py_ssize_t i, j, k, n - ndarray[object] result - flatiter it - object val, tup - - k = len(ndarrays) - n = len(ndarrays[0]) - - result = np.empty(n, dtype=object) - - # initialize tuples on first pass - arr = ndarrays[0] - it = PyArray_IterNew(arr) - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - tup = PyTuple_New(k) - - if val != val: - val = fill_value - - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - result[i] = tup - PyArray_ITER_NEXT(it) - - for j in range(1, k): - arr = ndarrays[j] - it = PyArray_IterNew(arr) - if len(arr) != n: - raise ValueError('all arrays must be same length') - - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - if val != val: - val = fill_value - - PyTuple_SET_ITEM(result[i], j, val) - Py_INCREF(val) - PyArray_ITER_NEXT(it) - - return result - - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/_libs/skiplist.pxd b/pandas/_libs/skiplist.pxd new file mode 100644 index 0000000000000..82a0862112199 --- /dev/null +++ b/pandas/_libs/skiplist.pxd @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +from cython cimport Py_ssize_t + +from numpy cimport double_t + + +cdef extern from "src/skiplist.h": + ctypedef struct node_t: + node_t **next + int *width + double value + int is_nil + int levels + int ref_count + + ctypedef struct skiplist_t: + node_t *head + node_t **tmp_chain + int *tmp_steps + int size + int maxlevels + + skiplist_t* skiplist_init(int) nogil + void skiplist_destroy(skiplist_t*) nogil + double skiplist_get(skiplist_t*, int, int*) nogil + int skiplist_insert(skiplist_t*, double) nogil + int skiplist_remove(skiplist_t*, double) nogil + + +# Note: Node is declared here so that IndexableSkiplist can be exposed; +# Node itself not intended to be exposed. +cdef class Node: + cdef public: + double_t value + list next + list width + + +cdef class IndexableSkiplist: + cdef: + Py_ssize_t size, maxlevels + Node head + + cpdef get(self, Py_ssize_t i) + cpdef insert(self, double value) + cpdef remove(self, double value) diff --git a/pandas/_libs/src/skiplist.pyx b/pandas/_libs/skiplist.pyx similarity index 95% rename from pandas/_libs/src/skiplist.pyx rename to pandas/_libs/skiplist.pyx index 1524dca38d0e0..c96413edfb0f2 100644 --- a/pandas/_libs/src/skiplist.pyx +++ b/pandas/_libs/skiplist.pyx @@ -6,8 +6,7 @@ # Cython version: Wes McKinney -cdef extern from "math.h": - double log(double x) +from libc.math cimport log # MSVC does not have log2! @@ -16,6 +15,7 @@ cdef double Log2(double x): cimport numpy as np import numpy as np +from numpy cimport double_t from random import random @@ -25,10 +25,10 @@ np.import_array() # TODO: optimize this, make less messy cdef class Node: - cdef public: - double_t value - list next - list width + # cdef public: + # double_t value + # list next + # list width def __init__(self, double_t value, list next, list width): self.value = value @@ -43,9 +43,9 @@ cdef class IndexableSkiplist: Sorted collection supporting O(lg n) insertion, removal, and lookup by rank. """ - cdef: - Py_ssize_t size, maxlevels - Node head + # cdef: + # Py_ssize_t size, maxlevels + # Node head def __init__(self, expected_size=100): self.size = 0 diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 3fc3625a06634..6e5d8b82c118f 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -5,25 +5,12 @@ from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString cdef extern from "numpy/ndarrayobject.h": - ctypedef int64_t npy_timedelta ctypedef int64_t npy_datetime - ctypedef enum NPY_CASTING: - NPY_NO_CASTING - NPY_EQUIV_CASTING - NPY_SAFE_CASTING - NPY_SAME_KIND_CASTING - NPY_UNSAFE_CASTING - -cdef extern from "numpy_helper.h": - npy_datetime get_datetime64_value(object o) - npy_timedelta get_timedelta64_value(object o) - cdef extern from "numpy/npy_common.h": ctypedef unsigned char npy_bool cdef extern from "datetime/np_datetime.h": - ctypedef enum PANDAS_DATETIMEUNIT: PANDAS_FR_Y PANDAS_FR_M @@ -44,22 +31,13 @@ cdef extern from "datetime/np_datetime.h": npy_int64 year npy_int32 month, day, hour, min, sec, us, ps, as - npy_datetime pandas_datetimestruct_to_datetime( - PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) nogil - void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil - int days_per_month_table[2][12] - int dayofweek(int y, int m, int d) nogil - int is_leapyear(int64_t year) nogil - PANDAS_DATETIMEUNIT get_datetime64_unit(object o) cdef extern from "datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, int *out_local, int *out_tzoffset, PANDAS_DATETIMEUNIT *out_bestunit, @@ -89,7 +67,6 @@ cdef inline int _cstring_to_dts(char *val, int length, int result result = parse_iso_8601_datetime(val, length, PANDAS_FR_ns, - NPY_UNSAFE_CASTING, dts, out_local, out_tzoffset, &out_bestunit, &special) return result diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index 7278cbaff86ca..edc9c0f8f903d 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -24,20 +24,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include "np_datetime.h" #if PY_MAJOR_VERSION >= 3 -#define PyIntObject PyLongObject -#define PyInt_Type PyLong_Type -#define PyInt_Check(op) PyLong_Check(op) -#define PyInt_CheckExact(op) PyLong_CheckExact(op) -#define PyInt_FromString PyLong_FromString -#define PyInt_FromUnicode PyLong_FromUnicode -#define PyInt_FromLong PyLong_FromLong -#define PyInt_FromSize_t PyLong_FromSize_t -#define PyInt_FromSsize_t PyLong_FromSsize_t #define PyInt_AsLong PyLong_AsLong -#define PyInt_AS_LONG PyLong_AS_LONG -#define PyInt_AsSsize_t PyLong_AsSsize_t -#define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask -#define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask #endif const pandas_datetimestruct _NS_MIN_DTS = { @@ -331,26 +318,19 @@ int cmp_pandas_datetimestruct(const pandas_datetimestruct *a, /* * * Tests for and converts a Python datetime.datetime or datetime.date - * object into a NumPy pandas_datetimestruct. + * object into a NumPy pandas_datetimestruct. Uses tzinfo (if present) + * to convert to UTC time. * * While the C API has PyDate_* and PyDateTime_* functions, the following * implementation just asks for attributes, and thus supports * datetime duck typing. The tzinfo time zone conversion would require * this style of access anyway. * - * 'out_bestunit' gives a suggested unit based on whether the object - * was a datetime.date or datetime.datetime object. - * - * If 'apply_tzinfo' is 1, this function uses the tzinfo to convert - * to UTC time, otherwise it returns the struct with the local time. - * * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the neeeded date or datetime attributes. */ int convert_pydatetime_to_datetimestruct(PyObject *obj, - pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo) { + pandas_datetimestruct *out) { PyObject *tmp; int isleap; @@ -417,10 +397,6 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, !PyObject_HasAttrString(obj, "minute") || !PyObject_HasAttrString(obj, "second") || !PyObject_HasAttrString(obj, "microsecond")) { - /* The best unit for date is 'D' */ - if (out_bestunit != NULL) { - *out_bestunit = PANDAS_FR_D; - } return 0; } @@ -478,7 +454,7 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, } /* Apply the time zone offset if it exists */ - if (apply_tzinfo && PyObject_HasAttrString(obj, "tzinfo")) { + if (PyObject_HasAttrString(obj, "tzinfo")) { tmp = PyObject_GetAttrString(obj, "tzinfo"); if (tmp == NULL) { return -1; @@ -519,11 +495,6 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, } } - /* The resolution of Python's datetime is 'us' */ - if (out_bestunit != NULL) { - *out_bestunit = PANDAS_FR_us; - } - return 0; invalid_date: @@ -542,54 +513,34 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) { - pandas_datetime_metadata meta; npy_datetime result = PANDAS_DATETIME_NAT; - meta.base = fr; - meta.num = 1; - - convert_datetimestruct_to_datetime(&meta, d, &result); + convert_datetimestruct_to_datetime(fr, d, &result); return result; } void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) { - pandas_datetime_metadata meta; - - meta.base = fr; - meta.num = 1; - - convert_datetime_to_datetimestruct(&meta, val, result); + convert_datetime_to_datetimestruct(fr, val, result); } void pandas_timedelta_to_timedeltastruct(npy_timedelta val, PANDAS_DATETIMEUNIT fr, - pandas_timedeltastruct *result) { - pandas_datetime_metadata meta; - - meta.base = fr; - meta.num - 1; - - convert_timedelta_to_timedeltastruct(&meta, val, result); + pandas_timedeltastruct *result) { + convert_timedelta_to_timedeltastruct(fr, val, result); } -PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *)obj)->obmeta.base; -} /* * Converts a datetime from a datetimestruct to a datetime based - * on some metadata. The date is assumed to be valid. - * - * TODO: If meta->num is really big, there could be overflow + * on a metadata unit. The date is assumed to be valid. * * Returns 0 on success, -1 on failure. */ -int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, +int convert_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT base, const pandas_datetimestruct *dts, npy_datetime *out) { npy_datetime ret; - PANDAS_DATETIMEUNIT base = meta->base; if (base == PANDAS_FR_Y) { /* Truncate to the year */ @@ -681,100 +632,15 @@ int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, } } - /* Divide by the multiplier */ - if (meta->num > 1) { - if (ret >= 0) { - ret /= meta->num; - } else { - ret = (ret - meta->num + 1) / meta->num; - } - } - *out = ret; return 0; } -/* - * This provides the casting rules for the TIMEDELTA data type units. - * - * Notably, there is a barrier between the nonlinear years and - * months units, and all the other units. - */ -npy_bool can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) { - switch (casting) { - /* Allow anything with unsafe casting */ - case NPY_UNSAFE_CASTING: - return 1; - - /* - * Only enforce the 'date units' vs 'time units' barrier with - * 'same_kind' casting. - */ - case NPY_SAME_KIND_CASTING: - return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || - (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); - - /* - * Enforce the 'date units' vs 'time units' barrier and that - * casting is only allowed towards more precise units with - * 'safe' casting. - */ - case NPY_SAFE_CASTING: - return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || - (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); - - /* Enforce equality with 'no' or 'equiv' casting */ - default: - return src_unit == dst_unit; - } -} - -/* - * This provides the casting rules for the DATETIME data type units. - * - * Notably, there is a barrier between 'date units' and 'time units' - * for all but 'unsafe' casting. - */ -npy_bool can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) { - switch (casting) { - /* Allow anything with unsafe casting */ - case NPY_UNSAFE_CASTING: - return 1; - - /* - * Only enforce the 'date units' vs 'time units' barrier with - * 'same_kind' casting. - */ - case NPY_SAME_KIND_CASTING: - return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || - (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); - - /* - * Enforce the 'date units' vs 'time units' barrier and that - * casting is only allowed towards more precise units with - * 'safe' casting. - */ - case NPY_SAFE_CASTING: - return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || - (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); - - /* Enforce equality with 'no' or 'equiv' casting */ - default: - return src_unit == dst_unit; - } -} - /* * Converts a datetime based on the given metadata into a datetimestruct */ -int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, +int convert_datetime_to_datetimestruct(PANDAS_DATETIMEUNIT base, npy_datetime dt, pandas_datetimestruct *out) { npy_int64 perday; @@ -785,14 +651,11 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, out->month = 1; out->day = 1; - /* TODO: Change to a mechanism that avoids the potential overflow */ - dt *= meta->num; - /* * Note that care must be taken with the / and % operators * for negative values. */ - switch (meta->base) { + switch (base) { case PANDAS_FR_Y: out->year = 1970 + dt; break; @@ -994,14 +857,13 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, /* * Converts a timedelta from a timedeltastruct to a timedelta based - * on some metadata. The timedelta is assumed to be valid. + * on a metadata unit. The timedelta is assumed to be valid. * * Returns 0 on success, -1 on failure. */ -int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, +int convert_timedelta_to_timedeltastruct(PANDAS_DATETIMEUNIT base, npy_timedelta td, pandas_timedeltastruct *out) { - npy_int64 perday; npy_int64 frac; npy_int64 sfrac; npy_int64 ifrac; @@ -1011,16 +873,16 @@ int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, /* Initialize the output to all zeros */ memset(out, 0, sizeof(pandas_timedeltastruct)); - switch (meta->base) { + switch (base) { case PANDAS_FR_ns: // put frac in seconds if (td < 0 && td % (1000LL * 1000LL * 1000LL) != 0) - frac = td / (1000LL * 1000LL * 1000LL) - 1; + frac = td / (1000LL * 1000LL * 1000LL) - 1; else frac = td / (1000LL * 1000LL * 1000LL); - if (frac < 0) { + if (frac < 0) { sign = -1; // even fraction @@ -1030,66 +892,66 @@ int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, } else { frac = -frac; } - } else { + } else { sign = 1; out->days = 0; - } + } - if (frac >= 86400) { + if (frac >= 86400) { out->days += frac / 86400LL; frac -= out->days * 86400LL; - } + } - if (frac >= 3600) { + if (frac >= 3600) { out->hrs = frac / 3600LL; frac -= out->hrs * 3600LL; - } else { + } else { out->hrs = 0; - } + } - if (frac >= 60) { + if (frac >= 60) { out->min = frac / 60LL; frac -= out->min * 60LL; - } else { + } else { out->min = 0; - } + } - if (frac >= 0) { + if (frac >= 0) { out->sec = frac; frac -= out->sec; - } else { + } else { out->sec = 0; - } + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * (1000LL * 1000LL * 1000LL); + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * (1000LL * 1000LL * 1000LL); - if (sign < 0) + if (sign < 0) out->days = -out->days; - ifrac = td - (out->days * DAY_NS + sfrac); + ifrac = td - (out->days * DAY_NS + sfrac); - if (ifrac != 0) { + if (ifrac != 0) { out->ms = ifrac / (1000LL * 1000LL); ifrac -= out->ms * 1000LL * 1000LL; out->us = ifrac / 1000LL; ifrac -= out->us * 1000LL; out->ns = ifrac; - } else { + } else { out->ms = 0; out->us = 0; out->ns = 0; - } + } - out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; - out->microseconds = out->ms * 1000 + out->us; - out->nanoseconds = out->ns; - break; + out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->microseconds = out->ms * 1000 + out->us; + out->nanoseconds = out->ns; + break; default: PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); + "NumPy timedelta metadata is corrupted with " + "invalid base unit"); return -1; } diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index c51a4bddac82f..b6c0852bfe764 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -40,8 +40,6 @@ typedef enum { #define PANDAS_DATETIME_NUMUNITS 13 -#define PANDAS_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) - #define PANDAS_DATETIME_NAT NPY_MIN_INT64 typedef struct { @@ -54,13 +52,6 @@ typedef struct { npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; } pandas_timedeltastruct; -typedef struct { - PANDAS_DATETIMEUNIT base; - int num; -} pandas_datetime_metadata; - -typedef pandas_datetime_metadata pandas_timedelta_metadata; - extern const pandas_datetimestruct _NS_MIN_DTS; extern const pandas_datetimestruct _NS_MAX_DTS; @@ -68,9 +59,7 @@ extern const pandas_datetimestruct _NS_MAX_DTS; // ---------------------------------------------------------------------------- int convert_pydatetime_to_datetimestruct(PyObject *obj, - pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo); + pandas_datetimestruct *out); npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d); @@ -91,19 +80,6 @@ extern const int days_per_month_table[2][12]; int is_leapyear(npy_int64 year); -/* - * Converts a datetime from a datetimestruct to a datetime based - * on some metadata. The date is assumed to be valid. - * - * TODO: If meta->num is really big, there could be overflow - * - * Returns 0 on success, -1 on failure. - */ -int -convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, - const pandas_datetimestruct *dts, - npy_datetime *out); - /* * Calculates the days offset from the 1970 epoch. */ @@ -125,30 +101,10 @@ int cmp_pandas_datetimestruct(const pandas_datetimestruct *a, void add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); -/* - * This provides the casting rules for the TIMEDELTA data type units. - * - * Notably, there is a barrier between the nonlinear years and - * months units, and all the other units. - */ -npy_bool -can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting); - int -convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, +convert_datetime_to_datetimestruct(PANDAS_DATETIMEUNIT base, npy_datetime dt, pandas_datetimestruct *out); -int -convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, - npy_timedelta td, - pandas_timedeltastruct *out); - - -PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); - - #endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/src/datetime/np_datetime_strings.c b/pandas/_libs/src/datetime/np_datetime_strings.c index 5307d394423ff..92f030b5fea2b 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/datetime/np_datetime_strings.c @@ -32,22 +32,6 @@ This file implements string parsing and creation for NumPy datetime. #include "np_datetime.h" #include "np_datetime_strings.h" -NPY_NO_EXPORT const char *npy_casting_to_string(NPY_CASTING casting) { - switch (casting) { - case NPY_NO_CASTING: - return "'no'"; - case NPY_EQUIV_CASTING: - return "'equiv'"; - case NPY_SAFE_CASTING: - return "'safe'"; - case NPY_SAME_KIND_CASTING: - return "'same_kind'"; - case NPY_UNSAFE_CASTING: - return "'unsafe'"; - default: - return ""; - } -} /* Platform-specific time_t typedef */ typedef time_t NPY_TIME_T; @@ -115,51 +99,6 @@ static int get_localtime(NPY_TIME_T *ts, struct tm *tms) { return -1; } -#if 0 -/* - * Wraps `gmtime` functionality for multiple platforms. This - * converts a time value to a time structure in UTC. - * - * Returns 0 on success, -1 on failure. - */ -static int -get_gmtime(NPY_TIME_T *ts, struct tm *tms) { - char *func_name = ""; -#if defined(_WIN32) -#if defined(_MSC_VER) && (_MSC_VER >= 1400) - if (gmtime_s(tms, ts) != 0) { - func_name = "gmtime_s"; - goto fail; - } -#elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) - if (_gmtime64_s(tms, ts) != 0) { - func_name = "_gmtime64_s"; - goto fail; - } -#else - struct tm *tms_tmp; - gmtime_r(ts, tms_tmp); - if (tms_tmp == NULL) { - func_name = "gmtime"; - goto fail; - } - memcpy(tms, tms_tmp, sizeof(struct tm)); -#endif -#else - if (gmtime_r(ts, tms) == NULL) { - func_name = "gmtime_r"; - goto fail; - } -#endif - - return 0; - -fail: - PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " - "to a UTC time", func_name); - return -1; -} -#endif /* * Converts a datetimestruct in UTC to a datetimestruct in local time, @@ -226,115 +165,6 @@ static int convert_datetimestruct_utc_to_local( return 0; } -#if 0 -/* - * Converts a datetimestruct in local time to a datetimestruct in UTC. - * - * Returns 0 on success, -1 on failure. - */ -static int -convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, - const pandas_datetimestruct *dts_local) { - npy_int64 year_correction = 0; - - /* Make a copy of the input 'dts' to modify */ - *out_dts_utc = *dts_local; - - /* HACK: Use a year < 2038 for later years for small time_t */ - if (sizeof(NPY_TIME_T) == 4 && out_dts_utc->year >= 2038) { - if (is_leapyear(out_dts_utc->year)) { - /* 2036 is a leap year */ - year_correction = out_dts_utc->year - 2036; - out_dts_utc->year -= year_correction; - } else { - /* 2037 is not a leap year */ - year_correction = out_dts_utc->year - 2037; - out_dts_utc->year -= year_correction; - } - } - - /* - * ISO 8601 states to treat date-times without a timezone offset - * or 'Z' for UTC as local time. The C standard libary functions - * mktime and gmtime allow us to do this conversion. - * - * Only do this timezone adjustment for recent and future years. - * In this case, "recent" is defined to be 1970 and later, because - * on MS Windows, mktime raises an error when given an earlier date. - */ - if (out_dts_utc->year >= 1970) { - NPY_TIME_T rawtime = 0; - struct tm tm_; - - tm_.tm_sec = out_dts_utc->sec; - tm_.tm_min = out_dts_utc->min; - tm_.tm_hour = out_dts_utc->hour; - tm_.tm_mday = out_dts_utc->day; - tm_.tm_mon = out_dts_utc->month - 1; - tm_.tm_year = out_dts_utc->year - 1900; - tm_.tm_isdst = -1; - - /* mktime converts a local 'struct tm' into a time_t */ - rawtime = mktime(&tm_); - if (rawtime == -1) { - PyErr_SetString(PyExc_OSError, "Failed to use mktime to " - "convert local time to UTC"); - return -1; - } - - /* gmtime converts a 'time_t' into a UTC 'struct tm' */ - if (get_gmtime(&rawtime, &tm_) < 0) { - return -1; - } - out_dts_utc->sec = tm_.tm_sec; - out_dts_utc->min = tm_.tm_min; - out_dts_utc->hour = tm_.tm_hour; - out_dts_utc->day = tm_.tm_mday; - out_dts_utc->month = tm_.tm_mon + 1; - out_dts_utc->year = tm_.tm_year + 1900; - } - - /* Reapply the year 2038 year correction HACK */ - out_dts_utc->year += year_correction; - - return 0; -} -#endif - -/* int */ -/* parse_python_string(PyObject* obj, pandas_datetimestruct *dts) { */ -/* PyObject *bytes = NULL; */ -/* char *str = NULL; */ -/* Py_ssize_t len = 0; */ -/* PANDAS_DATETIMEUNIT bestunit = -1; */ - -/* /\* Convert to an ASCII string for the date parser *\/ */ -/* if (PyUnicode_Check(obj)) { */ -/* bytes = PyUnicode_AsASCIIString(obj); */ -/* if (bytes == NULL) { */ -/* return -1; */ -/* } */ -/* } */ -/* else { */ -/* bytes = obj; */ -/* Py_INCREF(bytes); */ -/* } */ -/* if (PyBytes_AsStringAndSize(bytes, &str, &len) == -1) { */ -/* Py_DECREF(bytes); */ -/* return -1; */ -/* } */ - -/* /\* Parse the ISO date *\/ */ -/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, - */ -/* dts, NULL, &bestunit, NULL) < 0) { */ -/* Py_DECREF(bytes); */ -/* return -1; */ -/* } */ -/* Py_DECREF(bytes); */ - -/* return 0; */ -/* } */ /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -354,8 +184,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * 'str' must be a NULL-terminated string, and 'len' must be its length. * 'unit' should contain -1 if the unit is unknown, or the unit * which will be used if it is. - * 'casting' controls how the detected unit from the string is allowed - * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. * 'out_local' gets set to 1 if the parsed time contains timezone, @@ -375,7 +203,7 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * Returns 0 on success, -1 on failure. */ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, + pandas_datetimestruct *out, int *out_local, int *out_tzoffset, PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special) { @@ -444,16 +272,6 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_special = 1; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - return 0; } @@ -461,14 +279,9 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, if (len == 3 && tolower(str[0]) == 'n' && tolower(str[1]) == 'o' && tolower(str[2]) == 'w') { NPY_TIME_T rawtime = 0; - pandas_datetime_metadata meta; time(&rawtime); - /* Set up a dummy metadata for the conversion */ - meta.base = PANDAS_FR_s; - meta.num = 1; - bestunit = PANDAS_FR_s; /* @@ -486,17 +299,7 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_special = 1; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - - return convert_datetime_to_datetimestruct(&meta, rawtime, out); + return convert_datetime_to_datetimestruct(PANDAS_FR_s, rawtime, out); } /* Anything else isn't a special value */ @@ -941,16 +744,6 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_bestunit = bestunit; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - return 0; parse_error: @@ -1018,38 +811,6 @@ int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { return len; } -/* - * Finds the largest unit whose value is nonzero, and for which - * the remainder for the rest of the units is zero. - */ -static PANDAS_DATETIMEUNIT lossless_unit_from_datetimestruct( - pandas_datetimestruct *dts) { - if (dts->as % 1000 != 0) { - return PANDAS_FR_as; - } else if (dts->as != 0) { - return PANDAS_FR_fs; - } else if (dts->ps % 1000 != 0) { - return PANDAS_FR_ps; - } else if (dts->ps != 0) { - return PANDAS_FR_ns; - } else if (dts->us % 1000 != 0) { - return PANDAS_FR_us; - } else if (dts->us != 0) { - return PANDAS_FR_ms; - } else if (dts->sec != 0) { - return PANDAS_FR_s; - } else if (dts->min != 0) { - return PANDAS_FR_m; - } else if (dts->hour != 0) { - return PANDAS_FR_h; - } else if (dts->day != 1) { - return PANDAS_FR_D; - } else if (dts->month != 1) { - return PANDAS_FR_M; - } else { - return PANDAS_FR_Y; - } -} /* * Converts an pandas_datetimestruct to an (almost) ISO 8601 @@ -1069,17 +830,11 @@ static PANDAS_DATETIMEUNIT lossless_unit_from_datetimestruct( * set to a value other than -1. This is a manual override for * the local time zone to use, as an offset in minutes. * - * 'casting' controls whether data loss is allowed by truncating - * the data to a coarser unit. This interacts with 'local', slightly, - * in order to form a date unit string as a local time, the casting - * must be unsafe. - * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting) { + int local, PANDAS_DATETIMEUNIT base, int tzoffset) { pandas_datetimestruct dts_local; int timezone_offset = 0; @@ -1121,38 +876,6 @@ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, add_minutes_to_datetimestruct(dts, timezone_offset); } - /* - * Now the datetimestruct data is in the final form for - * the string representation, so ensure that the data - * is being cast according to the casting rule. - */ - if (casting != NPY_UNSAFE_CASTING) { - /* Producing a date as a local time is always 'unsafe' */ - if (base <= PANDAS_FR_D && local) { - PyErr_SetString(PyExc_TypeError, - "Cannot create a local " - "timezone-based date string from a NumPy " - "datetime without forcing 'unsafe' casting"); - return -1; - } else { - /* Only 'unsafe' and 'same_kind' allow data loss */ - PANDAS_DATETIMEUNIT unitprec; - - unitprec = lossless_unit_from_datetimestruct(dts); - if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { - PyErr_Format(PyExc_TypeError, - "Cannot create a " - "string with unit precision '%s' " - "from the NumPy datetime, which has data at " - "unit precision '%s', " - "requires 'unsafe' or 'same_kind' casting", - _datetime_strings[base], - _datetime_strings[unitprec]); - return -1; - } - } - } - /* YEAR */ /* * Can't use PyOS_snprintf, because it always produces a '\0' diff --git a/pandas/_libs/src/datetime/np_datetime_strings.h b/pandas/_libs/src/datetime/np_datetime_strings.h index 833c1869c1664..4c248129b68c3 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/src/datetime/np_datetime_strings.h @@ -40,8 +40,6 @@ This file implements string parsing and creation for NumPy datetime. * 'str' must be a NULL-terminated string, and 'len' must be its length. * 'unit' should contain -1 if the unit is unknown, or the unit * which will be used if it is. - * 'casting' controls how the detected unit from the string is allowed - * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time. @@ -62,7 +60,6 @@ This file implements string parsing and creation for NumPy datetime. int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, int *out_local, int *out_tzoffset, @@ -90,17 +87,11 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base); * set to a value other than -1. This is a manual override for * the local time zone to use, as an offset in minutes. * - * 'casting' controls whether data loss is allowed by truncating - * the data to a coarser unit. This interacts with 'local', slightly, - * in order to form a date unit string as a local time, the casting - * must be unsafe. - * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting); + int local, PANDAS_DATETIMEUNIT base, int tzoffset); #endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 066beb29c24ce..cb192fcced318 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -3,7 +3,7 @@ from decimal import Decimal cimport util cimport cython from tslibs.nattype import NaT -from tslib cimport convert_to_tsobject +from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone from datetime import datetime, timedelta @@ -464,7 +464,8 @@ cpdef object infer_datetimelike_array(object arr): - timedelta: we have *only* timedeltas and maybe strings, nulls - nat: we do not have *any* date, datetimes or timedeltas, but do have at least a NaT - - mixed: other objects (strings or actual objects) + - mixed: other objects (strings, a mix of tz-aware and tz-naive, or + actual objects) Parameters ---------- @@ -479,6 +480,7 @@ cpdef object infer_datetimelike_array(object arr): cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 + bint seen_tz_aware = 0, seen_tz_naive = 0 bint seen_nat = 0 list objs = [] object v @@ -496,8 +498,20 @@ cpdef object infer_datetimelike_array(object arr): pass elif v is NaT: seen_nat = 1 - elif is_datetime(v) or util.is_datetime64_object(v): - # datetime, or np.datetime64 + elif is_datetime(v): + # datetime + seen_datetime = 1 + + # disambiguate between tz-naive and tz-aware + if v.tzinfo is None: + seen_tz_naive = 1 + else: + seen_tz_aware = 1 + + if seen_tz_naive and seen_tz_aware: + return 'mixed' + elif util.is_datetime64_object(v): + # np.datetime64 seen_datetime = 1 elif is_date(v): seen_date = 1 diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index ad683459ad878..8a9a05723d9fe 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -18,14 +18,6 @@ The full license is in the LICENSE file, distributed with this software. PANDAS_INLINE npy_int64 get_nat(void) { return NPY_MIN_INT64; } -PANDAS_INLINE npy_datetime get_datetime64_value(PyObject* obj) { - return ((PyDatetimeScalarObject*)obj)->obval; -} - -PANDAS_INLINE npy_timedelta get_timedelta64_value(PyObject* obj) { - return ((PyTimedeltaScalarObject*)obj)->obval; -} - PANDAS_INLINE int is_integer_object(PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } diff --git a/pandas/_libs/src/skiplist.pxd b/pandas/_libs/src/skiplist.pxd deleted file mode 100644 index 214aa1c7aeaf0..0000000000000 --- a/pandas/_libs/src/skiplist.pxd +++ /dev/null @@ -1,22 +0,0 @@ -cdef extern from "skiplist.h": - ctypedef struct node_t: - node_t **next - int *width - double value - int is_nil - int levels - int ref_count - - ctypedef struct skiplist_t: - node_t *head - node_t **tmp_chain - int *tmp_steps - int size - int maxlevels - - skiplist_t* skiplist_init(int) nogil - void skiplist_destroy(skiplist_t*) nogil - double skiplist_get(skiplist_t*, int, int*) nogil - int skiplist_insert(skiplist_t*, double) nogil - int skiplist_remove(skiplist_t*, double) nogil - diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f799b7f6b4785..7c64db69f0c46 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -456,7 +456,7 @@ static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, } if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, - -1, NPY_UNSAFE_CASTING)) { + -1)) { PRINTMARK(); *_outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; @@ -493,7 +493,7 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, PRINTMARK(); - if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) { + if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { PRINTMARK(); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } else { diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index 61783ab47cb86..e5fe90aa81f7d 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -2,6 +2,7 @@ from numpy cimport ndarray cimport numpy as cnp cimport cpython + cdef extern from "numpy_helper.h": void set_array_not_contiguous(ndarray ao) diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd deleted file mode 100644 index b74cf5b79c4cb..0000000000000 --- a/pandas/_libs/tslib.pxd +++ /dev/null @@ -1,3 +0,0 @@ -from numpy cimport ndarray, int64_t - -from tslibs.conversion cimport convert_to_tsobject diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ea4f4728a0741..020ac812e1c20 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -5,8 +5,9 @@ # distutils: define_macros=CYTHON_TRACE_NOGIL=0 cimport numpy as np -from numpy cimport int64_t, import_array, ndarray, float64_t +from numpy cimport int64_t, ndarray, float64_t import numpy as np +np.import_array() from cpython cimport PyTypeObject, PyFloat_Check @@ -35,18 +36,15 @@ from tslibs.np_datetime cimport (check_dts_bounds, dayofweek, is_leapyear) from tslibs.np_datetime import OutOfBoundsDatetime -from .tslibs.parsing import parse_datetime_string +from tslibs.parsing import parse_datetime_string cimport cython +from cython cimport Py_ssize_t -import warnings import pytz UTC = pytz.utc -# initialize numpy -import_array() - from tslibs.timedeltas cimport cast_from_unit from tslibs.timedeltas import Timedelta @@ -60,7 +58,7 @@ from tslibs.conversion cimport (tz_convert_single, _TSObject, from tslibs.conversion import tz_convert_single from tslibs.nattype import NaT, nat_strings, iNaT -from tslibs.nattype cimport _checknull_with_nat, NPY_NAT +from tslibs.nattype cimport checknull_with_nat, NPY_NAT from tslibs.timestamps cimport (create_timestamp_from_ts, _NS_UPPER_BOUND, _NS_LOWER_BOUND) @@ -409,7 +407,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif is_integer_object(val) or is_float_object(val): @@ -475,7 +473,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): oresult[i] = NaT elif is_integer_object(val) or is_float_object(val): @@ -526,7 +524,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif PyDateTime_Check(val): @@ -686,7 +684,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', val = values[i] # set as nan except if its a NaT - if _checknull_with_nat(val): + if checknull_with_nat(val): if PyFloat_Check(val): oresult[i] = np.nan else: @@ -704,7 +702,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): oresult[i] = val elif is_string_object(val): @@ -730,29 +728,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -# ---------------------------------------------------------------------- -# Accessors - - -def get_time_micros(ndarray[int64_t] dtindex): - """ - Datetime as int64 representation to a structured array of fields - """ - cdef: - Py_ssize_t i, n = len(dtindex) - pandas_datetimestruct dts - ndarray[int64_t] micros - - micros = np.empty(n, dtype=np.int64) - - for i in range(n): - dt64_to_dtstruct(dtindex[i], &dts) - micros[i] = 1000000LL * (dts.hour * 60 * 60 + - 60 * dts.min + dts.sec) + dts.us - - return micros - - # ---------------------------------------------------------------------- # Some general helper functions diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 16e88bcaeea3e..7f3cc0a7e81dd 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -40,7 +40,7 @@ from timezones cimport ( from parsing import parse_datetime_string from nattype import nat_strings, NaT -from nattype cimport NPY_NAT, _checknull_with_nat +from nattype cimport NPY_NAT, checknull_with_nat # ---------------------------------------------------------------------- # Constants @@ -104,13 +104,16 @@ def ensure_datetime64ns(ndarray arr): return result unit = get_datetime64_unit(arr.flat[0]) - for i in range(n): - if ivalues[i] != NPY_NAT: - pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) - iresult[i] = dtstruct_to_dt64(&dts) - check_dts_bounds(&dts) - else: - iresult[i] = NPY_NAT + if unit == PANDAS_FR_ns: + result = arr + else: + for i in range(n): + if ivalues[i] != NPY_NAT: + pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) + iresult[i] = dtstruct_to_dt64(&dts) + check_dts_bounds(&dts) + else: + iresult[i] = NPY_NAT return result @@ -140,7 +143,7 @@ def datetime_to_datetime64(ndarray[object] values): iresult = result.view('i8') for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif PyDateTime_Check(val): if val.tzinfo is not None: diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index e813fad1d3fa7..3de361c511fbf 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -23,6 +23,26 @@ from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, from nattype cimport NPY_NAT +def get_time_micros(ndarray[int64_t] dtindex): + """ + Return the number of microseconds in the time component of a + nanosecond timestamp. + + Parameters + ---------- + dtindex : ndarray[int64_t] + + Returns + ------- + micros : ndarray[int64_t] + """ + cdef: + ndarray[int64_t] micros + + micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL + return micros + + def build_field_sarray(ndarray[int64_t] dtindex): """ Datetime as int64 representation to a structured array of fields diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 34fa1e70305e7..96e02142d501b 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -6,4 +6,4 @@ cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] -cdef bint _checknull_with_nat(object val) +cdef bint checknull_with_nat(object val) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index d2f6006b41f65..2e7b861b24fa8 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -572,7 +572,7 @@ NaT = NaTType() # ---------------------------------------------------------------------- -cdef inline bint _checknull_with_nat(object val): +cdef inline bint checknull_with_nat(object val): """ utility to check if a value is a nat or not """ return val is None or ( PyFloat_Check(val) and val != val) or val is NaT diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 526595e3a2eda..251af50ab12ce 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -17,14 +17,12 @@ np.import_array() from util cimport is_string_object, is_integer_object -from pandas._libs.tslib import monthrange - from conversion cimport tz_convert_single, pydt_to_i8 from frequencies cimport get_freq_code from nattype cimport NPY_NAT from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, - is_leapyear, days_per_month_table) + is_leapyear, days_per_month_table, dayofweek) # --------------------------------------------------------------------- # Constants @@ -33,7 +31,7 @@ from np_datetime cimport (pandas_datetimestruct, _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] _int_to_month = {(k + 1): v for k, v in enumerate(_MONTHS)} -_month_to_int = dict((v, k) for k, v in _int_to_month.items()) +_month_to_int = {v: k for k, v in _int_to_month.items()} class WeekDay(object): @@ -145,45 +143,44 @@ def apply_index_wraps(func): # --------------------------------------------------------------------- # Business Helpers -cpdef int get_lastbday(int wkday, int days_in_month): +cpdef int get_lastbday(int year, int month) nogil: """ Find the last day of the month that is a business day. - (wkday, days_in_month) is the output from monthrange(year, month) - Parameters ---------- - wkday : int - days_in_month : int + year : int + month : int Returns ------- last_bday : int """ + cdef: + int wkday, days_in_month + + wkday = dayofweek(year, month, 1) + days_in_month = get_days_in_month(year, month) return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) -cpdef int get_firstbday(int wkday, int days_in_month=0): +cpdef int get_firstbday(int year, int month) nogil: """ Find the first day of the month that is a business day. - (wkday, days_in_month) is the output from monthrange(year, month) - Parameters ---------- - wkday : int - days_in_month : int, default 0 + year : int + month : int Returns ------- first_bday : int - - Notes - ----- - `days_in_month` arg is a dummy so that this has the same signature as - `get_lastbday`. """ - cdef int first + cdef: + int first, wkday + + wkday = dayofweek(year, month, 1) first = 1 if wkday == 5: # on Saturday first = 3 @@ -261,7 +258,7 @@ def _validate_business_time(t_input): # --------------------------------------------------------------------- # Constructor Helpers -_rd_kwds = set([ +relativedelta_kwds = set([ 'years', 'months', 'weeks', 'days', 'year', 'month', 'week', 'day', 'weekday', 'hour', 'minute', 'second', 'microsecond', @@ -406,6 +403,33 @@ class _BaseOffset(object): # will raise NotImplementedError. return get_day_of_month(other, self._day_opt) + def _validate_n(self, n): + """ + Require that `n` be a nonzero integer. + + Parameters + ---------- + n : int + + Returns + ------- + nint : int + + Raises + ------ + TypeError if `int(n)` raises + ValueError if n != int(n) + """ + try: + nint = int(n) + except (ValueError, TypeError): + raise TypeError('`n` argument must be an integer, ' + 'got {ntype}'.format(ntype=type(n))) + if n != nint: + raise ValueError('`n` argument must be an integer, ' + 'got {n}'.format(n=n)) + return nint + class BaseOffset(_BaseOffset): # Here we add __rfoo__ methods that don't play well with cdef classes @@ -445,6 +469,160 @@ cdef inline int month_add_months(pandas_datetimestruct dts, int months) nogil: return 12 if new_month == 0 else new_month +@cython.wraparound(False) +@cython.boundscheck(False) +def shift_quarters(int64_t[:] dtindex, int quarters, + int q1start_month, object day, int modby=3): + """ + Given an int64 array representing nanosecond timestamps, shift all elements + by the specified number of quarters using DateOffset semantics. + + Parameters + ---------- + dtindex : int64_t[:] timestamps for input dates + quarters : int number of quarters to shift + q1start_month : int month in which Q1 begins by convention + day : {'start', 'end', 'business_start', 'business_end'} + modby : int (3 for quarters, 12 for years) + + Returns + ------- + out : ndarray[int64_t] + """ + cdef: + Py_ssize_t i + pandas_datetimestruct dts + int count = len(dtindex) + int months_to_roll, months_since, n, compare_day + bint roll_check + int64_t[:] out = np.empty(count, dtype='int64') + + if day == 'start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + + # offset semantics - if on the anchor point and going backwards + # shift to next + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > 1)): + n += 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + dts.day = 1 + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + + if n <= 0 and months_since != 0: + # The general case of this condition would be + # `months_since != 0 or (months_since == 0 and + # dts.day > get_days_in_month(dts.year, dts.month))` + # but the get_days_in_month inequality would never hold. + n += 1 + elif n > 0 and (months_since == 0 and + dts.day < get_days_in_month(dts.year, + dts.month)): + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + dts.day = get_days_in_month(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + compare_month = dts.month - months_since + compare_month = compare_month or 12 + # compare_day is only relevant for comparison in the case + # where months_since == 0. + compare_day = get_firstbday(dts.year, compare_month) + + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate + n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + + dts.day = get_firstbday(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + compare_month = dts.month - months_since + compare_month = compare_month or 12 + # compare_day is only relevant for comparison in the case + # where months_since == 0. + compare_day = get_lastbday(dts.year, compare_month) + + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate + n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + + dts.day = get_lastbday(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + else: + raise ValueError("day must be None, 'start', 'end', " + "'business_start', or 'business_end'") + + return np.asarray(out) + + @cython.wraparound(False) @cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): @@ -527,8 +705,56 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): dts.day = get_days_in_month(dts.year, dts.month) out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + compare_day = get_firstbday(dts.year, dts.month) + + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 + + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) + + dts.day = get_firstbday(dts.year, dts.month) + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + compare_day = get_lastbday(dts.year, dts.month) + + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 + + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) + + dts.day = get_lastbday(dts.year, dts.month) + out[i] = dtstruct_to_dt64(&dts) + else: - raise ValueError("day must be None, 'start' or 'end'") + raise ValueError("day must be None, 'start', 'end', " + "'business_start', or 'business_end'") return np.asarray(out) @@ -558,7 +784,7 @@ cpdef datetime shift_month(datetime stamp, int months, object day_opt=None): """ cdef: int year, month, day - int wkday, days_in_month, dy + int days_in_month, dy dy = (stamp.month + months) // 12 month = (stamp.month + months) % 12 @@ -568,20 +794,21 @@ cpdef datetime shift_month(datetime stamp, int months, object day_opt=None): dy -= 1 year = stamp.year + dy - wkday, days_in_month = monthrange(year, month) if day_opt is None: + days_in_month = get_days_in_month(year, month) day = min(stamp.day, days_in_month) elif day_opt == 'start': day = 1 elif day_opt == 'end': - day = days_in_month + day = get_days_in_month(year, month) elif day_opt == 'business_start': # first business day of month - day = get_firstbday(wkday, days_in_month) + day = get_firstbday(year, month) elif day_opt == 'business_end': # last business day of month - day = get_lastbday(wkday, days_in_month) + day = get_lastbday(year, month) elif is_integer_object(day_opt): + days_in_month = get_days_in_month(year, month) day = min(day_opt, days_in_month) else: raise ValueError(day_opt) @@ -614,22 +841,22 @@ cpdef int get_day_of_month(datetime other, day_opt) except? -1: """ cdef: - int wkday, days_in_month + int days_in_month if day_opt == 'start': return 1 - - wkday, days_in_month = monthrange(other.year, other.month) - if day_opt == 'end': + elif day_opt == 'end': + days_in_month = get_days_in_month(other.year, other.month) return days_in_month elif day_opt == 'business_start': # first business day of month - return get_firstbday(wkday, days_in_month) + return get_firstbday(other.year, other.month) elif day_opt == 'business_end': # last business day of month - return get_lastbday(wkday, days_in_month) + return get_lastbday(other.year, other.month) elif is_integer_object(day_opt): - day = min(day_opt, days_in_month) + days_in_month = get_days_in_month(other.year, other.month) + return min(day_opt, days_in_month) elif day_opt is None: # Note: unlike `shift_month`, get_day_of_month does not # allow day_opt = None diff --git a/pandas/_libs/period.pyx b/pandas/_libs/tslibs/period.pyx similarity index 96% rename from pandas/_libs/period.pyx rename to pandas/_libs/tslibs/period.pyx index d09459898321e..cf73257caf227 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -17,26 +17,27 @@ from pandas.compat import PY2 cimport cython -from tslibs.np_datetime cimport (pandas_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, - is_leapyear) +from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT +# import datetime C API +PyDateTime_IMPORT +from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, + dt64_to_dtstruct, is_leapyear) cimport util from util cimport is_period_object, is_string_object, INT32_MIN -from missing cimport is_null_datetimelike -from pandas._libs.tslib import Timestamp -from tslibs.timezones cimport ( - is_utc, is_tzlocal, get_utcoffset, get_dst_info) -from tslibs.timedeltas cimport delta_to_nanoseconds +from pandas._libs.missing cimport is_null_datetimelike +from timestamps import Timestamp +from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info +from timedeltas cimport delta_to_nanoseconds -from tslibs.parsing import (parse_time_string, NAT_SENTINEL, - _get_rule_month, _MONTH_NUMBERS) -from tslibs.frequencies cimport get_freq_code -from tslibs.resolution import resolution, Resolution -from tslibs.nattype import nat_strings, NaT, iNaT -from tslibs.nattype cimport _nat_scalar_rules, NPY_NAT +from parsing import (parse_time_string, NAT_SENTINEL, + _get_rule_month, _MONTH_NUMBERS) +from frequencies cimport get_freq_code +from resolution import resolution, Resolution +from nattype import nat_strings, NaT, iNaT +from nattype cimport _nat_scalar_rules, NPY_NAT from pandas.tseries import offsets from pandas.tseries import frequencies @@ -559,7 +560,6 @@ cdef class _Period(object): int64_t ordinal object freq - _comparables = ['name', 'freqstr'] _typ = 'period' def __cinit__(self, ordinal, freq): @@ -648,9 +648,19 @@ cdef class _Period(object): elif util.is_integer_object(other): ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) + elif (PyDateTime_Check(other) or + is_period_object(other) or util.is_datetime64_object(other)): + # can't add datetime-like + # GH#17983 + sname = type(self).__name__ + oname = type(other).__name__ + raise TypeError("unsupported operand type(s) for +: '{self}' " + "and '{other}'".format(self=sname, + other=oname)) else: # pragma: no cover return NotImplemented elif is_period_object(other): + # this can be reached via __radd__ because of cython rules return other + self else: return NotImplemented diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 388075903a8ba..1c20dbe7f8fc9 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -10,7 +10,7 @@ np.import_array() from util cimport is_string_object, get_nat -from khash cimport ( +from pandas._libs.khash cimport ( khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t, @@ -53,7 +53,7 @@ _ONE_HOUR = 60 * _ONE_MINUTE _ONE_DAY = 24 * _ONE_HOUR DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] -_weekday_rule_aliases = dict((k, v) for k, v in enumerate(DAYS)) +_weekday_rule_aliases = {k: v for k, v in enumerate(DAYS)} _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 439cc21a360c7..65594de586bac 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -38,7 +38,7 @@ from np_datetime cimport (check_dts_bounds, from util cimport is_string_object -from nattype cimport _checknull_with_nat, NPY_NAT +from nattype cimport checknull_with_nat, NPY_NAT from nattype import nat_strings @@ -142,7 +142,7 @@ def array_strptime(ndarray[object] values, object fmt, iresult[i] = NPY_NAT continue else: - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT continue else: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6ea30642625fe..b37e5dc620260 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -30,7 +30,7 @@ from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) from nattype import nat_strings, NaT -from nattype cimport _checknull_with_nat, NPY_NAT +from nattype cimport checknull_with_nat, NPY_NAT # ---------------------------------------------------------------------- # Constants @@ -111,7 +111,7 @@ cpdef convert_to_timedelta64(object ts, object unit): # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case """ - if _checknull_with_nat(ts): + if checknull_with_nat(ts): return np.timedelta64(NPY_NAT) elif isinstance(ts, Timedelta): # already in the proper format @@ -443,7 +443,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): cdef bint _validate_ops_compat(other): # return True if we are compat with operating - if _checknull_with_nat(other): + if checknull_with_nat(other): return True elif PyDelta_Check(other) or is_timedelta64_object(other): return True @@ -837,7 +837,7 @@ class Timedelta(_Timedelta): elif is_integer_object(value) or is_float_object(value): # unit=None is de-facto 'ns' value = convert_to_timedelta64(value, unit) - elif _checknull_with_nat(value): + elif checknull_with_nat(value): return NaT else: raise ValueError( diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8fdded0bcb07a..cf0c0e2c01d60 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -304,10 +304,12 @@ cdef class _Timestamp(datetime): out = get_date_field(np.array([val], dtype=np.int64), field) return int(out[0]) - cpdef _get_start_end_field(self, field): + cpdef bint _get_start_end_field(self, str field): cdef: int64_t val dict kwds + ndarray out + int month_kw freq = self.freq if freq: @@ -713,7 +715,7 @@ class Timestamp(_Timestamp): @property def quarter(self): - return self._get_field('q') + return ((self.month - 1) // 3) + 1 @property def days_in_month(self): @@ -727,26 +729,44 @@ class Timestamp(_Timestamp): @property def is_month_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 return self._get_start_end_field('is_month_start') @property def is_month_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.days_in_month return self._get_start_end_field('is_month_end') @property def is_quarter_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 and self.month % 3 == 1 return self._get_start_end_field('is_quarter_start') @property def is_quarter_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return (self.month % 3) == 0 and self.day == self.days_in_month return self._get_start_end_field('is_quarter_end') @property def is_year_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.month == 1 return self._get_start_end_field('is_year_start') @property def is_year_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.month == 12 and self.day == 31 return self._get_start_end_field('is_year_end') @property diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7fb48e7c66f47..d326f2cb68f24 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -284,10 +284,9 @@ cdef object get_dst_info(object tz): def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo - if end.tzinfo: - if not (get_timezone(tz) == get_timezone(end.tzinfo)): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + if not (get_timezone(tz) == get_timezone(end.tzinfo)): + msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' + raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 4d5ebdc0c581a..ecce45742afa7 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -14,9 +14,13 @@ cimport util from libc.stdlib cimport malloc, free - from numpy cimport ndarray, double_t, int64_t, float64_t +from skiplist cimport (IndexableSkiplist, + node_t, skiplist_t, + skiplist_init, skiplist_destroy, + skiplist_get, skiplist_insert, skiplist_remove) + cdef np.float32_t MINfloat32 = np.NINF cdef np.float64_t MINfloat64 = np.NINF @@ -30,19 +34,10 @@ cdef inline int int_min(int a, int b): return a if a <= b else b from util cimport numeric -from skiplist cimport ( - skiplist_t, - skiplist_init, - skiplist_destroy, - skiplist_get, - skiplist_insert, - skiplist_remove) - cdef extern from "../src/headers/math.h": - double sqrt(double x) nogil int signbit(double) nogil + double sqrt(double x) nogil -include "skiplist.pyx" # Cython implementations of rolling sum, mean, variance, skewness, # other statistical moment functions @@ -661,9 +656,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +672,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +688,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -749,6 +748,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: @@ -760,10 +762,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) diff --git a/pandas/_version.py b/pandas/_version.py index 4a469ebb8630e..624c7b5cd63a1 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -141,11 +141,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set(r.strip() for r in refnames.strip("()").split(",")) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set(r[len(TAG):] for r in refs if r.startswith(TAG)) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -154,7 +154,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set(r for r in refs if re.search(r'\d', r)) + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '{}', no digits".format(",".join(refs - tags))) if verbose: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index a615e098135a9..2deb29dabe764 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -396,25 +396,13 @@ def raise_with_traceback(exc, traceback=Ellipsis): If traceback is not passed, uses sys.exc_info() to get traceback.""" -# http://stackoverflow.com/questions/4126348 -# Thanks to @martineau at SO - +# dateutil minimum version import dateutil -if PY2 and LooseVersion(dateutil.__version__) == '2.0': - # dateutil brokenness - raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' - 'install version 1.5 or 2.1+!') - +if LooseVersion(dateutil.__version__) < '2.5': + raise ImportError('dateutil 2.5.0 is the minimum required version') from dateutil import parser as _date_parser -if LooseVersion(dateutil.__version__) < '2.0': - - @functools.wraps(_date_parser.parse) - def parse_date(timestr, *args, **kwargs): - timestr = bytes(timestr) - return _date_parser.parse(timestr, *args, **kwargs) -else: - parse_date = _date_parser.parse +parse_date = _date_parser.parse # https://github.com/pandas-dev/pandas/pull/9123 diff --git a/pandas/compat/openpyxl_compat.py b/pandas/compat/openpyxl_compat.py deleted file mode 100644 index 87cf52cf00fef..0000000000000 --- a/pandas/compat/openpyxl_compat.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Detect incompatible version of OpenPyXL - -GH7169 -""" - -from distutils.version import LooseVersion - -start_ver = '1.6.1' -stop_ver = '2.0.0' - - -def is_compat(major_ver=1): - """Detect whether the installed version of openpyxl is supported - - Parameters - ---------- - ver : int - 1 requests compatibility status among the 1.x.y series - 2 requests compatibility status of 2.0.0 and later - Returns - ------- - compat : bool - ``True`` if openpyxl is installed and is a compatible version. - ``False`` otherwise. - """ - import openpyxl - ver = LooseVersion(openpyxl.__version__) - if major_ver == 1: - return LooseVersion(start_ver) <= ver < LooseVersion(stop_ver) - elif major_ver == 2: - return LooseVersion(stop_ver) <= ver - else: - raise ValueError('cannot test for openpyxl compatibility with ver {0}' - .format(major_ver)) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 8015642919611..07b34961ce25d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -74,7 +74,11 @@ def load_reduce(self): ('pandas._libs.sparse', 'BlockIndex'), ('pandas.tslib', 'Timestamp'): ('pandas._libs.tslib', 'Timestamp'), - ('pandas._period', 'Period'): ('pandas._libs.period', 'Period'), + + # 18543 moving period + ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'), + ('pandas._libs.period', 'Period'): + ('pandas._libs.tslibs.period', 'Period'), # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype ('pandas.tslib', '__nat_unpickle'): diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 7a2da9655cc4a..53ead5e8f74a3 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -10,7 +10,7 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset([]) + _deprecations = frozenset(['asobject']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9f712a1cf039b..0ceb8966fd3c8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -369,7 +369,7 @@ def unique(values): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - uniques = uniques.asobject.values + uniques = uniques.astype(object).values return uniques diff --git a/pandas/core/api.py b/pandas/core/api.py index 1f46aaa40e9eb..8a624da362976 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,9 +23,9 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D -from pandas.core.reshape.reshape import ( - pivot_simple as pivot, get_dummies) -from pandas.core.reshape.melt import lreshape, wide_to_long + +# TODO: Remove import when statsmodels updates #18264 +from pandas.core.reshape.reshape import get_dummies from pandas.core.indexing import IndexSlice from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/base.py b/pandas/core/base.py index 90fe350848bf7..72acd0052202b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -12,11 +12,12 @@ is_object_dtype, is_list_like, is_scalar, - is_datetimelike) + is_datetimelike, + is_extension_type) from pandas.util._validators import validate_bool_kwarg -from pandas.core import common as com +from pandas.core import common as com, algorithms import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv @@ -838,6 +839,78 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, klass=self.__class__.__name__, op=name)) return func(**kwds) + def _map_values(self, mapper, na_action=None): + """An internal function that maps values using the input + correspondence (which can be a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + The input correspondence object + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function + + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + + """ + + # we can fastpath dict/Series to an efficient map + # as we know that we are not going to have to yield + # python types + if isinstance(mapper, dict): + if hasattr(mapper, '__missing__'): + # If a dictionary subclass defines a default value method, + # convert mapper to a lookup function (GH #15999). + dict_with_default = mapper + mapper = lambda x: dict_with_default[x] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an Series for efficiency. + # we specify the keys here to handle the + # possibility that they are tuples + from pandas import Series + mapper = Series(mapper) + + if isinstance(mapper, ABCSeries): + # Since values were input this means we came from either + # a dict or a series and mapper should be an index + if is_extension_type(self.dtype): + values = self._values + else: + values = self.values + + indexer = mapper.index.get_indexer(values) + new_values = algorithms.take_1d(mapper._values, indexer) + + return new_values + + # we must convert to python types + if is_extension_type(self.dtype): + values = self._values + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) + else: + values = self.astype(object) + values = getattr(values, 'values', values) + if na_action == 'ignore': + def map_f(values, f): + return lib.map_infer_mask(values, f, + isna(values).view(np.uint8)) + else: + map_f = lib.map_infer + + # mapper is a function + new_values = map_f(values, mapper) + + return new_values + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 03bf09352862b..e34755e665f8d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -552,26 +552,6 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, return cls(codes, dtype=dtype, fastpath=True) - @classmethod - def from_array(cls, data, **kwargs): - """ - .. deprecated:: 0.19.0 - Use ``Categorical`` instead. - - Make a Categorical type from a single array-like object. - - For internal compatibility with numpy arrays. - - Parameters - ---------- - data : array-like - Can be an Index or array-like. The categories are assumed to be - the unique values of `data`. - """ - warn("Categorical.from_array is deprecated, use Categorical instead", - FutureWarning, stacklevel=2) - return cls(data, **kwargs) - @classmethod def from_codes(cls, codes, categories, ordered=False): """ @@ -2276,7 +2256,7 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls - return codes + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/core/common.py b/pandas/core/common.py index 8e12ce3647340..76a69030463ec 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -347,7 +347,7 @@ def map_indices_py(arr): Returns a dictionary with (element, index) pairs for each element in the given array/list """ - return dict((x, i) for i, x in enumerate(arr)) + return {x: i for i, x in enumerate(arr)} def union(*seqs): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bc8aacfe90170..a97b84ab9cc5b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1127,3 +1127,38 @@ def cast_scalar_to_array(shape, value, dtype=None): values.fill(fill_value) return values + + +def construct_1d_arraylike_from_scalar(value, length, dtype): + """ + create a np.ndarray / pandas type of specified shape and dtype + filled with values + + Parameters + ---------- + value : scalar value + length : int + dtype : pandas_dtype / np.dtype + + Returns + ------- + np.ndarray / pandas type of length, filled with value + + """ + if is_datetimetz(dtype): + from pandas import DatetimeIndex + subarr = DatetimeIndex([value] * length, dtype=dtype) + elif is_categorical_dtype(dtype): + from pandas import Categorical + subarr = Categorical([value] * length) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + + # coerce if we have nan for an integer dtype + if is_integer_dtype(dtype) and isna(value): + dtype = np.float64 + subarr = np.empty(length, dtype=dtype) + subarr.fill(value) + + return subarr diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7f9245bb31530..cd98064dee86e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -401,7 +401,7 @@ def convert_to_pydatetime(x, axis): # if dtype is of datetimetz or timezone if x.dtype.kind == _NS_DTYPE.kind: if getattr(x, 'tz', None) is not None: - x = x.asobject.values + x = x.astype(object).values else: shape = x.shape x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), @@ -459,7 +459,7 @@ def _concat_datetimetz(to_concat, name=None): it is used in DatetimeIndex.append also """ # do not pass tz to set because tzlocal cannot be hashed - if len(set(str(x.dtype) for x in to_concat)) != 1: + if len({str(x.dtype) for x in to_concat}) != 1: raise ValueError('to_concat must have the same tz') tz = to_concat[0].tz # no need to localize because internal repr will not be changed @@ -479,7 +479,7 @@ def _concat_index_asobject(to_concat, name=None): """ klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex - to_concat = [x.asobject if isinstance(x, klasses) else x + to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] from pandas import Index @@ -525,7 +525,7 @@ def convert_sparse(x, axis): if len(typs) == 1: # concat input as it is if all inputs are sparse # and have the same fill_value - fill_values = set(c.fill_value for c in to_concat) + fill_values = {c.fill_value for c in to_concat} if len(fill_values) == 1: sp_values = [c.sp_values for c in to_concat] indexes = [c.sp_index.to_int_index() for c in to_concat] diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7cae536c5edd9..ce57b544d9d66 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -369,13 +369,14 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype): +def na_value_for_dtype(dtype, compat=True): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype + compat : boolean, default True Returns ------- @@ -389,7 +390,9 @@ def na_value_for_dtype(dtype): elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): - return 0 + if compat: + return 0 + return np.nan elif is_bool_dtype(dtype): return False return np.nan diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7145fa709c345..313c9ec872179 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -54,6 +54,7 @@ _ensure_int64, _ensure_platform_int, is_list_like, + is_nested_list_like, is_iterator, is_sequence, is_named_tuple) @@ -147,16 +148,17 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys on : label or list - Field names to join on. Must be found in both DataFrames. If on is - None and not merging on indexes, then it merges on the intersection of - the columns by default. + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index @@ -195,6 +197,11 @@ .. versionadded:: 0.21.0 +Notes +----- +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.22.0 + Examples -------- @@ -347,7 +354,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: data_columns = list(data.dtype.names) - data = dict((k, data[k]) for k in data_columns) + data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns mgr = self._init_dict(data, index, columns, dtype=dtype) @@ -417,8 +424,7 @@ def _init_dict(self, data, index, columns, dtype=None): extract_index(list(data.values())) # prefilter if columns passed - data = dict((k, v) for k, v in compat.iteritems(data) - if k in columns) + data = {k: v for k, v in compat.iteritems(data) if k in columns} if index is None: index = extract_index(list(data.values())) @@ -993,7 +999,7 @@ def to_dict(self, orient='dict', into=dict): for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): return [into_c((k, _maybe_box_datetimelike(v)) - for k, v in zip(self.columns, row)) + for k, v in zip(self.columns, np.atleast_1d(row))) for row in self.values] elif orient.lower().startswith('i'): return into_c((k, v.to_dict(into)) for k, v in self.iterrows()) @@ -1272,16 +1278,34 @@ def from_items(cls, items, columns=None, orient='columns'): columns = _ensure_index(keys) arrays = values - return cls._from_arrays(arrays, columns, None) + # GH 17312 + # Provide more informative error msg when scalar values passed + try: + return cls._from_arrays(arrays, columns, None) + + except ValueError: + if not is_nested_list_like(values): + raise ValueError('The value in each (key, value) pair ' + 'must be an array, Series, or dict') + elif orient == 'index': if columns is None: raise TypeError("Must pass columns with orient='index'") keys = _ensure_index(keys) - arr = np.array(values, dtype=object).T - data = [lib.maybe_convert_objects(v) for v in arr] - return cls._from_arrays(data, columns, keys) + # GH 17312 + # Provide more informative error msg when scalar values passed + try: + arr = np.array(values, dtype=object).T + data = [lib.maybe_convert_objects(v) for v in arr] + return cls._from_arrays(data, columns, keys) + + except TypeError: + if not is_nested_list_like(values): + raise ValueError('The value in each (key, value) pair ' + 'must be an array, Series, or dict') + else: # pragma: no cover raise ValueError("'orient' must be either 'columns' or 'index'") @@ -3306,7 +3330,7 @@ class max type def _maybe_casted_values(index, labels=None): if isinstance(index, PeriodIndex): - values = index.asobject.values + values = index.astype(object).values elif isinstance(index, DatetimeIndex) and index.tz is not None: values = index else: @@ -3745,7 +3769,7 @@ def nlargest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -3780,7 +3804,7 @@ def nsmallest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -3895,7 +3919,7 @@ def f(col): return self._constructor_sliced(r, index=new_index, dtype=r.dtype) - result = dict((col, f(col)) for col in this) + result = {col: f(col) for col in this} # non-unique else: @@ -3906,7 +3930,7 @@ def f(i): return self._constructor_sliced(r, index=new_index, dtype=r.dtype) - result = dict((i, f(i)) for i, col in enumerate(this.columns)) + result = {i: f(i) for i, col in enumerate(this.columns)} result = self._constructor(result, index=new_index, copy=False) result.columns = new_columns return result @@ -3984,7 +4008,7 @@ def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True): if self.columns.is_unique: def _compare(a, b): - return dict((col, func(a[col], b[col])) for col in a.columns) + return {col: func(a[col], b[col]) for col in a.columns} new_data = expressions.evaluate(_compare, str_rep, self, other) return self._constructor(data=new_data, index=self.index, @@ -3993,8 +4017,8 @@ def _compare(a, b): else: def _compare(a, b): - return dict((i, func(a.iloc[:, i], b.iloc[:, i])) - for i, col in enumerate(a.columns)) + return {i: func(a.iloc[:, i], b.iloc[:, i]) + for i, col in enumerate(a.columns)} new_data = expressions.evaluate(_compare, str_rep, self, other) result = self._constructor(data=new_data, index=self.index, @@ -4513,7 +4537,7 @@ def unstack(self, level=-1, fill_value=None): fill_value : replace NaN with this value if the unstack produces missing values - .. versionadded: 0.18.0 + .. versionadded:: 0.18.0 See also -------- @@ -4676,7 +4700,7 @@ def diff(self, periods=1, axis=0): axis : {0 or 'index', 1 or 'columns'}, default 0 Take difference over rows (0) or columns (1). - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 Returns ------- @@ -5053,7 +5077,7 @@ def applymap(self, func): def infer(x): if x.empty: return lib.map_infer(x, func) - return lib.map_infer(x.asobject, func) + return lib.map_infer(x.astype(object).values, func) return self.apply(infer) @@ -5196,12 +5220,12 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame - on : column name, tuple/list of column names, or array-like - Column(s) in the caller to join on the index in other, - otherwise joins index-on-index. If multiples - columns given, the passed DataFrame must have a MultiIndex. Can - pass an array as the join key if not already contained in the - calling DataFrame. Like an Excel VLOOKUP operation + on : name, tuple/list of names, or array-like + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects. @@ -5226,6 +5250,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Support for specifying index levels as the `on` parameter was added + in version 0.22.0 + Examples -------- >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 782971a742b54..83fd36f0a864f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_re_compilable, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame from pandas.core.common import (_count_not_none, @@ -36,7 +37,7 @@ from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, - InvalidIndexError) + InvalidIndexError, RangeIndex) import pandas.core.indexing as indexing from pandas.core.indexing import maybe_convert_indices from pandas.core.indexes.datetimes import DatetimeIndex @@ -235,10 +236,10 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, """ cls._AXIS_ORDERS = axes - cls._AXIS_NUMBERS = dict((a, i) for i, a in enumerate(axes)) + cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)} cls._AXIS_LEN = len(axes) cls._AXIS_ALIASES = aliases or dict() - cls._AXIS_IALIASES = dict((v, k) for k, v in cls._AXIS_ALIASES.items()) + cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()} cls._AXIS_NAMES = dict(enumerate(axes)) cls._AXIS_SLICEMAP = slicers or None cls._AXIS_REVERSED = axes_are_reversed @@ -279,21 +280,21 @@ def set_axis(a, i): def _construct_axes_dict(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = dict((a, self._get_axis(a)) for a in (axes or self._AXIS_ORDERS)) + d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d @staticmethod def _construct_axes_dict_from(self, axes, **kwargs): """Return an axes dictionary for the passed axes.""" - d = dict((a, ax) for a, ax in zip(self._AXIS_ORDERS, axes)) + d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)} d.update(kwargs) return d def _construct_axes_dict_for_slice(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = dict((self._AXIS_SLICEMAP[a], self._get_axis(a)) - for a in (axes or self._AXIS_ORDERS)) + d = {self._AXIS_SLICEMAP[a]: self._get_axis(a) + for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d @@ -329,7 +330,7 @@ def _construct_axes_from_arguments(self, args, kwargs, require_all=False): raise TypeError("not enough/duplicate arguments " "specified!") - axes = dict((a, kwargs.pop(a, None)) for a in self._AXIS_ORDERS) + axes = {a: kwargs.pop(a, None) for a in self._AXIS_ORDERS} return axes, kwargs @classmethod @@ -1038,6 +1039,313 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + def _is_level_reference(self, key, axis=0): + """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key: str + Potential level name for the given axis + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level: bool + """ + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_is_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + not self._is_label_reference(key, axis=axis)) + + def _is_label_reference(self, key, axis=0): + """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key: str + Potential label name + axis: int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + any(key in self.axes[ax] for ax in other_axes)) + + def _is_label_or_level_reference(self, key, axis=0): + """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key: str + Potential label or level name + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_label_or_level: bool + """ + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_or_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (self._is_level_reference(key, axis=axis) or + self._is_label_reference(key, axis=axis)) + + def _check_label_or_level_ambiguity(self, key, axis=0): + """ + Check whether `key` matches both a level of the input `axis` and a + label of the other axis and raise a ``FutureWarning`` if this is the + case. + + Note: This method will be altered to raise an ambiguity exception in + a future version. + + Parameters + ---------- + key: str or object + label or level name + + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + ambiguous: bool + + Raises + ------ + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(self))) + + if (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + any(key in self.axes[ax] for ax in other_axes)): + + # Build an informative and grammatical warning + level_article, level_type = (('an', 'index') + if axis == 0 else + ('a', 'column')) + + label_article, label_type = (('a', 'column') + if axis == 0 else + ('an', 'index')) + + msg = ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type) + + warnings.warn(msg, FutureWarning, stacklevel=2) + return True + else: + return False + + def _get_label_or_level_values(self, key, axis=0): + """ + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key: str + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_get_label_or_level_values is not implemented for {type}" + .format(type=type(self))) + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + label_axis_name = 'column' if axis == 0 else 'index' + raise ValueError(("The {label_axis_name} label '{key}' " + "is not unique") + .format(key=key, + label_axis_name=label_axis_name)) + + return values + + def _drop_labels_or_levels(self, keys, axis=0): + """ + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys: str or list of str + labels or levels to drop + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level + """ + + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_drop_labels_or_levels is not implemented for {type}" + .format(type=type(self))) + + # Validate keys + keys = com._maybe_make_list(keys) + invalid_keys = [k for k in keys if not + self._is_label_or_level_reference(k, axis=axis)] + + if invalid_keys: + raise ValueError(("The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys + if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) + + return dropped + # ---------------------------------------------------------------------- # Iteration @@ -1172,7 +1480,7 @@ def to_dense(self): # Picklability def __getstate__(self): - meta = dict((k, getattr(self, k, None)) for k in self._metadata) + meta = {k: getattr(self, k, None) for k in self._metadata} return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta) @@ -3735,6 +4043,9 @@ def _get_bool_data(self): def as_matrix(self, columns=None): """ + DEPRECATED: as_matrix will be removed in a future version. + Use :meth:`DataFrame.values` instead. + Convert the frame to its Numpy-array representation. Parameters @@ -3770,10 +4081,11 @@ def as_matrix(self, columns=None): -------- pandas.DataFrame.values """ + warnings.warn("Method .as_matrix will be removed in a future version. " + "Use .values instead.", FutureWarning, stacklevel=2) self._consolidate_inplace() - if self._AXIS_REVERSED: - return self._data.as_matrix(columns).T - return self._data.as_matrix(columns) + return self._data.as_array(transpose=self._AXIS_REVERSED, + items=columns) @property def values(self): @@ -3791,7 +4103,8 @@ def values(self): int32. By numpy.find_common_type convention, mixing int64 and uint64 will result in a flot64 dtype. """ - return self.as_matrix() + self._consolidate_inplace() + return self._data.as_array(transpose=self._AXIS_REVERSED) @property def _values(self): @@ -3801,11 +4114,11 @@ def _values(self): @property def _get_values(self): # compat - return self.as_matrix() + return self.values def get_values(self): """same as values (but handles sparseness conversions)""" - return self.as_matrix() + return self.values def get_dtype_counts(self): """Return the counts of dtypes in this object.""" @@ -4277,8 +4590,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, elif self.ndim == 3: # fill in 2d chunks - result = dict((col, s.fillna(method=method, value=value)) - for col, s in self.iteritems()) + result = {col: s.fillna(method=method, value=value) + for col, s in self.iteritems()} new_obj = self._constructor.\ from_dict(result).__finalize__(self) new_data = new_obj._data @@ -5681,7 +5994,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means other is a DataFrame, and we need to broadcast # self cons = self._constructor_expanddim - df = cons(dict((c, self) for c in other.columns), + df = cons({c: self for c in other.columns}, **other._construct_axes_dict()) return df._align_frame(other, join=join, axis=axis, level=level, copy=copy, @@ -5691,7 +6004,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means self is a DataFrame, and we need to broadcast # other cons = other._constructor_expanddim - df = cons(dict((c, other) for c in self.columns), + df = cons({c: other for c in self.columns}, **self._construct_axes_dict()) return self._align_frame(df, join=join, axis=axis, level=level, copy=copy, fill_value=fill_value, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8338df33f5cde..a5d8cc254cd93 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -77,6 +77,119 @@ pandas.Panel.%(name)s """ +_apply_docs = dict( + template=""" + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a dataframe, a series or a scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods. + Pandas offers a wide range of method that will be much faster + than using ``apply`` for their specific purposes, so try to use them + before reaching for ``apply``. + + Parameters + ---------- + func : function + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func`` + + Returns + ------- + applied : Series or DataFrame + + Notes + ----- + In the current implementation ``apply`` calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + Examples + -------- + {examples} + + See also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate, transform + """, + dataframe_examples=""" + >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) + >>> g = df.groupby('A') + + From ``df`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: below the function passed to ``apply`` takes a dataframe as + its argument and returns a dataframe. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + Example 2: The function passed to ``apply`` takes a dataframe as + its argument and returns a series. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x.max() - x.min()) + B C + A + a 1 2 + b 0 0 + + Example 3: The function passed to ``apply`` takes a dataframe as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + """, + series_examples=""" + >>> ser = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g = ser.groupby(ser.index) + + From ``ser`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: The function passed to ``apply`` takes a series as + its argument and returns a series. ``apply`` combines the result for + each group together into a new series: + + >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + 0 0.0 + 1 0.5 + 2 4.0 + dtype: float64 + + Example 2: The function passed to ``apply`` takes a series as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + """) + _transform_template = """ Call function producing a like-indexed %(klass)s on each group and return a %(klass)s having the same indexes as the original object @@ -144,6 +257,7 @@ """ + # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames _plotting_methods = frozenset(['plot', 'boxplot', 'hist']) @@ -206,12 +320,13 @@ class Grouper(object): sort : boolean, default to False whether to sort the resulting labels - additional kwargs to control time-like groupers (when freq is passed) + additional kwargs to control time-like groupers (when ``freq`` is passed) - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If grouper is PeriodIndex + base, loffset Returns ------- @@ -233,6 +348,7 @@ class Grouper(object): >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ + _attributes = ('key', 'level', 'freq', 'axis', 'sort') def __new__(cls, *args, **kwargs): if kwargs.get('freq') is not None: @@ -333,6 +449,14 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups + def __repr__(self): + attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None] + attrs = ", ".join(attrs_list) + cls_name = self.__class__.__name__ + return "{}({})".format(cls_name, attrs) + class GroupByPlot(PandasObject): """ @@ -653,50 +777,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Substitution(name='groupby') + @Appender(_apply_docs['template'] + .format(input="dataframe", + examples=_apply_docs['dataframe_examples'])) def apply(self, func, *args, **kwargs): - """ - Apply function and combine results together in an intelligent way. - - The split-apply-combine combination rules attempt to be as common - sense based as possible. For example: - - case 1: - group DataFrame - apply aggregation function (f(chunk) -> Series) - yield DataFrame, with group axis having group labels - - case 2: - group DataFrame - apply transform function ((f(chunk) -> DataFrame with same indexes) - yield DataFrame with resulting chunks glued together - - case 3: - group Series - apply function with f(chunk) -> DataFrame - yield DataFrame with result of chunks glued together - - Parameters - ---------- - func : function - - Notes - ----- - See online documentation for full exposition on how to use apply. - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - - See also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate, transform - """ func = self._is_builtin_func(func) @@ -2299,8 +2383,7 @@ def _aggregate_series_pure_python(self, obj, func): for label, group in splitter: res = func(group) if result is None: - if (isinstance(res, (Series, Index, np.ndarray)) or - isinstance(res, list)): + if (isinstance(res, (Series, Index, np.ndarray))): raise ValueError('Function does not reduce') result = np.empty(ngroups, dtype='O') @@ -2830,16 +2913,11 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if validate and gpr in obj.index.names: - warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % gpr, - FutureWarning, stacklevel=5) + if validate: + obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - elif gpr in obj.index.names: + elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) @@ -2850,9 +2928,11 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must " - "have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + ("Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length" + .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr @@ -3013,6 +3093,12 @@ def _selection_name(self): """) + @Appender(_apply_docs['template'] + .format(input='series', + examples=_apply_docs['series_examples'])) + def apply(self, func, *args, **kwargs): + return super(SeriesGroupBy, self).apply(func, *args, **kwargs) + @Appender(_agg_doc) @Appender(_shared_docs['aggregate'] % dict( klass='Series', @@ -3022,7 +3108,9 @@ def aggregate(self, func_or_funcs, *args, **kwargs): if isinstance(func_or_funcs, compat.string_types): return getattr(self, func_or_funcs)(*args, **kwargs) - if hasattr(func_or_funcs, '__iter__'): + if isinstance(func_or_funcs, collections.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) else: @@ -3840,7 +3928,7 @@ def first_not_none(values): # if all series have a consistent name. If the # series do not have a consistent name, do # nothing. - names = set(v.name for v in values) + names = {v.name for v in values} if len(names) == 1: index.name = list(names)[0] diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2176338574304..27e1006c23174 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -14,7 +14,7 @@ from pandas.core.accessor import PandasDelegate from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex -from pandas._libs.period import IncompatibleFrequency # noqa +from pandas._libs.tslibs.period import IncompatibleFrequency # noqa from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.algorithms import take_1d diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a34e64724245..94e9947155c41 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,7 +13,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.core.dtypes.generic import ( ABCSeries, ABCMultiIndex, @@ -251,7 +250,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name) + data, copy, name, dtype) except ValueError: pass @@ -307,7 +306,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if inferred == 'integer': try: return cls._try_convert_to_int_index( - subarr, copy, name) + subarr, copy, name, dtype) except ValueError: pass @@ -354,22 +353,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if (tupleize_cols and isinstance(data, list) and data and - isinstance(data[0], tuple)): - + if tupleize_cols and is_list_like(data) and data: + if is_iterator(data): + data = list(data) # we must be all tuples, otherwise don't construct # 10697 if all(isinstance(e, tuple) for e in data): - try: - # must be orderable in py3 - if compat.PY3: - sorted(data) - from .multi import MultiIndex - return MultiIndex.from_tuples( - data, names=name or kwargs.get('names')) - except (TypeError, KeyError): - # python2 - MultiIndex fails on mixed types - pass + from .multi import MultiIndex + return MultiIndex.from_tuples( + data, names=name or kwargs.get('names')) # other iterable of some kind subarr = _asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) @@ -664,7 +656,7 @@ def ravel(self, order='C'): # construction helpers @classmethod - def _try_convert_to_int_index(cls, data, copy, name): + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ Attempt to convert an array of data into an integer index. @@ -685,15 +677,18 @@ def _try_convert_to_int_index(cls, data, copy, name): """ from .numeric import Int64Index, UInt64Index - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - # Conversion to int64 failed (possibly due to - # overflow), so let's try now with uint64. + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. try: res = data.astype('u8', copy=False) if (res == data).all(): @@ -732,7 +727,7 @@ def _coerce_to_ndarray(cls, data): def _get_attributes_dict(self): """ return an attributes dict for my class """ - return dict((k, getattr(self, k, None)) for k in self._attributes) + return {k: getattr(self, k, None) for k in self._attributes} def view(self, cls=None): @@ -1784,7 +1779,7 @@ def append(self, other): if not isinstance(obj, Index): raise TypeError('all inputs must be Index') - names = set(obj.name for obj in to_concat) + names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name return self._concat(to_concat, name) @@ -1937,7 +1932,10 @@ def putmask(self, mask, value): try: np.putmask(values, mask, self._convert_for_op(value)) return self._shallow_copy(values) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err + # coerces to object return self.astype(object).putmask(mask, value) @@ -2862,13 +2860,15 @@ def groupby(self, values): return result - def map(self, mapper): - """Apply mapper function to an index. + def map(self, mapper, na_action=None): + """Map values of Series using input correspondence Parameters ---------- - mapper : callable - Function to be applied. + mapper : function, dict, or Series + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function Returns ------- @@ -2878,15 +2878,30 @@ def map(self, mapper): a MultiIndex will be returned. """ + from .multi import MultiIndex - mapped_values = self._arrmap(self.values, mapper) + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) + attributes = self._get_attributes_dict() - if mapped_values.size and isinstance(mapped_values[0], tuple): - return MultiIndex.from_tuples(mapped_values, - names=attributes.get('name')) + + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) attributes['copy'] = False - return Index(mapped_values, **attributes) + if not new_values.size: + # empty + attributes['dtype'] = self.dtype + + return Index(new_values, **attributes) def isin(self, values, level=None): """ @@ -3731,6 +3746,10 @@ def insert(self, loc, item): ------- new_index : Index """ + if is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value + _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._values idx = np.concatenate((_self[:loc], item, _self[loc:])) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d09e5447431ce..26ffb01b9577f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,7 +12,7 @@ is_scalar) from pandas.core.common import (_asarray_tuplesafe, _values_from_object) -from pandas.core.dtypes.missing import array_equivalent +from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core.algorithms import take_1d @@ -690,7 +690,7 @@ def insert(self, loc, item): """ code = self.categories.get_indexer([item]) - if (code == -1): + if (code == -1) and not (is_scalar(item) and isna(item)): raise TypeError("cannot insert an item into a CategoricalIndex " "that is not already an existing category") diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4934ccb49b844..5c96e4eeff69d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -25,7 +25,7 @@ import pandas.io.formats.printing as printing from pandas._libs import lib, iNaT, NaT -from pandas._libs.period import Period +from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds from pandas.core.indexes.base import Index, _index_shared_docs @@ -136,7 +136,7 @@ def equals(self, other): elif not isinstance(other, type(self)): try: other = type(self)(other) - except: + except Exception: return False if not is_dtype_equal(self.dtype, other.dtype): @@ -242,6 +242,13 @@ def _box_values(self, values): """ return lib.map_infer(values, self._box_func) + def _box_values_as_index(self): + """ + return object Index which contains boxed values + """ + from pandas.core.index import Index + return Index(self._box_values(self.asi8), name=self.name, dtype=object) + def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) @@ -263,7 +270,9 @@ def __getitem__(self, key): is_int = is_integer(key) if is_scalar(key) and not is_int: - raise ValueError + raise IndexError("only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean " + "arrays are valid indices") getitem = self._data.__getitem__ if is_int: @@ -352,13 +361,13 @@ def map(self, f): # Try to use this result if we can if isinstance(result, np.ndarray): - self._shallow_copy(result) + result = Index(result) if not isinstance(result, Index): raise TypeError('The map function must return an Index object') return result except Exception: - return self.asobject.map(f) + return self.astype(object).map(f) def sort_values(self, return_indexer=False, ascending=True): """ @@ -422,13 +431,15 @@ def _isnan(self): @property def asobject(self): - """ + """DEPRECATED: Use ``astype(object)`` instead. + return object Index which contains boxed values *this is an internal non-public method* """ - from pandas.core.index import Index - return Index(self._box_values(self.asi8), name=self.name, dtype=object) + warnings.warn("'asobject' is deprecated. Use 'astype(object)'" + " instead", FutureWarning, stacklevel=2) + return self.astype(object) def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance, box=False)) @@ -466,7 +477,7 @@ def tolist(self): """ return a list of the underlying data """ - return list(self.asobject) + return list(self.astype(object)) def min(self, axis=None, *args, **kwargs): """ @@ -744,7 +755,7 @@ def isin(self, values): try: values = type(self)(values) except ValueError: - return self.asobject.isin(values) + return self.astype(object).isin(values) return algorithms.isin(self.asi8, values.asi8) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e08bf4a625bce..fb86d25625b6a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -43,8 +43,7 @@ DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin) from pandas.tseries.offsets import ( DateOffset, generate_range, Tick, CDay, prefix_mapping) -from pandas.core.tools.datetimes import ( - parse_time_string, normalize_date, to_time) + from pandas.core.tools.timedeltas import to_timedelta from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) @@ -54,8 +53,9 @@ from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, - Timestamp, period as libperiod) -from pandas._libs.tslibs import timezones, conversion, fields + Timestamp) +from pandas._libs.tslibs import (timezones, conversion, fields, parsing, + period as libperiod) # -------- some conversion wrapper functions @@ -523,14 +523,14 @@ def _generate(cls, start, end, periods, name, offset, if start is not None: if normalize: - start = normalize_date(start) + start = libts.normalize_date(start) _normalized = True else: _normalized = _normalized and start.time() == _midnight if end is not None: if normalize: - end = normalize_date(end) + end = libts.normalize_date(end) _normalized = True else: _normalized = _normalized and end.time() == _midnight @@ -906,7 +906,7 @@ def to_datetime(self, dayfirst=False): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') @@ -926,7 +926,7 @@ def _get_time_micros(self): values = self.asi8 if self.tz is not None and self.tz is not utc: values = self._local_timestamps() - return libts.get_time_micros(values) + return fields.get_time_micros(values) def to_series(self, keep_tz=False): """ @@ -1528,7 +1528,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): if isinstance(label, compat.string_types): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - _, parsed, reso = parse_time_string(label, freq) + _, parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -1545,7 +1545,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - _, parsed, reso = parse_time_string(key, freq) + _, parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc @@ -1678,7 +1678,7 @@ def time(self): Returns numpy array of datetime.time. The time part of the Timestamps. """ return self._maybe_mask_results(libalgos.arrmap_object( - self.asobject.values, + self.astype(object).values, lambda x: np.nan if x is libts.NaT else x.time())) @property @@ -1757,6 +1757,9 @@ def insert(self, loc, item): ------- new_index : Index """ + if is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value freq = None @@ -1773,6 +1776,7 @@ def insert(self, loc, item): elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = _to_m8(item, tz=self.tz) + try: new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) @@ -1780,12 +1784,11 @@ def insert(self, loc, item): new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) - except (AttributeError, TypeError): # fall back to object index if isinstance(item, compat.string_types): - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) raise TypeError( "cannot insert DatetimeIndex with incompatible label") @@ -1961,8 +1964,8 @@ def indexer_between_time(self, start_time, end_time, include_start=True, ------- values_between_time : TimeSeries """ - start_time = to_time(start_time) - end_time = to_time(end_time) + start_time = tools.to_time(start_time) + end_time = tools.to_time(end_time) time_micros = self._get_time_micros() start_micros = _time_to_micros(start_time) end_micros = _time_to_micros(end_time) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c9bb8748abe7b..02ac74e619fa4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.missing import notna, isna from pandas.core.dtypes.generic import ABCPeriodIndex from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, @@ -31,7 +32,9 @@ from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex from pandas.compat.numpy import function as nv -from pandas.core import common as com +from pandas.core.common import ( + _all_not_none, _any_none, _asarray_tuplesafe, _count_not_none, + is_bool_indexer, _maybe_box_datetimelike, _not_none) from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset @@ -58,8 +61,8 @@ def _get_next_label(label): elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: - raise TypeError('cannot determine next label for type %r' - % type(label)) + raise TypeError('cannot determine next label for type {typ!r}' + .format(typ=type(label))) def _get_prev_label(label): @@ -73,8 +76,8 @@ def _get_prev_label(label): elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: - raise TypeError('cannot determine next label for type %r' - % type(label)) + raise TypeError('cannot determine next label for type {typ!r}' + .format(typ=type(label))) def _get_interval_closed_bounds(interval): @@ -91,17 +94,18 @@ def _get_interval_closed_bounds(interval): def _new_IntervalIndex(cls, d): - """ This is called upon unpickling, - rather than the default which doesn't - have arguments and breaks __new__ """ - + """ + This is called upon unpickling, rather than the default which doesn't have + arguments and breaks __new__ + """ return cls.from_arrays(**d) class IntervalIndex(IntervalMixin, Index): """ Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of intervals that are all closed on the same side. + represents an Index of Interval objects that are all closed on the same + side. .. versionadded:: 0.20.0 @@ -114,9 +118,9 @@ class IntervalIndex(IntervalMixin, Index): ---------- left, right : array-like (1-dimensional) Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or - neither. Defaults to 'right'. + neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -143,7 +147,7 @@ class IntervalIndex(IntervalMixin, Index): closed='right', dtype='interval[int64]') It may also be constructed using one of the constructor - methods :meth:`IntervalIndex.from_arrays`, + methods: :meth:`IntervalIndex.from_arrays`, :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_intervals` and :meth:`IntervalIndex.from_tuples`. @@ -159,12 +163,10 @@ class IntervalIndex(IntervalMixin, Index): See Also -------- Index : The base pandas Index type - Interval : A bounded slice-like interval - interval_range : Function to create a fixed frequency - IntervalIndex, IntervalIndex.from_arrays, IntervalIndex.from_breaks, - IntervalIndex.from_intervals, IntervalIndex.from_tuples - cut, qcut : convert arrays of continuous data into categoricals/series of - ``Interval``. + Interval : A bounded slice-like interval; the elements of an IntervalIndex + interval_range : Function to create a fixed frequency IntervalIndex + cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Intervals """ _typ = 'intervalindex' _comparables = ['name'] @@ -176,7 +178,7 @@ class IntervalIndex(IntervalMixin, Index): _mask = None - def __new__(cls, data, closed='right', + def __new__(cls, data, closed=None, name=None, copy=False, dtype=None, fastpath=False, verify_integrity=True): @@ -197,8 +199,17 @@ def __new__(cls, data, closed='right', if is_scalar(data): cls._scalar_data_error(data) - data = IntervalIndex.from_intervals(data, name=name) - left, right, closed = data.left, data.right, data.closed + data = maybe_convert_platform(data) + left, right, infer_closed = intervals_to_interval_bounds(data) + + if _all_not_none(closed, infer_closed) and closed != infer_closed: + # GH 18421 + msg = ("conflicting values for closed: constructor got " + "'{closed}', inferred from data '{infer_closed}'" + .format(closed=closed, infer_closed=infer_closed)) + raise ValueError(msg) + + closed = closed or infer_closed return cls._simple_new(left, right, closed, name, copy=copy, verify_integrity=verify_integrity) @@ -220,9 +231,9 @@ def _simple_new(cls, left, right, closed=None, name=None, left = left.astype(right.dtype) if type(left) != type(right): - raise ValueError("must not have differing left [{}] " - "and right [{}] types".format( - type(left), type(right))) + raise ValueError("must not have differing left [{left}] " + "and right [{right}] types" + .format(left=type(left), right=type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " @@ -267,7 +278,8 @@ def _validate(self): Verify that the IntervalIndex is valid. """ if self.closed not in _VALID_CLOSED: - raise ValueError("invalid options for 'closed': %s" % self.closed) + raise ValueError("invalid options for 'closed': {closed}" + .format(closed=self.closed)) if len(self.left) != len(self.right): raise ValueError('left and right must have the same length') left_mask = notna(self.left) @@ -281,12 +293,15 @@ def _validate(self): @cache_readonly def hasnans(self): - """ return if I have any nans; enables various perf speedups """ + """ + Return if the IntervalIndex has any nans; enables various performance + speedups + """ return self._isnan.any() @cache_readonly def _isnan(self): - """ return if each value is nan""" + """Return a mask indicating if each value is NA""" if self._mask is None: self._mask = isna(self.left) return self._mask @@ -323,7 +338,7 @@ def __contains__(self, key): def contains(self, key): """ - return a boolean if this key is IN the index + Return a boolean indicating if the key is IN the index We accept / allow keys to be not *just* actual objects. @@ -351,9 +366,9 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): ---------- breaks : array-like (1-dimensional) Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -376,7 +391,8 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - breaks = np.asarray(breaks) + breaks = maybe_convert_platform(breaks) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, name=name, copy=copy) @@ -391,9 +407,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): Left bounds for each interval. right : array-like (1-dimensional) Right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -416,8 +432,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - left = np.asarray(left) - right = np.asarray(right) + left = maybe_convert_platform(left) + right = maybe_convert_platform(right) + return cls._simple_new(left, right, closed, name=name, copy=copy, verify_integrity=True) @@ -460,8 +477,12 @@ def from_intervals(cls, data, name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) + if isinstance(data, IntervalIndex): + left, right, closed = data.left, data.right, data.closed + name = name or data.name + else: + data = maybe_convert_platform(data) + left, right, closed = intervals_to_interval_bounds(data) return cls.from_arrays(left, right, closed, name=name, copy=False) @classmethod @@ -473,9 +494,9 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): ---------- data : array-like (1-dimensional) Array of tuples - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -497,18 +518,18 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): IntervalIndex.from_intervals : Construct an IntervalIndex from an array of Interval objects """ - left = [] - right = [] - for d in data: + if len(data): + left, right = [], [] + else: + left = right = data + for d in data: if isna(d): - left.append(np.nan) - right.append(np.nan) - continue - - l, r = d - left.append(l) - right.append(r) + lhs = rhs = np.nan + else: + lhs, rhs = d + left.append(lhs) + right.append(rhs) # TODO # if we have nulls and we previous had *only* @@ -517,7 +538,8 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): return cls.from_arrays(left, right, closed, name=name, copy=False) def to_tuples(self): - return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + """Return an Index of tuples of the form (left, right)""" + return Index(_asarray_tuplesafe(zip(self.left, self.right))) @cache_readonly def _multiindex(self): @@ -526,14 +548,26 @@ def _multiindex(self): @property def left(self): + """ + Return the left endpoints of each Interval in the IntervalIndex as + an Index + """ return self._left @property def right(self): + """ + Return the right endpoints of each Interval in the IntervalIndex as + an Index + """ return self._right @property def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither + """ return self._closed def __len__(self): @@ -542,7 +576,7 @@ def __len__(self): @cache_readonly def values(self): """ - Returns the IntervalIndex's data as a numpy array of Interval + Return the IntervalIndex's data as a numpy array of Interval objects (with dtype='object') """ left = self.left @@ -594,14 +628,17 @@ def astype(self, dtype, copy=True): elif is_categorical_dtype(dtype): from pandas import Categorical return Categorical(self, ordered=True) - raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype) + raise ValueError('Cannot cast IntervalIndex to dtype {dtype}' + .format(dtype=dtype)) @cache_readonly def dtype(self): + """Return the dtype object of the underlying data""" return IntervalDtype.construct_from_string(str(self.left.dtype)) @property def inferred_type(self): + """Return a string of the type inferred from the values""" return 'interval' @Appender(Index.memory_usage.__doc__) @@ -613,7 +650,8 @@ def memory_usage(self, deep=False): @cache_readonly def mid(self): - """Returns the mid-point of each interval in the index as an array + """ + Return the midpoint of each Interval in the IntervalIndex as an Index """ try: return Index(0.5 * (self.left.values + self.right.values)) @@ -624,22 +662,42 @@ def mid(self): @cache_readonly def is_monotonic(self): + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ return self._multiindex.is_monotonic @cache_readonly def is_monotonic_increasing(self): + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ return self._multiindex.is_monotonic_increasing @cache_readonly def is_monotonic_decreasing(self): + """ + Return True if the IntervalIndex is monotonic decreasing (only equal or + decreasing values), else False + """ return self._multiindex.is_monotonic_decreasing @cache_readonly def is_unique(self): + """ + Return True if the IntervalIndex contains unique elements, else False + """ return self._multiindex.is_unique @cache_readonly def is_non_overlapping_monotonic(self): + """ + Return True if the IntervalIndex is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False + """ # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) # we already require left <= right @@ -704,9 +762,8 @@ def _check_method(self, method): return if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: - raise NotImplementedError( - 'method {} not yet implemented for ' - 'IntervalIndex'.format(method)) + msg = 'method {method} not yet implemented for IntervalIndex' + raise NotImplementedError(msg.format(method=method)) raise ValueError("Invalid fill method") @@ -838,24 +895,21 @@ def get_loc(self, key, method=None): return self._engine.get_loc(key) def get_value(self, series, key): - if com.is_bool_indexer(key): + if is_bool_indexer(key): loc = key elif is_list_like(key): loc = self.get_indexer(key) elif isinstance(key, slice): if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default " - "step in a slice") + raise ValueError("cannot support not-default step in a slice") try: loc = self.get_loc(key) except TypeError: - - # we didn't find exact intervals - # or are non-unique - raise ValueError("unable to slice with " - "this key: {}".format(key)) + # we didn't find exact intervals or are non-unique + msg = "unable to slice with this key: {key}".format(key=key) + raise ValueError(msg) else: loc = self.get_loc(key) @@ -908,31 +962,31 @@ def _get_reindexer(self, target): indexer = [] n = len(self) - for i, (l, r) in enumerate(zip(lindexer, rindexer)): + for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)): target_value = target[i] # matching on the lhs bound - if (l != -1 and + if (lhs != -1 and self.closed == 'right' and - target_value.left == self[l].right): - l += 1 + target_value.left == self[lhs].right): + lhs += 1 # matching on the lhs bound - if (r != -1 and + if (rhs != -1 and self.closed == 'left' and - target_value.right == self[r].left): - r -= 1 + target_value.right == self[rhs].left): + rhs -= 1 # not found - if l == -1 and r == -1: + if lhs == -1 and rhs == -1: indexer.append(np.array([-1])) - elif r == -1: + elif rhs == -1: - indexer.append(np.arange(l, n)) + indexer.append(np.arange(lhs, n)) - elif l == -1: + elif lhs == -1: # care about left/right closed here value = self[i] @@ -955,10 +1009,10 @@ def _get_reindexer(self, target): indexer.append(np.array([-1])) continue - indexer.append(np.arange(0, r + 1)) + indexer.append(np.arange(0, rhs + 1)) else: - indexer.append(np.arange(l, r + 1)) + indexer.append(np.arange(lhs, rhs + 1)) return np.concatenate(indexer) @@ -975,19 +1029,47 @@ def where(self, cond, other=None): return self._shallow_copy(values) def delete(self, loc): + """ + Return a new IntervalIndex with passed location(-s) deleted + + Returns + ------- + new_index : IntervalIndex + """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) return self._shallow_copy(new_left, new_right) def insert(self, loc, item): - if not isinstance(item, Interval): - raise ValueError('can only insert Interval objects into an ' - 'IntervalIndex') - if not item.closed == self.closed: - raise ValueError('inserted item must be closed on the same side ' - 'as the index') - new_left = self.left.insert(loc, item.left) - new_right = self.right.insert(loc, item.right) + """ + Return a new IntervalIndex inserting new item at location. Follows + Python list.append semantics for negative values. Only Interval + objects and NA can be inserted into an IntervalIndex + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : IntervalIndex + """ + if isinstance(item, Interval): + if item.closed != self.closed: + raise ValueError('inserted item must be closed on the same ' + 'side as the index') + left_insert = item.left + right_insert = item.right + elif is_scalar(item) and isna(item): + # GH 18295 + left_insert = right_insert = item + else: + raise ValueError('can only insert Interval objects and NA into ' + 'an IntervalIndex') + + new_left = self.left.insert(loc, left_insert) + new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right) def _as_like_interval_index(self, other, error_msg): @@ -1003,7 +1085,7 @@ def _concat_same_dtype(self, to_concat, name): assert that we all have the same .closed we allow a 0-len index here as well """ - if not len(set(i.closed for i in to_concat if len(i))) == 1: + if not len({i.closed for i in to_concat if len(i)}) == 1: msg = ('can only append two IntervalIndex objects ' 'that are closed on the same side') raise ValueError(msg) @@ -1080,23 +1162,23 @@ def _format_data(self, name=None): summary = '[]' elif n == 1: first = formatter(self[0]) - summary = '[{}]'.format(first) + summary = '[{first}]'.format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{}, {}]'.format(first, last) + summary = '[{first}, {last}]'.format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{} ... {}]'.format(', '.join(head), - ', '.join(tail)) + summary = '[{head} ... {tail}]'.format( + head=', '.join(head), tail=', '.join(tail)) else: head = [] tail = [formatter(x) for x in self] - summary = '[{}]'.format(', '.join(tail)) + summary = '[{tail}]'.format(tail=', '.join(tail)) return summary + self._format_space() @@ -1104,17 +1186,20 @@ def _format_attrs(self): attrs = [('closed', repr(self.closed))] if self.name is not None: attrs.append(('name', default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype))) return attrs def _format_space(self): - return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + space = ' ' * (len(self.__class__.__name__) + 1) + return "\n{space}".format(space=space) def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) def equals(self, other): - + """ + Determines if two IntervalIndex objects contain the same elements + """ if self.is_(other): return True @@ -1143,7 +1228,7 @@ def func(self, other): union = _setop('union') intersection = _setop('intersection') difference = _setop('difference') - symmetric_differnce = _setop('symmetric_difference') + symmetric_difference = _setop('symmetric_difference') # TODO: arithmetic operations @@ -1166,7 +1251,7 @@ def _is_type_compatible(a, b): return ((is_number(a) and is_number(b)) or (is_ts_compat(a) and is_ts_compat(b)) or (is_td_compat(a) and is_td_compat(b)) or - com._any_none(a, b)) + _any_none(a, b)) def interval_range(start=None, end=None, periods=None, freq=None, @@ -1188,8 +1273,9 @@ def interval_range(start=None, end=None, periods=None, freq=None, for numeric and 'D' (calendar daily) for datetime-like. name : string, default None Name of the resulting IntervalIndex - closed : string, default 'right' - options are: 'left', 'right', 'both', 'neither' + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. Notes ----- @@ -1244,13 +1330,13 @@ def interval_range(start=None, end=None, periods=None, freq=None, -------- IntervalIndex : an Index of intervals that are all closed on the same side. """ - if com._count_not_none(start, end, periods) != 2: + if _count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' 'exactly two must be specified') - start = com._maybe_box_datetimelike(start) - end = com._maybe_box_datetimelike(end) - endpoint = next(com._not_none(start, end)) + start = _maybe_box_datetimelike(start) + end = _maybe_box_datetimelike(end) + endpoint = next(_not_none(start, end)) if not _is_valid_endpoint(start): msg = 'start must be numeric or datetime-like, got {start}' diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cc99505b53bf5..0cbb87c65ccd7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -177,8 +177,8 @@ def _verify_integrity(self, labels=None, levels=None): Raises ------ ValueError - * if length of levels and labels don't match or any label would - exceed level bounds + If length of levels and labels don't match, if any label would + exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -198,6 +198,11 @@ def _verify_integrity(self, labels=None, levels=None): " level (%d). NOTE: this index is in an" " inconsistent state" % (i, label.max(), len(level))) + if not level.is_unique: + raise ValueError("Level values must be unique: {values} on " + "level {level}".format( + values=[value for value in level], + level=i)) @property def levels(self): @@ -1162,6 +1167,11 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): @@ -1206,6 +1216,11 @@ def from_tuples(cls, tuples, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + if len(tuples) == 0: if names is None: msg = 'Cannot infer number of levels from empty list' @@ -1260,6 +1275,11 @@ def from_product(cls, iterables, sortorder=None, names=None): from pandas.core.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) @@ -1305,19 +1325,19 @@ def _sort_levels_monotonic(self): for lev, lab in zip(self.levels, self.labels): - if lev.is_monotonic: - new_levels.append(lev) - new_labels.append(lab) - continue - - # indexer to reorder the levels - indexer = lev.argsort() - lev = lev.take(indexer) + if not lev.is_monotonic: + try: + # indexer to reorder the levels + indexer = lev.argsort() + except TypeError: + pass + else: + lev = lev.take(indexer) - # indexer to reorder the labels - indexer = _ensure_int64(indexer) - ri = lib.get_reverse_indexer(indexer, len(indexer)) - lab = algos.take_1d(ri, lab) + # indexer to reorder the labels + indexer = _ensure_int64(indexer) + ri = lib.get_reverse_indexer(indexer, len(indexer)) + lab = algos.take_1d(ri, lab) new_levels.append(lev) new_labels.append(lab) @@ -1365,31 +1385,29 @@ def remove_unused_levels(self): new_labels = [] changed = False - for idx, (lev, lab) in enumerate(zip(self.levels, self.labels)): - na_idxs = np.where(lab == -1)[0] - - if len(na_idxs): - lab = np.delete(lab, na_idxs) + for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) + na_idx = np.where(uniques == -1)[0] # nothing unused - if len(uniques) != len(lev): + if len(uniques) != len(lev) + len(na_idx): changed = True + if len(na_idx): + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + # labels get mapped from uniques to 0:len(uniques) - label_mapping = np.zeros(len(lev)) - label_mapping[uniques] = np.arange(len(uniques)) + # -1 (if present) is mapped to last position + label_mapping = np.zeros(len(lev) + len(na_idx)) + # ... and reassigned value -1: + label_mapping[uniques] = np.arange(len(uniques)) - len(na_idx) lab = label_mapping[lab] # new levels are simple - lev = lev.take(uniques) - - if len(na_idxs): - lab = np.insert(lab, na_idxs - np.arange(len(na_idxs)), -1) - else: - lab = self.labels[idx] + lev = lev.take(uniques[len(na_idx):]) new_levels.append(lev) new_labels.append(lab) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ae6a810ece510..fddbb2de83dca 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -216,7 +216,6 @@ class UInt64Index(NumericIndex): _inner_indexer = libjoin.inner_join_indexer_uint64 _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False - _na_value = 0 _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6535eee386e8b..cb0c4a9ce2a86 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -31,12 +31,12 @@ import pandas.tseries.offsets as offsets from pandas._libs.lib import infer_dtype -from pandas._libs import tslib, period, index as libindex -from pandas._libs.period import (Period, IncompatibleFrequency, - get_period_field_arr, _validate_end_alias, - _quarter_to_myear) +from pandas._libs import tslib, index as libindex +from pandas._libs.tslibs.period import (Period, IncompatibleFrequency, + get_period_field_arr, + _validate_end_alias, _quarter_to_myear) from pandas._libs.tslibs.fields import isleapyear_arr -from pandas._libs.tslibs import resolution +from pandas._libs.tslibs import resolution, period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds from pandas.core.base import _shared_docs @@ -418,7 +418,7 @@ def _int64index(self): @property def values(self): - return self.asobject.values + return self.astype(object).values @property def _values(self): @@ -428,7 +428,7 @@ def __array__(self, dtype=None): if is_integer_dtype(dtype): return self.asi8 else: - return self.asobject.values + return self.astype(object).values def __array_wrap__(self, result, context=None): """ @@ -476,7 +476,7 @@ def _to_embed(self, keep_tz=False, dtype=None): if dtype is not None: return self.astype(dtype)._to_embed(keep_tz=keep_tz) - return self.asobject.values + return self.astype(object).values @property def _formatter_func(self): @@ -506,7 +506,7 @@ def asof_locs(self, where, mask): def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_integer_dtype(dtype): if copy: return self._int64index.copy() @@ -656,7 +656,7 @@ def end_time(self): def _mpl_repr(self): # how to represent ourselves to matplotlib - return self.asobject.values + return self.astype(object).values def to_timestamp(self, freq=None, how='start'): """ @@ -971,7 +971,7 @@ def _convert_tolerance(self, tolerance, target): def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8)) @@ -1018,7 +1018,7 @@ def _apply_meta(self, rawarr): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - values = self.asobject.values + values = self.astype(object).values if date_format: formatter = lambda dt: dt.strftime(date_format) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 22fb7c255b12c..77e05ccf4db22 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -482,7 +482,7 @@ def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() @@ -852,16 +852,18 @@ def insert(self, loc, item): ------- new_index : Index """ - # try to convert if possible if _is_convertible_to_td(item): try: item = Timedelta(item) except Exception: pass + elif is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value freq = None - if isinstance(item, Timedelta) or item is NaT: + if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): # check freq can be preserved on edge cases if self.freq is not None: @@ -881,7 +883,7 @@ def insert(self, loc, item): # fall back to object index if isinstance(item, compat.string_types): - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) raise TypeError( "cannot insert TimedeltaIndex with incompatible label") diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 90733fa6d68d1..c6642657e386e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -405,7 +405,8 @@ def _setitem_with_indexer(self, indexer, value): new_values = np.concatenate([self.obj._values, new_values]) except TypeError: - new_values = np.concatenate([self.obj.asobject, + as_obj = self.obj.astype(object) + new_values = np.concatenate([as_obj, new_values]) self.obj._data = self.obj._constructor( new_values, index=new_index, name=self.obj.name)._data diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ca4984cc16673..4169a001655cb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1847,8 +1847,10 @@ def _can_hold_element(self, element): if tipo is not None: return (issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return (isinstance(element, (float, int, np.floating, np.int_)) and - not isinstance(element, (bool, np.bool_, datetime, timedelta, + return ( + isinstance( + element, (float, int, np.floating, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))) def to_native_types(self, slicer=None, na_rep='', float_format=None, @@ -1896,9 +1898,11 @@ def _can_hold_element(self, element): if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) - return (isinstance(element, - (float, int, complex, np.float_, np.int_)) and - not isinstance(element, (bool, np.bool_))) + return ( + isinstance( + element, + (float, int, complex, np.float_, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_))) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) @@ -1956,7 +1960,8 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) - return isinstance(element, (timedelta, np.timedelta64)) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) def fillna(self, value, **kwargs): @@ -2190,7 +2195,7 @@ def _try_coerce_args(self, values, other): if isinstance(other, ABCDatetimeIndex): # to store DatetimeTZBlock as object - other = other.asobject.values + other = other.astype(object).values return values, False, other, False @@ -3377,7 +3382,7 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, blocks.append(block) # note that some DatetimeTZ, Categorical are always ndim==1 - ndim = set(b.ndim for b in blocks) + ndim = {b.ndim for b in blocks} if 2 in ndim: @@ -3484,7 +3489,7 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False, mgr = self # figure out our mask a-priori to avoid repeated replacements - values = self.as_matrix() + values = self.as_array() def comp(s): if isna(s): @@ -3670,9 +3675,24 @@ def copy(self, deep=True, mgr=None): return self.apply('copy', axes=new_axes, deep=deep, do_integrity_check=False) - def as_matrix(self, items=None): + def as_array(self, transpose=False, items=None): + """Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : boolean, default False + If True, transpose the return array + items : list of strings or None + Names of block items that will be included in the returned + array. ``None`` means that all block items will be used + + Returns + ------- + arr : ndarray + """ if len(self.blocks) == 0: - return np.empty(self.shape, dtype=float) + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr if items is not None: mgr = self.reindex_axis(items, axis=0) @@ -3680,9 +3700,11 @@ def as_matrix(self, items=None): mgr = self if self._is_single_block or not self.is_mixed_type: - return mgr.blocks[0].get_values() + arr = mgr.blocks[0].get_values() else: - return mgr._interleave() + arr = mgr._interleave() + + return arr.transpose() if transpose else arr def _interleave(self): """ @@ -3891,7 +3913,7 @@ def get_scalar(self, tup): """ Retrieve single item """ - full_loc = list(ax.get_loc(x) for ax, x in zip(self.axes, tup)) + full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] blk = self.blocks[self._blknos[full_loc[0]]] values = blk.values @@ -4871,7 +4893,7 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): if _can_consolidate: if dtype is None: - if len(set(b.dtype for b in blocks)) != 1: + if len({b.dtype for b in blocks}) != 1: raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype diff --git a/pandas/core/ops.py b/pandas/core/ops.py index fa50036b6eb95..2fb0cbb14c225 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -146,7 +146,7 @@ def names(x): construct_result=_construct_divmod_result, ) - new_methods = dict((names(k), v) for k, v in new_methods.items()) + new_methods = {names(k): v for k, v in new_methods.items()} return new_methods @@ -850,7 +850,7 @@ def wrapper(self, other, axis=None): # tested in test_nat_comparisons # (pandas.tests.series.test_operators.TestSeriesOperators) return self._constructor(na_op(self.values, - other.asobject.values), + other.astype(object).values), index=self.index) return self._constructor(na_op(self.values, np.asarray(other)), diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 0a5e705071b5e..0f3c5cb85249a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -464,7 +464,7 @@ def to_excel(self, path, na_rep='', engine=None, **kwargs): def as_matrix(self): self._consolidate_inplace() - return self._data.as_matrix() + return self._data.as_array() # ---------------------------------------------------------------------- # Getting and setting elements diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 691787125043d..80ee680d2b9d2 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -105,7 +105,7 @@ def _combine_with_constructor(self, other, func): new_axes.append(getattr(self, a).union(getattr(other, a))) # reindex: could check that everything's the same size, but forget it - d = dict((a, ax) for a, ax in zip(self._AXIS_ORDERS, new_axes)) + d = {a: ax for a, ax in zip(self._AXIS_ORDERS, new_axes)} d['copy'] = False this = self.reindex(**d) other = other.reindex(**d) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6988528af415f..9f5439b68558b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,7 +24,7 @@ from pandas._libs import lib, tslib from pandas._libs.lib import Timestamp -from pandas._libs.period import IncompatibleFrequency +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.util._decorators import Appender from pandas.core.generic import _shared_docs @@ -1014,22 +1014,18 @@ class TimeGrouper(Grouper): Parameters ---------- freq : pandas date offset or offset alias for identifying bin edges - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right - nperiods : optional, integer + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex - - Notes - ----- - Use begin, end, nperiods to generate intervals that cannot be derived - directly from the associated object """ + _attributes = Grouper._attributes + ('closed', 'label', 'how', + 'loffset', 'kind', 'convention', + 'base') def __init__(self, freq='Min', closed=None, label=None, how='mean', - nperiods=None, axis=0, - fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0, **kwargs): + axis=0, fill_method=None, limit=None, loffset=None, + kind=None, convention=None, base=0, **kwargs): freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) @@ -1048,7 +1044,6 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.closed = closed self.label = label - self.nperiods = nperiods self.kind = kind self.convention = convention or 'E' diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 99286d807a205..11d69359f5c65 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,8 +1,8 @@ # flake8: noqa from pandas.core.reshape.concat import concat -from pandas.core.reshape.melt import melt -from pandas.core.reshape.merge import ( - merge, ordered_merge, merge_ordered, merge_asof) +from pandas.core.reshape.melt import melt, lreshape, wide_to_long +from pandas.core.reshape.reshape import pivot_simple as pivot, get_dummies +from pandas.core.reshape.merge import merge, merge_ordered, merge_asof from pandas.core.reshape.pivot import pivot_table, crosstab from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6139f093202fe..9bd5abb2cd476 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -568,7 +568,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = list(names) else: # make sure that all of the passed indices have the same nlevels - if not len(set(idx.nlevels for idx in indexes)) == 1: + if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 846d04221fe7f..16439b30d5bb4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -6,14 +6,12 @@ from pandas import compat from pandas.core.categorical import Categorical -from pandas.core.frame import DataFrame -from pandas.core.index import MultiIndex +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.frame import _shared_docs from pandas.util._decorators import Appender import re -import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import notna @@ -27,7 +25,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] - elif (isinstance(frame.columns, MultiIndex) and + elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') @@ -39,7 +37,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] - elif (isinstance(frame.columns, MultiIndex) and + elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') @@ -54,7 +52,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, frame.columns = frame.columns.get_level_values(col_level) if var_name is None: - if isinstance(frame.columns, MultiIndex): + if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: @@ -81,6 +79,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) + from pandas import DataFrame return DataFrame(mdata, columns=mcolumns) @@ -137,6 +136,8 @@ def lreshape(data, groups, dropna=True, label=None): for target, names in zip(keys, values): to_concat = [data[col].values for col in names] + + import pandas.core.dtypes.concat as _concat mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) @@ -148,8 +149,9 @@ def lreshape(data, groups, dropna=True, label=None): for c in pivot_cols: mask &= notna(mdata[c]) if not mask.all(): - mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) + mdata = {k: v[mask] for k, v in compat.iteritems(mdata)} + from pandas import DataFrame return DataFrame(mdata, columns=id_cols + pivot_cols) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec0..bad7088a126cf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -139,19 +139,6 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, return result, lby -def ordered_merge(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y')): - - warnings.warn("ordered_merge is deprecated and replaced by merge_ordered", - FutureWarning, stacklevel=2) - return merge_ordered(left, right, on=on, - left_on=left_on, right_on=right_on, - left_by=left_by, right_by=right_by, - fill_method=fill_method, suffixes=suffixes) - - def merge_ordered(left, right, on=None, left_on=None, right_on=None, left_by=None, right_by=None, @@ -204,7 +191,7 @@ def merge_ordered(left, right, on=None, 4 c 2 b 5 e 3 b - >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + >>> merge_ordered(A, B, fill_method='ffill', left_by='group') key lvalue group rvalue 0 a 1 a NaN 1 b 1 a 1 @@ -253,9 +240,6 @@ def _merger(x, y): return result -ordered_merge.__doc__ = merge_ordered.__doc__ - - def merge_asof(left, right, on=None, left_on=None, right_on=None, left_index=False, right_index=False, @@ -603,6 +587,8 @@ def get_result(self): self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_restore_index_levels(result) + return result def _indicator_pre_merge(self, left, right): @@ -645,6 +631,39 @@ def _indicator_post_merge(self, result): axis=1) return result + def _maybe_restore_index_levels(self, result): + """ + Restore index levels specified as `on` parameters + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + + Returns + ------- + None + """ + names_to_restore = [] + for name, left_key, right_key in zip(self.join_names, + self.left_on, + self.right_on): + if (self.orig_left._is_level_reference(left_key) and + self.orig_right._is_level_reference(right_key) and + name not in result.index.names): + + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None @@ -714,8 +733,17 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: key_col = Index(lvals).where(~mask, rvals) - if name in result: + if result._is_label_reference(name): result[name] = key_col + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + idx_list = [result.index.get_level_values(level_name) + if level_name != name else key_col + for level_name in result.index.names] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) else: result.insert(i, name or 'key_{i}'.format(i=i), key_col) @@ -812,7 +840,8 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -821,7 +850,8 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -834,7 +864,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left[lk]._values) + left_keys.append(left._get_label_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -846,7 +876,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left[k]._values) + left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -860,7 +890,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right[k]._values) + right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -870,10 +900,10 @@ def _get_merge_keys(self): left_keys = [self.left.index.values] if left_drop: - self.left = self.left.drop(left_drop, axis=1) + self.left = self.left._drop_labels_or_levels(left_drop) if right_drop: - self.right = self.right.drop(right_drop, axis=1) + self.right = self.right._drop_labels_or_levels(right_drop) return left_keys, right_keys, join_names @@ -906,16 +936,31 @@ def _maybe_coerce_merge_keys(self): continue # if we are numeric, then allow differing - # kinds to proceed, eg. int64 and int8 + # kinds to proceed, eg. int64 and int8, int and float # further if we are object, but we infer to # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: - continue + pass + + # check whether ints and floats + elif is_integer_dtype(rk) and is_float_dtype(lk): + if not (lk == lk.astype(rk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) + + elif is_float_dtype(rk) and is_integer_dtype(lk): + if not (rk == rk.astype(lk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) # let's infer and see if we are ok - if lib.infer_dtype(lk) == lib.infer_dtype(rk): - continue + elif lib.infer_dtype(lk) == lib.infer_dtype(rk): + pass # Houston, we have a problem! # let's coerce to object if the dtypes aren't @@ -924,14 +969,15 @@ def _maybe_coerce_merge_keys(self): # then we would lose type information on some # columns, and end up trying to merge # incompatible dtypes. See GH 16900. - if name in self.left.columns: - typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) - if name in self.right.columns: - typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + else: + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign( + **{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign( + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? @@ -1529,7 +1575,8 @@ def _get_join_keys(llab, rlab, shape, sort): rkey = stride * rlab[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + with np.errstate(divide='ignore'): + stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride diff --git a/pandas/core/series.py b/pandas/core/series.py index be1de4c6814ba..19c84c34d7d1d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -18,7 +18,7 @@ is_bool, is_integer, is_integer_dtype, is_float_dtype, - is_extension_type, is_datetimetz, + is_extension_type, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -34,18 +34,17 @@ from pandas.core.dtypes.cast import ( maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, - maybe_cast_to_datetime, maybe_castable) + maybe_cast_to_datetime, maybe_castable, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.common import (is_bool_indexer, _default_index, _asarray_tuplesafe, _values_from_object, - _try_sort, _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, - _dict_compat, standardize_mapping, _any_none) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -150,7 +149,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _metadata = ['name'] _accessors = frozenset(['dt', 'cat', 'str']) _deprecations = generic.NDFrame._deprecations | frozenset( - ['sortlevel', 'reshape', 'get_value', 'set_value', 'from_csv']) + ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value', + 'from_csv']) _allow_index_ops = True def __init__(self, data=None, index=None, dtype=None, name=None, @@ -198,32 +198,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data.reindex(index, copy=copy) data = data._data elif isinstance(data, dict): - if index is None: - if isinstance(data, OrderedDict): - index = Index(data) - else: - index = Index(_try_sort(data)) - try: - if isinstance(index, DatetimeIndex): - if len(data): - # coerce back to datetime objects for lookup - data = _dict_compat(data) - data = lib.fast_multiget(data, - index.asobject.values, - default=np.nan) - else: - data = np.nan - # GH #12169 - elif isinstance(index, (PeriodIndex, TimedeltaIndex)): - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - else: - data = lib.fast_multiget(data, index.values, - default=np.nan) - except TypeError: - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False elif isinstance(data, SingleBlockManager): if index is None: index = data.index @@ -271,6 +248,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None, self.name = name self._set_axis(0, index, fastpath=True) + def _init_dict(self, data, index=None, dtype=None): + """ + Derive the "_data" and "index" attributes of a new Series from a + dictionary input. + + Parameters + ---------- + data : dict or dict-like + Data used to populate the new Series + index : Index or index-like, default None + index for the new Series: if None, use dict keys + dtype : dtype, default None + dtype for the new Series: if None, infer from data + + Returns + ------- + _data : BlockManager for the new Series + index : index for the new Series + """ + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] + # raises KeyError), so we iterate the entire dict, and align + if data: + keys, values = zip(*compat.iteritems(data)) + else: + keys, values = [], [] + + # Input is now list-like, so rely on "standard" construction: + s = Series(values, index=keys, dtype=dtype) + + # Now we just make sure the order is respected, if any + if index is not None: + s = s.reindex(index, copy=False) + elif not isinstance(data, OrderedDict): + try: + s = s.sort_index() + except TypeError: + pass + return s._data, s.index + @classmethod def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False): @@ -434,12 +450,15 @@ def get_values(self): @property def asobject(self): - """ + """DEPRECATED: Use ``astype(object)`` instead. + return object Series which contains boxed values *this is an internal non-public method* """ - return self._data.asobject + warnings.warn("'asobject' is deprecated. Use 'astype(object)'" + " instead", FutureWarning, stacklevel=2) + return self.astype(object).values # ops def ravel(self, order='C'): @@ -1307,7 +1326,7 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.asobject.values + result = result.astype(object).values return result @@ -1626,7 +1645,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): ignore_index : boolean, default False If True, do not use the index labels. - .. versionadded: 0.19.0 + .. versionadded:: 0.19.0 verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates @@ -2043,7 +2062,7 @@ def nlargest(self, n=5, keep='first'): ---------- n : int Return this many descending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -2090,7 +2109,7 @@ def nsmallest(self, n=5, keep='first'): ---------- n : int Return this many ascending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -2213,7 +2232,7 @@ def unstack(self, level=-1, fill_value=None): fill_value : replace NaN with this value if the unstack produces missing values - .. versionadded: 0.18.0 + .. versionadded:: 0.18.0 Examples -------- @@ -2338,41 +2357,8 @@ def map(self, arg, na_action=None): 3 0 dtype: int64 """ - - if is_extension_type(self.dtype): - values = self._values - if na_action is not None: - raise NotImplementedError - map_f = lambda values, f: values.map(f) - else: - values = self.asobject - - if na_action == 'ignore': - def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) - else: - map_f = lib.map_infer - - if isinstance(arg, dict): - if hasattr(arg, '__missing__'): - # If a dictionary subclass defines a default value method, - # convert arg to a lookup function (GH #15999). - dict_with_default = arg - arg = lambda x: dict_with_default[x] - else: - # Dictionary does not have a default. Thus it's safe to - # convert to an indexed series for efficiency. - arg = self._constructor(arg, index=arg.keys()) - - if isinstance(arg, Series): - # arg is a Series - indexer = arg.index.get_indexer(values) - new_values = algorithms.take_1d(arg._values, indexer) - else: - # arg is a function - new_values = map_f(values, arg) - + new_values = super(Series, self)._map_values( + arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) @@ -2567,7 +2553,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if is_extension_type(self.dtype): mapped = self._values.map(f) else: - values = self.asobject + values = self.astype(object).values mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): @@ -3143,7 +3129,7 @@ def _sanitize_index(data, index, copy=False): if isinstance(data, ABCIndexClass) and not copy: pass elif isinstance(data, PeriodIndex): - data = data.asobject + data = data.astype(object).values elif isinstance(data, DatetimeIndex): data = data._to_embed(keep_tz=True) elif isinstance(data, np.ndarray): @@ -3248,21 +3234,6 @@ def _try_cast(arr, take_fast_path): else: subarr = _try_cast(data, False) - def create_from_value(value, index, dtype): - # return a new empty value suitable for the dtype - - if is_datetimetz(dtype): - subarr = DatetimeIndex([value] * len(index), dtype=dtype) - elif is_categorical_dtype(dtype): - subarr = Categorical([value] * len(index)) - else: - if not isinstance(dtype, (np.dtype, type(np.dtype))): - dtype = dtype.dtype - subarr = np.empty(len(index), dtype=dtype) - subarr.fill(value) - - return subarr - # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover @@ -3277,7 +3248,8 @@ def create_from_value(value, index, dtype): # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) - subarr = create_from_value(value, index, dtype) + subarr = construct_1d_arraylike_from_scalar( + value, len(index), dtype) else: return subarr.item() @@ -3288,8 +3260,8 @@ def create_from_value(value, index, dtype): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - subarr = create_from_value(subarr[0], index, - subarr.dtype) + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): diff --git a/pandas/core/sparse/api.py b/pandas/core/sparse/api.py index f79bb4886da4b..85941e6923338 100644 --- a/pandas/core/sparse/api.py +++ b/pandas/core/sparse/api.py @@ -1,6 +1,5 @@ # pylint: disable=W0611 # flake8: noqa from pandas.core.sparse.array import SparseArray -from pandas.core.sparse.list import SparseList from pandas.core.sparse.series import SparseSeries from pandas.core.sparse.frame import SparseDataFrame diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 0c9a55e0c9acd..36a18d8f8b4a0 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -131,8 +131,7 @@ def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = _ensure_index(columns) - data = dict((k, v) for k, v in compat.iteritems(data) - if k in columns) + data = {k: v for k, v in compat.iteritems(data) if k in columns} else: columns = Index(_try_sort(list(data.keys()))) @@ -173,7 +172,7 @@ def _init_matrix(self, data, index, columns, dtype=None): """ Init self from ndarray or list of lists """ data = _prep_ndarray(data, copy=False) index, columns = self._prep_index(data, index, columns) - data = dict((idx, data[:, i]) for i, idx in enumerate(columns)) + data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) def _init_spmatrix(self, data, index, columns, dtype=None, @@ -307,7 +306,7 @@ def to_dense(self): ------- df : DataFrame """ - data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) + data = {k: v.to_dense() for k, v in compat.iteritems(self)} return DataFrame(data, index=self.index, columns=self.columns) def _apply_columns(self, func): @@ -697,7 +696,7 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, raise NotImplementedError("'method' argument is not supported") # TODO: fill value handling - sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) + sdict = {k: v for k, v in compat.iteritems(self) if k in columns} return self._constructor( sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value).__finalize__(self) diff --git a/pandas/core/sparse/list.py b/pandas/core/sparse/list.py deleted file mode 100644 index f3e64b7efc764..0000000000000 --- a/pandas/core/sparse/list.py +++ /dev/null @@ -1,152 +0,0 @@ -import warnings -import numpy as np -from pandas.core.base import PandasObject -from pandas.io.formats.printing import pprint_thing - -from pandas.core.dtypes.common import is_scalar -from pandas.core.sparse.array import SparseArray -from pandas.util._validators import validate_bool_kwarg -import pandas._libs.sparse as splib - - -class SparseList(PandasObject): - - """ - Data structure for accumulating data to be converted into a - SparseArray. Has similar API to the standard Python list - - Parameters - ---------- - data : scalar or array-like - fill_value : scalar, default NaN - """ - - def __init__(self, data=None, fill_value=np.nan): - - # see gh-13784 - warnings.warn("SparseList is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) - - self.fill_value = fill_value - self._chunks = [] - - if data is not None: - self.append(data) - - def __unicode__(self): - contents = '\n'.join(repr(c) for c in self._chunks) - return '{self}\n{contents}'.format(self=object.__repr__(self), - contents=pprint_thing(contents)) - - def __len__(self): - return sum(len(c) for c in self._chunks) - - def __getitem__(self, i): - if i < 0: - if i + len(self) < 0: # pragma: no cover - raise ValueError('{index} out of range'.format(index=i)) - i += len(self) - - passed = 0 - j = 0 - while i >= passed + len(self._chunks[j]): - passed += len(self._chunks[j]) - j += 1 - return self._chunks[j][i - passed] - - def __setitem__(self, i, value): - raise NotImplementedError - - @property - def nchunks(self): - return len(self._chunks) - - @property - def is_consolidated(self): - return self.nchunks == 1 - - def consolidate(self, inplace=True): - """ - Internally consolidate chunks of data - - Parameters - ---------- - inplace : boolean, default True - Modify the calling object instead of constructing a new one - - Returns - ------- - splist : SparseList - If inplace=False, new object, otherwise reference to existing - object - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if not inplace: - result = self.copy() - else: - result = self - - if result.is_consolidated: - return result - - result._consolidate_inplace() - return result - - def _consolidate_inplace(self): - new_values = np.concatenate([c.sp_values for c in self._chunks]) - new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks]) - new_arr = SparseArray(new_values, sparse_index=new_index, - fill_value=self.fill_value) - self._chunks = [new_arr] - - def copy(self): - """ - Return copy of the list - - Returns - ------- - new_list : SparseList - """ - new_splist = SparseList(fill_value=self.fill_value) - new_splist._chunks = list(self._chunks) - return new_splist - - def to_array(self): - """ - Return SparseArray from data stored in the SparseList - - Returns - ------- - sparr : SparseArray - """ - self.consolidate(inplace=True) - return self._chunks[0] - - def append(self, value): - """ - Append element or array-like chunk of data to the SparseList - - Parameters - ---------- - value: scalar or array-like - """ - if is_scalar(value): - value = [value] - - sparr = SparseArray(value, fill_value=self.fill_value) - self._chunks.append(sparr) - self._consolidated = False - - -def _concat_sparse_indexes(indexes): - all_indices = [] - total_length = 0 - - for index in indexes: - # increment by offset - inds = index.to_int_index().indices + total_length - - all_indices.append(inds) - total_length += index.length - - return splib.IntIndex(total_length, np.concatenate(all_indices)) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 65f4704da3800..4245b9eb641ba 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -114,7 +114,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series - .. versionadded: 0.18.1 + .. versionadded:: 0.18.1 or DataFrame/dict-like @@ -140,7 +140,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware @@ -178,13 +178,13 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If Timestamp convertible, origin is set to Timestamp identified by origin. - .. versionadded: 0.20.0 + .. versionadded:: 0.20.0 cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime conversion. May produce sigificant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded: 0.22.0 + .. versionadded:: 0.22.0 Returns ------- @@ -629,8 +629,6 @@ def calc_with_mask(carg, mask): return None -normalize_date = tslib.normalize_date - # Fixed time formats for time parsing _time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] diff --git a/pandas/core/window.py b/pandas/core/window.py index 5143dddc5e866..345f9b035a36b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -503,6 +503,9 @@ class Window(_Window): * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. """ def validate(self): diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 4066a3be5e850..37d398f20ef41 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -18,7 +18,8 @@ On Linux, install xclip or xsel via package manager. For example, in Debian: sudo apt-get install xclip -Otherwise on Linux, you will need the gtk or PyQt4 modules installed. +Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed. +qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2 gtk and PyQt4 modules are not available for Python 3, and this module does not work with PyGObject yet. @@ -34,9 +35,9 @@ init_klipper_clipboard, init_no_clipboard) from .windows import init_windows_clipboard -# `import PyQt4` sys.exit()s if DISPLAY is not in the environment. +# `import qtpy` sys.exit()s if DISPLAY is not in the environment. # Thus, we need to detect the presence of $DISPLAY manually -# and not load PyQt4 if it is absent. +# and not load qtpy if it is absent. HAS_DISPLAY = os.getenv("DISPLAY", False) CHECK_CMD = "where" if platform.system() == "Windows" else "which" @@ -68,9 +69,23 @@ def determine_clipboard(): return init_gtk_clipboard() try: - # Check if PyQt4 is installed - import PyQt4 # noqa + # qtpy is a small abstraction layer that lets you write + # applications using a single api call to either PyQt or PySide + # https://pypi.python.org/pypi/QtPy + import qtpy # noqa except ImportError: + # If qtpy isn't installed, fall back on importing PyQt5, or PyQt5 + try: + import PyQt5 # noqa + except ImportError: + try: + import PyQt4 # noqa + except ImportError: + pass # fail fast for all non-ImportError exceptions. + else: + return init_qt_clipboard() + else: + return init_qt_clipboard() pass else: return init_qt_clipboard() diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index e32380a383374..285d93e3ca497 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -46,10 +46,19 @@ def paste_gtk(): def init_qt_clipboard(): # $DISPLAY should exist - from PyQt4.QtGui import QApplication - # use the global instance if it exists - app = QApplication.instance() or QApplication([]) + # Try to import from qtpy, but if that fails try PyQt5 then PyQt4 + try: + from qtpy.QtWidgets import QApplication + except ImportError: + try: + from PyQt5.QtWidgets import QApplication + except ImportError: + from PyQt4.QtGui import QApplication + + app = QApplication.instance() + if app is None: + app = QApplication([]) def copy_qt(text): cb = app.clipboard() diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 117c96d00171c..8e9b5497083f6 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -53,7 +53,7 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = set(x.lstrip().count('\t') for x in lines) + counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' diff --git a/pandas/io/data.py b/pandas/io/data.py deleted file mode 100644 index e76790a6ab98b..0000000000000 --- a/pandas/io/data.py +++ /dev/null @@ -1,6 +0,0 @@ -raise ImportError( - "The pandas.io.data module is moved to a separate package " - "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/excel.py b/pandas/io/excel.py index fec916dc52d20..882130bedcbf0 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -28,7 +28,6 @@ from pandas.core import config from pandas.io.formats.printing import pprint_thing import pandas.compat as compat -import pandas.compat.openpyxl_compat as openpyxl_compat from warnings import warn from distutils.version import LooseVersion from pandas.util._decorators import Appender, deprecate_kwarg @@ -185,22 +184,6 @@ def _get_default_writer(ext): def get_writer(engine_name): - if engine_name == 'openpyxl': - try: - import openpyxl - - # with version-less openpyxl engine - # make sure we make the intelligent choice for the user - if LooseVersion(openpyxl.__version__) < '2.0.0': - return _writers['openpyxl1'] - elif LooseVersion(openpyxl.__version__) < '2.2.0': - return _writers['openpyxl20'] - else: - return _writers['openpyxl22'] - except ImportError: - # fall through to normal exception handling below - pass - try: return _writers[engine_name] except KeyError: @@ -828,20 +811,15 @@ def close(self): return self.save() -class _Openpyxl1Writer(ExcelWriter): - engine = 'openpyxl1' +class _OpenpyxlWriter(ExcelWriter): + engine = 'openpyxl' supported_extensions = ('.xlsx', '.xlsm') - openpyxl_majorver = 1 def __init__(self, path, engine=None, **engine_kwargs): - if not openpyxl_compat.is_compat(major_ver=self.openpyxl_majorver): - raise ValueError('Installed openpyxl is not supported at this ' - 'time. Use {majorver}.x.y.' - .format(majorver=self.openpyxl_majorver)) # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super(_Openpyxl1Writer, self).__init__(path, **engine_kwargs) + super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -861,72 +839,6 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): - # Write the frame cells using openpyxl. - from openpyxl.cell import get_column_letter - - sheet_name = self._get_sheet_name(sheet_name) - - if sheet_name in self.sheets: - wks = self.sheets[sheet_name] - else: - wks = self.book.create_sheet() - wks.title = sheet_name - self.sheets[sheet_name] = wks - - for cell in cells: - colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks.cell("{col}{row}".format(col=colletter, - row=startrow + cell.row + 1)) - if (isinstance(cell.val, compat.string_types) and - xcell.data_type_for_value(cell.val) != xcell.TYPE_STRING): - xcell.set_value_explicit(cell.val) - else: - xcell.value = _conv_value(cell.val) - style = None - if cell.style: - style = self._convert_to_style(cell.style) - for field in style.__fields__: - xcell.style.__setattr__(field, - style.__getattribute__(field)) - - if isinstance(cell.val, datetime): - xcell.style.number_format.format_code = self.datetime_format - elif isinstance(cell.val, date): - xcell.style.number_format.format_code = self.date_format - - if cell.mergestart is not None and cell.mergeend is not None: - cletterstart = get_column_letter(startcol + cell.col + 1) - cletterend = get_column_letter(startcol + cell.mergeend + 1) - - wks.merge_cells('{start}{row}:{end}{mergestart}' - .format(start=cletterstart, - row=startrow + cell.row + 1, - end=cletterend, - mergestart=startrow + - cell.mergestart + 1)) - - # Excel requires that the format of the first cell in a merged - # range is repeated in the rest of the merged range. - if style: - first_row = startrow + cell.row + 1 - last_row = startrow + cell.mergestart + 1 - first_col = startcol + cell.col + 1 - last_col = startcol + cell.mergeend + 1 - - for row in range(first_row, last_row + 1): - for col in range(first_col, last_col + 1): - if row == first_row and col == first_col: - # Ignore first cell. It is already handled. - continue - colletter = get_column_letter(col) - xcell = wks.cell("{col}{row}" - .format(col=colletter, row=row)) - for field in style.__fields__: - xcell.style.__setattr__( - field, style.__getattribute__(field)) - @classmethod def _convert_to_style(cls, style_dict): """ @@ -948,88 +860,6 @@ def _convert_to_style(cls, style_dict): return xls_style - -register_writer(_Openpyxl1Writer) - - -class _OpenpyxlWriter(_Openpyxl1Writer): - engine = 'openpyxl' - - -register_writer(_OpenpyxlWriter) - - -class _Openpyxl20Writer(_Openpyxl1Writer): - """ - Note: Support for OpenPyxl v2 is currently EXPERIMENTAL (GH7565). - """ - engine = 'openpyxl20' - openpyxl_majorver = 2 - - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): - # Write the frame cells using openpyxl. - from openpyxl.cell import get_column_letter - - sheet_name = self._get_sheet_name(sheet_name) - - if sheet_name in self.sheets: - wks = self.sheets[sheet_name] - else: - wks = self.book.create_sheet() - wks.title = sheet_name - self.sheets[sheet_name] = wks - - for cell in cells: - colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks["{col}{row}" - .format(col=colletter, row=startrow + cell.row + 1)] - xcell.value = _conv_value(cell.val) - style_kwargs = {} - - # Apply format codes before cell.style to allow override - if isinstance(cell.val, datetime): - style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format': {'format_code': self.datetime_format}})) - elif isinstance(cell.val, date): - style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format': {'format_code': self.date_format}})) - - if cell.style: - style_kwargs.update(self._convert_to_style_kwargs(cell.style)) - - if style_kwargs: - xcell.style = xcell.style.copy(**style_kwargs) - - if cell.mergestart is not None and cell.mergeend is not None: - cletterstart = get_column_letter(startcol + cell.col + 1) - cletterend = get_column_letter(startcol + cell.mergeend + 1) - - wks.merge_cells('{start}{row}:{end}{mergestart}' - .format(start=cletterstart, - row=startrow + cell.row + 1, - end=cletterend, - mergestart=startrow + - cell.mergestart + 1)) - - # Excel requires that the format of the first cell in a merged - # range is repeated in the rest of the merged range. - if style_kwargs: - first_row = startrow + cell.row + 1 - last_row = startrow + cell.mergestart + 1 - first_col = startcol + cell.col + 1 - last_col = startcol + cell.mergeend + 1 - - for row in range(first_row, last_row + 1): - for col in range(first_col, last_col + 1): - if row == first_row and col == first_col: - # Ignore first cell. It is already handled. - continue - colletter = get_column_letter(col) - xcell = wks["{col}{row}" - .format(col=colletter, row=row)] - xcell.style = xcell.style.copy(**style_kwargs) - @classmethod def _convert_to_style_kwargs(cls, style_dict): """ @@ -1341,13 +1171,7 @@ def _convert_to_number_format(cls, number_format_dict): ------- number_format : str """ - try: - # >= 2.0.0 < 2.1.0 - from openpyxl.styles import NumberFormat - return NumberFormat(**number_format_dict) - except: - # >= 2.1.0 - return number_format_dict['format_code'] + return number_format_dict['format_code'] @classmethod def _convert_to_protection(cls, protection_dict): @@ -1367,17 +1191,6 @@ def _convert_to_protection(cls, protection_dict): return Protection(**protection_dict) - -register_writer(_Openpyxl20Writer) - - -class _Openpyxl22Writer(_Openpyxl20Writer): - """ - Note: Support for OpenPyxl v2.2 is currently EXPERIMENTAL (GH7565). - """ - engine = 'openpyxl22' - openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None): # Write the frame cells using openpyxl. @@ -1443,7 +1256,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, setattr(xcell, k, v) -register_writer(_Openpyxl22Writer) +register_writer(_OpenpyxlWriter) class _XlwtWriter(ExcelWriter): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index e116635c99264..8f25eb3af70cd 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -2231,7 +2231,7 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self): """ we by definition have a TZ """ - values = self.values.asobject + values = self.values.astype(object) is_dates_only = _is_dates_only(values) formatter = (self.formatter or _get_format_datetime64(is_dates_only, diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 12e52123064e2..b452b0cf5ddd4 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -29,9 +29,8 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, The main method a user calls to execute a Query in Google BigQuery and read results into a pandas DataFrame. - Google BigQuery API Client Library v2 for Python is used. - Documentation is available `here - `__ + This function requires the `pandas-gbq package + `__. Authentication to the Google BigQuery service is via OAuth 2.0. @@ -70,7 +69,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is + 'standard' : Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference `__ diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 32bab09a0c4ac..21736673350d8 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False @@ -715,10 +717,8 @@ def _parse_no_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + decoded = {str(k): v for k, v in compat.iteritems( + loads(json, precise_float=self.precise_float))} self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: @@ -732,7 +732,7 @@ def _parse_numpy(self): if orient == "split": decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + decoded = {str(k): v for k, v in compat.iteritems(decoded)} self.check_keys_split(decoded) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": @@ -770,7 +770,7 @@ def _parse_numpy(self): elif orient == "split": decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + decoded = {str(k): v for k, v in compat.iteritems(decoded)} self.check_keys_split(decoded) self.obj = DataFrame(**decoded) elif orient == "values": @@ -790,10 +790,8 @@ def _parse_no_numpy(self): self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) elif orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + decoded = {str(k): v for k, v in compat.iteritems( + loads(json, precise_float=self.precise_float))} self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index d062e4f2830ff..595031b04e367 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -181,7 +181,7 @@ def _pull_field(js, spec): return result - if isinstance(data, list) and len(data) is 0: + if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob @@ -207,9 +207,7 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] + meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index f6383b42d4975..c81069c8e04c0 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -8,6 +8,7 @@ from libc.limits cimport * from pandas.io.msgpack.exceptions import PackValueError from pandas.io.msgpack import ExtType +import numpy as np cdef extern from "../../src/msgpack/pack.h": @@ -133,7 +134,7 @@ cdef class Packer(object): while True: if o is None: ret = msgpack_pack_nil(&self.pk) - elif isinstance(o, bool): + elif isinstance(o, (bool, np.bool_)): if o: ret = msgpack_pack_true(&self.pk) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 558a1f6d76868..83b1d8ec1a070 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -74,15 +74,19 @@ .. versionadded:: 0.18.1 support for the Python parser. header : int or list of ints, default 'infer' - Row number(s) to use as the column names, and the start of the data. - Default behavior is as if set to 0 if no ``names`` passed, otherwise - ``None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of integers that specify row locations for - a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so header=0 denotes the first line of + data rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list will cause @@ -1133,8 +1137,39 @@ def _evaluate_usecols(usecols, names): If not a callable, returns 'usecols'. """ if callable(usecols): - return set(i for i, name in enumerate(names) - if usecols(name)) + return {i for i, name in enumerate(names) if usecols(name)} + return usecols + + +def _validate_usecols_names(usecols, names): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: {missing}".format(missing=missing) + ) + return usecols @@ -1750,14 +1785,14 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.names) self._set_noconvert_columns() @@ -1906,7 +1941,7 @@ def read(self, nrows=None): # rename dict keys data = sorted(data.items()) - data = dict((k, v) for k, (i, v) in zip(names, data)) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -1924,7 +1959,7 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - data = dict((k, v) for k, (i, v) in zip(names, data)) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) index, names = self._make_index(data, alldata, names) @@ -2300,7 +2335,7 @@ def _exclude_implicit_index(self, alldata): offset += 1 data[col] = alldata[i + offset] else: - data = dict((k, v) for k, v in zip(names, alldata)) + data = {k: v for k, v in zip(names, alldata)} return data @@ -2529,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key): raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] + for col in self.usecols: if isinstance(col, string_types): - col_indices.append(usecols_key.index(col)) + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + _validate_usecols_names(self.usecols, usecols_key) else: col_indices.append(col) else: @@ -3233,9 +3272,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): for i, n in enumerate(index_col): columns.pop(n - i) - col_dict = dict((col_name, - Series([], dtype=dtype[col_name])) - for col_name in columns) + col_dict = {col_name: Series([], dtype=dtype[col_name]) + for col_name in columns} return index, columns, col_dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b9cddce55c096..2a66aea88f6d9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -815,7 +815,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, "all tables must have exactly the same nrows!") # axis is the concentation axes - axis = list(set(t.non_index_axes[0][0] for t in tbls))[0] + axis = list({t.non_index_axes[0][0] for t in tbls})[0] def func(_start, _stop, _where): @@ -2374,8 +2374,7 @@ class GenericFixed(Fixed): """ a generified fixed version """ _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} - _reverse_index_map = dict((v, k) - for k, v in compat.iteritems(_index_type_map)) + _reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)} attributes = [] # indexer helpders @@ -3510,8 +3509,8 @@ def get_blk_items(mgr, blocks): # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict((tuple(b_items.tolist()), (b, b_items)) - for b, b_items in zip(blocks, blk_items)) + by_items = {tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items)} new_blocks = [] new_blk_items = [] for ea in existing_table.values_axes: @@ -3659,7 +3658,7 @@ def create_description(self, complib=None, complevel=None, d = dict(name='table', expectedrows=expectedrows) # description from the axes & values - d['description'] = dict((a.cname, a.typ) for a in self.axes) + d['description'] = {a.cname: a.typ for a in self.axes} if complib: if complevel is None: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 401a9c11a774d..975ad1e4ff368 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -641,7 +641,7 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter] + data = [{k: v for k, v in zip(keys, row)} for row in data_iter] conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): diff --git a/pandas/io/wb.py b/pandas/io/wb.py deleted file mode 100644 index 5dc4d9ce1adc4..0000000000000 --- a/pandas/io/wb.py +++ /dev/null @@ -1,6 +0,0 @@ -raise ImportError( - "The pandas.io.wb module is moved to a separate package " - "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.") diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 0f06d87726905..2ced5f653825d 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -363,7 +363,8 @@ def __call__(self): tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) - all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).asobject + all_dates = date_range(start=st, end=ed, + freq=freq, tz=tz).astype(object) try: if len(all_dates) > 0: @@ -994,7 +995,7 @@ def _set_default_format(self, vmin, vmax): info) else: format = np.compress(info['maj'], info) - self.formatdict = dict((x, f) for (x, _, _, f) in format) + self.formatdict = {x: f for (x, _, _, f) in format} return self.formatdict def set_locs(self, locs): diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index fad455d6391c3..e47f1919faaf5 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -52,7 +52,7 @@ class TestPDApi(Base): # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'Panel4D', 'TimeGrouper', - 'SparseList', 'Expr', 'Term'] + 'Expr', 'Term'] # these should be deprecated in the future deprecated_classes_in_future = ['Panel'] @@ -103,7 +103,7 @@ class TestPDApi(Base): 'rolling_kurt', 'rolling_max', 'rolling_mean', 'rolling_median', 'rolling_min', 'rolling_quantile', 'rolling_skew', 'rolling_std', 'rolling_sum', - 'rolling_var', 'rolling_window', 'ordered_merge', + 'rolling_var', 'rolling_window', 'pnow', 'match', 'groupby', 'get_store', 'plot_params', 'scatter_matrix'] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index a96dd3c232636..ef12416ef4e1c 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -419,6 +419,10 @@ def test_mixed_dtypes_remain_object_array(self): class TestTypeInference(object): + # Dummy class used for testing with Python objects + class Dummy(): + pass + def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) assert result == 'integer' @@ -655,6 +659,72 @@ def test_infer_dtype_period(self): dtype=object) assert lib.infer_dtype(arr) == 'mixed' + @pytest.mark.parametrize( + "data", + [ + [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], + [Timestamp("20170612"), Timestamp("20170311")], + [Timestamp("20170612", tz='US/Eastern'), + Timestamp("20170311", tz='US/Eastern')], + [date(2017, 6, 12), + Timestamp("20170311", tz='US/Eastern')], + [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], + [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)] + ] + ) + def test_infer_datetimelike_array_datetime(self, data): + assert lib.infer_datetimelike_array(data) == "datetime" + + @pytest.mark.parametrize( + "data", + [ + [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], + [timedelta(2017, 6, 12), date(2017, 3, 11)], + [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], + [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)] + ] + ) + def test_infer_datetimelike_array_timedelta(self, data): + assert lib.infer_datetimelike_array(data) == "timedelta" + + def test_infer_datetimelike_array_date(self): + arr = [date(2017, 6, 12), date(2017, 3, 11)] + assert lib.infer_datetimelike_array(arr) == "date" + + @pytest.mark.parametrize( + "data", + [ + ["2017-06-12", "2017-03-11"], + [20170612, 20170311], + [20170612.5, 20170311.8], + [Dummy(), Dummy()], + [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')], + [Timestamp("20170612"), 20170311], + [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')] + ] + ) + def test_infer_datetimelike_array_mixed(self, data): + assert lib.infer_datetimelike_array(data) == "mixed" + + @pytest.mark.parametrize( + "first, expected", + [ + [[None], "mixed"], + [[np.nan], "mixed"], + [[pd.NaT], "nat"], + [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"], + [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], + [[date(2017, 6, 12), pd.NaT], "date"], + [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], + [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"] + ] + ) + @pytest.mark.parametrize("second", [None, np.nan]) + def test_infer_datetimelike_array_nan_nat_like(self, first, second, + expected): + first.append(second) + assert lib.infer_datetimelike_array(first) == expected + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr) == 'floating' diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 3786facdd4ebd..c85fea3c3d71b 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -10,8 +10,8 @@ _frame = pd.DataFrame(_seriesd) _frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) -_intframe = pd.DataFrame(dict((k, v.astype(int)) - for k, v in compat.iteritems(_seriesd))) +_intframe = pd.DataFrame({k: v.astype(int) + for k, v in compat.iteritems(_seriesd)}) _tsframe = pd.DataFrame(_tsd) @@ -32,8 +32,7 @@ def frame2(self): @cache_readonly def intframe(self): # force these all to int64 to avoid platform testing issues - return pd.DataFrame(dict((c, s) for c, s in - compat.iteritems(_intframe)), + return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)}, dtype=np.int64) @cache_readonly @@ -112,7 +111,7 @@ def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 dtypes = dict(A='float32', B='float32', C='float16', D='float64') if isinstance(dtype, compat.string_types): - dtypes = dict((k, dtype) for k, v in dtypes.items()) + dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): @@ -128,7 +127,7 @@ def _check_mixed_float(df, dtype=None): def _check_mixed_int(df, dtype=None): dtypes = dict(A='int32', B='uint64', C='uint8', D='int64') if isinstance(dtype, compat.string_types): - dtypes = dict((k, dtype) for k, v in dtypes.items()) + dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c50aa858a15b5..0b562269ea29d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -234,7 +234,7 @@ def test_itertuples(self): if sys.version >= LooseVersion('2.7'): assert tup2._fields == ('Index', '_1', '_2') - df3 = DataFrame(dict(('f' + str(i), [i]) for i in range(1024))) + df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, '_fields') @@ -243,31 +243,31 @@ def test_itertuples(self): def test_len(self): assert len(self.frame) == len(self.frame.index) - def test_as_matrix(self): + def test_values(self): frame = self.frame - mat = frame.as_matrix() + arr = frame.values - frameCols = frame.columns - for i, row in enumerate(mat): + frame_cols = frame.columns + for i, row in enumerate(arr): for j, value in enumerate(row): - col = frameCols[j] + col = frame_cols[j] if np.isnan(value): assert np.isnan(frame[col][i]) else: assert value == frame[col][i] # mixed type - mat = self.mixed_frame.as_matrix(['foo', 'A']) - assert mat[0, 0] == 'bar' + arr = self.mixed_frame[['foo', 'A']].values + assert arr[0, 0] == 'bar' df = self.klass({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) - mat = df.as_matrix() - assert mat[0, 0] == 1j + arr = df.values + assert arr[0, 0] == 1j # single block corner case - mat = self.frame.as_matrix(['A', 'B']) + arr = self.frame[['A', 'B']].values expected = self.frame.reindex(columns=['A', 'B']).values - assert_almost_equal(mat, expected) + assert_almost_equal(arr, expected) def test_transpose(self): frame = self.frame @@ -311,8 +311,8 @@ def test_class_axis(self): DataFrame.index # no exception! DataFrame.columns # no exception! - def test_more_asMatrix(self): - values = self.mixed_frame.as_matrix() + def test_more_values(self): + values = self.mixed_frame.values assert values.shape[1] == len(self.mixed_frame.columns) def test_repr_with_mi_nat(self): @@ -369,6 +369,13 @@ def test_values(self): self.frame.values[:, 0] = 5. assert (self.frame.values[:, 0] == 5).all() + def test_as_matrix_deprecated(self): + # GH18458 + with tm.assert_produces_warning(FutureWarning): + result = self.frame.as_matrix(columns=self.frame.columns.tolist()) + expected = self.frame.values + tm.assert_numpy_array_equal(result, expected) + def test_deepcopy(self): cp = deepcopy(self.frame) series = cp['A'] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c29821ba51284..8b1fd7d50cb4d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -67,10 +67,10 @@ def test_consolidate_inplace(self): for letter in range(ord('A'), ord('Z')): self.frame[chr(letter)] = chr(letter) - def test_as_matrix_consolidate(self): + def test_values_consolidate(self): self.frame['E'] = 7. assert not self.frame._data.is_consolidated() - _ = self.frame.as_matrix() # noqa + _ = self.frame.values # noqa assert self.frame._data.is_consolidated() def test_modify_values(self): @@ -91,50 +91,50 @@ def test_boolean_set_uncons(self): self.frame[self.frame > 1] = 2 assert_almost_equal(expected, self.frame.values) - def test_as_matrix_numeric_cols(self): + def test_values_numeric_cols(self): self.frame['foo'] = 'bar' - values = self.frame.as_matrix(['A', 'B', 'C', 'D']) + values = self.frame[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - def test_as_matrix_lcd(self): + def test_values_lcd(self): # mixed lcd - values = self.mixed_float.as_matrix(['A', 'B', 'C', 'D']) + values = self.mixed_float[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - values = self.mixed_float.as_matrix(['A', 'B', 'C']) + values = self.mixed_float[['A', 'B', 'C']].values assert values.dtype == np.float32 - values = self.mixed_float.as_matrix(['C']) + values = self.mixed_float[['C']].values assert values.dtype == np.float16 # GH 10364 # B uint64 forces float because there are other signed int types - values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) + values = self.mixed_int[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - values = self.mixed_int.as_matrix(['A', 'D']) + values = self.mixed_int[['A', 'D']].values assert values.dtype == np.int64 # B uint64 forces float because there are other signed int types - values = self.mixed_int.as_matrix(['A', 'B', 'C']) + values = self.mixed_int[['A', 'B', 'C']].values assert values.dtype == np.float64 # as B and C are both unsigned, no forcing to float is needed - values = self.mixed_int.as_matrix(['B', 'C']) + values = self.mixed_int[['B', 'C']].values assert values.dtype == np.uint64 - values = self.mixed_int.as_matrix(['A', 'C']) + values = self.mixed_int[['A', 'C']].values assert values.dtype == np.int32 - values = self.mixed_int.as_matrix(['C', 'D']) + values = self.mixed_int[['C', 'D']].values assert values.dtype == np.int64 - values = self.mixed_int.as_matrix(['A']) + values = self.mixed_int[['A']].values assert values.dtype == np.int32 - values = self.mixed_int.as_matrix(['C']) + values = self.mixed_int[['C']].values assert values.dtype == np.uint8 def test_constructor_with_convert(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6ca90d715cb0b..876e0ea7ea0b3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -120,7 +120,7 @@ def _make_mixed_dtypes_df(typ, ad=None): assert(a.dtype == d) if ad is None: ad = dict() - ad.update(dict((d, a) for d, a in zipper)) + ad.update({d: a for d, a in zipper}) return DataFrame(ad) def _check_mixed_dtypes(df, dtypes=None): @@ -268,13 +268,14 @@ def test_constructor_dict(self): # GH10856 # dict with scalar values should raise error, even if columns passed - with pytest.raises(ValueError): + msg = 'If using all scalar values, you must pass an index' + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}) - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['a']) - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['b']) def test_constructor_multi_index(self): @@ -349,8 +350,8 @@ def test_constructor_subclass_dict(self): data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} df = DataFrame(data) - refdf = DataFrame(dict((col, dict(compat.iteritems(val))) - for col, val in compat.iteritems(data))) + refdf = DataFrame({col: dict(compat.iteritems(val)) + for col, val in compat.iteritems(data)}) tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(compat.iteritems(data)) @@ -413,8 +414,7 @@ def test_constructor_dict_of_tuples(self): data = {'a': (1, 2, 3), 'b': (4, 5, 6)} result = DataFrame(data) - expected = DataFrame(dict((k, list(v)) - for k, v in compat.iteritems(data))) + expected = DataFrame({k: list(v) for k, v in compat.iteritems(data)}) tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): @@ -447,8 +447,8 @@ def test_constructor_dict_datetime64_index(self): dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] def create_data(constructor): - return dict((i, {constructor(s): 2 * i}) - for i, s in enumerate(dates_as_str)) + return {i: {constructor(s): 2 * i} + for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) @@ -472,8 +472,8 @@ def test_constructor_dict_timedelta64_index(self): td_as_int = [1, 2, 3, 4] def create_data(constructor): - return dict((i, {constructor(s): 2 * i}) - for i, s in enumerate(td_as_int)) + return {i: {constructor(s): 2 * i} + for i, s in enumerate(td_as_int)} data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D')) data_timedelta = create_data(lambda x: timedelta(days=x)) @@ -501,8 +501,8 @@ def test_constructor_period(self): assert df['b'].dtype == 'object' # list of periods - df = pd.DataFrame({'a': a.asobject.tolist(), - 'b': b.asobject.tolist()}) + df = pd.DataFrame({'a': a.astype(object).tolist(), + 'b': b.astype(object).tolist()}) assert df['a'].dtype == 'object' assert df['b'].dtype == 'object' @@ -696,8 +696,8 @@ def test_constructor_mrecarray(self): mrecs = mrecords.fromarrays(data, names=names) # fill the comb - comb = dict((k, v.filled()) if hasattr( - v, 'filled') else (k, v) for k, v in comb) + comb = {k: (v.filled() if hasattr(v, 'filled') else v) + for k, v in comb} expected = DataFrame(comb, columns=names) result = DataFrame(mrecs) @@ -1205,6 +1205,19 @@ def test_constructor_from_items(self): columns=['one', 'two', 'three']) tm.assert_frame_equal(rs, xp) + def test_constructor_from_items_scalars(self): + # GH 17312 + with tm.assert_raises_regex(ValueError, + 'The value in each \(key, value\) ' + 'pair must be an array, Series, or dict'): + DataFrame.from_items([('A', 1), ('B', 4)]) + + with tm.assert_raises_regex(ValueError, + 'The value in each \(key, value\) ' + 'pair must be an array, Series, or dict'): + DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'], + orient='index') + def test_constructor_mix_series_nonseries(self): df = DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])}, columns=['A', 'B']) @@ -1854,8 +1867,8 @@ def test_from_records_dictlike(self): for dtype, b in compat.iteritems(blocks): columns.extend(b.columns) - asdict = dict((x, y) for x, y in compat.iteritems(df)) - asdict2 = dict((x, y.values) for x, y in compat.iteritems(df)) + asdict = {x: y for x, y in compat.iteritems(df)} + asdict2 = {x: y.values for x, y in compat.iteritems(df)} # dict of series & dict of ndarrays (have dtype info) results = [] diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 5bdb76494f4c8..7d2d18db8d41c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +from datetime import datetime + import pytest +import pytz import collections import numpy as np @@ -249,3 +252,18 @@ def test_to_dict_box_scalars(self): result = DataFrame(d).to_dict(orient='records') assert isinstance(result[0]['a'], (int, long)) + + def test_frame_to_dict_tz(self): + # GH18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] + df = DataFrame(list(data), columns=["d", ]) + + result = df.to_dict(orient='records') + expected = [ + {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)}, + {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 5b903c5a1eaf6..f0a21cde4fbd9 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -439,7 +439,7 @@ def test_columns_with_dups(self): xp.columns = ['A', 'A', 'B'] assert_frame_equal(rs, xp) - def test_as_matrix_duplicates(self): + def test_values_duplicates(self): df = DataFrame([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], columns=['one', 'one', 'two', 'two']) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 04dcea2b9d533..f34d25142a057 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -133,6 +133,30 @@ def test_stack_unstack(self): assert_frame_equal(unstacked_cols.T, df) assert_frame_equal(unstacked_cols_df['bar'].T, df) + def test_stack_mixed_level(self): + # GH 18310 + levels = [range(3), [3, 'a', 'b'], [1, 2]] + + # flat columns: + df = DataFrame(1, index=levels[0], columns=levels[1]) + result = df.stack() + expected = Series(1, index=MultiIndex.from_product(levels[:2])) + assert_series_equal(result, expected) + + # MultiIndex columns: + df = DataFrame(1, index=levels[0], + columns=MultiIndex.from_product(levels[1:])) + result = df.stack(1) + expected = DataFrame(1, index=MultiIndex.from_product([levels[0], + levels[2]]), + columns=levels[1]) + assert_frame_equal(result, expected) + + # as above, but used labels in level are actually of homogeneous type + result = df[['a', 'b']].stack(1) + expected = expected[['a', 'b']] + assert_frame_equal(result, expected) + def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py new file mode 100644 index 0000000000000..456cb48020500 --- /dev/null +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -0,0 +1,431 @@ +import pytest +import pandas as pd +import pandas.util.testing as tm +from pandas.core.dtypes.missing import array_equivalent + + +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3' """ + return pd.DataFrame({'L1': [1, 2, 3], + 'L2': [11, 12, 13], + 'L3': ['A', 'B', 'C']}) + + +@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ + df = df.set_index(['L1', 'L2']) + + df['L1'] = df['L3'] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = df.set_index(['L1']) + df = pd.concat([df, df['L2']], axis=1) + + return df + + +@pytest.fixture +def panel(): + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + return pd.Panel() + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names + if name is not None] + return expected_labels, expected_levels + + +def assert_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def assert_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_or_label_reference_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_reference_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + assert_label_reference(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + assert_level_reference(df_ambig, ['L2'], axis=axis) + + # df has a column named L3 and it not an level reference + assert_label_reference(df_ambig, ['L3'], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_reference(s, ['L1'], axis=0) + assert not s._is_level_reference('L2') + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_reference(s, ['L1', 'L2'], axis=0) + assert not s._is_level_reference('L3') + + +def test_is_level_reference_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._is_level_reference('L1', axis=1) + + +# Panel +# ----- +def test_is_level_reference_panel_error(panel): + msg = ("_is_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_level_reference('L1', axis=0) + + +def test_is_label_reference_panel_error(panel): + msg = ("_is_label_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_reference('L1', axis=0) + + +def test_is_label_or_level_reference_panel_error(panel): + msg = ("_is_label_or_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_or_level_reference('L1', axis=0) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous + with tm.assert_produces_warning(FutureWarning, + clear=True, + check_stacklevel=False) as w: + + assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) + warning_msg = w[0].message.args[0] + if axis == 0: + assert warning_msg.startswith("'L1' is both an index level " + "and a column label") + else: + assert warning_msg.startswith("'L1' is both a column level " + "and an index label") + + # df_ambig has an on-axis level named L2 and it is not ambiguous + # No warning should be raised + with tm.assert_produces_warning(None): + assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis) + + # df_ambig has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert not df_ambig._is_level_reference('L3', axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index('L1').L2 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + assert not s._check_label_or_level_ambiguity('L3', axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._check_label_or_level_ambiguity('L1', axis=1) + + +# Panel +# ----- +def test_check_label_or_level_ambiguity_panel_error(panel): + msg = ("_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._check_label_or_level_ambiguity('L1', axis=0) + + +# Test _get_label_or_level_values +# =============================== +def assert_label_values(frame, labels, axis): + for label in labels: + if axis == 0: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def assert_level_values(frame, levels, axis): + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = (frame.columns + .get_level_values(level=level) + ._values) + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous but will default to label + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + assert_label_values(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_level_values(df_ambig, ['L2'], axis=axis) + + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_label_values(df_ambig, ['L3'], axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_duplabels = df_duplabels.T + + # df has unambiguous level 'L1' + assert_level_values(df_duplabels, ['L1'], axis=axis) + + # df has unique label 'L3' + assert_label_values(df_duplabels, ['L3'], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with tm.assert_raises_regex(ValueError, expected_msg): + assert_label_values(df_duplabels, ['L2'], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_values(s, ['L1'], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_values(s, ['L1', 'L2'], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._get_label_or_level_values('L1', axis=1) + + +# Panel +# ----- +def test_get_label_or_level_values_panel_error(panel): + msg = ("_get_label_or_level_values is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._get_label_or_level_values('L1', axis=0) + + +# Test _drop_labels_or_levels +# =========================== +def assert_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def assert_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_drop_labels_or_levels_df(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df_levels._drop_labels_or_levels('L4', axis=axis) + + +# Series +# ------ +def test_drop_labels_or_levels_series(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_levels_dropped(s, ['L1'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_levels_dropped(s, ['L1', 'L2'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + +# Panel +# ----- +def test_drop_labels_or_levels_panel_error(panel): + msg = ("_drop_labels_or_levels is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._drop_labels_or_levels('L1', axis=0) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 913d3bcc09869..3d27df31cee6e 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -637,7 +637,7 @@ def test_agg_consistency(self): def P1(a): try: return np.percentile(a.dropna(), q=1) - except: + except Exception: return np.nan import datetime as dt @@ -892,3 +892,36 @@ def test_sum_uint64_overflow(self): expected.index.name = 0 result = df.groupby(0).sum() tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("structure, expected", [ + (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), + (3, 4): (3, 4, 4)}})), + (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], + (3, 4): [3, 4, 4]}})) + ]) + def test_agg_structs_dataframe(self, structure, expected): + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby(['A', 'B']).aggregate(structure) + expected.index.names = ['A', 'B'] + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("structure, expected", [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], + index=[1, 3], name='C')), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], + index=[1, 3], name='C')) + ]) + def test_agg_structs_series(self, structure, expected): + # Issue #18079 + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby('A')['C'].aggregate(structure) + expected.index.name = 'A' + assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py index bc13d51c4f4f6..b9718663570bd 100644 --- a/pandas/tests/groupby/test_functional.py +++ b/pandas/tests/groupby/test_functional.py @@ -52,10 +52,10 @@ def test_frame_describe_multikey(self): desc_groups = [] for col in self.tsframe: group = grouped[col].describe() - group_col = pd.MultiIndex([[col] * len(group.columns), - group.columns], - [[0] * len(group.columns), - range(len(group.columns))]) + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + labels=[[0] * len(group.columns), range(len(group.columns))]) group = pd.DataFrame(group.values, columns=group_col, index=group.index) @@ -67,8 +67,9 @@ def test_frame_describe_multikey(self): 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() expected = self.tsframe.describe().T - expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], - [range(4), range(len(expected.index))]) + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + labels=[[0, 0, 1, 1], range(len(expected.index))]) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 91a5569b352e9..3436dd9169081 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -25,6 +25,15 @@ from .common import MixIn +class TestGrouper(object): + + def test_repr(self): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected + + class TestGroupBy(MixIn): def test_basic(self): @@ -257,7 +266,7 @@ def test_len(self): assert len(grouped) == len(df) grouped = df.groupby([lambda x: x.year, lambda x: x.month]) - expected = len(set((x.year, x.month) for x in df.index)) + expected = len({(x.year, x.month) for x in df.index}) assert len(grouped) == expected # issue 11016 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index cc422f2d1cdeb..8702062e9cd0a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -6,7 +6,7 @@ from warnings import catch_warnings from pandas import (date_range, Timestamp, - Index, MultiIndex, DataFrame, Series) + Index, MultiIndex, DataFrame, Series, CategoricalIndex) from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) from pandas.compat import lrange, long @@ -251,6 +251,29 @@ def test_groupby_levels_and_columns(self): by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) + def test_groupby_categorical_index_and_columns(self): + # GH18432 + columns = ['A', 'B', 'A', 'B'] + categories = ['B', 'A'] + data = np.ones((5, 4), int) + cat_columns = CategoricalIndex(columns, + categories=categories, + ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0).sum() + expected_data = 2 * np.ones((5, 2), int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) + expected = DataFrame(data=expected_data, columns=expected_columns) + assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063 diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 3b6e15036cfe2..cee78eab3a636 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -108,7 +108,7 @@ def test_grouper_column_index_level_precedence(frame, assert_frame_equal(result, expected) - # Grouping with level Grouper should produce a difference result but + # Grouping with level Grouper should produce a different result but # still no warning with tm.assert_produces_warning(False): not_expected = frame.groupby(level_groupers).mean() diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 977c639d79711..de0deb442e516 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -238,7 +238,7 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') - results = set(v for v in dir(grp) if not v.startswith('_')) + results = {v for v in dir(grp) if not v.startswith('_')} expected = { 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 81360bc0c13f9..c1ee18526cc01 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -9,8 +9,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, IntervalIndex, - notna, isna) + TimedeltaIndex, PeriodIndex, IntervalIndex, isna) from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.dtypes.common import needs_i8_conversion @@ -529,31 +528,20 @@ def test_numpy_repeat(self): tm.assert_raises_regex(ValueError, msg, np.repeat, i, rep, axis=0) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + + cond = [True] * len(i) + result = i.where(klass(cond)) expected = i tm.assert_index_equal(result, expected) - _nan = i._na_value cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - result = i.where(cond) + expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input @@ -1017,3 +1005,56 @@ def test_searchsorted_monotonic(self, indices): # non-monotonic should raise. with pytest.raises(ValueError): indices._searchsorted_monotonic(value, side='left') + + def test_map(self): + # callable + index = self.create_index() + + # we don't infer UInt64 + if isinstance(index, pd.UInt64Index): + expected = index.astype('int64') + else: + expected = index + + result = index.map(lambda x: x) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): + + index = self.create_index() + if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)): + pytest.skip("skipping tests for {}".format(type(index))) + + identity = mapper(index.values, index) + + # we don't infer to UInt64 for a dict + if isinstance(index, pd.UInt64Index) and isinstance(identity, dict): + expected = index.astype('int64') + else: + expected = index + + result = index.map(identity) + tm.assert_index_equal(result, expected) + + # empty mappable + expected = pd.Index([np.nan] * len(index)) + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) + + def test_putmask_with_wrong_mask(self): + # GH18368 + index = self.create_index() + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) + 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) - 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask('foo', 1) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 12b509d4aef3f..7d01a2a70145d 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,5 +1,7 @@ """ generic datetimelike tests """ - +import pytest +import numpy as np +import pandas as pd from .common import Base import pandas.util.testing as tm @@ -38,3 +40,46 @@ def test_view(self, indices): i_view = i.view(self._holder) result = self._holder(i) tm.assert_index_equal(result, i_view) + + def test_map_callable(self): + + expected = self.index + 1 + result = self.index.map(lambda x: x + 1) + tm.assert_index_equal(result, expected) + + # map to NaT + result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): + expected = self.index + 1 + + # don't compare the freqs + if isinstance(expected, pd.DatetimeIndex): + expected.freq = None + + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) + + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) + + # empty map; these map to np.nan because we cannot know + # to re-infer things + expected = pd.Index([np.nan] * len(self.index)) + result = self.index.map(mapper([], [])) + tm.assert_index_equal(result, expected) + + def test_asobject_deprecated(self): + # GH18572 + d = self.create_index() + with tm.assert_produces_warning(FutureWarning): + i = d.asobject + assert isinstance(i, pd.Index) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index edcee0479827f..826e20b8b0586 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -290,6 +290,22 @@ def test_precision_finer_than_offset(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + dt1, dt2 = '2017-01-01', '2017-01-01' + tz1, tz2 = 'US/Eastern', 'Europe/London' + + @pytest.mark.parametrize("start,end", [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) + ]) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + with pytest.raises(TypeError): + pd.date_range(start, end) + with pytest.raises(TypeError): + pd.DatetimeIndex(start, end, freq=BDay()) + class TestBusinessDateRange(object): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4ce9441d87970..b3ce22962d5d4 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -145,6 +145,13 @@ def test_insert(self): assert result.tz == expected.tz assert result.freq is None + # GH 18295 (test missing) + expected = DatetimeIndex( + ['20170101', pd.NaT, '20170102', '20170103', '20170104']) + for na in (np.nan, pd.NaT, None): + result = date_range('20170101', periods=4).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 0db26652eb191..41d0dd38cd5f6 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -51,7 +51,7 @@ def test_ops_properties_basic(self): assert s.day == 10 pytest.raises(AttributeError, lambda: s.weekday) - def test_asobject_tolist(self): + def test_astype_object(self): idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') expected_list = [Timestamp('2013-01-31'), @@ -59,7 +59,7 @@ def test_asobject_tolist(self): Timestamp('2013-03-31'), Timestamp('2013-04-30')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -74,7 +74,7 @@ def test_asobject_tolist(self): Timestamp('2013-03-31', tz='Asia/Tokyo'), Timestamp('2013-04-30', tz='Asia/Tokyo')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -87,7 +87,7 @@ def test_asobject_tolist(self): Timestamp('2013-01-02'), pd.NaT, Timestamp('2013-01-04')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -389,26 +389,27 @@ def test_comp_nat(self): pd.Timestamp('2011-01-03')]) right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = rhs == lhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = lhs != rhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -636,9 +637,9 @@ def test_equals(self): idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) @@ -646,8 +647,8 @@ def test_equals(self): tz='US/Pacific') assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) @@ -656,8 +657,8 @@ def test_equals(self): tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.asobject) - assert not idx.asobject.equals(idx3) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index a1287c3102b77..d03951458f12a 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -16,7 +16,7 @@ from pandas._libs import tslib from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools -from pandas.core.tools.datetimes import normalize_date + from pandas.compat import lmap from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype @@ -1160,9 +1160,9 @@ class TestDatetimeParsingWrappers(object): @pytest.mark.parametrize('cache', [True, False]) def test_parsers(self, cache): + # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 - import dateutil - yearfirst = dateutil.__version__ >= LooseVersion('2.5.0') + yearfirst = True cases = {'2011-01-01': datetime(2011, 1, 1), '2Q2005': datetime(2005, 4, 1), @@ -1576,12 +1576,12 @@ def test_coerce_of_invalid_datetimes(self): def test_normalize_date(): value = date(2012, 9, 7) - result = normalize_date(value) + result = tslib.normalize_date(value) assert (result == datetime(2012, 9, 7)) value = datetime(2012, 9, 7, 12) - result = normalize_date(value) + result = tslib.normalize_date(value) assert (result == datetime(2012, 9, 7)) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index d20ed66c06ce9..6cb4226dffc5a 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -9,7 +9,7 @@ from pandas._libs import tslib, tslibs from pandas import (PeriodIndex, Series, DatetimeIndex, period_range, Period) -from pandas._libs import period as libperiod +from pandas._libs.tslibs import period as libperiod class TestGetItem(object): diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 1d77de0d2d8f3..a78bc6fc577b8 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -27,7 +27,7 @@ def test_ops_properties(self): self.check_ops_properties(PeriodIndex._object_ops, f) self.check_ops_properties(PeriodIndex._bool_ops, f) - def test_asobject_tolist(self): + def test_astype_object(self): idx = pd.period_range(start='2013-01-01', periods=4, freq='M', name='idx') expected_list = [pd.Period('2013-01-31', freq='M'), @@ -35,7 +35,7 @@ def test_asobject_tolist(self): pd.Period('2013-03-31', freq='M'), pd.Period('2013-04-30', freq='M')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -49,7 +49,7 @@ def test_asobject_tolist(self): pd.Period('NaT', freq='D'), pd.Period('2013-01-04', freq='D')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -290,26 +290,27 @@ def test_comp_nat(self): pd.Period('2011-01-03')]) right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = lhs == rhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = lhs != rhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -614,9 +615,9 @@ def test_equals(self): freq=freq) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) @@ -624,8 +625,8 @@ def test_equals(self): freq='H') assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) @@ -634,8 +635,8 @@ def test_equals(self): tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.asobject) - assert not idx.asobject.equals(idx3) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 7fefcc859d447..48378233dd638 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -24,7 +24,7 @@ def setup_method(self, method): def create_index(self): return period_range('20130101', periods=5, freq='D') - def test_astype(self): + def test_astype_conversion(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -61,27 +61,18 @@ def test_pickle_round_trip(self): result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, Series] - expected = pd.PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = PeriodIndex([NaT] + i[1:].tolist(), freq='D') + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_where_other(self): @@ -389,23 +380,23 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - def test_asobject_like(self): + def test_astype_object(self): idx = pd.PeriodIndex([], freq='M') exp = np.array([], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], dtype=object) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) def test_is_(self): @@ -692,11 +683,9 @@ def test_pickle_freq(self): assert new_prng.freqstr == 'M' def test_map(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') - result = index.map(lambda x: x + 1) - expected = index + 1 - tm.assert_index_equal(result, expected) + # test_map_dictlike generally tests + index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) @@ -706,3 +695,11 @@ def test_join_self(self, how): index = period_range('1/1/2000', periods=10) joined = index.join(index, how=how) assert index is joined + + def test_insert(self): + # GH 18295 (test missing) + expected = PeriodIndex( + ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q') + for na in (np.nan, pd.NaT, None): + result = period_range('2017Q1', periods=4, freq='Q').insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 074678164e6f9..3774111f44fb2 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,7 +6,7 @@ import pandas.core.indexes.period as period from pandas.compat import lrange from pandas.tseries.frequencies import get_freq, MONTHS -from pandas._libs.period import period_ordinal, period_asfreq +from pandas._libs.tslibs.period import period_ordinal, period_asfreq from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, date_range, to_datetime, period_range) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c55f53601848c..72b312f29a793 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -4,7 +4,10 @@ from datetime import datetime, timedelta +from collections import defaultdict + import pandas.util.testing as tm +from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.indexes.api import Index, MultiIndex from pandas.tests.indexes.common import Base @@ -14,7 +17,7 @@ import numpy as np from pandas import (period_range, date_range, Series, - DataFrame, Float64Index, Int64Index, + DataFrame, Float64Index, Int64Index, UInt64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, isna) from pandas.core.index import _get_combined_index, _ensure_index_from_sequences @@ -103,6 +106,15 @@ def test_construction_list_mixed_tuples(self): assert isinstance(idx2, Index) assert not isinstance(idx2, MultiIndex) + @pytest.mark.parametrize('na_value', [None, np.nan]) + @pytest.mark.parametrize('vtype', [list, tuple, iter]) + def test_construction_list_tuples_nan(self, na_value, vtype): + # GH 18505 : valid tuples containing NaN + values = [(1, 'two'), (3., na_value)] + result = Index(vtype(values)) + expected = MultiIndex.from_tuples(values) + tm.assert_index_equal(result, expected) + def test_constructor_from_index_datetimetz(self): idx = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') @@ -110,7 +122,7 @@ def test_constructor_from_index_datetimetz(self): tm.assert_index_equal(result, idx) assert result.tz == idx.tz - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) assert result.tz == idx.tz @@ -119,7 +131,7 @@ def test_constructor_from_index_timedelta(self): result = pd.Index(idx) tm.assert_index_equal(result, idx) - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) def test_constructor_from_index_period(self): @@ -127,7 +139,7 @@ def test_constructor_from_index_period(self): result = pd.Index(idx) tm.assert_index_equal(result, idx) - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) def test_constructor_from_series_datetimetz(self): @@ -201,6 +213,20 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('dtype', [ + int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32', + 'uint16', 'uint8']) + def test_constructor_int_dtype_float(self, dtype): + # GH 18400 + if is_unsigned_integer_dtype(dtype): + index_type = UInt64Index + else: + index_type = Int64Index + + expected = index_type([0, 1, 2, 3]) + result = Index([0., 1., 2., 3.], dtype=dtype) + tm.assert_index_equal(result, expected) + def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] @@ -442,6 +468,12 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) + # GH 18295 (test missing) + expected = Index(['a', np.nan, 'b', 'c']) + for na in (np.nan, pd.NaT, None): + result = Index(list('abc')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = Index(['a', 'b', 'c', 'd'], name='idx') @@ -591,12 +623,13 @@ def test_empty_fancy(self): # Index. pytest.raises(IndexError, idx.__getitem__, empty_farr) - def test_getitem(self): - arr = np.array(self.dateIndex) - exp = self.dateIndex[5] - exp = _to_m8(exp) + def test_getitem_error(self, indices): - assert exp == arr[5] + with pytest.raises(IndexError): + indices[101] + + with pytest.raises(IndexError): + indices['no_int'] def test_intersection(self): first = self.strIndex[:20] @@ -829,6 +862,61 @@ def test_map_tseries_indices_return_index(self): exp = Index(range(24), name='hourly') tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): + # GH 12756 + expected = Index(['foo', 'bar', 'baz']) + result = tm.makeIntIndex(3).map(mapper(expected.values, [0, 1, 2])) + tm.assert_index_equal(result, expected) + + for name in self.indices.keys(): + if name == 'catIndex': + # Tested in test_categorical + continue + elif name == 'repeats': + # Cannot map duplicated index + continue + + index = self.indices[name] + expected = Index(np.arange(len(index), 0, -1)) + + # to match proper result coercion for uints + if name == 'empty': + expected = Index([]) + + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) + + def test_map_with_non_function_missing_values(self): + # GH 12756 + expected = Index([2., np.nan, 'foo']) + input = Index([2, 1, 0]) + + mapper = Series(['foo', 2., 'baz'], index=[0, 2, -1]) + tm.assert_index_equal(expected, input.map(mapper)) + + mapper = {0: 'foo', 2: 2.0, -1: 'baz'} + tm.assert_index_equal(expected, input.map(mapper)) + + def test_map_na_exclusion(self): + idx = Index([1.5, np.nan, 3, np.nan, 5]) + + result = idx.map(lambda x: x * 2, na_action='ignore') + exp = idx * 2 + tm.assert_index_equal(result, exp) + + def test_map_defaultdict(self): + idx = Index([1, 2, 3]) + default_dict = defaultdict(lambda: 'blank') + default_dict[1] = 'stuff' + result = idx.map(default_dict) + expected = Index(['stuff', 'blank', 'blank']) + tm.assert_index_equal(result, expected) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5e40e06d57413..c2eee4e437347 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, IntervalIndex, compat, notna +from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -269,28 +269,37 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) - def test_where(self): + result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) + tm.assert_index_equal(result, exp) + + result = ci.map({'A': 10, 'B': 20, 'C': 30}) + tm.assert_index_equal(result, exp) + + def test_map_with_categorical_series(self): + # GH 12756 + a = pd.Index([1, 2, 3, 4]) + b = pd.Series(["even", "odd", "even", "odd"], + dtype="category") + c = pd.Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = pd.Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), - categories=i.categories) - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.CategoricalIndex([np.nan] + i[1:].tolist(), - categories=i.categories) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), + categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_append(self): @@ -353,6 +362,12 @@ def test_insert(self): # invalid pytest.raises(TypeError, lambda: ci.insert(0, 'd')) + # GH 18295 (test missing) + expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) + for na in (np.nan, pd.NaT, None): + result = CategoricalIndex(list('aabcb')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 399d88309072e..dc06e51c6d8e7 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -3,10 +3,10 @@ import pytest import numpy as np from datetime import timedelta -from pandas import (Interval, IntervalIndex, Index, isna, - interval_range, Timestamp, Timedelta, - compat, date_range, timedelta_range, DateOffset) -from pandas.compat import zip +from pandas import ( + Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp, + Timedelta, compat, date_range, timedelta_range, DateOffset) +from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -19,6 +19,11 @@ def closed(request): return request.param +@pytest.fixture(scope='class', params=[None, 'foo']) +def name(request): + return request.param + + class TestIntervalIndex(Base): _holder = IntervalIndex @@ -29,16 +34,17 @@ def setup_method(self, method): self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) def create_index(self, closed='right'): - return IntervalIndex.from_breaks(np.arange(3), closed=closed) + return IntervalIndex.from_breaks(range(11), closed=closed) def create_index_with_nan(self, closed='right'): - return IntervalIndex.from_tuples( - [(0, 1), np.nan, (1, 2)], closed=closed) + mask = [True, False] + [True] * 8 + return IntervalIndex.from_arrays( + np.where(mask, np.arange(10), np.nan), + np.where(mask, np.arange(1, 11), np.nan), closed=closed) - @pytest.mark.parametrize('name', [None, 'foo']) def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) - ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] + ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) @@ -57,7 +63,7 @@ def test_constructors(self, closed, name): tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( - zip(left, right), closed=closed, name=name) + lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) @@ -68,6 +74,9 @@ def test_constructors(self, closed, name): tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) + result = IntervalIndex.from_intervals(expected) + tm.assert_index_equal(result, expected) + result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) @@ -86,63 +95,118 @@ def test_constructors(self, closed, name): breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) - def test_constructors_other(self): - - # all-nan - result = IntervalIndex.from_intervals([np.nan]) - expected = np.array([np.nan], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) - - # empty - result = IntervalIndex.from_intervals([]) - expected = np.array([], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) + @pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50]) + def test_constructors_nan(self, closed, data): + # GH 18421 + expected_values = np.array(data, dtype=object) + expected_idx = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_idx.closed == closed + tm.assert_numpy_array_equal(expected_idx.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks([np.nan] + data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + @pytest.mark.parametrize('data', [ + [], + np.array([], dtype='int64'), + np.array([], dtype='float64'), + np.array([], dtype=object)]) + def test_constructors_empty(self, data, closed): + # GH 18421 + expected_dtype = data.dtype if isinstance(data, np.ndarray) else object + expected_values = np.array([], dtype=object) + expected_index = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_index.empty + assert expected_index.closed == closed + assert expected_index.dtype.subtype == expected_dtype + tm.assert_numpy_array_equal(expected_index.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) def test_constructors_errors(self): # scalar - msg = ('IntervalIndex(...) must be called with a collection of ' + msg = ('IntervalIndex\(...\) must be called with a collection of ' 'some kind, 5 was passed') - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex(5) # not an interval - msg = "type with value 0 is not an interval" - with pytest.raises(TypeError, message=msg): + msg = ("type <(class|type) 'numpy.int64'> with value 0 " + "is not an interval") + with tm.assert_raises_regex(TypeError, msg): IntervalIndex([0, 1]) - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals([0, 1]) # invalid closed msg = "invalid options for 'closed': invalid" - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') - # mismatched closed + # mismatched closed within intervals msg = 'intervals must all be closed on the same side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) - with pytest.raises(ValueError, message=msg): - IntervalIndex.from_arrays([0, 10], [3, 5]) - - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) + # mismatched closed inferred from intervals vs constructor. + msg = 'conflicting values for closed' + with tm.assert_raises_regex(ValueError, msg): + iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] + IntervalIndex(iv, closed='neither') + # no point in nesting periods in an IntervalIndex msg = 'Period dtypes are not supported, use a PeriodIndex instead' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) # decreasing breaks/arrays msg = 'left side of interval must be <= right side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(range(10, -1, -1)) - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) def test_constructors_datetimelike(self, closed): @@ -168,55 +232,67 @@ def f(): def test_properties(self, closed): index = self.create_index(closed=closed) - assert len(index) == 2 - assert index.size == 2 - assert index.shape == (2, ) + assert len(index) == 10 + assert index.size == 10 + assert index.shape == (10, ) - tm.assert_index_equal(index.left, Index([0, 1])) - tm.assert_index_equal(index.right, Index([1, 2])) - tm.assert_index_equal(index.mid, Index([0.5, 1.5])) + tm.assert_index_equal(index.left, Index(np.arange(10))) + tm.assert_index_equal(index.right, Index(np.arange(1, 11))) + tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5))) assert index.closed == closed - expected = np.array([Interval(0, 1, closed=closed), - Interval(1, 2, closed=closed)], dtype=object) + ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] + expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) # with nans index = self.create_index_with_nan(closed=closed) - assert len(index) == 3 - assert index.size == 3 - assert index.shape == (3, ) + assert len(index) == 10 + assert index.size == 10 + assert index.shape == (10, ) - tm.assert_index_equal(index.left, Index([0, np.nan, 1])) - tm.assert_index_equal(index.right, Index([1, np.nan, 2])) - tm.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) + expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9]) + expected_right = expected_left + 1 + expected_mid = expected_left + 0.5 + tm.assert_index_equal(index.left, expected_left) + tm.assert_index_equal(index.right, expected_right) + tm.assert_index_equal(index.mid, expected_mid) assert index.closed == closed - expected = np.array([Interval(0, 1, closed=closed), np.nan, - Interval(1, 2, closed=closed)], dtype=object) + ivs = [Interval(l, r, closed) if notna(l) else np.nan + for l, r in zip(expected_left, expected_right)] + expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) def test_with_nans(self, closed): index = self.create_index(closed=closed) assert not index.hasnans - tm.assert_numpy_array_equal(index.isna(), - np.array([False, False])) - tm.assert_numpy_array_equal(index.notna(), - np.array([True, True])) + + result = index.isna() + expected = np.repeat(False, len(index)) + tm.assert_numpy_array_equal(result, expected) + + result = index.notna() + expected = np.repeat(True, len(index)) + tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) assert index.hasnans - tm.assert_numpy_array_equal(index.notna(), - np.array([True, False, True])) - tm.assert_numpy_array_equal(index.isna(), - np.array([False, True, False])) + + result = index.isna() + expected = np.array([False, True] + [False] * (len(index) - 2)) + tm.assert_numpy_array_equal(result, expected) + + result = index.notna() + expected = np.array([True, False] + [True] * (len(index) - 2)) + tm.assert_numpy_array_equal(result, expected) def test_copy(self, closed): - expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + expected = self.create_index(closed=closed) result = expected.copy() assert result.equals(expected) @@ -290,44 +366,79 @@ def test_astype(self, closed): expected = pd.Categorical(idx, ordered=True) tm.assert_categorical_equal(result, expected) - def test_where(self, closed): - expected = self.create_index(closed=closed) - result = expected.where(expected.notna()) + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, closed, klass): + idx = self.create_index(closed=closed) + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) - idx = IntervalIndex.from_breaks([1, 2], closed=closed) - result = idx.where([True, False]) - expected = IntervalIndex.from_intervals( - [Interval(1.0, 2.0, closed=closed), np.nan]) + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - pass - def test_delete(self, closed): - expected = IntervalIndex.from_breaks([1, 2], closed=closed) + expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed) result = self.create_index(closed=closed).delete(0) tm.assert_index_equal(result, expected) - def test_insert(self): - expected = IntervalIndex.from_breaks(range(4)) - actual = self.index.insert(2, Interval(2, 3)) - assert expected.equals(actual) + @pytest.mark.parametrize('data', [ + interval_range(0, periods=10, closed='neither'), + interval_range(1.7, periods=8, freq=2.5, closed='both'), + interval_range(Timestamp('20170101'), periods=12, closed='left'), + interval_range(Timedelta('1 day'), periods=6, closed='right'), + IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]), + IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])]) + def test_insert(self, data): + item = data[0] + idx_item = IntervalIndex([item]) + + # start + expected = idx_item.append(data) + result = data.insert(0, item) + tm.assert_index_equal(result, expected) - pytest.raises(ValueError, self.index.insert, 0, 1) - pytest.raises(ValueError, self.index.insert, 0, - Interval(2, 3, closed='left')) + # end + expected = data.append(idx_item) + result = data.insert(len(data), item) + tm.assert_index_equal(result, expected) + + # mid + expected = data[:3].append(idx_item).append(data[3:]) + result = data.insert(3, item) + tm.assert_index_equal(result, expected) + + # invalid type + msg = 'can only insert Interval objects and NA into an IntervalIndex' + with tm.assert_raises_regex(ValueError, msg): + data.insert(1, 'foo') + + # invalid closed + msg = 'inserted item must be closed on the same side as the index' + for closed in {'left', 'right', 'both', 'neither'} - {item.closed}: + with tm.assert_raises_regex(ValueError, msg): + bad_item = Interval(item.left, item.right, closed=closed) + data.insert(1, bad_item) + + # GH 18295 (test missing) + na_idx = IntervalIndex([np.nan], closed=data.closed) + for na in (np.nan, pd.NaT, None): + expected = data[:1].append(na_idx).append(data[1:]) + result = data.insert(1, na) + tm.assert_index_equal(result, expected) def test_take(self, closed): index = self.create_index(closed=closed) - actual = index.take([0, 1]) - tm.assert_index_equal(actual, index) + result = index.take(range(10)) + tm.assert_index_equal(result, index) + result = index.take([0, 0, 1]) expected = IntervalIndex.from_arrays( [0, 0, 1], [1, 1, 2], closed=closed) - actual = index.take([0, 0, 1]) - tm.assert_index_equal(actual, expected) + tm.assert_index_equal(result, expected) def test_unique(self, closed): # unique non-overlapping @@ -683,50 +794,85 @@ def test_non_contiguous(self, closed): assert 1.5 not in index def test_union(self, closed): - idx = self.create_index(closed=closed) - other = IntervalIndex.from_arrays([2], [3], closed=closed) - expected = IntervalIndex.from_arrays( - range(3), range(1, 4), closed=closed) - actual = idx.union(other) - assert expected.equals(actual) + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + + expected = IntervalIndex.from_breaks(range(13), closed=closed) + result = index.union(other) + tm.assert_index_equal(result, expected) - actual = other.union(idx) - assert expected.equals(actual) + result = other.union(index) + tm.assert_index_equal(result, expected) - tm.assert_index_equal(idx.union(idx), idx) - tm.assert_index_equal(idx.union(idx[:1]), idx) + tm.assert_index_equal(index.union(index), index) + tm.assert_index_equal(index.union(index[:1]), index) def test_intersection(self, closed): - idx = self.create_index(closed=closed) - other = IntervalIndex.from_breaks([1, 2, 3], closed=closed) - expected = IntervalIndex.from_breaks([1, 2], closed=closed) - actual = idx.intersection(other) - assert expected.equals(actual) + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - tm.assert_index_equal(idx.intersection(idx), idx) + expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + result = other.intersection(index) + tm.assert_index_equal(result, expected) + + tm.assert_index_equal(index.intersection(index), index) def test_difference(self, closed): - idx = self.create_index(closed=closed) - tm.assert_index_equal(idx.difference(idx[:1]), idx[1:]) + index = self.create_index(closed=closed) + tm.assert_index_equal(index.difference(index[:1]), index[1:]) - def test_symmetric_difference(self): - result = self.index[:1].symmetric_difference(self.index[1:]) - expected = self.index + def test_symmetric_difference(self, closed): + idx = self.create_index(closed=closed) + result = idx[1:].symmetric_difference(idx[:-1]) + expected = IntervalIndex([idx[0], idx[-1]]) tm.assert_index_equal(result, expected) - def test_set_operation_errors(self): - pytest.raises(ValueError, self.index.union, self.index.left) + @pytest.mark.parametrize('op_name', [ + 'union', 'intersection', 'difference', 'symmetric_difference']) + def test_set_operation_errors(self, closed, op_name): + index = self.create_index(closed=closed) + set_op = getattr(index, op_name) - other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - pytest.raises(ValueError, self.index.union, other) + # test errors + msg = ('can only do set operations between two IntervalIndex objects ' + 'that are closed on the same side') + with tm.assert_raises_regex(ValueError, msg): + set_op(Index([1, 2, 3])) + + for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + other = self.create_index(closed=other_closed) + with tm.assert_raises_regex(ValueError, msg): + set_op(other) def test_isin(self, closed): - idx = self.create_index(closed=closed) - actual = idx.isin(idx) - tm.assert_numpy_array_equal(np.array([True, True]), actual) + index = self.create_index(closed=closed) + + expected = np.array([True] + [False] * (len(index) - 1)) + result = index.isin(index[:1]) + tm.assert_numpy_array_equal(result, expected) + + result = index.isin([index[0]]) + tm.assert_numpy_array_equal(result, expected) + + other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed) + expected = np.array([True] * (len(index) - 1) + [False]) + result = index.isin(other) + tm.assert_numpy_array_equal(result, expected) - actual = idx.isin(idx[:1]) - tm.assert_numpy_array_equal(np.array([True, False]), actual) + result = index.isin(other.tolist()) + tm.assert_numpy_array_equal(result, expected) + + for other_closed in {'right', 'left', 'both', 'neither'}: + other = self.create_index(closed=other_closed) + expected = np.repeat(closed == other_closed, len(index)) + result = index.isin(other) + tm.assert_numpy_array_equal(result, expected) + + result = index.isin(other.tolist()) + tm.assert_numpy_array_equal(result, expected) def test_comparison(self): actual = Interval(0, 1) < self.index @@ -799,23 +945,24 @@ def test_missing_values(self, closed): np.array([True, False, False])) def test_sort_values(self, closed): - expected = IntervalIndex.from_breaks([1, 2, 3, 4], closed=closed) - actual = IntervalIndex.from_tuples( - [(3, 4), (1, 2), (2, 3)], closed=closed).sort_values() - tm.assert_index_equal(expected, actual) + index = self.create_index(closed=closed) - # nan - idx = self.create_index_with_nan(closed=closed) - mask = idx.isna() - tm.assert_numpy_array_equal(mask, np.array([False, True, False])) + result = index.sort_values() + tm.assert_index_equal(result, index) - result = idx.sort_values() - mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([False, False, True])) + result = index.sort_values(ascending=False) + tm.assert_index_equal(result, index[::-1]) - result = idx.sort_values(ascending=False) - mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([True, False, False])) + # with nan + index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) + + result = index.sort_values() + expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) + tm.assert_index_equal(result, expected) + + result = index.sort_values(ascending=False) + expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) + tm.assert_index_equal(result, expected) def test_datetime(self): dates = date_range('2000', periods=3) @@ -865,7 +1012,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is True - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) @@ -873,7 +1020,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) @@ -881,7 +1028,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', overwise True (GH16560) @@ -895,58 +1042,58 @@ def test_is_non_overlapping_monotonic(self, closed): class TestIntervalRange(object): - def test_construction_from_numeric(self, closed): + def test_construction_from_numeric(self, closed, name): # combinations of start/end/periods without freq expected = IntervalIndex.from_breaks( - np.arange(0, 6), name='foo', closed=closed) + np.arange(0, 6), name=name, closed=closed) - result = interval_range(start=0, end=5, name='foo', closed=closed) + result = interval_range(start=0, end=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=0, periods=5, name='foo', closed=closed) + result = interval_range(start=0, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=5, periods=5, name='foo', closed=closed) + result = interval_range(end=5, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) # combinations of start/end/periods with freq expected = IntervalIndex.from_tuples([(0, 2), (2, 4), (4, 6)], - name='foo', closed=closed) + name=name, closed=closed) - result = interval_range(start=0, end=6, freq=2, name='foo', + result = interval_range(start=0, end=6, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=0, periods=3, freq=2, name='foo', + result = interval_range(start=0, periods=3, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=6, periods=3, freq=2, name='foo', + result = interval_range(end=6, periods=3, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. expected = IntervalIndex.from_tuples([(0.0, 1.5), (1.5, 3.0)], - name='foo', closed=closed) - result = interval_range(start=0, end=4, freq=1.5, name='foo', + name=name, closed=closed) + result = interval_range(start=0, end=4, freq=1.5, name=name, closed=closed) tm.assert_index_equal(result, expected) - def test_construction_from_timestamp(self, closed): + def test_construction_from_timestamp(self, closed, name): # combinations of start/end/periods without freq start, end = Timestamp('2017-01-01'), Timestamp('2017-01-06') breaks = date_range(start=start, end=end) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, name='foo', + result = interval_range(start=start, end=end, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=5, name='foo', + result = interval_range(start=start, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=5, name='foo', + result = interval_range(end=end, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -954,23 +1101,23 @@ def test_construction_from_timestamp(self, closed): freq = '2D' start, end = Timestamp('2017-01-01'), Timestamp('2017-01-07') breaks = date_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=3, freq=freq, name='foo', + result = interval_range(start=start, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=3, freq=freq, name='foo', + result = interval_range(end=end, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timestamp('2017-01-08') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -978,41 +1125,41 @@ def test_construction_from_timestamp(self, closed): freq = 'M' start, end = Timestamp('2017-01-01'), Timestamp('2017-12-31') breaks = date_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=11, freq=freq, name='foo', + result = interval_range(start=start, periods=11, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=11, freq=freq, name='foo', + result = interval_range(end=end, periods=11, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timestamp('2018-01-15') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - def test_construction_from_timedelta(self, closed): + def test_construction_from_timedelta(self, closed, name): # combinations of start/end/periods without freq start, end = Timedelta('1 day'), Timedelta('6 days') breaks = timedelta_range(start=start, end=end) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, name='foo', + result = interval_range(start=start, end=end, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=5, name='foo', + result = interval_range(start=start, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=5, name='foo', + result = interval_range(end=end, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -1020,23 +1167,23 @@ def test_construction_from_timedelta(self, closed): freq = '2D' start, end = Timedelta('1 day'), Timedelta('7 days') breaks = timedelta_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=3, freq=freq, name='foo', + result = interval_range(start=start, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=3, freq=freq, name='foo', + result = interval_range(end=end, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timedelta('7 days 1 hour') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -1054,10 +1201,6 @@ def test_constructor_coverage(self): end=end.to_pydatetime()) tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.tz_localize('UTC'), - end=end.tz_localize('UTC')) - tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 506a9e1c64b10..a2c0a75e21f43 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -672,8 +672,9 @@ def test_from_arrays(self): for lev, lab in zip(self.index.levels, self.index.labels): arrays.append(np.asarray(lev).take(lab)) - result = MultiIndex.from_arrays(arrays) - assert list(result) == list(self.index) + # list of arrays as input + result = MultiIndex.from_arrays(arrays, names=self.index.names) + tm.assert_index_equal(result, self.index) # infer correctly result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')], @@ -681,6 +682,21 @@ def test_from_arrays(self): assert result.levels[0].equals(Index([Timestamp('20130101')])) assert result.levels[1].equals(Index(['a', 'b'])) + def test_from_arrays_iterator(self): + # GH 18434 + arrays = [] + for lev, lab in zip(self.index.levels, self.index.labels): + arrays.append(np.asarray(lev).take(lab)) + + # iterator as input + result = MultiIndex.from_arrays(iter(arrays), names=self.index.names) + tm.assert_index_equal(result, self.index) + + # invalid iterator input + with tm.assert_raises_regex( + TypeError, "Input must be a list / sequence of array-likes."): + MultiIndex.from_arrays(0) + def test_from_arrays_index_series_datetimetz(self): idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') @@ -825,7 +841,25 @@ def test_from_product(self): expected = MultiIndex.from_tuples(tuples, names=names) tm.assert_index_equal(result, expected) - assert result.names == names + + def test_from_product_iterator(self): + # GH 18434 + first = ['foo', 'bar', 'buz'] + second = ['a', 'b', 'c'] + names = ['first', 'second'] + tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), + ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), + ('buz', 'c')] + expected = MultiIndex.from_tuples(tuples, names=names) + + # iterator as input + result = MultiIndex.from_product(iter([first, second]), names=names) + tm.assert_index_equal(result, expected) + + # Invalid non-iterable input + with tm.assert_raises_regex( + TypeError, "Input must be a list / sequence of iterables."): + MultiIndex.from_product(0) def test_from_product_empty(self): # 0 levels @@ -1584,7 +1618,9 @@ def test_is_(self): # shouldn't change assert mi2.is_(mi) mi4 = mi3.view() - mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True) + + # GH 17464 - Remove duplicate MultiIndex levels + mi4.set_levels([lrange(10), lrange(10)], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() mi5.set_levels(mi5.levels, inplace=True) @@ -1725,8 +1761,28 @@ def test_from_tuples(self): 'from empty list', MultiIndex.from_tuples, []) - idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) - assert len(idx) == 2 + expected = MultiIndex(levels=[[1, 3], [2, 4]], + labels=[[0, 1], [0, 1]], + names=['a', 'b']) + + # input tuples + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) + tm.assert_index_equal(result, expected) + + def test_from_tuples_iterator(self): + # GH 18434 + # input iterator for tuples + expected = MultiIndex(levels=[[1, 3], [2, 4]], + labels=[[0, 1], [0, 1]], + names=['a', 'b']) + + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) + tm.assert_index_equal(result, expected) + + # input non-iterables + with tm.assert_raises_regex( + TypeError, 'Input must be a list / sequence of tuple-likes.'): + MultiIndex.from_tuples(0) def test_from_tuples_empty(self): # GH 16777 @@ -2178,7 +2234,7 @@ def check(nlevels, with_nulls): if with_nulls: # inject some null values labels[500] = -1 # common nan value - labels = list(labels.copy() for i in range(nlevels)) + labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 @@ -2396,13 +2452,11 @@ def test_isna_behavior(self): pd.isna(self.index) def test_level_setting_resets_attributes(self): - ind = MultiIndex.from_arrays([ + ind = pd.MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) assert ind.is_monotonic - ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) - + ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic @@ -2773,7 +2827,7 @@ def test_groupby(self): # GH5620 groups = self.index.groupby(self.index) - exp = dict((key, [key]) for key in self.index) + exp = {key: [key] for key in self.index} tm.assert_dict_equal(groups, exp) def test_index_name_retained(self): @@ -3029,3 +3083,16 @@ def test_million_record_attribute_error(self): with tm.assert_raises_regex(AttributeError, "'Series' object has no attribute 'foo'"): df['a'].foo() + + def test_duplicate_multiindex_labels(self): + # GH 17464 + # Make sure that a MultiIndex with duplicate levels throws a ValueError + with pytest.raises(ValueError): + ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + + # And that using set_levels with duplicate levels fails + ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], + [1, 2, 1, 2, 3]]) + with pytest.raises(ValueError): + ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], + inplace=True) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a96c677852339..cbd819fa9cfb7 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -7,7 +7,7 @@ import numpy as np -from pandas import (date_range, notna, Series, Index, Float64Index, +from pandas import (date_range, Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex) import pandas.util.testing as tm @@ -175,6 +175,25 @@ def test_modulo(self): expected = Index(index.values % 2) tm.assert_index_equal(index % 2, expected) + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): + i = self.create_index() + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + + cond = [False] + [True] * (len(i) - 1) + expected = Float64Index([i._na_value] + i[1:].tolist()) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_insert(self): + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = self.create_index().insert(1, na) + tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric): _holder = Float64Index @@ -726,31 +745,6 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) assert isinstance(arr, Index) - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7d88b547746f6..96d5981abc1bb 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -10,7 +10,7 @@ import numpy as np -from pandas import (isna, notna, Series, Index, Float64Index, +from pandas import (isna, Series, Index, Float64Index, Int64Index, RangeIndex) import pandas.util.testing as tm @@ -295,6 +295,12 @@ def test_insert(self): # test 0th element tm.assert_index_equal(idx[0:4], result.insert(0, idx[0])) + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = RangeIndex(5).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = RangeIndex(5, name='Foo') @@ -934,31 +940,6 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) assert len(i) == 0 - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_append(self): # GH16212 RI = RangeIndex diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index cb88bac6386f7..e64c4e6ac54a5 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -57,6 +57,12 @@ def test_insert(self): assert result.name == expected.name assert result.freq == expected.freq + # GH 18295 (test missing) + expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + for na in (np.nan, pd.NaT, None): + result = timedelta_range('1day', '3day').insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 67238665a2e8a..fac3745ba4fb4 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -25,12 +25,12 @@ def test_ops_properties(self): self.check_ops_properties(TimedeltaIndex._field_ops, f) self.check_ops_properties(TimedeltaIndex._object_ops, f) - def test_asobject_tolist(self): + def test_astype_object(self): idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') expected_list = [Timedelta('1 days'), Timedelta('2 days'), Timedelta('3 days'), Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -43,7 +43,7 @@ def test_asobject_tolist(self): expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -217,26 +217,27 @@ def test_comp_nat(self): pd.Timedelta('3 days')]) right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = rhs == lhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = rhs != lhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -473,18 +474,18 @@ def test_equals(self): idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) - assert not idx.asobject.equals(idx2.asobject) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.astype(object).equals(idx2.astype(object)) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 533b06088f1bf..e25384ebf7d62 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -187,6 +187,7 @@ def test_misc_coverage(self): assert not idx.equals(list(non_td)) def test_map(self): + # test_map_dictlike generally tests rng = timedelta_range('1 day', periods=10) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 2c93d2afd1760..22b3fd9073bab 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -439,3 +439,21 @@ def test_indexing_with_category(self): res = (cat[['A']] == 'foo') tm.assert_frame_equal(res, exp) + + def test_map_with_dict_or_series(self): + orig_values = ['a', 'B', 1, 'a'] + new_values = ['one', 2, 3.0, 'one'] + cur_index = pd.CategoricalIndex(orig_values, name='XXX') + expected = pd.CategoricalIndex(new_values, + name='XXX', categories=[3.0, 2, 'one']) + + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + mapper = {o: n for o, n in + zip(orig_values[:-1], new_values[:-1])} + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py index 3792293f48b99..e29dc627a5d94 100644 --- a/pandas/tests/indexing/test_interval.py +++ b/pandas/tests/indexing/test_interval.py @@ -54,7 +54,7 @@ def test_getitem_with_scalar(self): def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] if direction == 'decreasing': - tpls = reversed(tpls) + tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) s = Series(list('abc'), idx) diff --git a/pandas/tests/indexing/test_panel.py b/pandas/tests/indexing/test_panel.py index 4d7768c9e8083..c4f7bd28e4d90 100644 --- a/pandas/tests/indexing/test_panel.py +++ b/pandas/tests/indexing/test_panel.py @@ -119,7 +119,7 @@ def test_panel_getitem(self): df = DataFrame( np.random.randn( len(ind), 5), index=ind, columns=list('ABCDE')) - panel = Panel(dict(('frame_' + c, df) for c in list('ABC'))) + panel = Panel({'frame_' + c: df for c in list('ABC')}) test2 = panel.loc[:, "2002":"2002-12-31"] test1 = panel.loc[:, "2002"] diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 32609362e49af..3ad3b771b2ab2 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,6 +2,7 @@ import pandas as pd from pandas.util import testing as tm +import numpy as np class TestTimedeltaIndexing(object): @@ -47,3 +48,23 @@ def test_string_indexing(self): expected = df.iloc[0] sliced = df.loc['0 days'] tm.assert_series_equal(sliced, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_masked_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series[series == series[0]] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_listlike_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series.iloc[0] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 4c0c7d8598a8e..08f769e02e267 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -476,7 +476,7 @@ def test_copy(self, mgr): def test_sparse(self): mgr = create_mgr('a: sparse-1; b: sparse-2') # what to test here? - assert mgr.as_matrix().dtype == np.float64 + assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') @@ -485,32 +485,32 @@ def test_sparse_mixed(self): # what to test here? - def test_as_matrix_float(self): + def test_as_array_float(self): mgr = create_mgr('c: f4; d: f2; e: f8') - assert mgr.as_matrix().dtype == np.float64 + assert mgr.as_array().dtype == np.float64 mgr = create_mgr('c: f4; d: f2') - assert mgr.as_matrix().dtype == np.float32 + assert mgr.as_array().dtype == np.float32 - def test_as_matrix_int_bool(self): + def test_as_array_int_bool(self): mgr = create_mgr('a: bool-1; b: bool-2') - assert mgr.as_matrix().dtype == np.bool_ + assert mgr.as_array().dtype == np.bool_ mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') - assert mgr.as_matrix().dtype == np.int64 + assert mgr.as_array().dtype == np.int64 mgr = create_mgr('c: i4; d: i2; e: u1') - assert mgr.as_matrix().dtype == np.int32 + assert mgr.as_array().dtype == np.int32 - def test_as_matrix_datetime(self): + def test_as_array_datetime(self): mgr = create_mgr('h: datetime-1; g: datetime-2') - assert mgr.as_matrix().dtype == 'M8[ns]' + assert mgr.as_array().dtype == 'M8[ns]' - def test_as_matrix_datetime_tz(self): + def test_as_array_datetime_tz(self): mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]' assert mgr.get('g').dtype == 'datetime64[ns, CET]' - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' def test_astype(self): # coerce all @@ -607,49 +607,49 @@ def test_interleave(self): for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]', 'm8[ns]']: mgr = create_mgr('a: {0}'.format(dtype)) - assert mgr.as_matrix().dtype == dtype + assert mgr.as_array().dtype == dtype mgr = create_mgr('a: {0}; b: {0}'.format(dtype)) - assert mgr.as_matrix().dtype == dtype + assert mgr.as_array().dtype == dtype # will be converted according the actual dtype of the underlying mgr = create_mgr('a: category') - assert mgr.as_matrix().dtype == 'i8' + assert mgr.as_array().dtype == 'i8' mgr = create_mgr('a: category; b: category') - assert mgr.as_matrix().dtype == 'i8' + assert mgr.as_array().dtype == 'i8' mgr = create_mgr('a: category; b: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: category2; b: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' # combinations mgr = create_mgr('a: f8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f8; b: i8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f4; b: i8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f4; b: i8; d: object') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: bool; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: complex') - assert mgr.as_matrix().dtype == 'complex' + assert mgr.as_array().dtype == 'complex' mgr = create_mgr('a: f8; b: category') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: category') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: bool') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: bool') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: m8[ns]') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' def test_interleave_non_unique_cols(self): df = DataFrame([ @@ -831,7 +831,7 @@ def test_equals_block_order_different_dtypes(self): def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) - assert mgr.as_matrix().tolist() == [0., 1., 2., 3., 4.] + assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.] def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] @@ -878,7 +878,7 @@ class TestIndexing(object): def test_get_slice(self): def assert_slice_ok(mgr, axis, slobj): # import pudb; pudb.set_trace() - mat = mgr.as_matrix() + mat = mgr.as_array() # we maybe using an ndarray to test slicing and # might not be the full length of the axis @@ -889,7 +889,7 @@ def assert_slice_ok(mgr, axis, slobj): len(ax) - len(slobj), dtype=bool)]) sliced = mgr.get_slice(slobj, axis=axis) mat_slobj = (slice(None), ) * axis + (slobj, ) - tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_matrix(), + tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(), check_dtype=False) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) @@ -930,10 +930,10 @@ def assert_slice_ok(mgr, axis, slobj): def test_take(self): def assert_take_ok(mgr, axis, indexer): - mat = mgr.as_matrix() + mat = mgr.as_array() taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal(np.take(mat, indexer, axis), - taken.as_matrix(), check_dtype=False) + taken.as_array(), check_dtype=False) tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) @@ -950,14 +950,14 @@ def assert_take_ok(mgr, axis, indexer): def test_reindex_axis(self): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): - mat = mgr.as_matrix() + mat = mgr.as_array() indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis, fill_value=fill_value), - reindexed.as_matrix(), + reindexed.as_array(), check_dtype=False) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -996,13 +996,13 @@ def test_reindex_indexer(self): def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): - mat = mgr.as_matrix() + mat = mgr.as_array() reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer(new_labels, indexer, axis, fill_value=fill_value) tm.assert_numpy_array_equal(reindexed_mat, - reindexed.as_matrix(), + reindexed.as_array(), check_dtype=False) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -1222,7 +1222,9 @@ class TestCanHoldElement(object): @pytest.mark.parametrize('value, dtype', [ (1, 'i8'), (1.0, 'f8'), + (2**63, 'f8'), (1j, 'complex128'), + (2**63, 'complex128'), (True, 'bool'), (np.timedelta64(20, 'ns'), '= 2.2""" - - @classmethod - def setup_class(cls): - _skip_if_no_openpyxl() - import openpyxl - ver = openpyxl.__version__ - if (not (LooseVersion(ver) >= LooseVersion('2.0.0') and - LooseVersion(ver) < LooseVersion('2.2.0'))): - pytest.skip("openpyxl %s >= 2.2" % str(ver)) - - cls.setup_class = setup_class - return cls - - -@raise_on_incompat_version(2) -@skip_openpyxl_gt21 -class TestOpenpyxl20Tests(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'openpyxl20' - check_skip = staticmethod(lambda *args, **kwargs: None) - - def test_to_excel_styleconverter(self): - import openpyxl - from openpyxl import styles - - hstyle = { - "font": { - "color": '00FF0000', - "bold": True, - }, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": { - "horizontal": "center", - "vertical": "top", - }, - "fill": { - "patternType": 'solid', - 'fgColor': { - 'rgb': '006666FF', - 'tint': 0.3, - }, - }, - "number_format": { - "format_code": "0.00" - }, - "protection": { - "locked": True, - "hidden": False, - }, - } - - font_color = styles.Color('00FF0000') - font = styles.Font(bold=True, color=font_color) - side = styles.Side(style=styles.borders.BORDER_THIN) - border = styles.Border(top=side, right=side, bottom=side, left=side) - alignment = styles.Alignment(horizontal='center', vertical='top') - fill_color = styles.Color(rgb='006666FF', tint=0.3) - fill = styles.PatternFill(patternType='solid', fgColor=fill_color) - - # ahh openpyxl API changes - ver = openpyxl.__version__ - if ver >= LooseVersion('2.0.0') and ver < LooseVersion('2.1.0'): - number_format = styles.NumberFormat(format_code='0.00') - else: - number_format = '0.00' # XXX: Only works with openpyxl-2.1.0 - - protection = styles.Protection(locked=True, hidden=False) - - kw = _Openpyxl20Writer._convert_to_style_kwargs(hstyle) - assert kw['font'] == font - assert kw['border'] == border - assert kw['alignment'] == alignment - assert kw['fill'] == fill - assert kw['number_format'] == number_format - assert kw['protection'] == protection - - def test_write_cells_merge_styled(self): - from pandas.io.formats.excel import ExcelCell - from openpyxl import styles - - sheet_name = 'merge_styled' - - sty_b1 = {'font': {'color': '00FF0000'}} - sty_a2 = {'font': {'color': '0000FF00'}} - - initial_cells = [ - ExcelCell(col=1, row=0, val=42, style=sty_b1), - ExcelCell(col=0, row=1, val=99, style=sty_a2), - ] - - sty_merged = {'font': {'color': '000000FF', 'bold': True}} - sty_kwargs = _Openpyxl20Writer._convert_to_style_kwargs(sty_merged) - openpyxl_sty_merged = styles.Style(**sty_kwargs) - merge_cells = [ - ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), - ] - - with ensure_clean('.xlsx') as path: - writer = _Openpyxl20Writer(path) - writer.write_cells(initial_cells, sheet_name=sheet_name) - writer.write_cells(merge_cells, sheet_name=sheet_name) - - wks = writer.sheets[sheet_name] - xcell_b1 = wks['B1'] - xcell_a2 = wks['A2'] - assert xcell_b1.style == openpyxl_sty_merged - assert xcell_a2.style == openpyxl_sty_merged - - -def skip_openpyxl_lt22(cls): - """Skip test case if openpyxl < 2.2""" - - @classmethod - def setup_class(cls): - _skip_if_no_openpyxl() - import openpyxl - ver = openpyxl.__version__ - if LooseVersion(ver) < LooseVersion('2.2.0'): - pytest.skip("openpyxl %s < 2.2" % str(ver)) - - cls.setup_class = setup_class - return cls - - -@raise_on_incompat_version(2) -@skip_openpyxl_lt22 -class TestOpenpyxl22Tests(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'openpyxl22' - check_skip = staticmethod(lambda *args, **kwargs: None) + check_skip = staticmethod(_skip_if_no_openpyxl) def test_to_excel_styleconverter(self): from openpyxl import styles @@ -2174,7 +1975,7 @@ def test_to_excel_styleconverter(self): protection = styles.Protection(locked=True, hidden=False) - kw = _Openpyxl22Writer._convert_to_style_kwargs(hstyle) + kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) assert kw['font'] == font assert kw['border'] == border assert kw['alignment'] == alignment @@ -2183,9 +1984,6 @@ def test_to_excel_styleconverter(self): assert kw['protection'] == protection def test_write_cells_merge_styled(self): - if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatible openpyxl version') - from pandas.io.formats.excel import ExcelCell sheet_name = 'merge_styled' @@ -2199,7 +1997,7 @@ def test_write_cells_merge_styled(self): ] sty_merged = {'font': {'color': '000000FF', 'bold': True}} - sty_kwargs = _Openpyxl22Writer._convert_to_style_kwargs(sty_merged) + sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) openpyxl_sty_merged = sty_kwargs['font'] merge_cells = [ ExcelCell(col=0, row=0, val='pandas', @@ -2207,7 +2005,7 @@ def test_write_cells_merge_styled(self): ] with ensure_clean('.xlsx') as path: - writer = _Openpyxl22Writer(path) + writer = _OpenpyxlWriter(path) writer.write_cells(initial_cells, sheet_name=sheet_name) writer.write_cells(merge_cells, sheet_name=sheet_name) @@ -2322,7 +2120,7 @@ def test_column_format(self): try: read_num_format = cell.number_format - except: + except Exception: read_num_format = cell.style.number_format._format_code assert read_num_format == num_format @@ -2366,9 +2164,7 @@ def test_ExcelWriter_dispatch(self): writer_klass = _XlsxWriter except ImportError: _skip_if_no_openpyxl() - if not openpyxl_compat.is_compat(major_ver=1): - pytest.skip('incompatible openpyxl version') - writer_klass = _Openpyxl1Writer + writer_klass = _OpenpyxlWriter with ensure_clean('.xlsx') as path: writer = ExcelWriter(path) @@ -2461,10 +2257,6 @@ def custom_converter(css): pytest.importorskip('jinja2') pytest.importorskip(engine) - if engine == 'openpyxl' and openpyxl_compat.is_compat(major_ver=1): - pytest.xfail('openpyxl1 does not support some openpyxl2-compatible ' - 'style dicts') - # Prepare spreadsheets df = DataFrame(np.random.randn(10, 3)) @@ -2482,9 +2274,6 @@ def custom_converter(css): # For other engines, we only smoke test return openpyxl = pytest.importorskip('openpyxl') - if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatible openpyxl version') - wb = openpyxl.load_workbook(path) # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index a28adcf1ee771..2d1671840ce8f 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -180,6 +180,15 @@ def test_scalar_float(self): x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) + def test_scalar_bool(self): + x = np.bool_(1) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + + x = np.bool_(0) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() x_rec = self.encode_decode(x) @@ -263,7 +272,7 @@ def test_numpy_array_complex(self): x.dtype == x_rec.dtype) def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] + x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -401,6 +410,7 @@ def setup_method(self, method): 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, 'H': Categorical([1, 2, 3, 4, 5]), 'I': Categorical([1, 2, 3, 4, 5], ordered=True), + 'J': (np.bool_(1), 2, 3, 4, 5), } self.d['float'] = Series(data['A']) @@ -410,6 +420,7 @@ def setup_method(self, method): self.d['dt_tz'] = Series(data['G']) self.d['cat_ordered'] = Series(data['H']) self.d['cat_unordered'] = Series(data['I']) + self.d['numpy_bool_mixed'] = Series(data['J']) def test_basic(self): @@ -607,8 +618,8 @@ def setup_method(self, method): 'E': [datetime.timedelta(days=x) for x in range(1000)], } self.frame = { - 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), - 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), + 'float': DataFrame({k: data[k] for k in ['A', 'A']}), + 'int': DataFrame({k: data[k] for k in ['B', 'B']}), 'mixed': DataFrame(data), } @@ -794,8 +805,8 @@ def setup_method(self, method): 'G': [400] * 1000 } self.frame = { - 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), - 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), + 'float': DataFrame({k: data[k] for k in ['A', 'A']}), + 'int': DataFrame({k: data[k] for k in ['B', 'B']}), 'mixed': DataFrame(data), } self.utf_encodings = ['utf8', 'utf16', 'utf32'] diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ca20d089ee5f7..3fcbf90d12494 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1391,7 +1391,7 @@ def test_append_with_strings(self): with catch_warnings(record=True): wp = tm.makePanel() wp2 = wp.rename_axis( - dict((x, "%s_extra" % x) for x in wp.minor_axis), axis=2) + {x: "%s_extra" % x for x in wp.minor_axis}, axis=2) def check_col(key, name, size): assert getattr(store.get_storer(key) @@ -4269,9 +4269,10 @@ def test_select_as_multiple(self): ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') - @pytest.mark.skipf( + @pytest.mark.skipif( LooseVersion(tables.__version__) < '3.1.0', - "tables version does not support fix for nan selection bug: GH 4858") + reason=("tables version does not support fix for nan selection " + "bug: GH 4858")) def test_nan_selection_bug_4858(self): with ensure_clean_store(self.path) as store: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 78b47960e1a04..b3ead7d9c8333 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -590,7 +590,7 @@ def test_105(self): def test_date_export_formats(self): columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] - conversions = dict(((c, c) for c in columns)) + conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) original = DataFrame([data], columns=columns) original.index.name = 'index' @@ -774,7 +774,7 @@ def test_big_dates(self): tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) - date_conversion = dict((c, c[-2:]) for c in columns) + date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} with tm.ensure_clean() as path: expected.index.name = 'index' diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index dfab539e9474c..2e62b22b2b69e 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -12,6 +12,7 @@ import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_is_valid_plot_return_object) +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -23,8 +24,6 @@ This is a common base class used for various plotting tests """ -tm._skip_if_no_mpl() - def _skip_if_no_scipy_gaussian_kde(): try: @@ -43,6 +42,7 @@ def _ok_for_gaussian_kde(kind): return plotting._compat._mpl_ge_1_5_0() +@td.skip_if_no_mpl class TestPlotBase(object): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4b1cb2ccbd3dd..1bc49e9e5f96a 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -8,6 +8,7 @@ from pandas import Series, DataFrame, MultiIndex from pandas.compat import range, lzip import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -19,8 +20,6 @@ """ Test cases for .boxplot method """ -tm._skip_if_no_mpl() - def _skip_if_mpl_14_or_dev_boxplot(): # GH 8382 @@ -31,6 +30,7 @@ def _skip_if_mpl_14_or_dev_boxplot(): pytest.skip("Matplotlib Regression in 1.4 and current dev.") +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @pytest.mark.slow @@ -174,6 +174,7 @@ def test_fontsize(self): xlabelsize=16, ylabelsize=16) +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index d66012e2a56a0..8f237a7f810c3 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -16,13 +16,13 @@ from pandas.util.testing import assert_series_equal, ensure_clean import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.tests.plotting.common import (TestPlotBase, _skip_if_no_scipy_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestTSPlot(TestPlotBase): def setup_method(self, method): @@ -272,7 +272,7 @@ def test_irreg_hf(self): _, ax = self.plt.subplots() df2 = df.copy() - df2.index = df.index.asobject + df2.index = df.index.astype(object) df2.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() assert (np.fabs(diffs[1:] - sec) < 1e-8).all() @@ -712,9 +712,9 @@ def test_mixed_freq_irregular_first(self): assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() - tm.assert_numpy_array_equal(x1, s2.index.asobject.values) + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) x2 = lines[1].get_xdata() - tm.assert_numpy_array_equal(x2, s1.index.asobject.values) + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_regular_first_df(self): # GH 9852 @@ -744,9 +744,9 @@ def test_mixed_freq_irregular_first_df(self): assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() - tm.assert_numpy_array_equal(x1, s2.index.asobject.values) + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) x2 = lines[1].get_xdata() - tm.assert_numpy_array_equal(x2, s1.index.asobject.values) + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_hf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') @@ -1019,7 +1019,7 @@ def test_irreg_dtypes(self): # np.datetime64 idx = date_range('1/1/2000', periods=10) - idx = idx[[0, 2, 5, 9]].asobject + idx = idx[[0, 2, 5, 9]].astype(object) df = DataFrame(np.random.randn(len(idx), 3), idx) _, ax = self.plt.subplots() _check_plot_works(df.plot, ax=ax) diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 970de6ff881ab..d2f8e13a2444b 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -4,6 +4,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td import pytest from numpy.random import randn @@ -18,9 +19,8 @@ pandas.tools.plotting """ -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDeprecatedNameSpace(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3887271edb2a3..5c72d778a1220 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -15,6 +15,7 @@ from pandas.compat import range, lrange, lmap, lzip, u, zip, PY3 from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import rand, randn @@ -24,9 +25,8 @@ _skip_if_no_scipy_gaussian_kde, _ok_for_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index de48b58133e9a..a7c99a06c34e9 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -5,14 +5,14 @@ from pandas import Series, DataFrame import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from pandas.tests.plotting.common import TestPlotBase -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): def test_series_groupby_plotting_nominally_works(self): diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 5f7b2dd2d6ca9..864d39eba29c5 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -6,6 +6,7 @@ from pandas import Series, DataFrame import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import randn @@ -14,9 +15,7 @@ from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) -tm._skip_if_no_mpl() - - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): @@ -141,6 +140,7 @@ def test_plot_fails_when_ax_differs_from_figure(self): self.ts.hist(ax=ax1, figure=fig2) +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @pytest.mark.slow @@ -251,6 +251,7 @@ def test_tight_layout(self): tm.close() +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6f476553091d9..8b0a981760c72 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -7,6 +7,7 @@ from pandas import DataFrame from pandas.compat import lmap import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -15,9 +16,8 @@ import pandas.plotting as plotting from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): @@ -49,6 +49,7 @@ def test_bootstrap_plot(self): _check_plot_works(bootstrap_plot, series=self.ts, size=10) +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): def test_scatter_matrix_axis(self): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index d04065ee34339..6dd7e1e9882b2 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -3,7 +3,7 @@ """ Test cases for Series.plot """ -import itertools +from itertools import chain import pytest from datetime import datetime @@ -12,6 +12,7 @@ from pandas import Series, DataFrame, date_range from pandas.compat import range, lrange import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import randn @@ -21,9 +22,8 @@ _skip_if_no_scipy_gaussian_kde, _ok_for_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): @@ -333,8 +333,7 @@ def test_pie_series(self): autopct='%.2f', fontsize=7) pcts = ['{0:.2f}'.format(s * 100) for s in series.values / float(series.sum())] - iters = [iter(series.index), iter(pcts)] - expected_texts = list(next(it) for it in itertools.cycle(iters)) + expected_texts = list(chain.from_iterable(zip(series.index, pcts))) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: assert t.get_fontsize() == 7 diff --git a/pandas/tests/reshape/test_join.py b/pandas/tests/reshape/test_join.py index 75c01fabea8f6..76791b08a26be 100644 --- a/pandas/tests/reshape/test_join.py +++ b/pandas/tests/reshape/test_join.py @@ -788,7 +788,7 @@ def _assert_same_contents(join_chunk, source): jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values svalues = source.fillna(NA_SENTINEL).drop_duplicates().values - rows = set(tuple(row) for row in jvalues) + rows = {tuple(row) for row in jvalues} assert(len(rows) == len(source)) assert(all(tuple(row) in rows for row in svalues)) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py new file mode 100644 index 0000000000000..3c38512548c70 --- /dev/null +++ b/pandas/tests/reshape/test_melt.py @@ -0,0 +1,537 @@ +# -*- coding: utf-8 -*- +# pylint: disable-msg=W0612,E1101 + +import pytest + +from pandas import DataFrame +import pandas as pd + +from numpy import nan +import numpy as np + +from pandas import melt, lreshape, wide_to_long +import pandas.util.testing as tm +from pandas.compat import range + + +class TestMelt(object): + + def setup_method(self, method): + self.df = tm.makeTimeDataFrame()[:10] + self.df['id1'] = (self.df['A'] > 0).astype(np.int64) + self.df['id2'] = (self.df['B'] > 0).astype(np.int64) + + self.var_name = 'var' + self.value_name = 'val' + + self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 + ], [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298, -0.873361]]) + self.df1.columns = [list('ABC'), list('abc')] + self.df1.columns.names = ['CAP', 'low'] + + def test_top_level_method(self): + result = melt(self.df) + assert result.columns.tolist() == ['variable', 'value'] + + def test_method_signatures(self): + tm.assert_frame_equal(self.df.melt(), + melt(self.df)) + + tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], + value_vars=['A', 'B']), + melt(self.df, + id_vars=['id1', 'id2'], + value_vars=['A', 'B'])) + + tm.assert_frame_equal(self.df.melt(var_name=self.var_name, + value_name=self.value_name), + melt(self.df, + var_name=self.var_name, + value_name=self.value_name)) + + tm.assert_frame_equal(self.df1.melt(col_level=0), + melt(self.df1, col_level=0)) + + def test_default_col_names(self): + result = self.df.melt() + assert result.columns.tolist() == ['variable', 'value'] + + result1 = self.df.melt(id_vars=['id1']) + assert result1.columns.tolist() == ['id1', 'variable', 'value'] + + result2 = self.df.melt(id_vars=['id1', 'id2']) + assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] + + def test_value_vars(self): + result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') + assert len(result3) == 10 + + result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) + expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', 'value']) + tm.assert_frame_equal(result4, expected4) + + def test_value_vars_types(self): + # GH 15348 + expected = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', 'value']) + + for type_ in (tuple, list, np.array): + result = self.df.melt(id_vars=['id1', 'id2'], + value_vars=type_(('A', 'B'))) + tm.assert_frame_equal(result, expected) + + def test_vars_work_with_multiindex(self): + expected = DataFrame({ + ('A', 'a'): self.df1[('A', 'a')], + 'CAP': ['B'] * len(self.df1), + 'low': ['b'] * len(self.df1), + 'value': self.df1[('B', 'b')], + }, columns=[('A', 'a'), 'CAP', 'low', 'value']) + + result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) + tm.assert_frame_equal(result, expected) + + def test_tuple_vars_fail_with_multiindex(self): + # melt should fail with an informative error message if + # the columns have a MultiIndex and a tuple is passed + # for id_vars or value_vars. + tuple_a = ('A', 'a') + list_a = [tuple_a] + tuple_b = ('B', 'b') + list_b = [tuple_b] + + for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), + (tuple_a, tuple_b)): + with tm.assert_raises_regex(ValueError, r'MultiIndex'): + self.df1.melt(id_vars=id_vars, value_vars=value_vars) + + def test_custom_var_name(self): + result5 = self.df.melt(var_name=self.var_name) + assert result5.columns.tolist() == ['var', 'value'] + + result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) + assert result6.columns.tolist() == ['id1', 'var', 'value'] + + result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) + assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] + + result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + var_name=self.var_name) + assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] + + result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + var_name=self.var_name) + expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', self.var_name, 'value']) + tm.assert_frame_equal(result9, expected9) + + def test_custom_value_name(self): + result10 = self.df.melt(value_name=self.value_name) + assert result10.columns.tolist() == ['variable', 'val'] + + result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) + assert result11.columns.tolist() == ['id1', 'variable', 'val'] + + result12 = self.df.melt(id_vars=['id1', 'id2'], + value_name=self.value_name) + assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] + + result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + value_name=self.value_name) + assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] + + result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + value_name=self.value_name) + expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + self.value_name: (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', + self.value_name]) + tm.assert_frame_equal(result14, expected14) + + def test_custom_var_and_value_name(self): + + result15 = self.df.melt(var_name=self.var_name, + value_name=self.value_name) + assert result15.columns.tolist() == ['var', 'val'] + + result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, + value_name=self.value_name) + assert result16.columns.tolist() == ['id1', 'var', 'val'] + + result17 = self.df.melt(id_vars=['id1', 'id2'], + var_name=self.var_name, + value_name=self.value_name) + assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] + + result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + var_name=self.var_name, + value_name=self.value_name) + assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] + + result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + var_name=self.var_name, + value_name=self.value_name) + expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A'] * 10 + ['B'] * 10, + self.value_name: (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', self.var_name, + self.value_name]) + tm.assert_frame_equal(result19, expected19) + + df20 = self.df.copy() + df20.columns.name = 'foo' + result20 = df20.melt() + assert result20.columns.tolist() == ['foo', 'value'] + + def test_col_level(self): + res1 = self.df1.melt(col_level=0) + res2 = self.df1.melt(col_level='CAP') + assert res1.columns.tolist() == ['CAP', 'value'] + assert res2.columns.tolist() == ['CAP', 'value'] + + def test_multiindex(self): + res = self.df1.melt() + assert res.columns.tolist() == ['CAP', 'low', 'value'] + + +class TestLreshape(object): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': + ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': + ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, + 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, + 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', '21jan2009', + '22jan2009', '31dec2008', '03feb2009', + '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': + ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', '20dec2008', + '30dec2008', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, + 3139, 4133, 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', '21jan2009', nan, + '22jan2009', '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', + '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, + 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, + 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + pytest.raises(ValueError, lreshape, df, spec) + + +class TestWideToLong(object): + + def test_simple(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A1970": {0: "a", + 1: "b", + 2: "c"}, + "A1980": {0: "d", + 1: "e", + 2: "f"}, + "B1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_stubs(self): + # GH9204 + df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) + df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] + stubs = ['inc', 'edu'] + + # TODO: unused? + df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa + + assert stubs == ['inc', 'edu'] + + def test_separating_character(self): + # GH14779 + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A.1970": {0: "a", + 1: "b", + 2: "c"}, + "A.1980": {0: "d", + 1: "e", + 2: "f"}, + "B.1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B.1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A(quarterly)1970": {0: "a", + 1: "b", + 2: "c"}, + "A(quarterly)1980": {0: "d", + 1: "e", + 2: "f"}, + "B(quarterly)1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B(quarterly)1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index( + ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] + long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], + i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], + 'A': [1.0, 3.0, 2.0, 4.0], + 'B': [5.0, np.nan, 6.0, np.nan], + 'id': [0, 0, 1, 1], + 'year': ['2010', '2011', '2010', '2011']} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'BBBX': [91, 92, 93], + 'BBBZ': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'BBBX': [91, 92, 93, 91, 92, 93], + 'BBBZ': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['BBBX', 'BBBZ', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = 'nope!' + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'A2010': [], + 'A2011': [], + 'B2010': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'Arating': [91, 92, 93], + 'Arating_old': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'Arating': [91, 92, 93, 91, 92, 93], + 'Arating_old': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['Arating', 'Arating_old', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame({'Aone': [1.0, 2.0], + 'Atwo': [3.0, 4.0], + 'Bone': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'Aone': [], + 'Atwo': [], + 'Bone': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame({ + 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + }) + exp_frame = pd.DataFrame({ + 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, + 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], + 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', + '2', '1', '2', '1', '2', '1', '2', '1', '2'] + }) + exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] + long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_non_unique_idvars(self): + # GH16382 + # Raise an error message if non unique id vars (i) are passed + df = pd.DataFrame({ + 'A_A1': [1, 2, 3, 4, 5], + 'B_B1': [1, 2, 3, 4, 5], + 'x': [1, 1, 1, 1, 1] + }) + with pytest.raises(ValueError): + wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..cd0701e3864fc 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -13,7 +13,10 @@ from pandas.core.reshape.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_object_dtype, +) from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -69,6 +72,15 @@ def test_merge_common(self): exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) + def test_merge_index_as_on_arg(self): + # GH14355 + + left = self.df.set_index('key1') + right = self.df2.set_index('key1') + result = merge(left, right, on='key1') + expected = merge(self.df, self.df2, on='key1').set_index('key1') + assert_frame_equal(result, expected) + def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) @@ -861,6 +873,12 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + def test_merge_two_empty_df_no_division_error(self): + # GH17776, PR #17846 + a = pd.DataFrame({'a': [], 'b': [], 'c': []}) + with np.errstate(divide='raise'): + merge(a, a, on=('a', 'b')) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: @@ -1408,6 +1426,42 @@ def test_join_multi_dtypes(self, d1, d2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ + ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), + ]) + def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): + # GH 16572 + # Check that float column is not cast to object if + # merging on float and int columns + A = DataFrame({'X': int_vals}) + B = DataFrame({'Y': float_vals}) + expected = DataFrame(exp_vals) + + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + + def test_merge_on_ints_floats_warning(self): + # GH 16572 + # merge will produce a warning when merging on int and + # float columns where the float values are not exactly + # equal to their int representation + A = DataFrame({'X': [1, 2, 3]}) + B = DataFrame({'Y': [1.1, 2.5, 3.0]}) + expected = DataFrame({'X': [3], 'Y': [3.0]}) + + with tm.assert_produces_warning(UserWarning): + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + @pytest.fixture def left(): diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py new file mode 100644 index 0000000000000..4c638f8e441fa --- /dev/null +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -0,0 +1,215 @@ +import numpy as np +import pytest + +from pandas import DataFrame +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture +def df1(): + return DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11))) + + +@pytest.fixture +def df2(): + return DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12))) + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def left_df(request, df1): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + if levels: + df1 = df1.set_index(levels) + + return df1 + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def right_df(request, df2): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + + if levels: + df2 = df2.set_index(levels) + + return df2 + + +def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + +@pytest.mark.parametrize('on,how', + [(['outer'], 'inner'), + (['inner'], 'left'), + (['outer', 'inner'], 'right'), + (['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_on(left_df, right_df, on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_on,right_on,how', + [(['outer'], ['outer'], 'inner'), + (['inner'], ['inner'], 'right'), + (['outer', 'inner'], ['outer', 'inner'], 'left'), + (['inner', 'outer'], ['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_lefton_righton( + left_df, right_df, left_on, right_on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = left_df.merge(right_df, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_index', + ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize('how', + ['inner', 'left', 'right', 'outer']) +def test_join_indexes_and_columns_on(df1, df2, left_index, how): + + # Construct left_df + left_df = df1.set_index(left_index) + + # Construct right_df + right_df = df2.set_index(['outer', 'inner']) + + # Result + expected = (left_df.reset_index() + .join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + .set_index(left_index)) + + # Perform join + result = left_df.join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + + assert_frame_equal(result, expected, check_like=True) + + +def test_merge_index_column_precedence(df1, df2): + + # Construct left_df with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + left_df = df1.set_index('outer') + left_df['outer'] = left_df['inner'] + + # Construct right_df with an index level named 'outer' + right_df = df2.set_index('outer') + + # Construct expected result. + # The 'outer' column from left_df is chosen and the resulting + # frame has no index levels + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) + + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + + # Perform the same using the left_on and right_on parameters + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) + + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_merge_ordered.py b/pandas/tests/reshape/test_merge_ordered.py index 9b1806ee52c1d..a4c8793cc0ade 100644 --- a/pandas/tests/reshape/test_merge_ordered.py +++ b/pandas/tests/reshape/test_merge_ordered.py @@ -6,7 +6,7 @@ from numpy import nan -class TestOrderedMerge(object): +class TestMergeOrdered(object): def setup_method(self, method): self.left = DataFrame({'key': ['a', 'c', 'e'], @@ -15,13 +15,6 @@ def setup_method(self, method): self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], 'rvalue': [1, 2, 3., 4]}) - def test_deprecation(self): - - with tm.assert_produces_warning(FutureWarning): - pd.ordered_merge(self.left, self.right, on='key') - - # GH #813 - def test_basic(self): result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 5d4aa048ae303..3e68ff7cf2f59 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -11,209 +11,9 @@ from pandas.util.testing import assert_frame_equal -from pandas.core.reshape.reshape import get_dummies -from pandas.core.reshape.melt import melt, lreshape, wide_to_long +from pandas import get_dummies import pandas.util.testing as tm -from pandas.compat import range, u - - -class TestMelt(object): - - def setup_method(self, method): - self.df = tm.makeTimeDataFrame()[:10] - self.df['id1'] = (self.df['A'] > 0).astype(np.int64) - self.df['id2'] = (self.df['B'] > 0).astype(np.int64) - - self.var_name = 'var' - self.value_name = 'val' - - self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 - ], [-1.321405, 0.368915, -1.055342], - [-0.807333, 0.08298, -0.873361]]) - self.df1.columns = [list('ABC'), list('abc')] - self.df1.columns.names = ['CAP', 'low'] - - def test_top_level_method(self): - result = melt(self.df) - assert result.columns.tolist() == ['variable', 'value'] - - def test_method_signatures(self): - tm.assert_frame_equal(self.df.melt(), - melt(self.df)) - - tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], - value_vars=['A', 'B']), - melt(self.df, - id_vars=['id1', 'id2'], - value_vars=['A', 'B'])) - - tm.assert_frame_equal(self.df.melt(var_name=self.var_name, - value_name=self.value_name), - melt(self.df, - var_name=self.var_name, - value_name=self.value_name)) - - tm.assert_frame_equal(self.df1.melt(col_level=0), - melt(self.df1, col_level=0)) - - def test_default_col_names(self): - result = self.df.melt() - assert result.columns.tolist() == ['variable', 'value'] - - result1 = self.df.melt(id_vars=['id1']) - assert result1.columns.tolist() == ['id1', 'variable', 'value'] - - result2 = self.df.melt(id_vars=['id1', 'id2']) - assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] - - def test_value_vars(self): - result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') - assert len(result3) == 10 - - result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) - expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) - tm.assert_frame_equal(result4, expected4) - - def test_value_vars_types(self): - # GH 15348 - expected = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) - - for type_ in (tuple, list, np.array): - result = self.df.melt(id_vars=['id1', 'id2'], - value_vars=type_(('A', 'B'))) - tm.assert_frame_equal(result, expected) - - def test_vars_work_with_multiindex(self): - expected = DataFrame({ - ('A', 'a'): self.df1[('A', 'a')], - 'CAP': ['B'] * len(self.df1), - 'low': ['b'] * len(self.df1), - 'value': self.df1[('B', 'b')], - }, columns=[('A', 'a'), 'CAP', 'low', 'value']) - - result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) - tm.assert_frame_equal(result, expected) - - def test_tuple_vars_fail_with_multiindex(self): - # melt should fail with an informative error message if - # the columns have a MultiIndex and a tuple is passed - # for id_vars or value_vars. - tuple_a = ('A', 'a') - list_a = [tuple_a] - tuple_b = ('B', 'b') - list_b = [tuple_b] - - for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), - (tuple_a, tuple_b)): - with tm.assert_raises_regex(ValueError, r'MultiIndex'): - self.df1.melt(id_vars=id_vars, value_vars=value_vars) - - def test_custom_var_name(self): - result5 = self.df.melt(var_name=self.var_name) - assert result5.columns.tolist() == ['var', 'value'] - - result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) - assert result6.columns.tolist() == ['id1', 'var', 'value'] - - result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) - assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name) - assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name) - expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, 'value']) - tm.assert_frame_equal(result9, expected9) - - def test_custom_value_name(self): - result10 = self.df.melt(value_name=self.value_name) - assert result10.columns.tolist() == ['variable', 'val'] - - result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) - assert result11.columns.tolist() == ['id1', 'variable', 'val'] - - result12 = self.df.melt(id_vars=['id1', 'id2'], - value_name=self.value_name) - assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - value_name=self.value_name) - assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - value_name=self.value_name) - expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', - self.value_name]) - tm.assert_frame_equal(result14, expected14) - - def test_custom_var_and_value_name(self): - - result15 = self.df.melt(var_name=self.var_name, - value_name=self.value_name) - assert result15.columns.tolist() == ['var', 'val'] - - result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, - value_name=self.value_name) - assert result16.columns.tolist() == ['id1', 'var', 'val'] - - result17 = self.df.melt(id_vars=['id1', 'id2'], - var_name=self.var_name, - value_name=self.value_name) - assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name, - value_name=self.value_name) - assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name, - value_name=self.value_name) - expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, - self.value_name]) - tm.assert_frame_equal(result19, expected19) - - df20 = self.df.copy() - df20.columns.name = 'foo' - result20 = df20.melt() - assert result20.columns.tolist() == ['foo', 'value'] - - def test_col_level(self): - res1 = self.df1.melt(col_level=0) - res2 = self.df1.melt(col_level='CAP') - assert res1.columns.tolist() == ['CAP', 'value'] - assert res2.columns.tolist() == ['CAP', 'value'] - - def test_multiindex(self): - res = self.df1.melt() - assert res.columns.tolist() == ['CAP', 'low', 'value'] +from pandas.compat import u class TestGetDummies(object): @@ -673,327 +473,3 @@ def test_preserve_categorical_dtype(self): result = make_axis_dummies(df, transform=lambda x: x) tm.assert_frame_equal(result, expected) - - -class TestLreshape(object): - - def test_pairs(self): - data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009'], - 'visitdt2': - ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], - 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], - 'wt1': [1823, 3338, 1549, 3298, 4306], - 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], - 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} - - df = DataFrame(data) - - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], - 'wt': ['wt%d' % i for i in range(1, 4)]} - result = lreshape(df, spec) - - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, - 4133, 1766, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, - 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Male', - 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', '02jan2009', '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, - 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} - exp = DataFrame(exp_data, columns=result.columns) - tm.assert_frame_equal(result, exp) - - result = lreshape(df, spec, dropna=False) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '20dec2008', '30dec2008', - '21dec2008', '11jan2009', '08jan2009', '20dec2008', - '30dec2008', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, - 3139, 4133, 1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, - 101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', nan, - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', nan, nan, '02jan2009', - '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, - 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, - 4805.0]} - exp = DataFrame(exp_data, columns=result.columns) - tm.assert_frame_equal(result, exp) - - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], - 'wt': ['wt%d' % i for i in range(1, 4)]} - pytest.raises(ValueError, lreshape, df, spec) - - -class TestWideToLong(object): - - def test_simple(self): - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A1970": {0: "a", - 1: "b", - 2: "c"}, - "A1980": {0: "d", - 1: "e", - 2: "f"}, - "B1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ["A", "B"], i="id", j="year") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_stubs(self): - # GH9204 - df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) - df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] - stubs = ['inc', 'edu'] - - # TODO: unused? - df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa - - assert stubs == ['inc', 'edu'] - - def test_separating_character(self): - # GH14779 - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A.1970": {0: "a", - 1: "b", - 2: "c"}, - "A.1980": {0: "d", - 1: "e", - 2: "f"}, - "B.1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B.1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_escapable_characters(self): - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A(quarterly)1970": {0: "a", - 1: "b", - 2: "c"}, - "A(quarterly)1980": {0: "d", - 1: "e", - 2: "f"}, - "B(quarterly)1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B(quarterly)1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], - "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index( - ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] - long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], - i="id", j="year") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_unbalanced(self): - # test that we can have a varying amount of time variables - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], - 'A': [1.0, 3.0, 2.0, 4.0], - 'B': [5.0, np.nan, 6.0, np.nan], - 'id': [0, 0, 1, 1], - 'year': ['2010', '2011', '2010', '2011']} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(long_frame, exp_frame) - - def test_character_overlap(self): - # Test we handle overlapping characters in both id_vars and value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'BBBX': [91, 92, 93], - 'BBBZ': [91, 92, 93] - }) - df['id'] = df.index - exp_frame = pd.DataFrame({ - 'BBBX': [91, 92, 93, 91, 92, 93], - 'BBBZ': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': ['11', '11', '11', '12', '12', '12']}) - exp_frame = exp_frame.set_index(['id', 'year'])[ - ['BBBX', 'BBBZ', 'A', 'B', 'BB']] - long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_invalid_separator(self): - # if an invalid separator is supplied a empty data frame is returned - sep = 'nope!' - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'A2010': [], - 'A2011': [], - 'B2010': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[[ - 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] - exp_frame.index.set_levels([[0, 1], []], inplace=True) - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_num_string_disambiguation(self): - # Test that we can disambiguate number value_vars from - # string value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'Arating': [91, 92, 93], - 'Arating_old': [91, 92, 93] - }) - df['id'] = df.index - exp_frame = pd.DataFrame({ - 'Arating': [91, 92, 93, 91, 92, 93], - 'Arating_old': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': ['11', '11', '11', '12', '12', '12']}) - exp_frame = exp_frame.set_index(['id', 'year'])[ - ['Arating', 'Arating_old', 'A', 'B', 'BB']] - long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_invalid_suffixtype(self): - # If all stubs names end with a string, but a numeric suffix is - # assumed, an empty data frame is returned - df = pd.DataFrame({'Aone': [1.0, 2.0], - 'Atwo': [3.0, 4.0], - 'Bone': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'Aone': [], - 'Atwo': [], - 'Bone': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[[ - 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] - exp_frame.index.set_levels([[0, 1], []], inplace=True) - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_multiple_id_columns(self): - # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm - df = pd.DataFrame({ - 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - }) - exp_frame = pd.DataFrame({ - 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, - 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], - 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], - 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], - 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', - '2', '1', '2', '1', '2', '1', '2', '1', '2'] - }) - exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] - long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') - tm.assert_frame_equal(long_frame, exp_frame) - - def test_non_unique_idvars(self): - # GH16382 - # Raise an error message if non unique id vars (i) are passed - df = pd.DataFrame({ - 'A_A1': [1, 2, 3, 4, 5], - 'B_B1': [1, 2, 3, 4, 5], - 'x': [1, 1, 1, 1, 1] - }) - with pytest.raises(ValueError): - wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index 8cfdf7a461879..eb6363689cca0 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -10,7 +10,8 @@ from pandas.compat import text_type, iteritems from pandas.compat.numpy import np_datetime64_compat -from pandas._libs import tslib, period as libperiod +from pandas._libs import tslib +from pandas._libs.tslibs import period as libperiod from pandas._libs.tslibs.parsing import DateParseError from pandas import Period, Timestamp, offsets from pandas._libs.tslibs.resolution import DAYS, _MONTHS as MONTHS @@ -1038,6 +1039,29 @@ def test_add_raises(self): with tm.assert_raises_regex(TypeError, msg): dt1 + dt2 + boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] + + @pytest.mark.parametrize('lbox', boxes) + @pytest.mark.parametrize('rbox', boxes) + def test_add_timestamp_raises(self, rbox, lbox): + # GH # 17983 + ts = pd.Timestamp('2017') + per = pd.Period('2017', freq='M') + + # We may get a different message depending on which class raises + # the error. + msg = (r"cannot add|unsupported operand|" + r"can only operate on a|incompatible type|" + r"ufunc add cannot use operands") + with tm.assert_raises_regex(TypeError, msg): + lbox(ts) + rbox(per) + + with tm.assert_raises_regex(TypeError, msg): + lbox(per) + rbox(ts) + + with tm.assert_raises_regex(TypeError, msg): + lbox(per) + rbox(per) + def test_sub(self): dt1 = Period('2011-01-01', freq='D') dt2 = Period('2011-01-15', freq='D') diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 17c818779c76d..001f6c1fdbef4 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -15,6 +15,28 @@ class TestTimedeltaArithmetic(object): _multiprocess_can_split_ = True + def test_arithmetic_overflow(self): + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') + + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) + + def test_ops_error_str(self): + # GH 13624 + td = Timedelta('1 day') + + for left, right in [(td, 'a'), ('a', td)]: + + with pytest.raises(TypeError): + left + right + + with pytest.raises(TypeError): + left > right + + assert not left == right + assert left != right + def test_to_timedelta_on_nanoseconds(self): # GH 9273 result = Timedelta(nanoseconds=100) @@ -93,38 +115,53 @@ def test_ops_offsets(self): assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td - # TODO: Split by op, better name - def test_ops(self): + def test_unary_ops(self): td = Timedelta(10, unit='d') + + # __neg__, __pos__ assert -td == Timedelta(-10, unit='d') + assert -td == Timedelta('-10d') assert +td == Timedelta(10, unit='d') - assert td - td == Timedelta(0, unit='ns') + + # __abs__, __abs__(__neg__) + assert abs(td) == td + assert abs(-td) == td + assert abs(-td) == Timedelta('10d') + + def test_binary_ops_nat(self): + td = Timedelta(10, unit='d') + assert (td - pd.NaT) is pd.NaT - assert td + td == Timedelta(20, unit='d') assert (td + pd.NaT) is pd.NaT - assert td * 2 == Timedelta(20, unit='d') assert (td * pd.NaT) is pd.NaT - assert td / 2 == Timedelta(5, unit='d') - assert td // 2 == Timedelta(5, unit='d') - assert abs(td) == td - assert abs(-td) == td - assert td / td == 1 assert (td / pd.NaT) is np.nan assert (td // pd.NaT) is np.nan + def test_binary_ops_integers(self): + td = Timedelta(10, unit='d') + + assert td * 2 == Timedelta(20, unit='d') + assert td / 2 == Timedelta(5, unit='d') + assert td // 2 == Timedelta(5, unit='d') + # invert - assert -td == Timedelta('-10d') assert td * -1 == Timedelta('-10d') assert -1 * td == Timedelta('-10d') - assert abs(-td) == Timedelta('10d') - - # invalid multiply with another timedelta - pytest.raises(TypeError, lambda: td * td) # can't operate with integers pytest.raises(TypeError, lambda: td + 2) pytest.raises(TypeError, lambda: td - 2) + def test_binary_ops_with_timedelta(self): + td = Timedelta(10, unit='d') + + assert td - td == Timedelta(0, unit='ns') + assert td + td == Timedelta(20, unit='d') + assert td / td == 1 + + # invalid multiply with another timedelta + pytest.raises(TypeError, lambda: td * td) + class TestTimedeltas(object): _multiprocess_can_split_ = True @@ -733,14 +770,6 @@ def test_timedelta_arithmetic(self): tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) - def test_arithmetic_overflow(self): - - with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') - - with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) - def test_apply_to_timedelta(self): timedelta_NaT = pd.to_timedelta('NaT') @@ -803,18 +832,3 @@ def test_isoformat(self): result = Timedelta(minutes=1).isoformat() expected = 'P0DT0H1M0S' assert result == expected - - def test_ops_error_str(self): - # GH 13624 - td = Timedelta('1 day') - - for l, r in [(td, 'a'), ('a', td)]: - - with pytest.raises(TypeError): - l + r - - with pytest.raises(TypeError): - l > r - - assert not l == r - assert l != r diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 545ed7f1ebbf3..e23911e8d2003 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -16,9 +16,8 @@ import pandas.util.testing as tm from pandas.tseries import offsets, frequencies -from pandas._libs import period -from pandas._libs.tslibs.timezones import get_timezone -from pandas._libs.tslibs import conversion +from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz +from pandas._libs.tslibs import conversion, period from pandas.compat import long, PY3 from pandas.util.testing import assert_series_equal @@ -46,8 +45,35 @@ def test_overflow_offset(self): with pytest.raises(OverflowError): stamp - offset + def test_delta_preserve_nanos(self): + val = Timestamp(long(1337299200000000123)) + result = val + timedelta(1) + assert result.nanosecond == val.nanosecond -class TestTimestamp(object): + +class TestTimestampProperties(object): + + def test_properties_business(self): + ts = Timestamp('2017-10-01', freq='B') + control = Timestamp('2017-10-01') + assert ts.dayofweek == 6 + assert not ts.is_month_start # not a weekday + assert not ts.is_quarter_start # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_start + assert control.is_quarter_start + + ts = Timestamp('2017-09-30', freq='B') + control = Timestamp('2017-09-30') + assert ts.dayofweek == 5 + assert not ts.is_month_end # not a weekday + assert not ts.is_quarter_end # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_end + assert control.is_quarter_end + + +class TestTimestampConstructors(object): def test_constructor(self): base_str = '2014-07-01 09:00' @@ -269,6 +295,17 @@ def test_constructor_fromordinal(self): assert Timestamp('2000-01-01', tz='US/Eastern') == ts assert base.toordinal() == ts.toordinal() + # GH#3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + assert ts.to_pydatetime() == dt + + # with a tzinfo + stamp = Timestamp('2011-4-16', tz='US/Eastern') + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') + assert ts.to_pydatetime() == dt_tz + def test_constructor_offset_depr(self): # see gh-12160 with tm.assert_produces_warning(FutureWarning, @@ -299,6 +336,9 @@ def test_constructor_offset_depr_fromordinal(self): with tm.assert_raises_regex(TypeError, msg): Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + +class TestTimestamp(object): + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') @@ -314,57 +354,52 @@ def test_conversion(self): assert type(result) == type(expected) assert result.dtype == expected.dtype - def test_repr(self): - dates = ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001'] - + @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) + @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', + '2014-01-01 00:00:00.000000001']) + def test_repr(self, date, freq): # dateutil zone change (only matters for repr) - if (dateutil.__version__ >= LooseVersion('2.3') and - (dateutil.__version__ <= LooseVersion('2.4.0') or - dateutil.__version__ >= LooseVersion('2.6.0'))): + if dateutil.__version__ >= LooseVersion('2.6.0'): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] else: timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles'] - freqs = ['D', 'M', 'S', 'N'] - - for date in dates: - for tz in timezones: - for freq in freqs: - - # avoid to match with timezone name - freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') - else: - tz_repr = tz - - date_only = Timestamp(date) - assert date in repr(date_only) - assert tz_repr not in repr(date_only) - assert freq_repr not in repr(date_only) - assert date_only == eval(repr(date_only)) - - date_tz = Timestamp(date, tz=tz) - assert date in repr(date_tz) - assert tz_repr in repr(date_tz) - assert freq_repr not in repr(date_tz) - assert date_tz == eval(repr(date_tz)) - - date_freq = Timestamp(date, freq=freq) - assert date in repr(date_freq) - assert tz_repr not in repr(date_freq) - assert freq_repr in repr(date_freq) - assert date_freq == eval(repr(date_freq)) - - date_tz_freq = Timestamp(date, tz=tz, freq=freq) - assert date in repr(date_tz_freq) - assert tz_repr in repr(date_tz_freq) - assert freq_repr in repr(date_tz_freq) - assert date_tz_freq == eval(repr(date_tz_freq)) + for tz in timezones: + # avoid to match with timezone name + freq_repr = "'{0}'".format(freq) + if tz.startswith('dateutil'): + tz_repr = tz.replace('dateutil', '') + else: + tz_repr = tz + + date_only = Timestamp(date) + assert date in repr(date_only) + assert tz_repr not in repr(date_only) + assert freq_repr not in repr(date_only) + assert date_only == eval(repr(date_only)) + + date_tz = Timestamp(date, tz=tz) + assert date in repr(date_tz) + assert tz_repr in repr(date_tz) + assert freq_repr not in repr(date_tz) + assert date_tz == eval(repr(date_tz)) + + date_freq = Timestamp(date, freq=freq) + assert date in repr(date_freq) + assert tz_repr not in repr(date_freq) + assert freq_repr in repr(date_freq) + assert date_freq == eval(repr(date_freq)) + + date_tz_freq = Timestamp(date, tz=tz, freq=freq) + assert date in repr(date_tz_freq) + assert tz_repr in repr(date_tz_freq) + assert freq_repr in repr(date_tz_freq) + assert date_tz_freq == eval(repr(date_tz_freq)) + + def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) @@ -375,6 +410,16 @@ def test_repr(self): 'pytz.FixedOffset(-240)') assert date_with_utc_offset == eval(expr) + def test_timestamp_repr_pre1900(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + assert iso8601 in result + def test_bounds_with_different_units(self): out_of_bounds_dates = ('1677-09-21', '2262-04-12', ) @@ -453,32 +498,34 @@ def test_tz_localize_errors_ambiguous(self): pytest.raises(AmbiguousTimeError, ts.tz_localize, 'US/Pacific', errors='coerce') - def test_tz_localize_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t) - localized = ts.tz_localize(tz) - assert localized == Timestamp(t, tz=tz) - - with pytest.raises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - assert reset == ts - assert reset.tzinfo is None - - def test_tz_convert_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t, tz='UTC') - converted = ts.tz_convert(tz) - - reset = converted.tz_convert(None) - assert reset == Timestamp(t) - assert reset.tzinfo is None - assert reset == converted.tz_convert('UTC').tz_localize(None) + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + def test_tz_localize_roundtrip(self, tz): + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t) + localized = ts.tz_localize(tz) + assert localized == Timestamp(t, tz=tz) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + assert reset == ts + assert reset.tzinfo is None + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + def test_tz_convert_roundtrip(self, tz): + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t, tz='UTC') + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + assert reset == Timestamp(t) + assert reset.tzinfo is None + assert reset == converted.tz_convert('UTC').tz_localize(None) def test_barely_oob_dts(self): one_us = np.timedelta64(1).astype('timedelta64[us]') @@ -885,6 +932,51 @@ def test_roundtrip(self): assert result == Timestamp(str(base) + ".200005") assert result.microsecond == 5 + 200 * 1000 + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1): 5} + stamp = Timestamp(datetime(2011, 1, 1)) + assert d[stamp] == 5 + + @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']) + def test_is_leap_year(self, tz): + # GH 13727 + dt = Timestamp('2000-01-01 00:00:00', tz=tz) + assert dt.is_leap_year + assert isinstance(dt.is_leap_year, bool) + + dt = Timestamp('1999-01-01 00:00:00', tz=tz) + assert not dt.is_leap_year + + dt = Timestamp('2004-01-01 00:00:00', tz=tz) + assert dt.is_leap_year + + dt = Timestamp('2100-01-01 00:00:00', tz=tz) + assert not dt.is_leap_year + + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') + utsc = tsc.tz_convert('UTC') + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + if PY3: + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() + + +class TestTimestampComparison(object): def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = long(1337299200000000000) @@ -916,7 +1008,6 @@ def test_comparison(self): assert other >= val def test_compare_invalid(self): - # GH 8058 val = Timestamp('20130101 12:01:02') assert not val == 'foo' @@ -1007,16 +1098,6 @@ def test_cant_compare_tz_naive_w_aware_dateutil(self): assert not a == b.to_pydatetime() assert not a.to_pydatetime() == b - def test_delta_preserve_nanos(self): - val = Timestamp(long(1337299200000000123)) - result = val + timedelta(1) - assert result.nanosecond == val.nanosecond - - def test_hash_equivalent(self): - d = {datetime(2011, 1, 1): 5} - stamp = Timestamp(datetime(2011, 1, 1)) - assert d[stamp] == 5 - def test_timestamp_compare_scalars(self): # case where ndim == 0 lhs = np.datetime64(datetime(2013, 12, 6)) @@ -1077,43 +1158,20 @@ def test_timestamp_compare_series(self): result = right_f(Timestamp('nat'), s_nat) tm.assert_series_equal(result, expected) - def test_is_leap_year(self): - # GH 13727 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - dt = Timestamp('2000-01-01 00:00:00', tz=tz) - assert dt.is_leap_year - assert isinstance(dt.is_leap_year, bool) - - dt = Timestamp('1999-01-01 00:00:00', tz=tz) - assert not dt.is_leap_year - - dt = Timestamp('2004-01-01 00:00:00', tz=tz) - assert dt.is_leap_year - - dt = Timestamp('2100-01-01 00:00:00', tz=tz) - assert not dt.is_leap_year - - def test_timestamp(self): - # GH#17329 - # tz-naive --> treat it as if it were UTC for purposes of timestamp() - ts = Timestamp.now() - uts = ts.replace(tzinfo=utc) - assert ts.timestamp() == uts.timestamp() - - tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') - utsc = tsc.tz_convert('UTC') - - # utsc is a different representation of the same time - assert tsc.timestamp() == utsc.timestamp() - - if PY3: - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp('2012-01-01') - # should agree with datetime.timestamp method - dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() + assert not stamp == datetime.min + assert not stamp == datetime(1600, 1, 1) + assert not stamp == datetime(2700, 1, 1) + assert stamp != datetime.min + assert stamp != datetime(1600, 1, 1) + assert stamp != datetime(2700, 1, 1) + assert stamp > datetime(1600, 1, 1) + assert stamp >= datetime(1600, 1, 1) + assert stamp < datetime(2700, 1, 1) + assert stamp <= datetime(2700, 1, 1) class TestTimestampNsOperations(object): @@ -1260,7 +1318,9 @@ def test_addition_subtraction_preserve_frequency(self): assert (timestamp_instance - timedelta64_instance).freq == original_freq - def test_resolution(self): + @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Eastern']) + def test_resolution(self, tz): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], @@ -1269,12 +1329,9 @@ def test_resolution(self): RESO_HR, RESO_MIN, RESO_SEC, RESO_MS, RESO_US]): - for tz in [None, 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Eastern']: - idx = date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) - result = period.resolution(idx.asi8, idx.tz) - assert result == expected + idx = date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) + result = period.resolution(idx.asi8, idx.tz) + assert result == expected class TestTimestampToJulianDate(object): @@ -1300,8 +1357,7 @@ def test_compare_hour13(self): assert r == 2451769.0416666666666666 -class TestTimeSeries(object): - +class TestTimestampConversion(object): def test_timestamp_to_datetime(self): stamp = Timestamp('20090415', tz='US/Eastern', freq='D') dtval = stamp.to_pydatetime() @@ -1323,53 +1379,30 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): tm._skip_if_windows_python_3() - from pandas._libs.tslibs.timezones import dateutil_gettz as gettz stamp = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo - def test_timestamp_date_out_of_range(self): - pytest.raises(ValueError, Timestamp, '1676-01-01') - pytest.raises(ValueError, Timestamp, '2263-01-01') - - def test_timestamp_repr(self): - # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') - repr(stamp) - - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') - result = repr(stamp) - assert iso8601 in result - - def test_timestamp_from_ordinal(self): + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == + Timestamp.max.value / 1000) - # GH 3042 - dt = datetime(2011, 4, 16, 0, 0) - ts = Timestamp.fromordinal(dt.toordinal()) - assert ts.to_pydatetime() == dt + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == + Timestamp.min.value / 1000) - # with a tzinfo - stamp = Timestamp('2011-4-16', tz='US/Eastern') - dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') - assert ts.to_pydatetime() == dt_tz - def test_timestamp_compare_with_early_datetime(self): - # e.g. datetime.min - stamp = Timestamp('2012-01-01') +class TestTimeSeries(object): - assert not stamp == datetime.min - assert not stamp == datetime(1600, 1, 1) - assert not stamp == datetime(2700, 1, 1) - assert stamp != datetime.min - assert stamp != datetime(1600, 1, 1) - assert stamp != datetime(2700, 1, 1) - assert stamp > datetime(1600, 1, 1) - assert stamp >= datetime(1600, 1, 1) - assert stamp < datetime(2700, 1, 1) - assert stamp <= datetime(2700, 1, 1) + def test_timestamp_date_out_of_range(self): + pytest.raises(ValueError, Timestamp, '1676-01-01') + pytest.raises(ValueError, Timestamp, '2263-01-01') def test_timestamp_equality(self): @@ -1462,16 +1495,3 @@ def test_min_valid(self): def test_max_valid(self): # Ensure that Timestamp.max is a valid Timestamp Timestamp(Timestamp.max) - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == - Timestamp.max.value / 1000) - - exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == - Timestamp.min.value / 1000) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ff788fb2347b8..cfc319da1598d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -848,6 +848,12 @@ def test_value_counts_nunique(self): result = series.nunique() assert result == 11 + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + def test_unique(self): # 714 also, dtype=float @@ -873,6 +879,14 @@ def test_unique(self): expected = np.array([1, 2, 3, None], dtype=object) tm.assert_numpy_array_equal(result, expected) + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + @pytest.mark.parametrize( "tc1, tc2", [ diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index d0693984689a6..8899ab585d6cb 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -377,6 +377,14 @@ def test_map(self): exp = Series([np.nan, 'B', 'C', 'D']) tm.assert_series_equal(a.map(c), exp) + @pytest.mark.parametrize("index", tm.all_index_generator(10)) + def test_map_empty(self, index): + s = Series(index) + result = s.map({}) + + expected = pd.Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) + def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) @@ -422,8 +430,10 @@ def test_map_dict_with_tuple_keys(self): converted to a multi-index, preventing tuple values from being mapped properly. """ + # GH 18496 df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df['labels'] = df['a'].map(label_mappings) df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) # All labels should be filled now diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 71ac00975af03..6cf60e818c845 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self): # categorical assert pd.concat([Series(dtype='category'), Series(dtype='category')]).dtype == 'category' - assert pd.concat([Series(dtype='category'), + # GH 18515 + assert pd.concat([Series(np.array([]), dtype='category'), Series(dtype='float64')]).dtype == 'float64' assert pd.concat([Series(dtype='category'), Series(dtype='object')]).dtype == 'object' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index e62b19294a07b..c814cade77e5c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -4,6 +4,7 @@ import pytest from datetime import datetime, timedelta +from collections import OrderedDict from numpy import nan import numpy as np @@ -79,17 +80,42 @@ def test_constructor(self): m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m) - def test_constructor_empty(self): + @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) + def test_constructor_empty(self, input_class): empty = Series() - empty2 = Series([]) + empty2 = Series(input_class()) - # the are Index() and RangeIndex() which don't compare type equal + # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) - empty = Series(index=lrange(10)) - empty2 = Series(np.nan, index=lrange(10)) - assert_series_equal(empty, empty2) + # With explicit dtype: + empty = Series(dtype='float64') + empty2 = Series(input_class(), dtype='float64') + assert_series_equal(empty, empty2, check_index_type=False) + + # GH 18515 : with dtype=category: + empty = Series(dtype='category') + empty2 = Series(input_class(), dtype='category') + assert_series_equal(empty, empty2, check_index_type=False) + + if input_class is not list: + # With index: + empty = Series(index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10)) + assert_series_equal(empty, empty2) + + # With index and dtype float64: + empty = Series(np.nan, index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10), dtype='float64') + assert_series_equal(empty, empty2) + + @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) + def test_constructor_nan(self, input_arg): + empty = Series(dtype='float64', index=lrange(10)) + empty2 = Series(input_arg, index=lrange(10)) + + assert_series_equal(empty, empty2, check_index_type=False) def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] @@ -251,7 +277,7 @@ def test_constructor_maskedarray(self): def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') - data = dict((k, 1) for k in rng) + data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng @@ -606,7 +632,7 @@ def test_constructor_periodindex(self): pi = period_range('20130101', periods=5, freq='D') s = Series(pi) - expected = Series(pi.asobject) + expected = Series(pi.astype(object)) assert_series_equal(s, expected) assert s.dtype == 'object' @@ -625,6 +651,21 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + def test_constructor_dict_nan_key(self, value): + # GH 18480 + d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) + assert_series_equal(result, expected) + + # MultiIndex: + d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c'], + index=Index([(1, 1), (2, np.nan), (3, value)])) + assert_series_equal(result, expected) + def test_constructor_dict_datetime64_index(self): # GH 9456 @@ -658,6 +699,14 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data + def test_constructor_dict_of_tuples(self): + data = {(1, 2): 3, + (None, 5): 6} + result = Series(data).sort_values() + expected = Series([3, 6], + index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + tm.assert_series_equal(result, expected) + def test_constructor_set(self): values = set([1, 2, 3, 4, 5]) pytest.raises(TypeError, Series, values) @@ -777,6 +826,15 @@ def f(): s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]' + # GH 16406 + def test_constructor_mixed_tz(self): + s = Series([Timestamp('20130101'), + Timestamp('20130101', tz='US/Eastern')]) + expected = Series([Timestamp('20130101'), + Timestamp('20130101', tz='US/Eastern')], + dtype='object') + assert_series_equal(s, expected) + def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index e810eadd2dee9..b79d8def905af 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -228,7 +228,7 @@ def get_dir(s): results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) s = Series(period_range('20130101', periods=5, - freq='D', name='xxx').asobject) + freq='D', name='xxx').astype(object)) results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_period + ok_for_period_methods)))) @@ -387,7 +387,7 @@ def test_sub_of_datetime_from_TimeSeries(self): assert result.dtype == 'timedelta64[ns]' def test_between(self): - s = Series(bdate_range('1/1/2000', periods=20).asobject) + s = Series(bdate_range('1/1/2000', periods=20).astype(object)) s[::2] = np.nan result = s[s.between(s[3], s[17])] diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index ad6d019b5287e..163950b75bc34 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -37,6 +37,12 @@ def test_astype(self, dtype): assert as_typed.dtype == dtype assert as_typed.name == s.name + def test_asobject_deprecated(self): + s = Series(np.random.randn(5), name='foo') + with tm.assert_produces_warning(FutureWarning): + o = s.asobject + assert isinstance(o, np.ndarray) + def test_dtype(self): assert self.ts.dtype == np.dtype('float64') diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index d141b378fe214..c0ef5a3694bf3 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1734,7 +1734,7 @@ def test_timedelta_assignment(self): def test_underlying_data_conversion(self): # GH 4080 - df = DataFrame(dict((c, [1, 2, 3]) for c in ['a', 'b', 'c'])) + df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']}) df.set_index(['a', 'b', 'c'], inplace=True) s = Series([1], index=[(2, 2, 2)]) df['val'] = 0 diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index c1ef70bba8634..b0d0e2a51b5f4 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -134,13 +134,13 @@ def test_shift_dst(self): assert res.dtype == 'datetime64[ns, US/Eastern]' res = s.shift(1) - exp_vals = [NaT] + dates.asobject.values.tolist()[:9] + exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] exp = Series(exp_vals) tm.assert_series_equal(res, exp) assert res.dtype == 'datetime64[ns, US/Eastern]' res = s.shift(-2) - exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT] + exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] exp = Series(exp_vals) tm.assert_series_equal(res, exp) assert res.dtype == 'datetime64[ns, US/Eastern]' diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index e65059156c5b9..bb5dbdcaaa7c4 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -78,16 +78,16 @@ def test_fill_value_when_combine_const(self): res = df.add(2, fill_value=0) tm.assert_sp_frame_equal(res, exp) - def test_as_matrix(self): - empty = self.empty.as_matrix() + def test_values(self): + empty = self.empty.values assert empty.shape == (0, 0) no_cols = SparseDataFrame(index=np.arange(10)) - mat = no_cols.as_matrix() + mat = no_cols.values assert mat.shape == (10, 0) no_index = SparseDataFrame(columns=np.arange(10)) - mat = no_index.as_matrix() + mat = no_index.values assert mat.shape == (0, 10) def test_copy(self): diff --git a/pandas/tests/sparse/test_list.py b/pandas/tests/sparse/test_list.py deleted file mode 100644 index 6c721ca813a21..0000000000000 --- a/pandas/tests/sparse/test_list.py +++ /dev/null @@ -1,111 +0,0 @@ -from pandas.compat import range - -from numpy import nan -import numpy as np - -from pandas.core.sparse.api import SparseList, SparseArray -import pandas.util.testing as tm - - -class TestSparseList(object): - - def setup_method(self, method): - self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) - self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) - - def test_deprecation(self): - # see gh-13784 - with tm.assert_produces_warning(FutureWarning): - SparseList() - - def test_constructor(self): - with tm.assert_produces_warning(FutureWarning): - lst1 = SparseList(self.na_data[:5]) - with tm.assert_produces_warning(FutureWarning): - exp = SparseList() - - exp.append(self.na_data[:5]) - tm.assert_sp_list_equal(lst1, exp) - - def test_len(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - assert len(splist) == 5 - splist.append(arr[5]) - assert len(splist) == 6 - splist.append(arr[6:]) - assert len(splist) == 10 - - def test_append_na(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr)) - - def test_append_zero(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.zero_data - splist = SparseList(fill_value=0) - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - # list always produces int64, but SA constructor - # is platform dtype aware - sparr = splist.to_array() - exp = SparseArray(arr, fill_value=0) - tm.assert_sp_array_equal(sparr, exp, check_dtype=False) - - def test_consolidate(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = self.na_data - exp_sparr = SparseArray(arr) - - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - consol = splist.consolidate(inplace=False) - assert consol.nchunks == 1 - assert splist.nchunks == 3 - tm.assert_sp_array_equal(consol.to_array(), exp_sparr) - - splist.consolidate() - assert splist.nchunks == 1 - tm.assert_sp_array_equal(splist.to_array(), exp_sparr) - - def test_copy(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = self.na_data - exp_sparr = SparseArray(arr) - - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - - cp = splist.copy() - cp.append(arr[6:]) - assert splist.nchunks == 2 - tm.assert_sp_array_equal(cp.to_array(), exp_sparr) - - def test_getitem(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - for i in range(len(arr)): - tm.assert_almost_equal(splist[i], arr[i]) - tm.assert_almost_equal(splist[-i], arr[-i]) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 31f4ca146040e..df76390d7ce7a 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -437,7 +437,7 @@ def test_value_counts_unique_nunique(self): for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal(result, - orig._values.asobject.values) + orig._values.astype(object).values) else: tm.assert_numpy_array_equal(result, orig.values) @@ -525,8 +525,8 @@ def test_value_counts_unique_nunique_null(self): Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan - tm.assert_numpy_array_equal(result[1:], - values[2:].asobject.values) + vals = values[2:].astype(object).values + tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b570672124976..b661bde434814 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1955,11 +1955,6 @@ def test_deprecated_labels(self): res = cat.labels tm.assert_numpy_array_equal(res, exp) - def test_deprecated_from_array(self): - # GH13854, `.from_array` is deprecated - with tm.assert_produces_warning(FutureWarning): - Categorical.from_array([0, 1]) - def test_datetime_categorical_comparison(self): dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True) tm.assert_numpy_array_equal(dt_cat > dt_cat[0], @@ -4817,7 +4812,7 @@ def test_constructor(self): assert isinstance(sc, tm.SubclassedCategorical) tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) - def test_from_array(self): + def test_from_codes(self): sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) assert isinstance(sc, tm.SubclassedCategorical) exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 0e783d67517ac..f51bf0c4e476a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1094,21 +1094,21 @@ def test_ctor_dict(self): assert_panel_equal(Panel(d4), Panel(items=['A', 'B'])) # cast - dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) - for k, v in compat.iteritems(d)) + dcasted = {k: v.reindex(wp.major_axis).fillna(0) + for k, v in compat.iteritems(d)} result = Panel(dcasted, dtype=int) - expected = Panel(dict((k, v.astype(int)) - for k, v in compat.iteritems(dcasted))) + expected = Panel({k: v.astype(int) + for k, v in compat.iteritems(dcasted)}) assert_panel_equal(result, expected) result = Panel(dcasted, dtype=np.int32) - expected = Panel(dict((k, v.astype(np.int32)) - for k, v in compat.iteritems(dcasted))) + expected = Panel({k: v.astype(np.int32) + for k, v in compat.iteritems(dcasted)}) assert_panel_equal(result, expected) def test_constructor_dict_mixed(self): with catch_warnings(record=True): - data = dict((k, v.values) for k, v in self.panel.iteritems()) + data = {k: v.values for k, v in self.panel.iteritems()} result = Panel(data) exp_major = Index(np.arange(len(self.panel.major_axis))) tm.assert_index_equal(result.major_axis, exp_major) @@ -1350,18 +1350,18 @@ def test_apply_slabs(self): # make sure that we don't trigger any warnings result = self.panel.apply(f, axis=['items', 'major_axis']) - expected = Panel(dict((ax, f(self.panel.loc[:, :, ax])) - for ax in self.panel.minor_axis)) + expected = Panel({ax: f(self.panel.loc[:, :, ax]) + for ax in self.panel.minor_axis}) assert_panel_equal(result, expected) result = self.panel.apply(f, axis=['major_axis', 'minor_axis']) - expected = Panel(dict((ax, f(self.panel.loc[ax])) - for ax in self.panel.items)) + expected = Panel({ax: f(self.panel.loc[ax]) + for ax in self.panel.items}) assert_panel_equal(result, expected) result = self.panel.apply(f, axis=['minor_axis', 'items']) - expected = Panel(dict((ax, f(self.panel.loc[:, ax])) - for ax in self.panel.major_axis)) + expected = Panel({ax: f(self.panel.loc[:, ax]) + for ax in self.panel.major_axis}) assert_panel_equal(result, expected) # with multi-indexes @@ -1994,8 +1994,8 @@ def test_shift(self): # negative numbers, #2164 result = self.panel.shift(-1) - expected = Panel(dict((i, f.shift(-1)[:-1]) - for i, f in self.panel.iteritems())) + expected = Panel({i: f.shift(-1)[:-1] + for i, f in self.panel.iteritems()}) assert_panel_equal(result, expected) # mixed dtypes #6959 diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index c0e8770dff8b8..c42bedebe2f23 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -689,7 +689,7 @@ def test_ctor_dict(self): def test_constructor_dict_mixed(self): with catch_warnings(record=True): - data = dict((k, v.values) for k, v in self.panel4d.iteritems()) + data = {k: v.values for k, v in self.panel4d.iteritems()} result = Panel4D(data) exp_major = Index(np.arange(len(self.panel4d.major_axis))) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c9e40074c06ad..10c3c0ea507c1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -13,6 +13,7 @@ import pandas as pd import pandas.tseries.offsets as offsets import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas import (Series, DataFrame, Panel, Index, isna, notna, Timestamp) @@ -32,7 +33,7 @@ from pandas.core.indexes.timedeltas import timedelta_range, TimedeltaIndex from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, assert_index_equal) -from pandas._libs.period import IncompatibleFrequency +from pandas._libs.tslibs.period import IncompatibleFrequency bday = BDay() @@ -234,9 +235,8 @@ def test_groupby_resample_on_api(self): result = df.groupby('key').resample('D', on='dates').mean() assert_frame_equal(result, expected) + @td.skip_if_no_mpl def test_plot_api(self): - tm._skip_if_no_mpl() - # .resample(....).plot(...) # hitting warnings # GH 12448 @@ -816,21 +816,23 @@ def test_resample_empty_dataframe(self): # test size for GH13212 (currently stays as df) - def test_resample_empty_dtypes(self): + @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) + @pytest.mark.parametrize( + "dtype", + [np.float, np.int, np.object, 'datetime64[ns]']) + def test_resample_empty_dtypes(self, index, dtype): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass + for how in downsample_methods + upsample_methods: + empty_series = Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass def test_resample_loffset_arg_type(self): # GH 13218, 15002 @@ -3416,3 +3418,11 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + + def test_repr(self): + # GH18203 + result = repr(TimeGrouper(key='A', freq='H')) + expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)") + assert result == expected diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,18 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617 diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2427bcea4053d..8135e263f412f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2482,6 +2482,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index 25446c24b28c0..76f24123ea0e1 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -7,6 +7,19 @@ def offset_types(request): return request.param +@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if + issubclass(getattr(offsets, o), offsets.MonthOffset) + and o != 'MonthOffset']) +def month_classes(request): + return request.param + + +@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if + issubclass(getattr(offsets, o), offsets.Tick)]) +def tick_classes(request): + return request.param + + @pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific']) def tz(request): diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py new file mode 100644 index 0000000000000..45f12c6931fd9 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -0,0 +1,606 @@ +# -*- coding: utf-8 -*- +""" +Tests for Fiscal Year and Fiscal Quarter offset classes +""" +from datetime import datetime + +from dateutil.relativedelta import relativedelta +import pytest + +import pandas.util.testing as tm + +from pandas.tseries.frequencies import get_offset, _INVALID_FREQ_ERROR +from pandas.tseries.offsets import FY5253Quarter, FY5253 +from pandas._libs.tslibs.offsets import WeekDay + +from .common import assert_offset_equal, assert_onOffset +from .test_offsets import Base + + +def makeFY5253LastOfMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="last", **kwds) + + +def makeFY5253NearestEndMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="nearest", **kwds) + + +def makeFY5253NearestEndMonth(*args, **kwds): + return FY5253(*args, variation="nearest", **kwds) + + +def makeFY5253LastOfMonth(*args, **kwds): + return FY5253(*args, variation="last", **kwds) + + +def test_get_offset_name(): + assert (makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, + qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") + assert (makeFY5253NearestEndMonthQuarter( + weekday=1, startingMonth=3, + qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") + + +def test_get_offset(): + with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + get_offset('gibberish') + with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + get_offset('QS-JAN-B') + + pairs = [ + ("RE-N-DEC-MON", + makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", + makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ("REQ-L-MAR-TUE-4", + makeFY5253LastOfMonthQuarter(weekday=1, + startingMonth=3, + qtr_with_extra_week=4)), + ("REQ-L-DEC-MON-3", + makeFY5253LastOfMonthQuarter(weekday=0, + startingMonth=12, + qtr_with_extra_week=3)), + ("REQ-N-DEC-MON-3", + makeFY5253NearestEndMonthQuarter(weekday=0, + startingMonth=12, + qtr_with_extra_week=3))] + + for name, expected in pairs: + offset = get_offset(name) + assert offset == expected, ("Expected %r to yield %r (actual: %r)" % + (name, expected, offset)) + + +class TestFY5253LastOfMonth(Base): + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, + weekday=WeekDay.SAT) + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) + (offset_lom_sat_aug, datetime(2006, 8, 26), True), + (offset_lom_sat_aug, datetime(2007, 8, 25), True), + (offset_lom_sat_aug, datetime(2008, 8, 30), True), + (offset_lom_sat_aug, datetime(2009, 8, 29), True), + (offset_lom_sat_aug, datetime(2010, 8, 28), True), + (offset_lom_sat_aug, datetime(2011, 8, 27), True), + (offset_lom_sat_aug, datetime(2012, 8, 25), True), + (offset_lom_sat_aug, datetime(2013, 8, 31), True), + (offset_lom_sat_aug, datetime(2014, 8, 30), True), + (offset_lom_sat_aug, datetime(2015, 8, 29), True), + (offset_lom_sat_aug, datetime(2016, 8, 27), True), + (offset_lom_sat_aug, datetime(2017, 8, 26), True), + (offset_lom_sat_aug, datetime(2018, 8, 25), True), + (offset_lom_sat_aug, datetime(2019, 8, 31), True), + + (offset_lom_sat_aug, datetime(2006, 8, 27), False), + (offset_lom_sat_aug, datetime(2007, 8, 28), False), + (offset_lom_sat_aug, datetime(2008, 8, 31), False), + (offset_lom_sat_aug, datetime(2009, 8, 30), False), + (offset_lom_sat_aug, datetime(2010, 8, 29), False), + (offset_lom_sat_aug, datetime(2011, 8, 28), False), + + (offset_lom_sat_aug, datetime(2006, 8, 25), False), + (offset_lom_sat_aug, datetime(2007, 8, 24), False), + (offset_lom_sat_aug, datetime(2008, 8, 29), False), + (offset_lom_sat_aug, datetime(2009, 8, 28), False), + (offset_lom_sat_aug, datetime(2010, 8, 27), False), + (offset_lom_sat_aug, datetime(2011, 8, 26), False), + (offset_lom_sat_aug, datetime(2019, 8, 30), False), + + # From GMCR (see for example: + # http://yahoo.brand.edgar-online.com/Default.aspx? + # companyid=3184&formtypeID=7) + (offset_lom_sat_sep, datetime(2010, 9, 25), True), + (offset_lom_sat_sep, datetime(2011, 9, 24), True), + (offset_lom_sat_sep, datetime(2012, 9, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_apply(self): + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT) + + date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 8, 27), + datetime(2012, 8, 25), datetime(2013, 8, 31), + datetime(2014, 8, 30), datetime(2015, 8, 29), + datetime(2016, 8, 27)] + + tests = [ + (offset_lom_aug_sat, date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, date_seq_lom_aug_sat), + (offset_lom_aug_sat, [ + datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [ + datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + (makeFY5253LastOfMonth(n=-1, startingMonth=8, + weekday=WeekDay.SAT), + list(reversed(date_seq_lom_aug_sat))), + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253NearestEndMonth(Base): + + def test_get_target_month_end(self): + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=12, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 12, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=2, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 2, 28)) + + def test_get_year_end(self): + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SUN).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 9, 1)) + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.FRI).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 30)) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + assert (offset_n.get_year_end(datetime(2012, 1, 1)) == + datetime(2013, 1, 1)) + assert (offset_n.get_year_end(datetime(2012, 1, 10)) == + datetime(2013, 1, 1)) + + assert (offset_n.get_year_end(datetime(2013, 1, 1)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 2)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 3)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 10)) == + datetime(2013, 12, 31)) + + JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") + assert (JNJ.get_year_end(datetime(2006, 1, 1)) == + datetime(2006, 12, 31)) + + offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, + weekday=WeekDay.THU) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar + # #Saturday_nearest_the_end_of_month) + # 2006-09-02 2006 September 2 + # 2007-09-01 2007 September 1 + # 2008-08-30 2008 August 30 (leap year) + # 2009-08-29 2009 August 29 + # 2010-08-28 2010 August 28 + # 2011-09-03 2011 September 3 + # 2012-09-01 2012 September 1 (leap year) + # 2013-08-31 2013 August 31 + # 2014-08-30 2014 August 30 + # 2015-08-29 2015 August 29 + # 2016-09-03 2016 September 3 (leap year) + # 2017-09-02 2017 September 2 + # 2018-09-01 2018 September 1 + # 2019-08-31 2019 August 31 + (offset_lom_aug_sat, datetime(2006, 9, 2), True), + (offset_lom_aug_sat, datetime(2007, 9, 1), True), + (offset_lom_aug_sat, datetime(2008, 8, 30), True), + (offset_lom_aug_sat, datetime(2009, 8, 29), True), + (offset_lom_aug_sat, datetime(2010, 8, 28), True), + (offset_lom_aug_sat, datetime(2011, 9, 3), True), + + (offset_lom_aug_sat, datetime(2016, 9, 3), True), + (offset_lom_aug_sat, datetime(2017, 9, 2), True), + (offset_lom_aug_sat, datetime(2018, 9, 1), True), + (offset_lom_aug_sat, datetime(2019, 8, 31), True), + + (offset_lom_aug_sat, datetime(2006, 8, 27), False), + (offset_lom_aug_sat, datetime(2007, 8, 28), False), + (offset_lom_aug_sat, datetime(2008, 8, 31), False), + (offset_lom_aug_sat, datetime(2009, 8, 30), False), + (offset_lom_aug_sat, datetime(2010, 8, 29), False), + (offset_lom_aug_sat, datetime(2011, 8, 28), False), + + (offset_lom_aug_sat, datetime(2006, 8, 25), False), + (offset_lom_aug_sat, datetime(2007, 8, 24), False), + (offset_lom_aug_sat, datetime(2008, 8, 29), False), + (offset_lom_aug_sat, datetime(2009, 8, 28), False), + (offset_lom_aug_sat, datetime(2010, 8, 27), False), + (offset_lom_aug_sat, datetime(2011, 8, 26), False), + (offset_lom_aug_sat, datetime(2019, 8, 30), False), + + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_lom_aug_thu, datetime(2012, 8, 30), True), + (offset_lom_aug_thu, datetime(2011, 9, 1), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_apply(self): + date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 9, 3)] + + JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), + datetime(2006, 12, 31), datetime(2007, 12, 30), + datetime(2008, 12, 28), datetime(2010, 1, 3), + datetime(2011, 1, 2), datetime(2012, 1, 1), + datetime(2012, 12, 30)] + + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, + variation="nearest") + + tests = [ + (makeFY5253NearestEndMonth(startingMonth=8, + weekday=WeekDay.SAT), + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT), + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 1)] + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT), + [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), + (makeFY5253NearestEndMonth(n=-1, startingMonth=8, + weekday=WeekDay.SAT), + list(reversed(date_seq_nem_8_sat))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), JNJ), + (makeFY5253NearestEndMonth(n=-1, startingMonth=12, + weekday=WeekDay.SUN), + list(reversed(JNJ))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), + [datetime(2005, 1, 2), datetime(2006, 1, 1)]), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), + [datetime(2006, 1, 2), datetime(2006, 12, 31)]), + (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253LastOfMonthQuarter(Base): + + def test_isAnchored(self): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4).isAnchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, + qtr_with_extra_week=4).isAnchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4).isAnchored() + + def test_equality(self): + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( + startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + + def test_offset(self): + offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), + datetime(2010, 9, 25), datetime(2010, 12, 25), + datetime(2011, 3, 26), datetime(2011, 6, 25), + datetime(2011, 9, 24), datetime(2011, 12, 24), + datetime(2012, 3, 24), datetime(2012, 6, 23), + datetime(2012, 9, 29), datetime(2012, 12, 29), + datetime(2013, 3, 30), datetime(2013, 6, 29)] + + assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) + assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), + expected=GMCR[0]) + assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) + + assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) + assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) + + assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) + assert_offset_equal(offset_neg1, + base=GMCR[-1] + relativedelta(days=+1), + expected=GMCR[-1]) + assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) + + date = GMCR[0] + relativedelta(days=-1) + for expected in GMCR: + assert_offset_equal(offset, date, expected) + date = date + offset + + date = GMCR[-1] + relativedelta(days=+1) + for expected in reversed(GMCR): + assert_offset_equal(offset_neg1, date, expected) + date = date + offset_neg1 + + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + on_offset_cases = [ + # From Wikipedia + (lomq_aug_sat_4, datetime(2006, 8, 26), True), + (lomq_aug_sat_4, datetime(2007, 8, 25), True), + (lomq_aug_sat_4, datetime(2008, 8, 30), True), + (lomq_aug_sat_4, datetime(2009, 8, 29), True), + (lomq_aug_sat_4, datetime(2010, 8, 28), True), + (lomq_aug_sat_4, datetime(2011, 8, 27), True), + (lomq_aug_sat_4, datetime(2019, 8, 31), True), + + (lomq_aug_sat_4, datetime(2006, 8, 27), False), + (lomq_aug_sat_4, datetime(2007, 8, 28), False), + (lomq_aug_sat_4, datetime(2008, 8, 31), False), + (lomq_aug_sat_4, datetime(2009, 8, 30), False), + (lomq_aug_sat_4, datetime(2010, 8, 29), False), + (lomq_aug_sat_4, datetime(2011, 8, 28), False), + + (lomq_aug_sat_4, datetime(2006, 8, 25), False), + (lomq_aug_sat_4, datetime(2007, 8, 24), False), + (lomq_aug_sat_4, datetime(2008, 8, 29), False), + (lomq_aug_sat_4, datetime(2009, 8, 28), False), + (lomq_aug_sat_4, datetime(2010, 8, 27), False), + (lomq_aug_sat_4, datetime(2011, 8, 26), False), + (lomq_aug_sat_4, datetime(2019, 8, 30), False), + + # From GMCR + (lomq_sep_sat_4, datetime(2010, 9, 25), True), + (lomq_sep_sat_4, datetime(2011, 9, 24), True), + (lomq_sep_sat_4, datetime(2012, 9, 29), True), + + (lomq_sep_sat_4, datetime(2013, 6, 29), True), + (lomq_sep_sat_4, datetime(2012, 6, 23), True), + (lomq_sep_sat_4, datetime(2012, 6, 30), False), + + (lomq_sep_sat_4, datetime(2013, 3, 30), True), + (lomq_sep_sat_4, datetime(2012, 3, 24), True), + + (lomq_sep_sat_4, datetime(2012, 12, 29), True), + (lomq_sep_sat_4, datetime(2011, 12, 24), True), + + # INTC (extra week in Q1) + # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2011, 4, 2), True), + + # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2012, 12, 29), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2011, 12, 31), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2010, 12, 25), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_year_has_extra_week(self): + # End of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) + + # Start of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) + + # End of year before year with long Q1 + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) + + for year in [x + for x in range(1994, 2011 + 1) + if x not in [2011, 2005, 2000, 1994]]: + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week( + datetime(year, 4, 2)) + + # Other long years + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) + + def test_get_weeks(self): + sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1) + sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] + assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] + assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13] + + +class TestFY5253NearestEndMonthQuarter(Base): + + offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, + qtr_with_extra_week=4) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + + on_offset_cases = [ + # From Wikipedia + (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), + (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), + (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), + (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), + + (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), + (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), + + (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), + + (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), + (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), + + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), + (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), + + # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 + (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), + (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), + (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), + (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), + (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), + (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_offset(self): + offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, + weekday=WeekDay.THU, + qtr_with_extra_week=4) + + MU = [datetime(2012, 5, 31), + datetime(2012, 8, 30), datetime(2012, 11, 29), + datetime(2013, 2, 28), datetime(2013, 5, 30)] + + date = MU[0] + relativedelta(days=-1) + for expected in MU: + assert_offset_equal(offset, date, expected) + date = date + offset + + assert_offset_equal(offset, + datetime(2012, 5, 31), + datetime(2012, 8, 30)) + assert_offset_equal(offset, + datetime(2012, 5, 30), + datetime(2012, 5, 31)) + + offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", + qtr_with_extra_week=4) + + assert_offset_equal(offset2, + datetime(2013, 1, 15), + datetime(2013, 3, 30)) diff --git a/pandas/tests/tseries/offsets/test_liboffsets.py b/pandas/tests/tseries/offsets/test_liboffsets.py index 321104222936b..8aa32bc600ee6 100644 --- a/pandas/tests/tseries/offsets/test_liboffsets.py +++ b/pandas/tests/tseries/offsets/test_liboffsets.py @@ -6,7 +6,6 @@ import pytest -from pandas._libs import tslib from pandas import Timestamp import pandas._libs.tslibs.offsets as liboffsets @@ -15,25 +14,21 @@ def test_get_lastbday(): dt = datetime(2017, 11, 30) assert dt.weekday() == 3 # i.e. this is a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_lastbday(wkday, days_in_month) == 30 + assert liboffsets.get_lastbday(dt.year, dt.month) == 30 dt = datetime(1993, 10, 31) assert dt.weekday() == 6 # i.e. this is not a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_lastbday(wkday, days_in_month) == 29 + assert liboffsets.get_lastbday(dt.year, dt.month) == 29 def test_get_firstbday(): dt = datetime(2017, 4, 1) assert dt.weekday() == 5 # i.e. not a weekday - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_firstbday(wkday, days_in_month) == 3 + assert liboffsets.get_firstbday(dt.year, dt.month) == 3 dt = datetime(1993, 10, 1) assert dt.weekday() == 4 # i.e. a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_firstbday(wkday, days_in_month) == 1 + assert liboffsets.get_firstbday(dt.year, dt.month) == 1 def test_shift_month(): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6821017c89c3a..d6b64896b8a60 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,7 +1,6 @@ import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta -from dateutil.relativedelta import relativedelta import pytest from pandas.compat import range @@ -17,6 +16,7 @@ get_offset, get_standard_freq) from pandas.core.indexes.datetimes import ( _to_m8, DatetimeIndex, _daterange_cache) +import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import WeekDay, CacheableOffset from pandas.tseries.offsets import (BDay, CDay, BQuarterEnd, BMonthEnd, BusinessHour, WeekOfMonth, CBMonthEnd, @@ -620,50 +620,50 @@ def test_onOffset(self): for offset, d, expected in tests: assert_onOffset(offset, d, expected) - def test_apply(self): - tests = [] - - tests.append((BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)} - )) - - tests.append((-BDay(), {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)} - )) - - tests.append((-2 * BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)} - )) - - tests.append((BDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)} - )) + apply_cases = [] + apply_cases.append((BDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + apply_cases.append((2 * BDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + apply_cases.append((-BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + apply_cases.append((-2 * BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + apply_cases.append((BDay(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -850,483 +850,469 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2014, 7, 7, 9) - def test_normalize(self): - tests = [] - - tests.append((BusinessHour(normalize=True), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - tests.append((BusinessHour(-1, normalize=True), - {datetime(2014, 7, 1, 8): datetime(2014, 6, 30), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30), - datetime(2014, 7, 1, 0): datetime(2014, 6, 30), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - tests.append((BusinessHour(1, normalize=True, start='17:00', - end='04:00'), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 2), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - for offset, cases in tests: - for dt, expected in compat.iteritems(cases): - assert offset.apply(dt) == expected - - def test_onOffset(self): - tests = [] - - tests.append((BusinessHour(), {datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False})) - - tests.append((BusinessHour(start='10:00', end='15:00'), - {datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) - - tests.append((BusinessHour(start='19:00', end='05:00'), - {datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False})) - - for offset, cases in tests: - for dt, expected in compat.iteritems(cases): - assert offset.onOffset(dt) == expected - - def test_opening_time(self): - tests = [] - - # opening time should be affected by sign of n, not by n's value and - # end - tests.append(( - [BusinessHour(), BusinessHour(n=2), BusinessHour( - n=4), BusinessHour(end='10:00'), BusinessHour(n=2, end='4:00'), - BusinessHour(n=4, end='15:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - # if timestamp is on opening time, next opening time is - # as it is - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), datetime( - 2014, 7, 2, 9)), - # 2014-07-05 is saturday - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), datetime( - 2014, 7, 7, 9))})) - - tests.append(([BusinessHour(start='11:15'), - BusinessHour(n=2, start='11:15'), - BusinessHour(n=3, start='11:15'), - BusinessHour(start='11:15', end='10:00'), - BusinessHour(n=2, start='11:15', end='4:00'), - BusinessHour(n=3, start='11:15', end='15:00')], - {datetime(2014, 7, 1, 11): (datetime( - 2014, 7, 1, 11, 15), datetime(2014, 6, 30, 11, 15)), - datetime(2014, 7, 1, 18): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 1, 23): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 10): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 11, 15): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime( - 2014, 7, 3, 11, 15), datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 4, 10): (datetime( - 2014, 7, 4, 11, 15), datetime(2014, 7, 3, 11, 15)), - datetime(2014, 7, 4, 23): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 6, 10): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15))})) - - tests.append(([BusinessHour(-1), BusinessHour(n=-2), - BusinessHour(n=-4), - BusinessHour(n=-1, end='10:00'), - BusinessHour(n=-2, end='4:00'), - BusinessHour(n=-4, end='15:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 3, 9)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 8, 9))})) - - tests.append(([BusinessHour(start='17:00', end='05:00'), - BusinessHour(n=3, start='17:00', end='03:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), - datetime(2014, 6, 30, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 3, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 17, 1): (datetime( - 2014, 7, 8, 17), datetime(2014, 7, 7, 17)), })) - - tests.append(([BusinessHour(-1, start='17:00', end='05:00'), - BusinessHour(n=-2, start='17:00', end='03:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime( - 2014, 7, 1, 17), datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 17))})) - - for _offsets, cases in tests: - for offset in _offsets: - for dt, (exp_next, exp_prev) in compat.iteritems(cases): - assert offset._next_opening_time(dt) == exp_next - assert offset._prev_opening_time(dt) == exp_prev - - def test_apply(self): - tests = [] - - tests.append(( - BusinessHour(), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, - 30)})) - - tests.append((BusinessHour( - 4), {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, - 30)})) - - tests.append( - (BusinessHour(-1), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime( - 2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 1, 9, 30, 15): datetime( - 2014, 6, 30, 16, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, - 30)})) - - tests.append((BusinessHour( - -4), {datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, - 30)})) - - tests.append((BusinessHour(start='13:00', end='16:00'), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, - 13, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) - - tests.append((BusinessHour(n=2, start='13:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), - datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30) - })) - - tests.append((BusinessHour(n=-1, start='13:00', end='16:00'), - {datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, - 15, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), - datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) - - tests.append((BusinessHour(n=-3, start='10:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), - datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30) - })) - - tests.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), - datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), - datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), - datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), - datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30) - })) - - tests.append((BusinessHour(n=-1, start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30) - })) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_apply_large_n(self): - tests = [] + normalize_cases = [] + normalize_cases.append((BusinessHour(normalize=True), { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) + + normalize_cases.append((BusinessHour(-1, normalize=True), { + datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) + + normalize_cases.append((BusinessHour(1, normalize=True, start='17:00', + end='04:00'), { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) + + @pytest.mark.parametrize('case', normalize_cases) + def test_normalize(self, case): + offset, cases = case + for dt, expected in compat.iteritems(cases): + assert offset.apply(dt) == expected + + on_offset_cases = [] + on_offset_cases.append((BusinessHour(), { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False})) + + on_offset_cases.append((BusinessHour(start='10:00', end='15:00'), { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False})) + + on_offset_cases.append((BusinessHour(start='19:00', end='05:00'), { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False})) - tests.append( - (BusinessHour(40), # A week later - {datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), - datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), - datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), - datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), - datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), - datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), - datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), - datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, - 30)})) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, cases = case + for dt, expected in compat.iteritems(cases): + assert offset.onOffset(dt) == expected - tests.append( - (BusinessHour(-25), # 3 days and 1 hour before - {datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, - 30)})) - - # 5 days and 3 hours later - tests.append((BusinessHour(28, start='21:00', end='02:00'), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), - datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), - datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, - 30)})) + opening_time_cases = [] + # opening time should be affected by sign of n, not by n's value and + # end + opening_time_cases.append(([BusinessHour(), BusinessHour(n=2), + BusinessHour(n=4), BusinessHour(end='10:00'), + BusinessHour(n=2, end='4:00'), + BusinessHour(n=4, end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9)), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9))})) + + opening_time_cases.append(([BusinessHour(start='11:15'), + BusinessHour(n=2, start='11:15'), + BusinessHour(n=3, start='11:15'), + BusinessHour(start='11:15', end='10:00'), + BusinessHour(n=2, start='11:15', end='4:00'), + BusinessHour(n=3, start='11:15', + end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15))})) + + opening_time_cases.append(([BusinessHour(-1), BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end='10:00'), + BusinessHour(n=-2, end='4:00'), + BusinessHour(n=-4, end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9))})) + + opening_time_cases.append(([BusinessHour(start='17:00', end='05:00'), + BusinessHour(n=3, start='17:00', + end='03:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17)), })) + + opening_time_cases.append(([BusinessHour(-1, start='17:00', end='05:00'), + BusinessHour(n=-2, start='17:00', + end='03:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17))})) + + @pytest.mark.parametrize('case', opening_time_cases) + def test_opening_time(self, case): + _offsets, cases = case + for offset in _offsets: + for dt, (exp_next, exp_prev) in compat.iteritems(cases): + assert offset._next_opening_time(dt) == exp_next + assert offset._prev_opening_time(dt) == exp_prev + + apply_cases = [] + apply_cases.append((BusinessHour(), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) + + apply_cases.append((BusinessHour(4), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) + + apply_cases.append((BusinessHour(-1), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)})) + + apply_cases.append((BusinessHour(-4), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)})) + + apply_cases.append((BusinessHour(start='13:00', end='16:00'), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) + + apply_cases.append((BusinessHour(n=2, start='13:00', end='16:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)})) + + apply_cases.append((BusinessHour(n=-1, start='13:00', end='16:00'), { + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) + + apply_cases.append((BusinessHour(n=-3, start='10:00', end='16:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)})) + + apply_cases.append((BusinessHour(start='19:00', end='05:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)})) + + apply_cases.append((BusinessHour(n=-1, start='19:00', end='05:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)})) + + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + apply_large_n_cases = [] + # A week later + apply_large_n_cases.append((BusinessHour(40), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)})) + + # 3 days and 1 hour before + apply_large_n_cases.append((BusinessHour(-25), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + + # 5 days and 3 hours later + apply_large_n_cases.append((BusinessHour(28, start='21:00', end='02:00'), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + + @pytest.mark.parametrize('case', apply_large_n_cases) + def test_apply_large_n(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_nanoseconds(self): tests = [] @@ -1742,58 +1728,58 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CDay(), datetime(2008, 1, 1), True), - (CDay(), datetime(2008, 1, 5), False)] - - for offset, d, expected in tests: - assert_onOffset(offset, d, expected) - - def test_apply(self): - tests = [] + on_offset_cases = [(CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False)] - tests.append((CDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)} - )) - - tests.append((-CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)} - )) - - tests.append((-2 * CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)} - )) - - tests.append((CDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, d, expected = case + assert_onOffset(offset, d, expected) + + apply_cases = [] + apply_cases.append((CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + apply_cases.append((2 * CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + apply_cases.append((-CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + apply_cases.append((-2 * CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + apply_cases.append((CDay(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -1987,37 +1973,40 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CBMonthEnd(), datetime(2008, 1, 31), True), - (CBMonthEnd(), datetime(2008, 1, 1), False)] - - for offset, d, expected in tests: - assert_onOffset(offset, d, expected) + on_offset_cases = [(CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False)] - def test_apply(self): - cbm = CBMonthEnd() - tests = [] + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, d, expected = case + assert_onOffset(offset, d, expected) - tests.append((cbm, {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) + apply_cases = [] + apply_cases.append((CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) - tests.append((2 * cbm, {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 2, 7): datetime(2008, 3, 31)})) + apply_cases.append((2 * CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31)})) - tests.append((-cbm, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 2, 8): datetime(2008, 1, 31)})) + apply_cases.append((-CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31)})) - tests.append((-2 * cbm, {datetime(2008, 1, 1): datetime(2007, 11, 30), - datetime(2008, 2, 9): datetime(2007, 12, 31)} - )) + apply_cases.append((-2 * CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31)})) - tests.append((CBMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) + apply_cases.append((CBMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -2101,36 +2090,40 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CBMonthBegin(), datetime(2008, 1, 1), True), - (CBMonthBegin(), datetime(2008, 1, 31), False)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) + on_offset_cases = [(CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False)] - def test_apply(self): - cbm = CBMonthBegin() - tests = [] + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) - tests.append((cbm, {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 2, 7): datetime(2008, 3, 3)})) + apply_cases = [] + apply_cases.append((CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3)})) - tests.append((2 * cbm, {datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 2, 7): datetime(2008, 4, 1)})) + apply_cases.append((2 * CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1)})) - tests.append((-cbm, {datetime(2008, 1, 1): datetime(2007, 12, 3), - datetime(2008, 2, 8): datetime(2008, 2, 1)})) + apply_cases.append((-CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1)})) - tests.append((-2 * cbm, {datetime(2008, 1, 1): datetime(2007, 11, 1), - datetime(2008, 2, 9): datetime(2008, 1, 1)})) + apply_cases.append((-2 * CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1)})) - tests.append((CBMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 7): datetime(2008, 2, 1)})) + apply_cases.append((CBMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -2188,38 +2181,42 @@ def test_isAnchored(self): assert not Week(2, weekday=2).isAnchored() assert not Week(2).isAnchored() - def test_offset(self): - tests = [] - - tests.append((Week(), # not business week - {datetime(2008, 1, 1): datetime(2008, 1, 8), - datetime(2008, 1, 4): datetime(2008, 1, 11), - datetime(2008, 1, 5): datetime(2008, 1, 12), - datetime(2008, 1, 6): datetime(2008, 1, 13), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) - - tests.append((Week(weekday=0), # Mon - {datetime(2007, 12, 31): datetime(2008, 1, 7), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) - - tests.append((Week(0, weekday=0), # n=0 -> roll forward. Mon - {datetime(2007, 12, 31): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - tests.append((Week(-2, weekday=1), # n=0 -> roll forward. Mon - {datetime(2010, 4, 6): datetime(2010, 3, 23), - datetime(2010, 4, 8): datetime(2010, 3, 30), - datetime(2010, 4, 5): datetime(2010, 3, 23)})) + offset_cases = [] + # not business week + offset_cases.append((Week(), { + datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + # Mon + offset_cases.append((Week(weekday=0), { + datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + # n=0 -> roll forward. Mon + offset_cases.append((Week(0, weekday=0), { + datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + # n=0 -> roll forward. Mon + offset_cases.append((Week(-2, weekday=1), { + datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_onOffset(self): for weekday in range(7): @@ -2299,8 +2296,7 @@ def test_offset(self): (2, 2, 1, date1, datetime(2011, 2, 15)), (2, 2, 1, date2, datetime(2011, 2, 15)), (2, 2, 1, date3, datetime(2011, 3, 15)), - (2, 2, 1, date4, datetime(2011, 3, 15)), - ] + (2, 2, 1, date4, datetime(2011, 3, 15))] for n, week, weekday, dt, expected in test_cases: offset = WeekOfMonth(n, week=week, weekday=weekday) @@ -2313,19 +2309,18 @@ def test_offset(self): result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) assert result == datetime(2011, 2, 2) - def test_onOffset(self): - test_cases = [ - (0, 0, datetime(2011, 2, 7), True), - (0, 0, datetime(2011, 2, 6), False), - (0, 0, datetime(2011, 2, 14), False), - (1, 0, datetime(2011, 2, 14), True), - (0, 1, datetime(2011, 2, 1), True), - (0, 1, datetime(2011, 2, 8), False), - ] - - for week, weekday, dt, expected in test_cases: - offset = WeekOfMonth(week=week, weekday=weekday) - assert offset.onOffset(dt) == expected + on_offset_cases = [(0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + week, weekday, dt, expected = case + offset = WeekOfMonth(week=week, weekday=weekday) + assert offset.onOffset(dt) == expected class TestLastWeekOfMonth(Base): @@ -2379,346 +2374,32 @@ def test_offset(self): offset_sunday = LastWeekOfMonth(n=1, weekday=WeekDay.SUN) assert datetime(2013, 7, 31) + offset_sunday == datetime(2013, 8, 25) - def test_onOffset(self): - test_cases = [ - (WeekDay.SUN, datetime(2013, 1, 27), True), - (WeekDay.SAT, datetime(2013, 3, 30), True), - (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon - (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN - (WeekDay.MON, datetime(2013, 2, 25), True), - (WeekDay.SAT, datetime(2013, 11, 30), True), - - (WeekDay.SAT, datetime(2006, 8, 26), True), - (WeekDay.SAT, datetime(2007, 8, 25), True), - (WeekDay.SAT, datetime(2008, 8, 30), True), - (WeekDay.SAT, datetime(2009, 8, 29), True), - (WeekDay.SAT, datetime(2010, 8, 28), True), - (WeekDay.SAT, datetime(2011, 8, 27), True), - (WeekDay.SAT, datetime(2019, 8, 31), True), - ] - - for weekday, dt, expected in test_cases: - offset = LastWeekOfMonth(weekday=weekday) - assert offset.onOffset(dt) == expected - - -class TestBMonthBegin(Base): - _offset = BMonthBegin - - def test_offset(self): - tests = [] - - tests.append((BMonthBegin(), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 1): datetime(2006, 10, 2), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1)})) - - tests.append((BMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 10, 2): datetime(2006, 10, 2), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 15): datetime(2006, 10, 2)})) - - tests.append((BMonthBegin(2), - {datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 1, 15): datetime(2008, 3, 3), - datetime(2006, 12, 29): datetime(2007, 2, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - tests.append((BMonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 2), - datetime(2008, 6, 1): datetime(2008, 5, 1), - datetime(2008, 3, 10): datetime(2008, 3, 3), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 30): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_onOffset(self): - - tests = [(BMonthBegin(), datetime(2007, 12, 31), False), - (BMonthBegin(), datetime(2008, 1, 1), True), - (BMonthBegin(), datetime(2001, 4, 2), True), - (BMonthBegin(), datetime(2008, 3, 3), True)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) - - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = BMonthBegin() - offset2 = BMonthBegin() - assert not offset1 != offset2 - - -class TestBMonthEnd(Base): - _offset = BMonthEnd - - def test_offset(self): - tests = [] - - tests.append((BMonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 29)})) - - tests.append((BMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 29), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - tests.append((BMonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 2, 28), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 29)})) - - tests.append((BMonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2008, 5, 30), - datetime(2008, 12, 31): datetime(2008, 11, 28), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_normalize(self): - dt = datetime(2007, 1, 1, 3) - - result = dt + BMonthEnd(normalize=True) - expected = dt.replace(hour=0) + BMonthEnd() - assert result == expected - - def test_onOffset(self): - - tests = [(BMonthEnd(), datetime(2007, 12, 31), True), - (BMonthEnd(), datetime(2008, 1, 1), False)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) - - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = BMonthEnd() - offset2 = BMonthEnd() - assert not offset1 != offset2 - - -class TestMonthBegin(Base): - _offset = MonthBegin - - def test_offset(self): - tests = [] - - # NOTE: I'm not entirely happy with the logic here for Begin -ss - # see thread 'offset conventions' on the ML - tests.append((MonthBegin(), - {datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 2, 1): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - tests.append((MonthBegin(0), - {datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 12, 3): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - tests.append((MonthBegin(2), - {datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 12, 28): datetime(2008, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - tests.append((MonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 5, 31): datetime(2008, 5, 1), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 1, 2): datetime(2006, 1, 1)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - -class TestMonthEnd(Base): - _offset = MonthEnd - - def test_offset(self): - tests = [] - - tests.append((MonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 31)})) - - tests.append((MonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - tests.append((MonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 31)})) - - tests.append((MonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 11, 30), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_day_of_month(self): - dt = datetime(2007, 1, 1) - offset = MonthEnd() - - result = dt + offset - assert result == Timestamp(2007, 1, 31) - - result = result + offset - assert result == Timestamp(2007, 2, 28) - - def test_normalize(self): - dt = datetime(2007, 1, 1, 3) - - result = dt + MonthEnd(normalize=True) - expected = dt.replace(hour=0) + MonthEnd() - assert result == expected - - def test_onOffset(self): - - tests = [(MonthEnd(), datetime(2007, 12, 31), True), - (MonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (WeekDay.SUN, datetime(2013, 1, 27), True), + (WeekDay.SAT, datetime(2013, 3, 30), True), + (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon + (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN + (WeekDay.MON, datetime(2013, 2, 25), True), + (WeekDay.SAT, datetime(2013, 11, 30), True), + + (WeekDay.SAT, datetime(2006, 8, 26), True), + (WeekDay.SAT, datetime(2007, 8, 25), True), + (WeekDay.SAT, datetime(2008, 8, 30), True), + (WeekDay.SAT, datetime(2009, 8, 29), True), + (WeekDay.SAT, datetime(2010, 8, 28), True), + (WeekDay.SAT, datetime(2011, 8, 27), True), + (WeekDay.SAT, datetime(2019, 8, 31), True)] - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + weekday, dt, expected = case + offset = LastWeekOfMonth(weekday=weekday) + assert offset.onOffset(dt) == expected class TestSemiMonthEnd(Base): _offset = SemiMonthEnd - def _get_tests(self): - tests = [] - - tests.append((SemiMonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(day_of_month=20), - {datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 20), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 20), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - tests.append((SemiMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 16): datetime(2008, 1, 31), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 15)})) - - tests.append((SemiMonthEnd(0, day_of_month=16), - {datetime(2008, 1, 1): datetime(2008, 1, 16), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 16)})) - - tests.append((SemiMonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 11, 30)})) - - tests.append((SemiMonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 30): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(-1, day_of_month=4), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2007, 1, 4): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(-2), - {datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 2, 15), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 14): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - return tests - def test_offset_whole_year(self): dates = (datetime(2007, 12, 31), datetime(2008, 1, 15), @@ -2760,28 +2441,107 @@ def test_offset_whole_year(self): exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) - def test_offset(self): - for offset, cases in self._get_tests(): - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_apply_index(self): - for offset, cases in self._get_tests(): - s = DatetimeIndex(cases.keys()) - result = offset.apply_index(s) - exp = DatetimeIndex(cases.values()) - tm.assert_index_equal(result, exp) - - def test_onOffset(self): - - tests = [(datetime(2007, 12, 31), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 1), False), - (datetime(2008, 2, 29), True)] + offset_cases = [] + offset_cases.append((SemiMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(day_of_month=20), { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + offset_cases.append((SemiMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15)})) + + offset_cases.append((SemiMonthEnd(0, day_of_month=16), { + datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16)})) + + offset_cases.append((SemiMonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30)})) + + offset_cases.append((SemiMonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(-1, day_of_month=4), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(-2), { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) - for dt, expected in tests: - assert_onOffset(SemiMonthEnd(), dt, expected) + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize('case', offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + on_offset_cases = [(datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + dt, expected = case + assert_onOffset(SemiMonthEnd(), dt, expected) @pytest.mark.parametrize('klass,assert_func', [(Series, tm.assert_series_equal), @@ -2810,91 +2570,6 @@ def test_vectorized_offset_addition(self, klass, assert_func): class TestSemiMonthBegin(Base): _offset = SemiMonthBegin - def _get_tests(self): - tests = [] - - tests.append((SemiMonthBegin(), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(day_of_month=20), - {datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - tests.append((SemiMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 2): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(0, day_of_month=16), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 5): datetime(2007, 1, 16), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(2), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 15): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 12, 1)})) - - tests.append((SemiMonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 6, 14): datetime(2008, 6, 1), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - tests.append((SemiMonthBegin(-1, day_of_month=4), - {datetime(2007, 1, 1): datetime(2006, 12, 4), - datetime(2007, 1, 4): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2006, 12, 2): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 4)})) - - tests.append((SemiMonthBegin(-2), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 1), - datetime(2008, 6, 14): datetime(2008, 5, 15), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 15): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - return tests - def test_offset_whole_year(self): dates = (datetime(2007, 12, 15), datetime(2008, 1, 1), @@ -2936,27 +2611,111 @@ def test_offset_whole_year(self): exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) - def test_offset(self): - for offset, cases in self._get_tests(): - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + offset_cases = [] + offset_cases.append((SemiMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1)})) + + offset_cases.append((SemiMonthBegin(day_of_month=20), { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + offset_cases.append((SemiMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) + + offset_cases.append((SemiMonthBegin(0, day_of_month=16), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) - def test_apply_index(self): - for offset, cases in self._get_tests(): - s = DatetimeIndex(cases.keys()) - result = offset.apply_index(s) - exp = DatetimeIndex(cases.values()) - tm.assert_index_equal(result, exp) + offset_cases.append((SemiMonthBegin(2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1)})) + + offset_cases.append((SemiMonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + offset_cases.append((SemiMonthBegin(-1, day_of_month=4), { + datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4)})) + + offset_cases.append((SemiMonthBegin(-2), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) - def test_onOffset(self): - tests = [(datetime(2007, 12, 1), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 31), False), - (datetime(2008, 2, 15), True)] + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize('case', offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) - for dt, expected in tests: - assert_onOffset(SemiMonthBegin(), dt, expected) + on_offset_cases = [(datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + dt, expected = case + assert_onOffset(SemiMonthBegin(), dt, expected) @pytest.mark.parametrize('klass,assert_func', [(Series, tm.assert_series_equal), @@ -2981,1276 +2740,6 @@ def test_vectorized_offset_addition(self, klass, assert_func): assert_func(result2, exp) -class TestBQuarterBegin(Base): - _offset = BQuarterBegin - - def test_repr(self): - assert (repr(BQuarterBegin()) == - "") - assert (repr(BQuarterBegin(startingMonth=3)) == - "") - assert (repr(BQuarterBegin(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert BQuarterBegin(startingMonth=1).isAnchored() - assert BQuarterBegin().isAnchored() - assert not BQuarterBegin(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((BQuarterBegin(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2007, 3, 15): datetime(2007, 4, 2), - datetime(2007, 2, 28): datetime(2007, 4, 2), - datetime(2007, 1, 1): datetime(2007, 4, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 7, 2), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 8, 15): datetime(2008, 11, 3), - datetime(2008, 9, 15): datetime(2008, 11, 3), - datetime(2008, 11, 1): datetime(2008, 11, 3), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2007, 12, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 15): datetime(2008, 4, 1), - datetime(2008, 2, 27): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 4, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 2): datetime(2007, 7, 2)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2007, 7, 3): datetime(2007, 7, 2), - datetime(2007, 4, 3): datetime(2007, 4, 2), - datetime(2007, 7, 2): datetime(2007, 4, 2), - datetime(2008, 4, 1): datetime(2008, 1, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 1, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2007, 3, 31): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 10, 1), - datetime(2008, 4, 30): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = BQuarterBegin(n=-1, startingMonth=1) - assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) - - -class TestBQuarterEnd(Base): - _offset = BQuarterEnd - - def test_repr(self): - assert (repr(BQuarterEnd()) == - "") - assert (repr(BQuarterEnd(startingMonth=3)) == - "") - assert (repr(BQuarterEnd(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert BQuarterEnd(startingMonth=1).isAnchored() - assert BQuarterEnd().isAnchored() - assert not BQuarterEnd(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((BQuarterEnd(startingMonth=1), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31), })) - - offset_cases.append((BQuarterEnd(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 30), - datetime(2008, 3, 15): datetime(2008, 5, 30), - datetime(2008, 3, 31): datetime(2008, 5, 30), - datetime(2008, 4, 15): datetime(2008, 5, 30), - datetime(2008, 4, 30): datetime(2008, 5, 30), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=2), - {datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = BQuarterEnd(n=-1, startingMonth=1) - assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) - - on_offset_cases = [ - (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -def makeFY5253LastOfMonthQuarter(*args, **kwds): - return FY5253Quarter(*args, variation="last", **kwds) - - -def makeFY5253NearestEndMonthQuarter(*args, **kwds): - return FY5253Quarter(*args, variation="nearest", **kwds) - - -def makeFY5253NearestEndMonth(*args, **kwds): - return FY5253(*args, variation="nearest", **kwds) - - -def makeFY5253LastOfMonth(*args, **kwds): - return FY5253(*args, variation="last", **kwds) - - -class TestFY5253LastOfMonth(Base): - offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, - weekday=WeekDay.SAT) - - on_offset_cases = [ - # From Wikipedia (see: - # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) - (offset_lom_sat_aug, datetime(2006, 8, 26), True), - (offset_lom_sat_aug, datetime(2007, 8, 25), True), - (offset_lom_sat_aug, datetime(2008, 8, 30), True), - (offset_lom_sat_aug, datetime(2009, 8, 29), True), - (offset_lom_sat_aug, datetime(2010, 8, 28), True), - (offset_lom_sat_aug, datetime(2011, 8, 27), True), - (offset_lom_sat_aug, datetime(2012, 8, 25), True), - (offset_lom_sat_aug, datetime(2013, 8, 31), True), - (offset_lom_sat_aug, datetime(2014, 8, 30), True), - (offset_lom_sat_aug, datetime(2015, 8, 29), True), - (offset_lom_sat_aug, datetime(2016, 8, 27), True), - (offset_lom_sat_aug, datetime(2017, 8, 26), True), - (offset_lom_sat_aug, datetime(2018, 8, 25), True), - (offset_lom_sat_aug, datetime(2019, 8, 31), True), - - (offset_lom_sat_aug, datetime(2006, 8, 27), False), - (offset_lom_sat_aug, datetime(2007, 8, 28), False), - (offset_lom_sat_aug, datetime(2008, 8, 31), False), - (offset_lom_sat_aug, datetime(2009, 8, 30), False), - (offset_lom_sat_aug, datetime(2010, 8, 29), False), - (offset_lom_sat_aug, datetime(2011, 8, 28), False), - - (offset_lom_sat_aug, datetime(2006, 8, 25), False), - (offset_lom_sat_aug, datetime(2007, 8, 24), False), - (offset_lom_sat_aug, datetime(2008, 8, 29), False), - (offset_lom_sat_aug, datetime(2009, 8, 28), False), - (offset_lom_sat_aug, datetime(2010, 8, 27), False), - (offset_lom_sat_aug, datetime(2011, 8, 26), False), - (offset_lom_sat_aug, datetime(2019, 8, 30), False), - - # From GMCR (see for example: - # http://yahoo.brand.edgar-online.com/Default.aspx? - # companyid=3184&formtypeID=7) - (offset_lom_sat_sep, datetime(2010, 9, 25), True), - (offset_lom_sat_sep, datetime(2011, 9, 24), True), - (offset_lom_sat_sep, datetime(2012, 9, 29), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_apply(self): - offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT) - - date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 8, 27), - datetime(2012, 8, 25), datetime(2013, 8, 31), - datetime(2014, 8, 30), datetime(2015, 8, 29), - datetime(2016, 8, 27)] - - tests = [ - (offset_lom_aug_sat, date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, date_seq_lom_aug_sat), - (offset_lom_aug_sat, [ - datetime(2006, 8, 25)] + date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, [ - datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), - (makeFY5253LastOfMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_lom_aug_sat))), - ] - for test in tests: - offset, data = test - current = data[0] - for datum in data[1:]: - current = current + offset - assert current == datum - - -class TestFY5253NearestEndMonth(Base): - - def test_get_target_month_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=12, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 12, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=2, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 2, 28)) - - def test_get_year_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SUN).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 9, 1)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.FRI).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 30)) - - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - assert (offset_n.get_year_end(datetime(2012, 1, 1)) == - datetime(2013, 1, 1)) - assert (offset_n.get_year_end(datetime(2012, 1, 10)) == - datetime(2013, 1, 1)) - - assert (offset_n.get_year_end(datetime(2013, 1, 1)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 2)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 3)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 10)) == - datetime(2013, 12, 31)) - - JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") - assert (JNJ.get_year_end(datetime(2006, 1, 1)) == - datetime(2006, 12, 31)) - - offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.THU) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - - on_offset_cases = [ - # From Wikipedia (see: - # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar - # #Saturday_nearest_the_end_of_month) - # 2006-09-02 2006 September 2 - # 2007-09-01 2007 September 1 - # 2008-08-30 2008 August 30 (leap year) - # 2009-08-29 2009 August 29 - # 2010-08-28 2010 August 28 - # 2011-09-03 2011 September 3 - # 2012-09-01 2012 September 1 (leap year) - # 2013-08-31 2013 August 31 - # 2014-08-30 2014 August 30 - # 2015-08-29 2015 August 29 - # 2016-09-03 2016 September 3 (leap year) - # 2017-09-02 2017 September 2 - # 2018-09-01 2018 September 1 - # 2019-08-31 2019 August 31 - (offset_lom_aug_sat, datetime(2006, 9, 2), True), - (offset_lom_aug_sat, datetime(2007, 9, 1), True), - (offset_lom_aug_sat, datetime(2008, 8, 30), True), - (offset_lom_aug_sat, datetime(2009, 8, 29), True), - (offset_lom_aug_sat, datetime(2010, 8, 28), True), - (offset_lom_aug_sat, datetime(2011, 9, 3), True), - - (offset_lom_aug_sat, datetime(2016, 9, 3), True), - (offset_lom_aug_sat, datetime(2017, 9, 2), True), - (offset_lom_aug_sat, datetime(2018, 9, 1), True), - (offset_lom_aug_sat, datetime(2019, 8, 31), True), - - (offset_lom_aug_sat, datetime(2006, 8, 27), False), - (offset_lom_aug_sat, datetime(2007, 8, 28), False), - (offset_lom_aug_sat, datetime(2008, 8, 31), False), - (offset_lom_aug_sat, datetime(2009, 8, 30), False), - (offset_lom_aug_sat, datetime(2010, 8, 29), False), - (offset_lom_aug_sat, datetime(2011, 8, 28), False), - - (offset_lom_aug_sat, datetime(2006, 8, 25), False), - (offset_lom_aug_sat, datetime(2007, 8, 24), False), - (offset_lom_aug_sat, datetime(2008, 8, 29), False), - (offset_lom_aug_sat, datetime(2009, 8, 28), False), - (offset_lom_aug_sat, datetime(2010, 8, 27), False), - (offset_lom_aug_sat, datetime(2011, 8, 26), False), - (offset_lom_aug_sat, datetime(2019, 8, 30), False), - - # From Micron, see: - # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 - (offset_lom_aug_thu, datetime(2012, 8, 30), True), - (offset_lom_aug_thu, datetime(2011, 9, 1), True), - - (offset_n, datetime(2012, 12, 31), False), - (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_apply(self): - date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 9, 3)] - - JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), - datetime(2006, 12, 31), datetime(2007, 12, 30), - datetime(2008, 12, 28), datetime(2010, 1, 3), - datetime(2011, 1, 2), datetime(2012, 1, 1), - datetime(2012, 12, 30)] - - DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, - variation="nearest") - - tests = [ - (makeFY5253NearestEndMonth(startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), - [datetime(2006, 9, 1)] + date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), - (makeFY5253NearestEndMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_nem_8_sat))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), JNJ), - (makeFY5253NearestEndMonth(n=-1, startingMonth=12, - weekday=WeekDay.SUN), - list(reversed(JNJ))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2005, 1, 2), datetime(2006, 1, 1)]), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2006, 1, 2), datetime(2006, 12, 31)]), - (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) - ] - for test in tests: - offset, data = test - current = data[0] - for datum in data[1:]: - current = current + offset - assert current == datum - - -class TestFY5253LastOfMonthQuarter(Base): - - def test_isAnchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, - qtr_with_extra_week=4).isAnchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() - - def test_equality(self): - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - - def test_offset(self): - offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), - datetime(2010, 9, 25), datetime(2010, 12, 25), - datetime(2011, 3, 26), datetime(2011, 6, 25), - datetime(2011, 9, 24), datetime(2011, 12, 24), - datetime(2012, 3, 24), datetime(2012, 6, 23), - datetime(2012, 9, 29), datetime(2012, 12, 29), - datetime(2013, 3, 30), datetime(2013, 6, 29)] - - assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) - assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), - expected=GMCR[0]) - assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) - - assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) - assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) - - assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) - assert_offset_equal(offset_neg1, - base=GMCR[-1] + relativedelta(days=+1), - expected=GMCR[-1]) - assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) - - date = GMCR[0] + relativedelta(days=-1) - for expected in GMCR: - assert_offset_equal(offset, date, expected) - date = date + offset - - date = GMCR[-1] + relativedelta(days=+1) - for expected in reversed(GMCR): - assert_offset_equal(offset_neg1, date, expected) - date = date + offset_neg1 - - lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - on_offset_cases = [ - # From Wikipedia - (lomq_aug_sat_4, datetime(2006, 8, 26), True), - (lomq_aug_sat_4, datetime(2007, 8, 25), True), - (lomq_aug_sat_4, datetime(2008, 8, 30), True), - (lomq_aug_sat_4, datetime(2009, 8, 29), True), - (lomq_aug_sat_4, datetime(2010, 8, 28), True), - (lomq_aug_sat_4, datetime(2011, 8, 27), True), - (lomq_aug_sat_4, datetime(2019, 8, 31), True), - - (lomq_aug_sat_4, datetime(2006, 8, 27), False), - (lomq_aug_sat_4, datetime(2007, 8, 28), False), - (lomq_aug_sat_4, datetime(2008, 8, 31), False), - (lomq_aug_sat_4, datetime(2009, 8, 30), False), - (lomq_aug_sat_4, datetime(2010, 8, 29), False), - (lomq_aug_sat_4, datetime(2011, 8, 28), False), - - (lomq_aug_sat_4, datetime(2006, 8, 25), False), - (lomq_aug_sat_4, datetime(2007, 8, 24), False), - (lomq_aug_sat_4, datetime(2008, 8, 29), False), - (lomq_aug_sat_4, datetime(2009, 8, 28), False), - (lomq_aug_sat_4, datetime(2010, 8, 27), False), - (lomq_aug_sat_4, datetime(2011, 8, 26), False), - (lomq_aug_sat_4, datetime(2019, 8, 30), False), - - # From GMCR - (lomq_sep_sat_4, datetime(2010, 9, 25), True), - (lomq_sep_sat_4, datetime(2011, 9, 24), True), - (lomq_sep_sat_4, datetime(2012, 9, 29), True), - - (lomq_sep_sat_4, datetime(2013, 6, 29), True), - (lomq_sep_sat_4, datetime(2012, 6, 23), True), - (lomq_sep_sat_4, datetime(2012, 6, 30), False), - - (lomq_sep_sat_4, datetime(2013, 3, 30), True), - (lomq_sep_sat_4, datetime(2012, 3, 24), True), - - (lomq_sep_sat_4, datetime(2012, 12, 29), True), - (lomq_sep_sat_4, datetime(2011, 12, 24), True), - - # INTC (extra week in Q1) - # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 4, 2), True), - - # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2012, 12, 29), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 12, 31), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2010, 12, 25), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_year_has_extra_week(self): - # End of long Q1 - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) - - # Start of long Q1 - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) - - # End of year before year with long Q1 - assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) - - for year in [x - for x in range(1994, 2011 + 1) - if x not in [2011, 2005, 2000, 1994]]: - assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week( - datetime(year, 4, 2)) - - # Other long years - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) - - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) - - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) - - def test_get_weeks(self): - sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1) - sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] - assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] - assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13] - - -class TestFY5253NearestEndMonthQuarter(Base): - - offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.THU, - qtr_with_extra_week=4) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - - on_offset_cases = [ - # From Wikipedia - (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), - (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), - (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), - (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), - (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), - (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), - - (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), - (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), - (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), - (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), - - (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), - (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), - (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), - (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), - (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), - - (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), - (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), - (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), - (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), - (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), - (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), - - # From Micron, see: - # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 - (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), - (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), - - # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 - (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), - (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), - (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), - (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), - (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), - (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), - - (offset_n, datetime(2012, 12, 31), False), - (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_offset(self): - offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, - weekday=WeekDay.THU, - qtr_with_extra_week=4) - - MU = [datetime(2012, 5, 31), - datetime(2012, 8, 30), datetime(2012, 11, 29), - datetime(2013, 2, 28), datetime(2013, 5, 30)] - - date = MU[0] + relativedelta(days=-1) - for expected in MU: - assert_offset_equal(offset, date, expected) - date = date + offset - - assert_offset_equal(offset, - datetime(2012, 5, 31), - datetime(2012, 8, 30)) - assert_offset_equal(offset, - datetime(2012, 5, 30), - datetime(2012, 5, 31)) - - offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", - qtr_with_extra_week=4) - - assert_offset_equal(offset2, - datetime(2013, 1, 15), - datetime(2013, 3, 30)) - - -class TestQuarterBegin(Base): - - def test_repr(self): - assert (repr(QuarterBegin()) == - "") - assert (repr(QuarterBegin(startingMonth=3)) == - "") - assert (repr(QuarterBegin(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert QuarterBegin(startingMonth=1).isAnchored() - assert QuarterBegin().isAnchored() - assert not QuarterBegin(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((QuarterBegin(startingMonth=1), - {datetime(2007, 12, 1): datetime(2008, 1, 1), - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 1): datetime(2008, 7, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 4, 30): datetime(2008, 5, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 12, 1): datetime(2009, 1, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 30): datetime(2008, 7, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2008, 4, 30): datetime(2008, 4, 1), - datetime(2008, 7, 1): datetime(2008, 4, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=2), - {datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 2, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2008, 3, 31): datetime(2008, 7, 1), - datetime(2008, 4, 15): datetime(2008, 10, 1), - datetime(2008, 4, 1): datetime(2008, 10, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = QuarterBegin(n=-1, startingMonth=1) - assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) - - -class TestQuarterEnd(Base): - _offset = QuarterEnd - - def test_repr(self): - assert (repr(QuarterEnd()) == - "") - assert (repr(QuarterEnd(startingMonth=3)) == - "") - assert (repr(QuarterEnd(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert QuarterEnd(startingMonth=1).isAnchored() - assert QuarterEnd().isAnchored() - assert not QuarterEnd(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((QuarterEnd(startingMonth=1), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31), })) - - offset_cases.append((QuarterEnd(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 5, 31), - datetime(2008, 3, 31): datetime(2008, 5, 31), - datetime(2008, 4, 15): datetime(2008, 5, 31), - datetime(2008, 4, 30): datetime(2008, 5, 31), })) - - offset_cases.append((QuarterEnd(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30), })) - - offset_cases.append((QuarterEnd(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), - datetime(2008, 7, 1): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=2), - {datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = QuarterEnd(n=-1, startingMonth=1) - assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) - - on_offset_cases = [ - (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), - (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), - (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), - (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), - (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), - (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), - (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearBegin(Base): - _offset = BYearBegin - - def test_misspecified(self): - pytest.raises(ValueError, BYearBegin, month=13) - pytest.raises(ValueError, BYearEnd, month=13) - - offset_cases = [] - offset_cases.append((BYearBegin(), - {datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2011, 1, 1): datetime(2011, 1, 3), - datetime(2011, 1, 3): datetime(2012, 1, 2), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2), })) - - offset_cases.append((BYearBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 1, 2), - datetime(2009, 1, 4): datetime(2009, 1, 1), - datetime(2009, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 2), - datetime(2006, 12, 30): datetime(2006, 1, 2), - datetime(2006, 1, 1): datetime(2005, 1, 3), })) - - offset_cases.append((BYearBegin(-2), - {datetime(2007, 1, 1): datetime(2005, 1, 3), - datetime(2007, 6, 30): datetime(2006, 1, 2), - datetime(2008, 12, 31): datetime(2007, 1, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - -class TestYearBegin(Base): - _offset = YearBegin - - def test_misspecified(self): - pytest.raises(ValueError, YearBegin, month=13) - - offset_cases = [] - offset_cases.append((YearBegin(), - {datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(3), - {datetime(2008, 1, 1): datetime(2011, 1, 1), - datetime(2008, 6, 30): datetime(2011, 1, 1), - datetime(2008, 12, 31): datetime(2011, 1, 1), - datetime(2005, 12, 30): datetime(2008, 1, 1), - datetime(2005, 12, 31): datetime(2008, 1, 1), })) - - offset_cases.append((YearBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 1, 1), - datetime(2007, 1, 15): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 1), - datetime(2006, 12, 30): datetime(2006, 1, 1), - datetime(2007, 1, 1): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(-2), - {datetime(2007, 1, 1): datetime(2005, 1, 1), - datetime(2008, 6, 30): datetime(2007, 1, 1), - datetime(2008, 12, 31): datetime(2007, 1, 1), })) - - offset_cases.append((YearBegin(month=4), - {datetime(2007, 4, 1): datetime(2008, 4, 1), - datetime(2007, 4, 15): datetime(2008, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1), })) - - offset_cases.append((YearBegin(0, month=4), - {datetime(2007, 4, 1): datetime(2007, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1), })) - - offset_cases.append((YearBegin(4, month=4), - {datetime(2007, 4, 1): datetime(2011, 4, 1), - datetime(2007, 4, 15): datetime(2011, 4, 1), - datetime(2007, 3, 1): datetime(2010, 4, 1), - datetime(2007, 12, 15): datetime(2011, 4, 1), - datetime(2012, 1, 31): datetime(2015, 4, 1), })) - - offset_cases.append((YearBegin(-1, month=4), - {datetime(2007, 4, 1): datetime(2006, 4, 1), - datetime(2007, 3, 1): datetime(2006, 4, 1), - datetime(2007, 12, 15): datetime(2007, 4, 1), - datetime(2012, 1, 31): datetime(2011, 4, 1), })) - - offset_cases.append((YearBegin(-3, month=4), - {datetime(2007, 4, 1): datetime(2004, 4, 1), - datetime(2007, 3, 1): datetime(2004, 4, 1), - datetime(2007, 12, 15): datetime(2005, 4, 1), - datetime(2012, 1, 31): datetime(2009, 4, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), - (YearBegin(), datetime(2008, 1, 1), True), - (YearBegin(), datetime(2006, 12, 31), False), - (YearBegin(), datetime(2006, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearEndLagged(Base): - - def test_bad_month_fail(self): - pytest.raises(Exception, BYearEnd, month=13) - pytest.raises(Exception, BYearEnd, month=0) - - offset_cases = [] - offset_cases.append((BYearEnd(month=6), - {datetime(2008, 1, 1): datetime(2008, 6, 30), - datetime(2007, 6, 30): datetime(2008, 6, 30)}, )) - - offset_cases.append((BYearEnd(n=-1, month=6), - {datetime(2008, 1, 1): datetime(2007, 6, 29), - datetime(2007, 6, 30): datetime(2007, 6, 29)}, )) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert base + offset == expected - - def test_roll(self): - offset = BYearEnd(month=6) - date = datetime(2009, 11, 30) - - assert offset.rollforward(date) == datetime(2010, 6, 30) - assert offset.rollback(date) == datetime(2009, 6, 30) - - on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), - (BYearEnd(month=6), datetime(2007, 6, 30), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearEnd(Base): - _offset = BYearEnd - - offset_cases = [] - offset_cases.append((BYearEnd(), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2006, 12, 29), - datetime(2005, 12, 31): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(0), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(-2), - {datetime(2007, 1, 1): datetime(2005, 12, 30), - datetime(2008, 6, 30): datetime(2006, 12, 29), - datetime(2008, 12, 31): datetime(2006, 12, 29), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), - (BYearEnd(), datetime(2008, 1, 1), False), - (BYearEnd(), datetime(2006, 12, 31), False), - (BYearEnd(), datetime(2006, 12, 29), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestYearEnd(Base): - _offset = YearEnd - - def test_misspecified(self): - pytest.raises(ValueError, YearEnd, month=13) - - offset_cases = [] - offset_cases.append((YearEnd(), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 31), })) - - offset_cases.append((YearEnd(0), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), })) - - offset_cases.append((YearEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 31), - datetime(2006, 12, 30): datetime(2005, 12, 31), - datetime(2007, 1, 1): datetime(2006, 12, 31), })) - - offset_cases.append((YearEnd(-2), - {datetime(2007, 1, 1): datetime(2005, 12, 31), - datetime(2008, 6, 30): datetime(2006, 12, 31), - datetime(2008, 12, 31): datetime(2006, 12, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), - (YearEnd(), datetime(2008, 1, 1), False), - (YearEnd(), datetime(2006, 12, 31), True), - (YearEnd(), datetime(2006, 12, 29), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestYearEndDiffMonth(Base): - - offset_cases = [] - offset_cases.append((YearEnd(month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 15): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2009, 3, 31), - datetime(2008, 3, 30): datetime(2008, 3, 31), - datetime(2005, 3, 31): datetime(2006, 3, 31), - datetime(2006, 7, 30): datetime(2007, 3, 31)})) - - offset_cases.append((YearEnd(0, month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 28): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2008, 3, 31), - datetime(2005, 3, 30): datetime(2005, 3, 31), })) - - offset_cases.append((YearEnd(-1, month=3), - {datetime(2007, 1, 1): datetime(2006, 3, 31), - datetime(2008, 2, 28): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2007, 3, 31), - datetime(2006, 3, 29): datetime(2005, 3, 31), - datetime(2006, 3, 30): datetime(2005, 3, 31), - datetime(2007, 3, 1): datetime(2006, 3, 31), })) - - offset_cases.append((YearEnd(-2, month=3), - {datetime(2007, 1, 1): datetime(2005, 3, 31), - datetime(2008, 6, 30): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2006, 3, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), - (YearEnd(month=3), datetime(2008, 1, 1), False), - (YearEnd(month=3), datetime(2006, 3, 31), True), - (YearEnd(month=3), datetime(2006, 3, 29), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_Easter(): assert_offset_equal(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)) assert_offset_equal(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)) @@ -4284,12 +2773,6 @@ def test_get_offset_name(self): assert Week(weekday=4).freqstr == 'W-FRI' assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN" - assert (makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") - assert (makeFY5253NearestEndMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") def test_get_offset(): @@ -4302,17 +2785,7 @@ def test_get_offset(): ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)), - ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4)), - ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, - startingMonth=12)), - ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), - ("REQ-L-MAR-TUE-4", makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, qtr_with_extra_week=4)), - ("REQ-L-DEC-MON-3", makeFY5253LastOfMonthQuarter( - weekday=0, startingMonth=12, qtr_with_extra_week=3)), - ("REQ-N-DEC-MON-3", makeFY5253NearestEndMonthQuarter( - weekday=0, startingMonth=12, qtr_with_extra_week=3)), - ] + ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4))] for name, expected in pairs: offset = get_offset(name) @@ -4380,16 +2853,6 @@ def test_get_standard_freq(): assert fstr == get_standard_freq(('q', 5)) -def test_quarterly_dont_normalize(): - date = datetime(2012, 3, 31, 5, 30) - - offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) - - for klass in offsets: - result = date + klass() - assert (result.time() == date.time()) - - class TestOffsetAliases(object): def setup_method(self, method): @@ -4682,9 +3145,45 @@ def test_all_offset_classes(self, tup): assert first == second +# --------------------------------------------------------------------- def test_get_offset_day_error(): # subclass of _BaseOffset must override _day_opt attribute, or we should # get a NotImplementedError with pytest.raises(NotImplementedError): DateOffset()._get_offset_day(datetime.now()) + + +@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +def test_valid_month_attributes(kwd, month_classes): + # GH#18226 + cls = month_classes + # check that we cannot create e.g. MonthEnd(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +def test_valid_tick_attributes(kwd, tick_classes): + # GH#18226 + cls = tick_classes + # check that we cannot create e.g. Hour(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +def test_validate_n_error(): + with pytest.raises(TypeError): + DateOffset(n='Doh!') + + with pytest.raises(TypeError): + MonthBegin(n=timedelta(1)) + + with pytest.raises(TypeError): + BDay(n=np.array([1, 2], dtype=np.int64)) + + +def test_require_integers(offset_types): + cls = offset_types + with pytest.raises(ValueError): + cls(n=1.5) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py new file mode 100644 index 0000000000000..22b8cf6119d18 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -0,0 +1,1030 @@ +# -*- coding: utf-8 -*- +""" +Tests for Year, Quarter, and Month-based DateOffset subclasses +""" +from datetime import datetime + +import pytest + +import pandas as pd +from pandas import Timestamp +from pandas import compat + +from pandas.tseries.offsets import (BMonthBegin, BMonthEnd, + MonthBegin, MonthEnd, + YearEnd, YearBegin, BYearEnd, BYearBegin, + QuarterEnd, QuarterBegin, + BQuarterEnd, BQuarterBegin) + +from .test_offsets import Base +from .common import assert_offset_equal, assert_onOffset + + +# -------------------------------------------------------------------- +# Misc + +def test_quarterly_dont_normalize(): + date = datetime(2012, 3, 31, 5, 30) + + offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) + + for klass in offsets: + result = date + klass() + assert (result.time() == date.time()) + + +@pytest.mark.parametrize('n', [-2, 1]) +@pytest.mark.parametrize('cls', [MonthBegin, MonthEnd, + BMonthBegin, BMonthEnd, + QuarterBegin, QuarterEnd, + BQuarterBegin, BQuarterEnd, + YearBegin, YearEnd, + BYearBegin, BYearEnd]) +def test_apply_index(cls, n): + offset = cls(n=n) + rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') + ser = pd.Series(rng) + + res = rng + offset + res_v2 = offset.apply_index(rng) + assert (res == res_v2).all() + assert res[0] == rng[0] + offset + assert res[-1] == rng[-1] + offset + res2 = ser + offset + # apply_index is only for indexes, not series, so no res2_v2 + assert res2.iloc[0] == ser.iloc[0] + offset + assert res2.iloc[-1] == ser.iloc[-1] + offset + + +@pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(), + BQuarterBegin(), BQuarterEnd()]) +def test_on_offset(offset): + dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)] + for date in dates: + res = offset.onOffset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +# -------------------------------------------------------------------- +# Months + +class TestMonthBegin(Base): + _offset = MonthBegin + + offset_cases = [] + # NOTE: I'm not entirely happy with the logic here for Begin -ss + # see thread 'offset conventions' on the ML + offset_cases.append((MonthBegin(), { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + offset_cases.append((MonthBegin(0), { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + offset_cases.append((MonthBegin(2), { + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + offset_cases.append((MonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestMonthEnd(Base): + _offset = MonthEnd + + def test_day_of_month(self): + dt = datetime(2007, 1, 1) + offset = MonthEnd() + + result = dt + offset + assert result == Timestamp(2007, 1, 31) + + result = result + offset + assert result == Timestamp(2007, 2, 28) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + MonthEnd(normalize=True) + expected = dt.replace(hour=0) + MonthEnd() + assert result == expected + + offset_cases = [] + offset_cases.append((MonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31)})) + + offset_cases.append((MonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + offset_cases.append((MonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31)})) + + offset_cases.append((MonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBMonthBegin(Base): + _offset = BMonthBegin + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthBegin() + offset2 = BMonthBegin() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append((BMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1)})) + + offset_cases.append((BMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2)})) + + offset_cases.append((BMonthBegin(2), { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + offset_cases.append((BMonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBMonthEnd(Base): + _offset = BMonthEnd + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + BMonthEnd(normalize=True) + expected = dt.replace(hour=0) + BMonthEnd() + assert result == expected + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthEnd() + offset2 = BMonthEnd() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append((BMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + offset_cases.append((BMonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BMonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + +# -------------------------------------------------------------------- +# Quarters + + +class TestQuarterBegin(Base): + + def test_repr(self): + expected = "" + assert repr(QuarterBegin()) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=1)) == expected + + def test_isAnchored(self): + assert QuarterBegin(startingMonth=1).isAnchored() + assert QuarterBegin().isAnchored() + assert not QuarterBegin(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append((QuarterBegin(startingMonth=1), { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=2), { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestQuarterEnd(Base): + _offset = QuarterEnd + + def test_repr(self): + expected = "" + assert repr(QuarterEnd()) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=1)) == expected + + def test_isAnchored(self): + assert QuarterEnd(startingMonth=1).isAnchored() + assert QuarterEnd().isAnchored() + assert not QuarterEnd(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) + + offset_cases = [] + offset_cases.append((QuarterEnd(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31)})) + + offset_cases.append((QuarterEnd(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=2), { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBQuarterBegin(Base): + _offset = BQuarterBegin + + def test_repr(self): + expected = "" + assert repr(BQuarterBegin()) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=1)) == expected + + def test_isAnchored(self): + assert BQuarterBegin(startingMonth=1).isAnchored() + assert BQuarterBegin().isAnchored() + assert not BQuarterBegin(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterBegin(n=-1, startingMonth=1) + assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) + + offset_cases = [] + offset_cases.append((BQuarterBegin(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestBQuarterEnd(Base): + _offset = BQuarterEnd + + def test_repr(self): + expected = "" + assert repr(BQuarterEnd()) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=1)) == expected + + def test_isAnchored(self): + assert BQuarterEnd(startingMonth=1).isAnchored() + assert BQuarterEnd().isAnchored() + assert not BQuarterEnd(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) + + offset_cases = [] + offset_cases.append((BQuarterEnd(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31)})) + + offset_cases.append((BQuarterEnd(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=2), { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + +# -------------------------------------------------------------------- +# Years + + +class TestYearBegin(Base): + _offset = YearBegin + + def test_misspecified(self): + pytest.raises(ValueError, YearBegin, month=13) + + offset_cases = [] + offset_cases.append((YearBegin(), { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(3), { + datetime(2008, 1, 1): datetime(2011, 1, 1), + datetime(2008, 6, 30): datetime(2011, 1, 1), + datetime(2008, 12, 31): datetime(2011, 1, 1), + datetime(2005, 12, 30): datetime(2008, 1, 1), + datetime(2005, 12, 31): datetime(2008, 1, 1)})) + + offset_cases.append((YearBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(-2), { + datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1)})) + + offset_cases.append((YearBegin(month=4), { + datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1)})) + + offset_cases.append((YearBegin(0, month=4), { + datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1)})) + + offset_cases.append((YearBegin(4, month=4), { + datetime(2007, 4, 1): datetime(2011, 4, 1), + datetime(2007, 4, 15): datetime(2011, 4, 1), + datetime(2007, 3, 1): datetime(2010, 4, 1), + datetime(2007, 12, 15): datetime(2011, 4, 1), + datetime(2012, 1, 31): datetime(2015, 4, 1)})) + + offset_cases.append((YearBegin(-1, month=4), { + datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1)})) + + offset_cases.append((YearBegin(-3, month=4), { + datetime(2007, 4, 1): datetime(2004, 4, 1), + datetime(2007, 3, 1): datetime(2004, 4, 1), + datetime(2007, 12, 15): datetime(2005, 4, 1), + datetime(2012, 1, 31): datetime(2009, 4, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestYearEnd(Base): + _offset = YearEnd + + def test_misspecified(self): + pytest.raises(ValueError, YearEnd, month=13) + + offset_cases = [] + offset_cases.append((YearEnd(), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31)})) + + offset_cases.append((YearEnd(0), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31)})) + + offset_cases.append((YearEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((YearEnd(-2), { + datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestYearEndDiffMonth(Base): + offset_cases = [] + offset_cases.append((YearEnd(month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31)})) + + offset_cases.append((YearEnd(0, month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31)})) + + offset_cases.append((YearEnd(-1, month=3), + {datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31)})) + + offset_cases.append((YearEnd(-2, month=3), + {datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBYearBegin(Base): + _offset = BYearBegin + + def test_misspecified(self): + pytest.raises(ValueError, BYearBegin, month=13) + pytest.raises(ValueError, BYearEnd, month=13) + + offset_cases = [] + offset_cases.append((BYearBegin(), { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2)})) + + offset_cases.append((BYearBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2)})) + + offset_cases.append((BYearBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3)})) + + offset_cases.append((BYearBegin(-2), { + datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestBYearEnd(Base): + _offset = BYearEnd + + offset_cases = [] + offset_cases.append((BYearEnd(), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(0), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(-2), { + datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBYearEndLagged(Base): + _offset = BYearEnd + + def test_bad_month_fail(self): + pytest.raises(Exception, BYearEnd, month=13) + pytest.raises(Exception, BYearEnd, month=0) + + offset_cases = [] + offset_cases.append((BYearEnd(month=6), { + datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30)})) + + offset_cases.append((BYearEnd(n=-1, month=6), { + datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + def test_roll(self): + offset = BYearEnd(month=6) + date = datetime(2009, 11, 30) + + assert offset.rollforward(date) == datetime(2010, 6, 30) + assert offset.rollback(date) == datetime(2009, 6, 30) + + on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 9666a4c154c63..beea6df086b72 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -720,15 +720,15 @@ def _check_generated_range(self, start, freq): def test_infer_freq(self): rng = period_range('1959Q2', '2009Q3', freq='Q') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-DEC' rng = period_range('1959Q2', '2009Q3', freq='Q-NOV') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-NOV' rng = period_range('1959Q2', '2009Q3', freq='Q-OCT') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-OCT' def test_infer_freq_tz(self): diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 3dfad2d4af75e..5fd2089d234c1 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -424,7 +424,7 @@ def test_with_tz(self): # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - '1/1/2009', tz=pytz.utc) + datetime(2009, 1, 1, tzinfo=pytz.utc)) pytest.raises(Exception, bdate_range, datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', @@ -688,7 +688,7 @@ def test_index_astype_asobject_tzinfos(self): # dates around a dst transition rng = date_range('2/13/2010', '5/6/2010', tz=self.tzstr('US/Eastern')) - objs = rng.asobject + objs = rng.astype(object) for i, x in enumerate(objs): exval = rng[i] assert x == exval @@ -1552,8 +1552,8 @@ def test_append_aware_naive(self): ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.asobject.append( - ts2.index.asobject)) + assert ts_result.index.equals(ts1.index.astype(object).append( + ts2.index.astype(object))) # mixed rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') @@ -1561,7 +1561,7 @@ def test_append_aware_naive(self): ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.asobject.append( + assert ts_result.index.equals(ts1.index.astype(object).append( ts2.index)) def test_equal_join_ensure_utc(self): diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 659ce36de6bab..be4e60c6493c8 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -16,6 +16,7 @@ validate_bool_kwarg) import pandas.util.testing as tm +from pandas.util._test_decorators import safe_import class TestDecorators(object): @@ -482,3 +483,20 @@ def test_make_signature(): assert sig == (['old_arg_name', 'new_arg_name', 'mapping=None', 'stacklevel=2'], ['old_arg_name', 'new_arg_name', 'mapping', 'stacklevel']) + + +def test_safe_import(monkeypatch): + assert not safe_import("foo") + assert not safe_import("pandas", min_version="99.99.99") + + # Create dummy module to be imported + import types + import sys + mod_name = "hello123" + mod = types.ModuleType(mod_name) + mod.__version__ = "1.5" + + assert not safe_import(mod_name) + monkeypatch.setitem(sys.modules, mod_name, mod) + assert not safe_import(mod_name, min_version="2.0") + assert safe_import(mod_name, min_version="1.0") diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 957352d9ff13f..0e6cbea21493c 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -430,7 +430,7 @@ def merge_class(base, other): if not isinstance(other, list): other = [other] - other_holidays = dict((holiday.name, holiday) for holiday in other) + other_holidays = {holiday.name: holiday for holiday in other} try: base = base.rules @@ -439,7 +439,7 @@ def merge_class(base, other): if not isinstance(base, list): base = [base] - base_holidays = dict((holiday.name, holiday) for holiday in base) + base_holidays = {holiday.name: holiday for holiday in base} other_holidays.update(base_holidays) return list(other_holidays.values()) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2097fb22b3ec5..857ec9e9881d9 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- +from datetime import date, datetime, timedelta import functools import operator -from datetime import date, datetime, timedelta from pandas.compat import range from pandas import compat import numpy as np from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex, ABCPeriod -from pandas.core.tools.datetimes import to_datetime, normalize_date +from pandas.core.tools.datetimes import to_datetime from pandas.core.common import AbstractMethodError # import after tools, dateutil check @@ -27,7 +27,7 @@ apply_index_wraps, roll_yearday, shift_month, - BeginMixin, EndMixin, + EndMixin, BaseOffset) @@ -103,7 +103,7 @@ def wrapper(self, other): if self.normalize: # normalize_date returns normal datetime - result = normalize_date(result) + result = tslib.normalize_date(result) if tz is not None and result.tzinfo is None: result = tslib._localize_pydatetime(result, tz) @@ -166,7 +166,7 @@ def __add__(date): normalize = False def __init__(self, n=1, normalize=False, **kwds): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self.kwds = kwds @@ -237,9 +237,9 @@ def apply_index(self, i): i = (i.to_period('W') + weeks).to_timestamp() + \ i.to_perioddelta('W') - timedelta_kwds = dict((k, v) for k, v in self.kwds.items() - if k in ['days', 'hours', 'minutes', - 'seconds', 'microseconds']) + timedelta_kwds = {k: v for k, v in self.kwds.items() + if k in ['days', 'hours', 'minutes', + 'seconds', 'microseconds']} if timedelta_kwds: delta = Timedelta(**timedelta_kwds) i = i + (self.n * delta) @@ -473,7 +473,7 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): _adjust_dst = True def __init__(self, n=1, normalize=False, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self.kwds = {'offset': offset} self._offset = offset @@ -774,7 +774,7 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n business days - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 """ _prefix = 'BH' @@ -782,7 +782,7 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): def __init__(self, n=1, normalize=False, start='09:00', end='17:00', offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize super(BusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -819,7 +819,7 @@ class CustomBusinessDay(BusinessDay): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self._offset = offset self.kwds = {} @@ -878,7 +878,7 @@ class CustomBusinessHour(BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n custom business days - .. versionadded: 0.18.1 + .. versionadded:: 0.18.1 """ _prefix = 'CBH' @@ -887,7 +887,7 @@ class CustomBusinessHour(BusinessHourMixin, SingleConstructorOffset): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, start='09:00', end='17:00', offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize super(CustomBusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -919,13 +919,19 @@ def next_bday(self): class MonthOffset(SingleConstructorOffset): _adjust_dst = True + def __init__(self, n=1, normalize=False): + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + @property def name(self): if self.isAnchored: return self.rule_code else: + month = liboffsets._int_to_month[self.n] return "{code}-{month}".format(code=self.rule_code, - month=_int_to_month[self.n]) + month=month) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -945,28 +951,23 @@ def apply(self, other): return shift_month(other, n, self._day_opt) + @apply_index_wraps + def apply_index(self, i): + shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) + return i._shallow_copy(shifted) + class MonthEnd(MonthOffset): """DateOffset of one month end""" _prefix = 'M' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) - return i._shallow_copy(shifted) - class MonthBegin(MonthOffset): """DateOffset of one month at beginning""" _prefix = 'MS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) - return i._shallow_copy(shifted) - class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" @@ -980,6 +981,158 @@ class BusinessMonthBegin(MonthOffset): _day_opt = 'business_start' +class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): + """ + DateOffset subclass representing one custom business month, incrementing + between end of month dates + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar + """ + + _cacheable = False + _prefix = 'CBM' + + onOffset = DateOffset.onOffset # override MonthOffset method + apply_index = DateOffset.apply_index # override MonthOffset method + + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, offset=timedelta(0)): + self.n = self._validate_n(n) + self.normalize = normalize + self._offset = offset + self.kwds = {} + + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + self.kwds['offset'] = offset + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + return MonthEnd(n=1, normalize=self.normalize) + + @apply_wraps + def apply(self, other): + n = self.n + + # First move to month offset + cur_mend = self.m_offset.rollforward(other) + + # Find this custom month offset + cur_cmend = self.cbday.rollback(cur_mend) + + # handle zero case. arbitrarily rollforward + if n == 0 and other != cur_cmend: + n += 1 + + if other < cur_cmend and n >= 1: + n -= 1 + elif other > cur_cmend and n <= -1: + n += 1 + + new = cur_mend + n * self.m_offset + result = self.cbday.rollback(new) + return result + + +class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): + """ + DateOffset subclass representing one custom business month, incrementing + between beginning of month dates + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar + """ + + _cacheable = False + _prefix = 'CBMS' + + onOffset = DateOffset.onOffset # override MonthOffset method + apply_index = DateOffset.apply_index # override MonthOffset method + + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, offset=timedelta(0)): + self.n = self._validate_n(n) + self.normalize = normalize + self._offset = offset + self.kwds = {} + + # _get_calendar does validation and possible transformation + # of calendar and holidays. + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['calendar'] = self.calendar = calendar + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['offset'] = offset + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + return MonthBegin(n=1, normalize=self.normalize) + + @apply_wraps + def apply(self, other): + n = self.n + dt_in = other + + # First move to month offset + cur_mbegin = self.m_offset.rollback(dt_in) + + # Find this custom month offset + cur_cmbegin = self.cbday.rollforward(cur_mbegin) + + # handle zero case. arbitrarily rollforward + if n == 0 and dt_in != cur_cmbegin: + n += 1 + + if dt_in > cur_cmbegin and n <= -1: + n += 1 + elif dt_in < cur_cmbegin and n >= 1: + n -= 1 + + new = cur_mbegin + n * self.m_offset + result = self.cbday.rollforward(new) + return result + + +# --------------------------------------------------------------------- +# Semi-Month Based Offset Classes + class SemiMonthOffset(DateOffset): _adjust_dst = True _default_day_of_month = 15 @@ -994,7 +1147,8 @@ def __init__(self, n=1, normalize=False, day_of_month=None): msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' raise ValueError(msg.format(min=self._min_day_of_month, day=self.day_of_month)) - self.n = int(n) + + self.n = self._validate_n(n) self.normalize = normalize self.kwds = {'day_of_month': self.day_of_month} @@ -1094,12 +1248,9 @@ def onOffset(self, dt): def _apply(self, n, other): # if other.day is not day_of_month move to day_of_month and update n - if other.day < self.day_of_month: - other = other.replace(day=self.day_of_month) - if n > 0: - n -= 1 + if n > 0 and other.day < self.day_of_month: + n -= 1 elif other.day > self.day_of_month: - other = other.replace(day=self.day_of_month) n += 1 months = n // 2 @@ -1149,12 +1300,9 @@ def onOffset(self, dt): def _apply(self, n, other): # if other.day is not day_of_month move to day_of_month and update n if other.day < self.day_of_month: - other = other.replace(day=self.day_of_month) n -= 1 - elif other.day > self.day_of_month: - other = other.replace(day=self.day_of_month) - if n <= 0: - n += 1 + elif n <= 0 and other.day > self.day_of_month: + n += 1 months = n // 2 + n % 2 day = 1 if n % 2 else self.day_of_month @@ -1179,155 +1327,6 @@ def _apply_index_days(self, i, roll): return i + (roll % 2) * Timedelta(days=self.day_of_month - 1).value -class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): - """ - DateOffset subclass representing one custom business month, incrementing - between end of month dates - - Parameters - ---------- - n : int, default 1 - offset : timedelta, default timedelta(0) - normalize : bool, default False - Normalize start/end dates to midnight before generating date range - weekmask : str, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` - holidays : list - list/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` - calendar : pd.HolidayCalendar or np.busdaycalendar - """ - - _cacheable = False - _prefix = 'CBM' - - onOffset = DateOffset.onOffset # override MonthOffset method - - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) - self.normalize = normalize - self._offset = offset - self.kwds = {} - - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) - self.kwds['weekmask'] = self.weekmask = weekmask - self.kwds['holidays'] = self.holidays = holidays - self.kwds['calendar'] = self.calendar = calendar - self.kwds['offset'] = offset - - @cache_readonly - def cbday(self): - kwds = self.kwds - return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) - - @cache_readonly - def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthEnd(n=1, normalize=self.normalize, **kwds) - - @apply_wraps - def apply(self, other): - n = self.n - # First move to month offset - cur_mend = self.m_offset.rollforward(other) - # Find this custom month offset - cur_cmend = self.cbday.rollback(cur_mend) - - # handle zero case. arbitrarily rollforward - if n == 0 and other != cur_cmend: - n += 1 - - if other < cur_cmend and n >= 1: - n -= 1 - elif other > cur_cmend and n <= -1: - n += 1 - - new = cur_mend + n * self.m_offset - result = self.cbday.rollback(new) - return result - - -class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): - """ - DateOffset subclass representing one custom business month, incrementing - between beginning of month dates - - Parameters - ---------- - n : int, default 1 - offset : timedelta, default timedelta(0) - normalize : bool, default False - Normalize start/end dates to midnight before generating date range - weekmask : str, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` - holidays : list - list/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` - calendar : pd.HolidayCalendar or np.busdaycalendar - """ - - _cacheable = False - _prefix = 'CBMS' - - onOffset = DateOffset.onOffset # override MonthOffset method - - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) - self.normalize = normalize - self._offset = offset - self.kwds = {} - - # _get_calendar does validation and possible transformation - # of calendar and holidays. - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) - self.kwds['calendar'] = self.calendar = calendar - self.kwds['weekmask'] = self.weekmask = weekmask - self.kwds['holidays'] = self.holidays = holidays - self.kwds['offset'] = offset - - @cache_readonly - def cbday(self): - kwds = self.kwds - return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) - - @cache_readonly - def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthBegin(n=1, normalize=self.normalize, **kwds) - - @apply_wraps - def apply(self, other): - n = self.n - dt_in = other - # First move to month offset - cur_mbegin = self.m_offset.rollback(dt_in) - # Find this custom month offset - cur_cmbegin = self.cbday.rollforward(cur_mbegin) - - # handle zero case. arbitrarily rollforward - if n == 0 and dt_in != cur_cmbegin: - n += 1 - - if dt_in > cur_cmbegin and n <= -1: - n += 1 - elif dt_in < cur_cmbegin and n >= 1: - n -= 1 - - new = cur_mbegin + n * self.m_offset - result = self.cbday.rollforward(new) - return result - - # --------------------------------------------------------------------- # Week-Based Offset Classes @@ -1345,7 +1344,7 @@ class Week(EndMixin, DateOffset): _prefix = 'W' def __init__(self, n=1, normalize=False, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -1424,7 +1423,7 @@ class WeekOfMonth(DateOffset): _adjust_dst = True def __init__(self, n=1, normalize=False, week=None, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday self.week = week @@ -1460,6 +1459,7 @@ def apply(self, other): def getOffsetOfMonth(self, dt): w = Week(weekday=self.weekday) d = datetime(dt.year, dt.month, 1, tzinfo=dt.tzinfo) + # TODO: Is this DST-safe? d = w.rollforward(d) return d + timedelta(weeks=self.week) @@ -1509,7 +1509,7 @@ class LastWeekOfMonth(DateOffset): _prefix = 'LWOM' def __init__(self, n=1, normalize=False, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -1539,6 +1539,7 @@ def getOffsetOfMonth(self, dt): d = datetime(dt.year, dt.month, 1, dt.hour, dt.minute, dt.second, dt.microsecond, tzinfo=dt.tzinfo) eom = m.rollforward(d) + # TODO: Is this DST-safe? w = Week(weekday=self.weekday) return w.rollback(eom) @@ -1575,7 +1576,7 @@ class QuarterOffset(DateOffset): # point def __init__(self, n=1, normalize=False, startingMonth=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize if startingMonth is None: startingMonth = self._default_startingMonth @@ -1590,7 +1591,7 @@ def isAnchored(self): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['startingMonth'] = _month_to_int[suffix] + kwargs['startingMonth'] = liboffsets._month_to_int[suffix] else: if cls._from_name_startingMonth is not None: kwargs['startingMonth'] = cls._from_name_startingMonth @@ -1598,7 +1599,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - month = _int_to_month[self.startingMonth] + month = liboffsets._int_to_month[self.startingMonth] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @apply_wraps @@ -1618,6 +1619,18 @@ def apply(self, other): return shift_month(other, 3 * n - months_since, self._day_opt) + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + modMonth = (dt.month - self.startingMonth) % 3 + return modMonth == 0 and dt.day == self._get_offset_day(dt) + + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, + self.startingMonth, self._day_opt) + return dtindex._shallow_copy(shifted) + class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates @@ -1631,16 +1644,6 @@ class BQuarterEnd(QuarterOffset): _prefix = 'BQ' _day_opt = 'business_end' - def onOffset(self, dt): - if self.normalize and not _is_normalized(dt): - return False - modMonth = (dt.month - self.startingMonth) % 3 - return modMonth == 0 and dt.day == self._get_offset_day(dt) - - -_int_to_month = tslib._MONTH_ALIASES -_month_to_int = dict((v, k) for k, v in _int_to_month.items()) - # TODO: This is basically the same as BQuarterEnd class BQuarterBegin(QuarterOffset): @@ -1652,7 +1655,7 @@ class BQuarterBegin(QuarterOffset): _day_opt = 'business_start' -class QuarterEnd(EndMixin, QuarterOffset): +class QuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -1663,30 +1666,14 @@ class QuarterEnd(EndMixin, QuarterOffset): _prefix = 'Q' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - return self._end_apply_index(i, self.freqstr) - - def onOffset(self, dt): - if self.normalize and not _is_normalized(dt): - return False - modMonth = (dt.month - self.startingMonth) % 3 - return modMonth == 0 and dt.day == self._get_offset_day(dt) - -class QuarterBegin(BeginMixin, QuarterOffset): +class QuarterBegin(QuarterOffset): _outputName = 'QuarterBegin' _default_startingMonth = 3 _from_name_startingMonth = 1 _prefix = 'QS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - freq_month = 12 if self.startingMonth == 1 else self.startingMonth - 1 - freqstr = 'Q-{month}'.format(month=_int_to_month[freq_month]) - return self._beg_apply_index(i, freqstr) - # --------------------------------------------------------------------- # Year-Based Offset Classes @@ -1707,6 +1694,13 @@ def apply(self, other): months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, + self.month, self._day_opt, + modby=12) + return dtindex._shallow_copy(shifted) + def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False @@ -1725,12 +1719,12 @@ def __init__(self, n=1, normalize=False, month=None): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['month'] = _month_to_int[suffix] + kwargs['month'] = liboffsets._month_to_int[suffix] return cls(**kwargs) @property def rule_code(self): - month = _int_to_month[self.month] + month = liboffsets._int_to_month[self.month] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @@ -1750,30 +1744,19 @@ class BYearBegin(YearOffset): _day_opt = 'business_start' -class YearEnd(EndMixin, YearOffset): +class YearEnd(YearOffset): """DateOffset increments between calendar year ends""" _default_month = 12 _prefix = 'A' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - # convert month anchor to annual period tuple - return self._end_apply_index(i, self.freqstr) - -class YearBegin(BeginMixin, YearOffset): +class YearBegin(YearOffset): """DateOffset increments between calendar year begin dates""" _default_month = 1 _prefix = 'AS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - freq_month = 12 if self.month == 1 else self.month - 1 - freqstr = 'A-{month}'.format(month=_int_to_month[freq_month]) - return self._beg_apply_index(i, freqstr) - # --------------------------------------------------------------------- # Special Offset Classes @@ -1820,7 +1803,7 @@ class FY5253(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest"): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.startingMonth = startingMonth self.weekday = weekday @@ -1956,7 +1939,7 @@ def _get_suffix_prefix(self): def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() - month = _int_to_month[self.startingMonth] + month = liboffsets._int_to_month[self.startingMonth] weekday = _int_to_weekday[self.weekday] return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, weekday=weekday) @@ -1971,7 +1954,7 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): raise ValueError("Unable to parse varion_code: " "{code}".format(code=varion_code)) - startingMonth = _month_to_int[startingMonth_code] + startingMonth = liboffsets._month_to_int[startingMonth_code] weekday = _weekday_to_int[weekday_code] return {"weekday": weekday, @@ -2032,7 +2015,7 @@ class FY5253Quarter(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, qtr_with_extra_week=1, variation="nearest"): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -2158,6 +2141,11 @@ class Easter(DateOffset): """ _adjust_dst = True + def __init__(self, n=1, normalize=False): + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + @apply_wraps def apply(self, other): current_easter = easter(other.year) @@ -2199,6 +2187,12 @@ class Tick(SingleConstructorOffset): _inc = Timedelta(microseconds=1000) _prefix = 'undefined' + def __init__(self, n=1, normalize=False): + # TODO: do Tick classes with normalize=True make sense? + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) __lt__ = _tick_comp(operator.lt) @@ -2231,7 +2225,8 @@ def __eq__(self, other): if isinstance(other, Tick): return self.delta == other.delta else: - return DateOffset.__eq__(self, other) + # TODO: Are there cases where this should raise TypeError? + return False # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. @@ -2247,7 +2242,8 @@ def __ne__(self, other): if isinstance(other, Tick): return self.delta != other.delta else: - return DateOffset.__ne__(self, other) + # TODO: Are there cases where this should raise TypeError? + return True @property def delta(self): @@ -2257,6 +2253,7 @@ def delta(self): def nanos(self): return delta_to_nanoseconds(self.delta) + # TODO: Should Tick have its own apply_index? def apply(self, other): # Timestamp can handle tz and nano sec, thus no need to use apply_wraps if isinstance(other, Timestamp): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py new file mode 100644 index 0000000000000..b592a73e5d758 --- /dev/null +++ b/pandas/util/_test_decorators.py @@ -0,0 +1,71 @@ +""" +This module provides decorator functions which can be applied to test objects +in order to skip those objects when certain conditions occur. A sample use case +is to detect if the platform is missing ``matplotlib``. If so, any test objects +which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be +skipped by ``pytest`` during the execution of the test suite. + +To illustrate, after importing this module: + +import pandas.util._test_decorators as td + +The decorators can be applied to classes: + +@td.skip_if_some_reason +class Foo(): + ... + +Or individual functions: + +@td.skip_if_some_reason +def test_foo(): + ... + +For more information, refer to the ``pytest`` documentation on ``skipif``. +""" + +import pytest + + +def safe_import(mod_name, min_version=None): + """ + Parameters: + ----------- + mod_name : str + Name of the module to be imported + min_version : str, default None + Minimum required version of the specified mod_name + + Returns: + -------- + object + The imported module if successful, or False + """ + try: + mod = __import__(mod_name) + except ImportError: + return False + + if not min_version: + return mod + else: + import sys + version = getattr(sys.modules[mod_name], '__version__') + if version: + from distutils.version import LooseVersion + if LooseVersion(version) >= LooseVersion(min_version): + return mod + + return False + + +def _skip_if_no_mpl(): + mod = safe_import("matplotlib") + if mod: + mod.use("Agg", warn=False) + else: + return True + + +skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), + reason="Missing matplotlib dependency") diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1c4c63acb436a..9db09f23eb849 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -325,13 +325,6 @@ def _skip_if_32bit(): pytest.skip("skipping for 32 bit") -def _skip_if_no_mpl(): - import pytest - - mpl = pytest.importorskip("matplotlib") - mpl.use("Agg", warn=False) - - def _skip_if_mpl_1_5(): import matplotlib as mpl @@ -1589,13 +1582,6 @@ def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, for col in right: assert (col in left) - -def assert_sp_list_equal(left, right): - assert isinstance(left, pd.SparseList) - assert isinstance(right, pd.SparseList) - - assert_sp_array_equal(left.to_array(), right.to_array()) - # ----------------------------------------------------------------------------- # Others @@ -1702,7 +1688,8 @@ def all_index_generator(k=10): """ all_make_index_funcs = [makeIntIndex, makeFloatIndex, makeStringIndex, makeUnicodeIndex, makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeBoolIndex, + makeTimedeltaIndex, makeBoolIndex, makeRangeIndex, + makeIntervalIndex, makeCategoricalIndex] for make_index_func in all_make_index_funcs: yield make_index_func(k=k) @@ -1741,7 +1728,7 @@ def makeObjectSeries(name=None): def getSeriesData(): index = makeStringIndex(N) - return dict((c, Series(randn(N), index=index)) for c in getCols(K)) + return {c: Series(randn(N), index=index) for c in getCols(K)} def makeTimeSeries(nper=None, freq='B', name=None): @@ -1757,11 +1744,11 @@ def makePeriodSeries(nper=None, name=None): def getTimeSeriesData(nper=None, freq='B'): - return dict((c, makeTimeSeries(nper, freq)) for c in getCols(K)) + return {c: makeTimeSeries(nper, freq) for c in getCols(K)} def getPeriodData(nper=None): - return dict((c, makePeriodSeries(nper)) for c in getCols(K)) + return {c: makePeriodSeries(nper) for c in getCols(K)} # make frame @@ -1800,14 +1787,14 @@ def makePeriodFrame(nper=None): def makePanel(nper=None): with warnings.catch_warnings(record=True): cols = ['Item' + c for c in string.ascii_uppercase[:K - 1]] - data = dict((c, makeTimeDataFrame(nper)) for c in cols) + data = {c: makeTimeDataFrame(nper) for c in cols} return Panel.fromDict(data) def makePeriodPanel(nper=None): with warnings.catch_warnings(record=True): cols = ['Item' + c for c in string.ascii_uppercase[:K - 1]] - data = dict((c, makePeriodFrame(nper)) for c in cols) + data = {c: makePeriodFrame(nper) for c in cols} return Panel.fromDict(data) diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 11cc96609be94..4062a96d8e08d 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -160,7 +160,7 @@ def merge_pr(pr_num, target_ref): if body is not None: merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))] - authors = "\n".join("Author: %s" % a for a in distinct_authorsS) + authors = "\n".join("Author: %s" % a for a in distinct_authors) merge_message_flags += ["-m", authors] diff --git a/setup.cfg b/setup.cfg index 7a88ee8557dc7..828ef80971f7b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,4 +27,4 @@ markers = single: mark a test as single cpu only slow: mark a test as slow network: mark a test as network - highmemory: mark a test as a high-memory only + high_memory: mark a test as a high-memory only diff --git a/setup.py b/setup.py index 7e56298d1b20b..004f111115079 100755 --- a/setup.py +++ b/setup.py @@ -9,9 +9,11 @@ import os from os.path import join as pjoin +import pkg_resources import sys import shutil from distutils.version import LooseVersion +from setuptools import setup, Command # versioning import versioneer @@ -30,7 +32,7 @@ def is_platform_mac(): return sys.platform == 'darwin' -min_cython_ver = '0.23' +min_cython_ver = '0.24' try: import Cython ver = Cython.__version__ @@ -38,46 +40,18 @@ def is_platform_mac(): except ImportError: _CYTHON_INSTALLED = False -try: - import pkg_resources - from setuptools import setup, Command - _have_setuptools = True -except ImportError: - # no setuptools installed - from distutils.core import setup, Command - _have_setuptools = False -setuptools_kwargs = {} min_numpy_ver = '1.9.0' -if sys.version_info[0] >= 3: - - setuptools_kwargs = {'zip_safe': False, - 'install_requires': ['python-dateutil >= 2', - 'pytz >= 2011k', - 'numpy >= %s' % min_numpy_ver], - 'setup_requires': ['numpy >= %s' % min_numpy_ver]} - if not _have_setuptools: - sys.exit("need setuptools/distribute for Py3k" - "\n$ pip install distribute") +setuptools_kwargs = { + 'install_requires': [ + 'python-dateutil >= 2.5.0', + 'pytz >= 2011k', + 'numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver), + ], + 'setup_requires': ['numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver)], + 'zip_safe': False, +} -else: - setuptools_kwargs = { - 'install_requires': ['python-dateutil', - 'pytz >= 2011k', - 'numpy >= %s' % min_numpy_ver], - 'setup_requires': ['numpy >= %s' % min_numpy_ver], - 'zip_safe': False, - } - - if not _have_setuptools: - try: - import numpy # noqa:F401 - import dateutil # noqa:F401 - setuptools_kwargs = {} - except ImportError: - sys.exit("install requires: 'python-dateutil < 2','numpy'." - " use pip or easy_install." - "\n $ pip install 'python-dateutil < 2' 'numpy'") from distutils.extension import Extension # noqa:E402 from distutils.command.build import build # noqa:E402 @@ -331,7 +305,6 @@ class CheckSDist(sdist_class): _pyxfiles = ['pandas/_libs/lib.pyx', 'pandas/_libs/hashtable.pyx', 'pandas/_libs/tslib.pyx', - 'pandas/_libs/period.pyx', 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', 'pandas/_libs/join.pyx', @@ -341,8 +314,10 @@ class CheckSDist(sdist_class): 'pandas/_libs/missing.pyx', 'pandas/_libs/testing.pyx', 'pandas/_libs/window.pyx', + 'pandas/_libs/skiplist.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', + 'pandas/_libs/tslibs/period.pyx', 'pandas/_libs/tslibs/strptime.pyx', 'pandas/_libs/tslibs/np_datetime.pyx', 'pandas/_libs/tslibs/timedeltas.pyx', @@ -365,8 +340,9 @@ def run(self): else: for pyxfile in self._pyxfiles: cfile = pyxfile[:-3] + 'c' - msg = "C-source file '%s' not found." % (cfile) +\ - " Run 'setup.py cython' before sdist." + msg = ("C-source file '{source}' not found.\n" + "Run 'setup.py cython' before sdist.".format( + source=cfile)) assert os.path.isfile(cfile), msg sdist_class.run(self) @@ -382,10 +358,10 @@ def check_cython_extensions(self, extensions): for src in ext.sources: if not os.path.exists(src): print("{}: -> [{}]".format(ext.name, ext.sources)) - raise Exception("""Cython-generated file '%s' not found. + raise Exception("""Cython-generated file '{src}' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. - """ % src) + """.format(src=src)) def build_extensions(self): self.check_cython_extensions(self.extensions) @@ -496,7 +472,7 @@ def pxd(name): 'pyxfile': '_libs/hashing'}, '_libs.hashtable': { 'pyxfile': '_libs/hashtable', - 'pxdfiles': ['_libs/hashtable', '_libs/missing'], + 'pxdfiles': ['_libs/hashtable', '_libs/missing', '_libs/khash'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, '_libs.index': { @@ -516,7 +492,9 @@ def pxd(name): 'depends': _pxi_dep['join']}, '_libs.lib': { 'pyxfile': '_libs/lib', - 'pxdfiles': ['_libs/src/util', '_libs/missing'], + 'pxdfiles': ['_libs/src/util', + '_libs/missing', + '_libs/tslibs/conversion'], 'depends': lib_depends + tseries_depends}, '_libs.missing': { 'pyxfile': '_libs/missing', @@ -529,8 +507,8 @@ def pxd(name): 'pandas/_libs/src/numpy_helper.h'], 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, - '_libs.period': { - 'pyxfile': '_libs/period', + '_libs.tslibs.period': { + 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util', '_libs/lib', '_libs/tslibs/timedeltas', @@ -544,13 +522,15 @@ def pxd(name): '_libs.reshape': { 'pyxfile': '_libs/reshape', 'depends': _pxi_dep['reshape']}, + '_libs.skiplist': { + 'pyxfile': '_libs/skiplist', + 'depends': ['pandas/_libs/src/skiplist.h']}, '_libs.sparse': { 'pyxfile': '_libs/sparse', 'depends': _pxi_dep['sparse']}, '_libs.tslib': { 'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', - '_libs/src/khash', '_libs/tslibs/conversion', '_libs/tslibs/timedeltas', '_libs/tslibs/timestamps', @@ -591,12 +571,11 @@ def pxd(name): 'sources': np_datetime_sources}, '_libs.tslibs.parsing': { 'pyxfile': '_libs/tslibs/parsing', - 'pxdfiles': ['_libs/src/util', - '_libs/src/khash']}, + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.resolution': { 'pyxfile': '_libs/tslibs/resolution', 'pxdfiles': ['_libs/src/util', - '_libs/src/khash', + '_libs/khash', '_libs/tslibs/frequencies', '_libs/tslibs/timezones'], 'depends': tseries_depends, @@ -629,9 +608,7 @@ def pxd(name): 'pyxfile': '_libs/testing'}, '_libs.window': { 'pyxfile': '_libs/window', - 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], - 'depends': ['pandas/_libs/src/skiplist.pyx', - 'pandas/_libs/src/skiplist.h']}, + 'pxdfiles': ['_libs/skiplist', '_libs/src/util']}, 'io.sas._sas': { 'pyxfile': 'io/sas/sas'}} @@ -647,7 +624,7 @@ def pxd(name): include = data.get('include', common_include) - obj = Extension('pandas.%s' % name, + obj = Extension('pandas.{name}'.format(name=name), sources=sources, depends=data.get('depends', []), include_dirs=include, @@ -695,7 +672,7 @@ def pxd(name): # ---------------------------------------------------------------------- # ujson -if suffix == '.pyx' and 'setuptools' in sys.modules: +if suffix == '.pyx': # undo dumb setuptools bug clobbering .pyx sources back to .c for ext in extensions: if ext.sources[0].endswith(('.c', '.cpp')): @@ -729,10 +706,6 @@ def pxd(name): sources=['pandas/util/move.c']) extensions.append(_move_ext) - -if _have_setuptools: - setuptools_kwargs["test_suite"] = "nose.collector" - # The build cache system does string matching below this point. # if you change something, be careful. diff --git a/test.bat b/test.bat index 6c69f83866ffd..e07c84f257a69 100644 --- a/test.bat +++ b/test.bat @@ -1,3 +1,3 @@ :: test on windows -pytest --skip-slow --skip-network pandas -n 2 %* +pytest --skip-slow --skip-network pandas -n 2 -r sxX --strict %* diff --git a/test.sh b/test.sh index 23c7ff52d2ce9..1255a39816f78 100755 --- a/test.sh +++ b/test.sh @@ -1,4 +1,4 @@ #!/bin/sh command -v coverage >/dev/null && coverage erase command -v python-coverage >/dev/null && python-coverage erase -pytest pandas --cov=pandas +pytest pandas --cov=pandas -r sxX --strict diff --git a/test_fast.bat b/test_fast.bat index 17dc54b580137..81f30dd310e28 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network -m "not single" -n 4 pandas +pytest --skip-slow --skip-network -m "not single" -n 4 -r sXX --strict pandas diff --git a/test_fast.sh b/test_fast.sh index 9b984156a796c..1fb55e581d292 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network -m "not single" -n 4 "$@" +pytest pandas --skip-slow --skip-network -m "not single" -n 4 -r sxX --strict "$@" diff --git a/versioneer.py b/versioneer.py index 59228ec63c136..b0ae4fa2dc8e8 100644 --- a/versioneer.py +++ b/versioneer.py @@ -606,11 +606,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set(r.strip() for r in refnames.strip("()").split(",")) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set(r[len(TAG):] for r in refs if r.startswith(TAG)) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d @@ -619,7 +619,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set(r for r in refs if re.search(r'\d', r)) + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs-tags)) if verbose: @@ -960,11 +960,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set(r.strip() for r in refnames.strip("()").split(",")) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set(r[len(TAG):] for r in refs if r.startswith(TAG)) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -973,7 +973,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set(r for r in refs if re.search(r'\d', r)) + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs-tags)) if verbose: