From 6c074d10090062f95b4510f126edd6a0bb93b163 Mon Sep 17 00:00:00 2001 From: Jan Werkmann Date: Thu, 23 Nov 2017 16:29:01 +0100 Subject: [PATCH 01/98] Numpy bool msgpack bugfix (#18395) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/msgpack/_packer.pyx | 3 ++- pandas/tests/io/test_packers.py | 13 ++++++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5829481cdb731..ab7ffecaebc4f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -87,6 +87,7 @@ I/O - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). +- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) Plotting diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index f6383b42d4975..c81069c8e04c0 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -8,6 +8,7 @@ from libc.limits cimport * from pandas.io.msgpack.exceptions import PackValueError from pandas.io.msgpack import ExtType +import numpy as np cdef extern from "../../src/msgpack/pack.h": @@ -133,7 +134,7 @@ cdef class Packer(object): while True: if o is None: ret = msgpack_pack_nil(&self.pk) - elif isinstance(o, bool): + elif isinstance(o, (bool, np.bool_)): if o: ret = msgpack_pack_true(&self.pk) else: diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index a28adcf1ee771..bc58ea1c7c228 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -180,6 +180,15 @@ def test_scalar_float(self): x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) + def test_scalar_bool(self): + x = np.bool_(1) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + + x = np.bool_(0) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() x_rec = self.encode_decode(x) @@ -263,7 +272,7 @@ def test_numpy_array_complex(self): x.dtype == x_rec.dtype) def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] + x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -401,6 +410,7 @@ def setup_method(self, method): 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, 'H': Categorical([1, 2, 3, 4, 5]), 'I': Categorical([1, 2, 3, 4, 5], ordered=True), + 'J': (np.bool_(1), 2, 3, 4, 5), } self.d['float'] = Series(data['A']) @@ -410,6 +420,7 @@ def setup_method(self, method): self.d['dt_tz'] = Series(data['G']) self.d['cat_ordered'] = Series(data['H']) self.d['cat_unordered'] = Series(data['I']) + self.d['numpy_bool_mixed'] = Series(data['J']) def test_basic(self): From 4e0948030de512b353e0a39b3d3c309b77c3f3f2 Mon Sep 17 00:00:00 2001 From: bolkedebruin Date: Thu, 23 Nov 2017 16:35:40 +0100 Subject: [PATCH 02/98] [BUG-FIX] DataFrame created with tzinfo cannot use to_dict(orient="records") (#18416) Closes #18372 --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/frame.py | 2 +- pandas/tests/frame/test_convert_to.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index ab7ffecaebc4f..73cbb5cefeb3f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -63,7 +63,7 @@ Conversion - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) -- +- Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7145fa709c345..b05cfe41fd9d1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -993,7 +993,7 @@ def to_dict(self, orient='dict', into=dict): for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): return [into_c((k, _maybe_box_datetimelike(v)) - for k, v in zip(self.columns, row)) + for k, v in zip(self.columns, np.atleast_1d(row))) for row in self.values] elif orient.lower().startswith('i'): return into_c((k, v.to_dict(into)) for k, v in self.iterrows()) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 5bdb76494f4c8..7d2d18db8d41c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +from datetime import datetime + import pytest +import pytz import collections import numpy as np @@ -249,3 +252,18 @@ def test_to_dict_box_scalars(self): result = DataFrame(d).to_dict(orient='records') assert isinstance(result[0]['a'], (int, long)) + + def test_frame_to_dict_tz(self): + # GH18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] + df = DataFrame(list(data), columns=["d", ]) + + result = df.to_dict(orient='records') + expected = [ + {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)}, + {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) From e6a0ef81972ff77663c4b8cb4806c6c7d650f51a Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 23 Nov 2017 16:47:48 +0100 Subject: [PATCH 03/98] REF: smarter NaN handling in remove_unused_levels() (#18438) --- pandas/core/indexes/multi.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cc99505b53bf5..81d892fba0fe2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1365,31 +1365,29 @@ def remove_unused_levels(self): new_labels = [] changed = False - for idx, (lev, lab) in enumerate(zip(self.levels, self.labels)): - na_idxs = np.where(lab == -1)[0] - - if len(na_idxs): - lab = np.delete(lab, na_idxs) + for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) + na_idx = np.where(uniques == -1)[0] # nothing unused - if len(uniques) != len(lev): + if len(uniques) != len(lev) + len(na_idx): changed = True + if len(na_idx): + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + # labels get mapped from uniques to 0:len(uniques) - label_mapping = np.zeros(len(lev)) - label_mapping[uniques] = np.arange(len(uniques)) + # -1 (if present) is mapped to last position + label_mapping = np.zeros(len(lev) + len(na_idx)) + # ... and reassigned value -1: + label_mapping[uniques] = np.arange(len(uniques)) - len(na_idx) lab = label_mapping[lab] # new levels are simple - lev = lev.take(uniques) - - if len(na_idxs): - lab = np.insert(lab, na_idxs - np.arange(len(na_idxs)), -1) - else: - lab = self.labels[idx] + lev = lev.take(uniques[len(na_idx):]) new_levels.append(lev) new_labels.append(lab) From 04b628fa49063a43c5941b14f047b587aaaf1e00 Mon Sep 17 00:00:00 2001 From: Ted Petrou Date: Thu, 23 Nov 2017 10:50:37 -0500 Subject: [PATCH 04/98] cleaned up imports (#18264) --- pandas/core/api.py | 6 +++--- pandas/core/reshape/api.py | 3 ++- pandas/core/reshape/melt.py | 14 ++++++++------ pandas/tests/reshape/test_reshape.py | 3 +-- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index 1f46aaa40e9eb..8a624da362976 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,9 +23,9 @@ from pandas.core.frame import DataFrame from pandas.core.panel import Panel, WidePanel from pandas.core.panel4d import Panel4D -from pandas.core.reshape.reshape import ( - pivot_simple as pivot, get_dummies) -from pandas.core.reshape.melt import lreshape, wide_to_long + +# TODO: Remove import when statsmodels updates #18264 +from pandas.core.reshape.reshape import get_dummies from pandas.core.indexing import IndexSlice from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 99286d807a205..454a3965d74a6 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,7 +1,8 @@ # flake8: noqa from pandas.core.reshape.concat import concat -from pandas.core.reshape.melt import melt +from pandas.core.reshape.melt import melt, lreshape, wide_to_long +from pandas.core.reshape.reshape import pivot_simple as pivot, get_dummies from pandas.core.reshape.merge import ( merge, ordered_merge, merge_ordered, merge_asof) from pandas.core.reshape.pivot import pivot_table, crosstab diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 846d04221fe7f..36e52f1472f82 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -6,14 +6,12 @@ from pandas import compat from pandas.core.categorical import Categorical -from pandas.core.frame import DataFrame -from pandas.core.index import MultiIndex +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.frame import _shared_docs from pandas.util._decorators import Appender import re -import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import notna @@ -27,7 +25,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] - elif (isinstance(frame.columns, MultiIndex) and + elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') @@ -39,7 +37,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] - elif (isinstance(frame.columns, MultiIndex) and + elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') @@ -54,7 +52,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, frame.columns = frame.columns.get_level_values(col_level) if var_name is None: - if isinstance(frame.columns, MultiIndex): + if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: @@ -81,6 +79,7 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) + from pandas import DataFrame return DataFrame(mdata, columns=mcolumns) @@ -137,6 +136,8 @@ def lreshape(data, groups, dropna=True, label=None): for target, names in zip(keys, values): to_concat = [data[col].values for col in names] + + import pandas.core.dtypes.concat as _concat mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) @@ -150,6 +151,7 @@ def lreshape(data, groups, dropna=True, label=None): if not mask.all(): mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) + from pandas import DataFrame return DataFrame(mdata, columns=id_cols + pivot_cols) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 5d4aa048ae303..59852ee014b92 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -11,8 +11,7 @@ from pandas.util.testing import assert_frame_equal -from pandas.core.reshape.reshape import get_dummies -from pandas.core.reshape.melt import melt, lreshape, wide_to_long +from pandas import melt, lreshape, wide_to_long, get_dummies import pandas.util.testing as tm from pandas.compat import range, u From 369df07e04de93c0853c6a5d5aced1eb3c387daf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 23 Nov 2017 07:54:42 -0800 Subject: [PATCH 05/98] CLN: ASV attrs_caching benchmark (#18441) --- asv_bench/benchmarks/attrs_caching.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index b7610037bed4d..3c091be7a8424 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame try: from pandas.util import cache_readonly @@ -7,9 +8,11 @@ class DataFrameAttributes(object): + goal_time = 0.2 def setup(self): + np.random.seed(1234) self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index @@ -21,6 +24,7 @@ def time_set_index(self): class CacheReadonly(object): + goal_time = 0.2 def setup(self): From 4e98a7bd65f23874e74ef561bc0fe890ea2b3c0a Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Thu, 23 Nov 2017 15:56:04 +0000 Subject: [PATCH 06/98] BUG: Keep float dtype in merge on int and float column (#18352) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/reshape/merge.py | 40 ++++++++++++++++++++--------- pandas/tests/reshape/test_merge.py | 41 +++++++++++++++++++++++++++++- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a4d60b4dfbb2..1a6327554f61a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -51,7 +51,7 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec0..d00aa1003988a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -906,16 +906,31 @@ def _maybe_coerce_merge_keys(self): continue # if we are numeric, then allow differing - # kinds to proceed, eg. int64 and int8 + # kinds to proceed, eg. int64 and int8, int and float # further if we are object, but we infer to # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: - continue + pass + + # check whether ints and floats + elif is_integer_dtype(rk) and is_float_dtype(lk): + if not (lk == lk.astype(rk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) + + elif is_float_dtype(rk) and is_integer_dtype(lk): + if not (rk == rk.astype(lk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) # let's infer and see if we are ok - if lib.infer_dtype(lk) == lib.infer_dtype(rk): - continue + elif lib.infer_dtype(lk) == lib.infer_dtype(rk): + pass # Houston, we have a problem! # let's coerce to object if the dtypes aren't @@ -924,14 +939,15 @@ def _maybe_coerce_merge_keys(self): # then we would lose type information on some # columns, and end up trying to merge # incompatible dtypes. See GH 16900. - if name in self.left.columns: - typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) - if name in self.right.columns: - typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + else: + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign( + **{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign( + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..ee7c4e5c90bb8 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -13,7 +13,10 @@ from pandas.core.reshape.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_object_dtype, +) from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1408,6 +1411,42 @@ def test_join_multi_dtypes(self, d1, d2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ + ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), + ]) + def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): + # GH 16572 + # Check that float column is not cast to object if + # merging on float and int columns + A = DataFrame({'X': int_vals}) + B = DataFrame({'Y': float_vals}) + expected = DataFrame(exp_vals) + + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + + def test_merge_on_ints_floats_warning(self): + # GH 16572 + # merge will produce a warning when merging on int and + # float columns where the float values are not exactly + # equal to their int representation + A = DataFrame({'X': [1, 2, 3]}) + B = DataFrame({'Y': [1.1, 2.5, 3.0]}) + expected = DataFrame({'X': [3], 'Y': [3.0]}) + + with tm.assert_produces_warning(UserWarning): + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + @pytest.fixture def left(): From 41004d9df1f74e3ff478a1f40e868a0cf255dd07 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 23 Nov 2017 11:05:15 -0500 Subject: [PATCH 07/98] BUG: prevent coercion to datetime64[ns] when a Series is initialized with both tz-naive and tz-aware (#18361) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/_libs/src/inference.pyx | 20 ++++++- pandas/tests/dtypes/test_inference.py | 70 ++++++++++++++++++++++++ pandas/tests/series/test_constructors.py | 9 +++ 4 files changed, 97 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1a6327554f61a..90032a692fd15 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -195,5 +195,5 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) -- +- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) - diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 066beb29c24ce..6e964077dd56e 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -464,7 +464,8 @@ cpdef object infer_datetimelike_array(object arr): - timedelta: we have *only* timedeltas and maybe strings, nulls - nat: we do not have *any* date, datetimes or timedeltas, but do have at least a NaT - - mixed: other objects (strings or actual objects) + - mixed: other objects (strings, a mix of tz-aware and tz-naive, or + actual objects) Parameters ---------- @@ -479,6 +480,7 @@ cpdef object infer_datetimelike_array(object arr): cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 + bint seen_tz_aware = 0, seen_tz_naive = 0 bint seen_nat = 0 list objs = [] object v @@ -496,8 +498,20 @@ cpdef object infer_datetimelike_array(object arr): pass elif v is NaT: seen_nat = 1 - elif is_datetime(v) or util.is_datetime64_object(v): - # datetime, or np.datetime64 + elif is_datetime(v): + # datetime + seen_datetime = 1 + + # disambiguate between tz-naive and tz-aware + if v.tzinfo is None: + seen_tz_naive = 1 + else: + seen_tz_aware = 1 + + if seen_tz_naive and seen_tz_aware: + return 'mixed' + elif util.is_datetime64_object(v): + # np.datetime64 seen_datetime = 1 elif is_date(v): seen_date = 1 diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index a96dd3c232636..ef12416ef4e1c 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -419,6 +419,10 @@ def test_mixed_dtypes_remain_object_array(self): class TestTypeInference(object): + # Dummy class used for testing with Python objects + class Dummy(): + pass + def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) assert result == 'integer' @@ -655,6 +659,72 @@ def test_infer_dtype_period(self): dtype=object) assert lib.infer_dtype(arr) == 'mixed' + @pytest.mark.parametrize( + "data", + [ + [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], + [Timestamp("20170612"), Timestamp("20170311")], + [Timestamp("20170612", tz='US/Eastern'), + Timestamp("20170311", tz='US/Eastern')], + [date(2017, 6, 12), + Timestamp("20170311", tz='US/Eastern')], + [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], + [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)] + ] + ) + def test_infer_datetimelike_array_datetime(self, data): + assert lib.infer_datetimelike_array(data) == "datetime" + + @pytest.mark.parametrize( + "data", + [ + [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], + [timedelta(2017, 6, 12), date(2017, 3, 11)], + [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], + [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)] + ] + ) + def test_infer_datetimelike_array_timedelta(self, data): + assert lib.infer_datetimelike_array(data) == "timedelta" + + def test_infer_datetimelike_array_date(self): + arr = [date(2017, 6, 12), date(2017, 3, 11)] + assert lib.infer_datetimelike_array(arr) == "date" + + @pytest.mark.parametrize( + "data", + [ + ["2017-06-12", "2017-03-11"], + [20170612, 20170311], + [20170612.5, 20170311.8], + [Dummy(), Dummy()], + [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')], + [Timestamp("20170612"), 20170311], + [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')] + ] + ) + def test_infer_datetimelike_array_mixed(self, data): + assert lib.infer_datetimelike_array(data) == "mixed" + + @pytest.mark.parametrize( + "first, expected", + [ + [[None], "mixed"], + [[np.nan], "mixed"], + [[pd.NaT], "nat"], + [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"], + [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], + [[date(2017, 6, 12), pd.NaT], "date"], + [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], + [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"] + ] + ) + @pytest.mark.parametrize("second", [None, np.nan]) + def test_infer_datetimelike_array_nan_nat_like(self, first, second, + expected): + first.append(second) + assert lib.infer_datetimelike_array(first) == expected + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr) == 'floating' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index e62b19294a07b..86e5cc54bd490 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -777,6 +777,15 @@ def f(): s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]' + # GH 16406 + def test_constructor_mixed_tz(self): + s = Series([Timestamp('20130101'), + Timestamp('20130101', tz='US/Eastern')]) + expected = Series([Timestamp('20130101'), + Timestamp('20130101', tz='US/Eastern')], + dtype='object') + assert_series_equal(s, expected) + def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') From b45325e283b16ec8869aaea407de8256fc234f33 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 23 Nov 2017 16:13:47 +0000 Subject: [PATCH 08/98] BUG: Copy categorical codes if empty (fixes #18051) (#18436) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/categorical.py | 2 +- pandas/tests/series/test_analytics.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 73cbb5cefeb3f..637ccf0603e0f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -136,6 +136,7 @@ Categorical - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) +- Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) Other ^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 03bf09352862b..deaec20586005 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2276,7 +2276,7 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls - return codes + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ff788fb2347b8..cfc319da1598d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -848,6 +848,12 @@ def test_value_counts_nunique(self): result = series.nunique() assert result == 11 + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + def test_unique(self): # 714 also, dtype=float @@ -873,6 +879,14 @@ def test_unique(self): expected = np.array([1, 2, 3, None], dtype=object) tm.assert_numpy_array_equal(result, expected) + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + @pytest.mark.parametrize( "tc1, tc2", [ From 492040b401772e95b755f74f09b20ca236016fb5 Mon Sep 17 00:00:00 2001 From: dmanikowski-reef <32765114+dmanikowski-reef@users.noreply.github.com> Date: Thu, 23 Nov 2017 20:41:03 +0100 Subject: [PATCH 09/98] CLN: Add teardowns for some benchmarks (#17616) (#18388) Added teardowns for hdfstore, io and packers benchmarks. --- asv_bench/benchmarks/hdfstore_bench.py | 6 ++- asv_bench/benchmarks/io_bench.py | 63 ++++++++++++++++---------- asv_bench/benchmarks/packers.py | 29 ++++-------- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 7d490180e8af6..5aa8f76917797 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -40,10 +40,11 @@ def setup(self): def teardown(self): self.store.close() + self.remove(self.f) def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass @@ -115,10 +116,11 @@ def setup(self): def teardown(self): self.store.close() + self.remove(self.f) def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index d5eedf63dfe8a..c718b13912e73 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -8,18 +8,36 @@ import timeit -class frame_to_csv(object): +class _BenchTeardown(object): + """ + base class for teardown method implementation + """ + fname = None + + def remove(self, f): + try: + os.remove(f) + except: + pass + + def teardown(self): + self.remove(self.fname) + + +class frame_to_csv(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df = DataFrame(np.random.randn(3000, 30)) def time_frame_to_csv(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) -class frame_to_csv2(object): +class frame_to_csv2(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df = DataFrame({'A': range(50000), }) @@ -28,22 +46,24 @@ def setup(self): self.df['D'] = (self.df.A + 3.0) def time_frame_to_csv2(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) -class frame_to_csv_date_formatting(object): +class frame_to_csv_date_formatting(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.rng = date_range('1/1/2000', periods=1000) self.data = DataFrame(self.rng, index=self.rng) def time_frame_to_csv_date_formatting(self): - self.data.to_csv('__test__.csv', date_format='%Y%m%d') + self.data.to_csv(self.fname, date_format='%Y%m%d') -class frame_to_csv_mixed(object): +class frame_to_csv_mixed(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=self.create_cols('float')) @@ -55,7 +75,7 @@ def setup(self): self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) def time_frame_to_csv_mixed(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def create_cols(self, name): return [('%s%03d' % (name, i)) for i in range(5)] @@ -94,28 +114,30 @@ def time_read_csv_infer_datetime_format_ymd(self): read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) -class read_csv_skiprows(object): +class read_csv_skiprows(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(20000) self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def time_read_csv_skiprows(self): - read_csv('__test__.csv', skiprows=10000) + read_csv(self.fname, skiprows=10000) -class read_csv_standard(object): +class read_csv_standard(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(10000) self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) def time_read_csv_standard(self): - read_csv('__test__.csv') + read_csv(self.fname) class read_parse_dates_iso8601(object): @@ -152,15 +174,16 @@ def time_read_uint64_na_values(self): read_csv(StringIO(self.data1), header=None, na_values=self.na_values) -class write_csv_standard(object): +class write_csv_standard(_BenchTeardown): goal_time = 0.2 + fname = '__test__.csv' def setup(self): self.index = tm.makeStringIndex(10000) self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) def time_write_csv_standard(self): - self.df.to_csv('__test__.csv') + self.df.to_csv(self.fname) class read_csv_from_s3(object): @@ -195,7 +218,7 @@ def time_read_nrows(self, compression, engine): compression=compression, engine=engine) -class read_json_lines(object): +class read_json_lines(_BenchTeardown): goal_time = 0.2 fname = "__test__.json" @@ -205,12 +228,6 @@ def setup(self): self.df = DataFrame({('float{0}'.format(i), randn(self.N)) for i in range(self.C)}) self.df.to_json(self.fname,orient="records",lines=True) - def teardown(self): - try: - os.remove(self.fname) - except: - pass - def time_read_json_lines(self): pd.read_json(self.fname, lines=True) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index e3d95aa3586c5..927f1505e85c6 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -9,6 +9,7 @@ import numpy as np from random import randrange + class _Packers(object): goal_time = 0.2 @@ -24,12 +25,15 @@ def _setup(self): def remove(self, f): try: - os.remove(self.f) + os.remove(f) except: pass + def teardown(self): + self.remove(self.f) + + class Packers(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -38,8 +42,8 @@ def setup(self): def time_packers_read_csv(self): pd.read_csv(self.f) + class packers_read_excel(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -54,7 +58,6 @@ def time_packers_read_excel(self): class packers_read_hdf_store(_Packers): - goal_time = 0.2 def setup(self): self._setup() @@ -115,6 +118,7 @@ def setup(self): def time_packers_read_pickle(self): pd.read_pickle(self.f) + class packers_read_sql(_Packers): def setup(self): @@ -177,9 +181,6 @@ def setup(self): def time_write_csv(self): self.df.to_csv(self.f) - def teardown(self): - self.remove(self.f) - class Excel(_Packers): @@ -217,8 +218,6 @@ def time_write_hdf_store(self): def time_write_hdf_table(self): self.df2.to_hdf(self.f, 'df', table=True) - def teardown(self): - self.remove(self.f) class JSON(_Packers): @@ -259,9 +258,6 @@ def time_write_json_mixed_float_int_str(self): def time_write_json_lines(self): self.df.to_json(self.f, orient="records", lines=True) - def teardown(self): - self.remove(self.f) - class MsgPack(_Packers): @@ -271,9 +267,6 @@ def setup(self): def time_write_msgpack(self): self.df2.to_msgpack(self.f) - def teardown(self): - self.remove(self.f) - class Pickle(_Packers): @@ -283,9 +276,6 @@ def setup(self): def time_write_pickle(self): self.df2.to_pickle(self.f) - def teardown(self): - self.remove(self.f) - class SQL(_Packers): @@ -313,6 +303,3 @@ def time_write_stata(self): def time_write_stata_with_validation(self): self.df3.to_stata(self.f, {'index': 'tc', }) - - def teardown(self): - self.remove(self.f) From 5e670653e50dcbbafc0ba004b16328f49925f041 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Nov 2017 22:46:09 +0100 Subject: [PATCH 10/98] CI: temp skip geopandas downstream tests (GH18456) (#18457) --- pandas/tests/test_downstream.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 0f0abd8cd3400..1ec25bc8bb295 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -92,6 +92,7 @@ def test_pandas_datareader(): pandas_datareader.get_data_google('AAPL') +@pytest.mark.skip(reason="import issue with fiona GH18456") def test_geopandas(): geopandas = import_module('geopandas') # noqa From e6eac0b308af9869ee123caa8c256bd8a7cc126b Mon Sep 17 00:00:00 2001 From: Ted Petrou Date: Thu, 23 Nov 2017 18:34:43 -0500 Subject: [PATCH 11/98] TST: move melt tests to separate file (#18428) --- pandas/tests/reshape/test_melt.py | 537 +++++++++++++++++++++++++++ pandas/tests/reshape/test_reshape.py | 527 +------------------------- 2 files changed, 539 insertions(+), 525 deletions(-) create mode 100644 pandas/tests/reshape/test_melt.py diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py new file mode 100644 index 0000000000000..3c38512548c70 --- /dev/null +++ b/pandas/tests/reshape/test_melt.py @@ -0,0 +1,537 @@ +# -*- coding: utf-8 -*- +# pylint: disable-msg=W0612,E1101 + +import pytest + +from pandas import DataFrame +import pandas as pd + +from numpy import nan +import numpy as np + +from pandas import melt, lreshape, wide_to_long +import pandas.util.testing as tm +from pandas.compat import range + + +class TestMelt(object): + + def setup_method(self, method): + self.df = tm.makeTimeDataFrame()[:10] + self.df['id1'] = (self.df['A'] > 0).astype(np.int64) + self.df['id2'] = (self.df['B'] > 0).astype(np.int64) + + self.var_name = 'var' + self.value_name = 'val' + + self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 + ], [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298, -0.873361]]) + self.df1.columns = [list('ABC'), list('abc')] + self.df1.columns.names = ['CAP', 'low'] + + def test_top_level_method(self): + result = melt(self.df) + assert result.columns.tolist() == ['variable', 'value'] + + def test_method_signatures(self): + tm.assert_frame_equal(self.df.melt(), + melt(self.df)) + + tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], + value_vars=['A', 'B']), + melt(self.df, + id_vars=['id1', 'id2'], + value_vars=['A', 'B'])) + + tm.assert_frame_equal(self.df.melt(var_name=self.var_name, + value_name=self.value_name), + melt(self.df, + var_name=self.var_name, + value_name=self.value_name)) + + tm.assert_frame_equal(self.df1.melt(col_level=0), + melt(self.df1, col_level=0)) + + def test_default_col_names(self): + result = self.df.melt() + assert result.columns.tolist() == ['variable', 'value'] + + result1 = self.df.melt(id_vars=['id1']) + assert result1.columns.tolist() == ['id1', 'variable', 'value'] + + result2 = self.df.melt(id_vars=['id1', 'id2']) + assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] + + def test_value_vars(self): + result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') + assert len(result3) == 10 + + result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) + expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', 'value']) + tm.assert_frame_equal(result4, expected4) + + def test_value_vars_types(self): + # GH 15348 + expected = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', 'value']) + + for type_ in (tuple, list, np.array): + result = self.df.melt(id_vars=['id1', 'id2'], + value_vars=type_(('A', 'B'))) + tm.assert_frame_equal(result, expected) + + def test_vars_work_with_multiindex(self): + expected = DataFrame({ + ('A', 'a'): self.df1[('A', 'a')], + 'CAP': ['B'] * len(self.df1), + 'low': ['b'] * len(self.df1), + 'value': self.df1[('B', 'b')], + }, columns=[('A', 'a'), 'CAP', 'low', 'value']) + + result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) + tm.assert_frame_equal(result, expected) + + def test_tuple_vars_fail_with_multiindex(self): + # melt should fail with an informative error message if + # the columns have a MultiIndex and a tuple is passed + # for id_vars or value_vars. + tuple_a = ('A', 'a') + list_a = [tuple_a] + tuple_b = ('B', 'b') + list_b = [tuple_b] + + for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), + (tuple_a, tuple_b)): + with tm.assert_raises_regex(ValueError, r'MultiIndex'): + self.df1.melt(id_vars=id_vars, value_vars=value_vars) + + def test_custom_var_name(self): + result5 = self.df.melt(var_name=self.var_name) + assert result5.columns.tolist() == ['var', 'value'] + + result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) + assert result6.columns.tolist() == ['id1', 'var', 'value'] + + result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) + assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] + + result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + var_name=self.var_name) + assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] + + result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + var_name=self.var_name) + expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A'] * 10 + ['B'] * 10, + 'value': (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', self.var_name, 'value']) + tm.assert_frame_equal(result9, expected9) + + def test_custom_value_name(self): + result10 = self.df.melt(value_name=self.value_name) + assert result10.columns.tolist() == ['variable', 'val'] + + result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) + assert result11.columns.tolist() == ['id1', 'variable', 'val'] + + result12 = self.df.melt(id_vars=['id1', 'id2'], + value_name=self.value_name) + assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] + + result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + value_name=self.value_name) + assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] + + result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + value_name=self.value_name) + expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A'] * 10 + ['B'] * 10, + self.value_name: (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', 'variable', + self.value_name]) + tm.assert_frame_equal(result14, expected14) + + def test_custom_var_and_value_name(self): + + result15 = self.df.melt(var_name=self.var_name, + value_name=self.value_name) + assert result15.columns.tolist() == ['var', 'val'] + + result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, + value_name=self.value_name) + assert result16.columns.tolist() == ['id1', 'var', 'val'] + + result17 = self.df.melt(id_vars=['id1', 'id2'], + var_name=self.var_name, + value_name=self.value_name) + assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] + + result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', + var_name=self.var_name, + value_name=self.value_name) + assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] + + result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], + var_name=self.var_name, + value_name=self.value_name) + expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A'] * 10 + ['B'] * 10, + self.value_name: (self.df['A'].tolist() + + self.df['B'].tolist())}, + columns=['id1', 'id2', self.var_name, + self.value_name]) + tm.assert_frame_equal(result19, expected19) + + df20 = self.df.copy() + df20.columns.name = 'foo' + result20 = df20.melt() + assert result20.columns.tolist() == ['foo', 'value'] + + def test_col_level(self): + res1 = self.df1.melt(col_level=0) + res2 = self.df1.melt(col_level='CAP') + assert res1.columns.tolist() == ['CAP', 'value'] + assert res2.columns.tolist() == ['CAP', 'value'] + + def test_multiindex(self): + res = self.df1.melt() + assert res.columns.tolist() == ['CAP', 'low', 'value'] + + +class TestLreshape(object): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': + ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': + ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, + 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, + 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', '21jan2009', + '22jan2009', '31dec2008', '03feb2009', + '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': + ['08jan2009', '20dec2008', '30dec2008', '21dec2008', + '11jan2009', '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', '20dec2008', + '30dec2008', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, + 3139, 4133, 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', '21jan2009', nan, + '22jan2009', '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', + '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, + 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, + 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + pytest.raises(ValueError, lreshape, df, spec) + + +class TestWideToLong(object): + + def test_simple(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A1970": {0: "a", + 1: "b", + 2: "c"}, + "A1980": {0: "d", + 1: "e", + 2: "f"}, + "B1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_stubs(self): + # GH9204 + df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) + df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] + stubs = ['inc', 'edu'] + + # TODO: unused? + df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa + + assert stubs == ['inc', 'edu'] + + def test_separating_character(self): + # GH14779 + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A.1970": {0: "a", + 1: "b", + 2: "c"}, + "A.1980": {0: "d", + 1: "e", + 2: "f"}, + "B.1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B.1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A(quarterly)1970": {0: "a", + 1: "b", + 2: "c"}, + "A(quarterly)1980": {0: "d", + 1: "e", + 2: "f"}, + "B(quarterly)1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B(quarterly)1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index( + ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] + long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], + i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], + 'A': [1.0, 3.0, 2.0, 4.0], + 'B': [5.0, np.nan, 6.0, np.nan], + 'id': [0, 0, 1, 1], + 'year': ['2010', '2011', '2010', '2011']} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'BBBX': [91, 92, 93], + 'BBBZ': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'BBBX': [91, 92, 93, 91, 92, 93], + 'BBBZ': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['BBBX', 'BBBZ', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = 'nope!' + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'A2010': [], + 'A2011': [], + 'B2010': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'Arating': [91, 92, 93], + 'Arating_old': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'Arating': [91, 92, 93, 91, 92, 93], + 'Arating_old': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['Arating', 'Arating_old', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame({'Aone': [1.0, 2.0], + 'Atwo': [3.0, 4.0], + 'Bone': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'Aone': [], + 'Atwo': [], + 'Bone': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame({ + 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + }) + exp_frame = pd.DataFrame({ + 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, + 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], + 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', + '2', '1', '2', '1', '2', '1', '2', '1', '2'] + }) + exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] + long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_non_unique_idvars(self): + # GH16382 + # Raise an error message if non unique id vars (i) are passed + df = pd.DataFrame({ + 'A_A1': [1, 2, 3, 4, 5], + 'B_B1': [1, 2, 3, 4, 5], + 'x': [1, 1, 1, 1, 1] + }) + with pytest.raises(ValueError): + wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 59852ee014b92..3e68ff7cf2f59 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -11,208 +11,9 @@ from pandas.util.testing import assert_frame_equal -from pandas import melt, lreshape, wide_to_long, get_dummies +from pandas import get_dummies import pandas.util.testing as tm -from pandas.compat import range, u - - -class TestMelt(object): - - def setup_method(self, method): - self.df = tm.makeTimeDataFrame()[:10] - self.df['id1'] = (self.df['A'] > 0).astype(np.int64) - self.df['id2'] = (self.df['B'] > 0).astype(np.int64) - - self.var_name = 'var' - self.value_name = 'val' - - self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 - ], [-1.321405, 0.368915, -1.055342], - [-0.807333, 0.08298, -0.873361]]) - self.df1.columns = [list('ABC'), list('abc')] - self.df1.columns.names = ['CAP', 'low'] - - def test_top_level_method(self): - result = melt(self.df) - assert result.columns.tolist() == ['variable', 'value'] - - def test_method_signatures(self): - tm.assert_frame_equal(self.df.melt(), - melt(self.df)) - - tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], - value_vars=['A', 'B']), - melt(self.df, - id_vars=['id1', 'id2'], - value_vars=['A', 'B'])) - - tm.assert_frame_equal(self.df.melt(var_name=self.var_name, - value_name=self.value_name), - melt(self.df, - var_name=self.var_name, - value_name=self.value_name)) - - tm.assert_frame_equal(self.df1.melt(col_level=0), - melt(self.df1, col_level=0)) - - def test_default_col_names(self): - result = self.df.melt() - assert result.columns.tolist() == ['variable', 'value'] - - result1 = self.df.melt(id_vars=['id1']) - assert result1.columns.tolist() == ['id1', 'variable', 'value'] - - result2 = self.df.melt(id_vars=['id1', 'id2']) - assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] - - def test_value_vars(self): - result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') - assert len(result3) == 10 - - result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) - expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) - tm.assert_frame_equal(result4, expected4) - - def test_value_vars_types(self): - # GH 15348 - expected = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) - - for type_ in (tuple, list, np.array): - result = self.df.melt(id_vars=['id1', 'id2'], - value_vars=type_(('A', 'B'))) - tm.assert_frame_equal(result, expected) - - def test_vars_work_with_multiindex(self): - expected = DataFrame({ - ('A', 'a'): self.df1[('A', 'a')], - 'CAP': ['B'] * len(self.df1), - 'low': ['b'] * len(self.df1), - 'value': self.df1[('B', 'b')], - }, columns=[('A', 'a'), 'CAP', 'low', 'value']) - - result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) - tm.assert_frame_equal(result, expected) - - def test_tuple_vars_fail_with_multiindex(self): - # melt should fail with an informative error message if - # the columns have a MultiIndex and a tuple is passed - # for id_vars or value_vars. - tuple_a = ('A', 'a') - list_a = [tuple_a] - tuple_b = ('B', 'b') - list_b = [tuple_b] - - for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), - (tuple_a, tuple_b)): - with tm.assert_raises_regex(ValueError, r'MultiIndex'): - self.df1.melt(id_vars=id_vars, value_vars=value_vars) - - def test_custom_var_name(self): - result5 = self.df.melt(var_name=self.var_name) - assert result5.columns.tolist() == ['var', 'value'] - - result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) - assert result6.columns.tolist() == ['id1', 'var', 'value'] - - result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) - assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name) - assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name) - expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, 'value']) - tm.assert_frame_equal(result9, expected9) - - def test_custom_value_name(self): - result10 = self.df.melt(value_name=self.value_name) - assert result10.columns.tolist() == ['variable', 'val'] - - result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) - assert result11.columns.tolist() == ['id1', 'variable', 'val'] - - result12 = self.df.melt(id_vars=['id1', 'id2'], - value_name=self.value_name) - assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - value_name=self.value_name) - assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - value_name=self.value_name) - expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', - self.value_name]) - tm.assert_frame_equal(result14, expected14) - - def test_custom_var_and_value_name(self): - - result15 = self.df.melt(var_name=self.var_name, - value_name=self.value_name) - assert result15.columns.tolist() == ['var', 'val'] - - result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, - value_name=self.value_name) - assert result16.columns.tolist() == ['id1', 'var', 'val'] - - result17 = self.df.melt(id_vars=['id1', 'id2'], - var_name=self.var_name, - value_name=self.value_name) - assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name, - value_name=self.value_name) - assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name, - value_name=self.value_name) - expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, - self.value_name]) - tm.assert_frame_equal(result19, expected19) - - df20 = self.df.copy() - df20.columns.name = 'foo' - result20 = df20.melt() - assert result20.columns.tolist() == ['foo', 'value'] - - def test_col_level(self): - res1 = self.df1.melt(col_level=0) - res2 = self.df1.melt(col_level='CAP') - assert res1.columns.tolist() == ['CAP', 'value'] - assert res2.columns.tolist() == ['CAP', 'value'] - - def test_multiindex(self): - res = self.df1.melt() - assert res.columns.tolist() == ['CAP', 'low', 'value'] +from pandas.compat import u class TestGetDummies(object): @@ -672,327 +473,3 @@ def test_preserve_categorical_dtype(self): result = make_axis_dummies(df, transform=lambda x: x) tm.assert_frame_equal(result, expected) - - -class TestLreshape(object): - - def test_pairs(self): - data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009'], - 'visitdt2': - ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], - 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], - 'wt1': [1823, 3338, 1549, 3298, 4306], - 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], - 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} - - df = DataFrame(data) - - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], - 'wt': ['wt%d' % i for i in range(1, 4)]} - result = lreshape(df, spec) - - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, - 4133, 1766, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, - 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Male', - 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', '02jan2009', '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, - 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} - exp = DataFrame(exp_data, columns=result.columns) - tm.assert_frame_equal(result, exp) - - result = lreshape(df, spec, dropna=False) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '20dec2008', '30dec2008', - '21dec2008', '11jan2009', '08jan2009', '20dec2008', - '30dec2008', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, - 3139, 4133, 1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, - 101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', nan, - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', nan, nan, '02jan2009', - '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, - 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, - 4805.0]} - exp = DataFrame(exp_data, columns=result.columns) - tm.assert_frame_equal(result, exp) - - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], - 'wt': ['wt%d' % i for i in range(1, 4)]} - pytest.raises(ValueError, lreshape, df, spec) - - -class TestWideToLong(object): - - def test_simple(self): - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A1970": {0: "a", - 1: "b", - 2: "c"}, - "A1980": {0: "d", - 1: "e", - 2: "f"}, - "B1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ["A", "B"], i="id", j="year") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_stubs(self): - # GH9204 - df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) - df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] - stubs = ['inc', 'edu'] - - # TODO: unused? - df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa - - assert stubs == ['inc', 'edu'] - - def test_separating_character(self): - # GH14779 - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A.1970": {0: "a", - 1: "b", - 2: "c"}, - "A.1980": {0: "d", - 1: "e", - 2: "f"}, - "B.1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B.1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_escapable_characters(self): - np.random.seed(123) - x = np.random.randn(3) - df = pd.DataFrame({"A(quarterly)1970": {0: "a", - 1: "b", - 2: "c"}, - "A(quarterly)1980": {0: "d", - 1: "e", - 2: "f"}, - "B(quarterly)1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B(quarterly)1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) - df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], - "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": ['1970', '1970', '1970', '1980', '1980', '1980'], - "id": [0, 1, 2, 0, 1, 2]} - exp_frame = DataFrame(exp_data) - exp_frame = exp_frame.set_index( - ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] - long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], - i="id", j="year") - tm.assert_frame_equal(long_frame, exp_frame) - - def test_unbalanced(self): - # test that we can have a varying amount of time variables - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], - 'A': [1.0, 3.0, 2.0, 4.0], - 'B': [5.0, np.nan, 6.0, np.nan], - 'id': [0, 0, 1, 1], - 'year': ['2010', '2011', '2010', '2011']} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(long_frame, exp_frame) - - def test_character_overlap(self): - # Test we handle overlapping characters in both id_vars and value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'BBBX': [91, 92, 93], - 'BBBZ': [91, 92, 93] - }) - df['id'] = df.index - exp_frame = pd.DataFrame({ - 'BBBX': [91, 92, 93, 91, 92, 93], - 'BBBZ': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': ['11', '11', '11', '12', '12', '12']}) - exp_frame = exp_frame.set_index(['id', 'year'])[ - ['BBBX', 'BBBZ', 'A', 'B', 'BB']] - long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_invalid_separator(self): - # if an invalid separator is supplied a empty data frame is returned - sep = 'nope!' - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'A2010': [], - 'A2011': [], - 'B2010': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[[ - 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] - exp_frame.index.set_levels([[0, 1], []], inplace=True) - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_num_string_disambiguation(self): - # Test that we can disambiguate number value_vars from - # string value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'Arating': [91, 92, 93], - 'Arating_old': [91, 92, 93] - }) - df['id'] = df.index - exp_frame = pd.DataFrame({ - 'Arating': [91, 92, 93, 91, 92, 93], - 'Arating_old': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': ['11', '11', '11', '12', '12', '12']}) - exp_frame = exp_frame.set_index(['id', 'year'])[ - ['Arating', 'Arating_old', 'A', 'B', 'BB']] - long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_invalid_suffixtype(self): - # If all stubs names end with a string, but a numeric suffix is - # assumed, an empty data frame is returned - df = pd.DataFrame({'Aone': [1.0, 2.0], - 'Atwo': [3.0, 4.0], - 'Bone': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'Aone': [], - 'Atwo': [], - 'Bone': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - exp_frame = pd.DataFrame(exp_data) - exp_frame = exp_frame.set_index(['id', 'year'])[[ - 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] - exp_frame.index.set_levels([[0, 1], []], inplace=True) - long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(long_frame.sort_index(axis=1), - exp_frame.sort_index(axis=1)) - - def test_multiple_id_columns(self): - # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm - df = pd.DataFrame({ - 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - }) - exp_frame = pd.DataFrame({ - 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, - 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], - 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], - 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], - 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', - '2', '1', '2', '1', '2', '1', '2', '1', '2'] - }) - exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] - long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') - tm.assert_frame_equal(long_frame, exp_frame) - - def test_non_unique_idvars(self): - # GH16382 - # Raise an error message if non unique id vars (i) are passed - df = pd.DataFrame({ - 'A_A1': [1, 2, 3, 4, 5], - 'B_B1': [1, 2, 3, 4, 5], - 'x': [1, 1, 1, 1, 1] - }) - with pytest.raises(ValueError): - wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') From 154c41690fdc23b62aa69ad2d6774a02f6334ece Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Nov 2017 12:47:04 +0100 Subject: [PATCH 12/98] Revert "CI: temp skip geopandas downstream tests (GH18456)" (#18466) This reverts commit 5e670653e50dcbbafc0ba004b16328f49925f041. --- pandas/tests/test_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 1ec25bc8bb295..0f0abd8cd3400 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -92,7 +92,6 @@ def test_pandas_datareader(): pandas_datareader.get_data_google('AAPL') -@pytest.mark.skip(reason="import issue with fiona GH18456") def test_geopandas(): geopandas = import_module('geopandas') # noqa From de4b384e5b4c1231825cba4f080fa756a2d601d0 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 24 Nov 2017 09:55:13 -0700 Subject: [PATCH 13/98] BUG: Fix IntervalIndex constructor inconsistencies (#18424) --- doc/source/whatsnew/v0.22.0.txt | 2 + pandas/_libs/interval.pyx | 4 +- pandas/core/indexes/interval.py | 57 +++++++---- pandas/tests/indexes/test_interval.py | 126 ++++++++++++++++++------- pandas/tests/indexing/test_interval.py | 2 +- 5 files changed, 134 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 90032a692fd15..92d9123d2cf4c 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -74,6 +74,7 @@ Other API Changes - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) +- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) .. _whatsnew_0220.deprecations: @@ -137,6 +138,7 @@ Indexing - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) +- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - I/O diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index c09642511207a..39b26c61172ed 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -211,8 +211,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals): int64_t n = len(intervals) ndarray left, right - left = np.empty(n, dtype=object) - right = np.empty(n, dtype=object) + left = np.empty(n, dtype=intervals.dtype) + right = np.empty(n, dtype=intervals.dtype) for i in range(len(intervals)): interval = intervals[i] diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c9bb8748abe7b..cca7a06a2d44b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.missing import notna, isna from pandas.core.dtypes.generic import ABCPeriodIndex from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, @@ -31,7 +32,9 @@ from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex from pandas.compat.numpy import function as nv -from pandas.core import common as com +from pandas.core.common import ( + _all_not_none, _any_none, _asarray_tuplesafe, _count_not_none, + is_bool_indexer, _maybe_box_datetimelike, _not_none) from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset @@ -176,7 +179,7 @@ class IntervalIndex(IntervalMixin, Index): _mask = None - def __new__(cls, data, closed='right', + def __new__(cls, data, closed=None, name=None, copy=False, dtype=None, fastpath=False, verify_integrity=True): @@ -197,8 +200,17 @@ def __new__(cls, data, closed='right', if is_scalar(data): cls._scalar_data_error(data) - data = IntervalIndex.from_intervals(data, name=name) - left, right, closed = data.left, data.right, data.closed + data = maybe_convert_platform(data) + left, right, infer_closed = intervals_to_interval_bounds(data) + + if _all_not_none(closed, infer_closed) and closed != infer_closed: + # GH 18421 + msg = ("conflicting values for closed: constructor got " + "'{closed}', inferred from data '{infer_closed}'" + .format(closed=closed, infer_closed=infer_closed)) + raise ValueError(msg) + + closed = closed or infer_closed return cls._simple_new(left, right, closed, name, copy=copy, verify_integrity=verify_integrity) @@ -376,7 +388,8 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - breaks = np.asarray(breaks) + breaks = maybe_convert_platform(breaks) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, name=name, copy=copy) @@ -416,8 +429,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - left = np.asarray(left) - right = np.asarray(right) + left = maybe_convert_platform(left) + right = maybe_convert_platform(right) + return cls._simple_new(left, right, closed, name=name, copy=copy, verify_integrity=True) @@ -460,8 +474,12 @@ def from_intervals(cls, data, name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) + if isinstance(data, IntervalIndex): + left, right, closed = data.left, data.right, data.closed + name = name or data.name + else: + data = maybe_convert_platform(data) + left, right, closed = intervals_to_interval_bounds(data) return cls.from_arrays(left, right, closed, name=name, copy=False) @classmethod @@ -497,8 +515,11 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): IntervalIndex.from_intervals : Construct an IntervalIndex from an array of Interval objects """ - left = [] - right = [] + if len(data): + left, right = [], [] + else: + left = right = data + for d in data: if isna(d): @@ -517,7 +538,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): return cls.from_arrays(left, right, closed, name=name, copy=False) def to_tuples(self): - return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + return Index(_asarray_tuplesafe(zip(self.left, self.right))) @cache_readonly def _multiindex(self): @@ -838,7 +859,7 @@ def get_loc(self, key, method=None): return self._engine.get_loc(key) def get_value(self, series, key): - if com.is_bool_indexer(key): + if is_bool_indexer(key): loc = key elif is_list_like(key): loc = self.get_indexer(key) @@ -1166,7 +1187,7 @@ def _is_type_compatible(a, b): return ((is_number(a) and is_number(b)) or (is_ts_compat(a) and is_ts_compat(b)) or (is_td_compat(a) and is_td_compat(b)) or - com._any_none(a, b)) + _any_none(a, b)) def interval_range(start=None, end=None, periods=None, freq=None, @@ -1244,13 +1265,13 @@ def interval_range(start=None, end=None, periods=None, freq=None, -------- IntervalIndex : an Index of intervals that are all closed on the same side. """ - if com._count_not_none(start, end, periods) != 2: + if _count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' 'exactly two must be specified') - start = com._maybe_box_datetimelike(start) - end = com._maybe_box_datetimelike(end) - endpoint = next(com._not_none(start, end)) + start = _maybe_box_datetimelike(start) + end = _maybe_box_datetimelike(end) + endpoint = next(_not_none(start, end)) if not _is_valid_endpoint(start): msg = 'start must be numeric or datetime-like, got {start}' diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 399d88309072e..b98359ea0ec4d 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,7 +6,7 @@ from pandas import (Interval, IntervalIndex, Index, isna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) -from pandas.compat import zip +from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -38,7 +38,7 @@ def create_index_with_nan(self, closed='right'): @pytest.mark.parametrize('name', [None, 'foo']) def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) - ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] + ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) @@ -57,7 +57,7 @@ def test_constructors(self, closed, name): tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( - zip(left, right), closed=closed, name=name) + lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) @@ -68,6 +68,9 @@ def test_constructors(self, closed, name): tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) + result = IntervalIndex.from_intervals(expected) + tm.assert_index_equal(result, expected) + result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) @@ -86,63 +89,118 @@ def test_constructors(self, closed, name): breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) - def test_constructors_other(self): - - # all-nan - result = IntervalIndex.from_intervals([np.nan]) - expected = np.array([np.nan], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) - - # empty - result = IntervalIndex.from_intervals([]) - expected = np.array([], dtype=object) - tm.assert_numpy_array_equal(result.values, expected) + @pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50]) + def test_constructors_nan(self, closed, data): + # GH 18421 + expected_values = np.array(data, dtype=object) + expected_idx = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_idx.closed == closed + tm.assert_numpy_array_equal(expected_idx.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks([np.nan] + data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_idx) + tm.assert_numpy_array_equal(result.values, expected_values) + + @pytest.mark.parametrize('data', [ + [], + np.array([], dtype='int64'), + np.array([], dtype='float64'), + np.array([], dtype=object)]) + def test_constructors_empty(self, data, closed): + # GH 18421 + expected_dtype = data.dtype if isinstance(data, np.ndarray) else object + expected_values = np.array([], dtype=object) + expected_index = IntervalIndex(data, closed=closed) + + # validate the expected index + assert expected_index.empty + assert expected_index.closed == closed + assert expected_index.dtype.subtype == expected_dtype + tm.assert_numpy_array_equal(expected_index.values, expected_values) + + result = IntervalIndex.from_tuples(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_breaks(data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + result = IntervalIndex.from_arrays(data, data, closed=closed) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) + + if closed == 'right': + # Can't specify closed for IntervalIndex.from_intervals + result = IntervalIndex.from_intervals(data) + tm.assert_index_equal(result, expected_index) + tm.assert_numpy_array_equal(result.values, expected_values) def test_constructors_errors(self): # scalar - msg = ('IntervalIndex(...) must be called with a collection of ' + msg = ('IntervalIndex\(...\) must be called with a collection of ' 'some kind, 5 was passed') - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex(5) # not an interval - msg = "type with value 0 is not an interval" - with pytest.raises(TypeError, message=msg): + msg = ("type <(class|type) 'numpy.int64'> with value 0 " + "is not an interval") + with tm.assert_raises_regex(TypeError, msg): IntervalIndex([0, 1]) - with pytest.raises(TypeError, message=msg): + with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals([0, 1]) # invalid closed msg = "invalid options for 'closed': invalid" - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') - # mismatched closed + # mismatched closed within intervals msg = 'intervals must all be closed on the same side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) - with pytest.raises(ValueError, message=msg): - IntervalIndex.from_arrays([0, 10], [3, 5]) - - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) + # mismatched closed inferred from intervals vs constructor. + msg = 'conflicting values for closed' + with tm.assert_raises_regex(ValueError, msg): + iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] + IntervalIndex(iv, closed='neither') + # no point in nesting periods in an IntervalIndex msg = 'Period dtypes are not supported, use a PeriodIndex instead' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) # decreasing breaks/arrays msg = 'left side of interval must be <= right side' - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(range(10, -1, -1)) - with pytest.raises(ValueError, message=msg): + with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) def test_constructors_datetimelike(self, closed): @@ -865,7 +923,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is True - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) @@ -873,7 +931,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) @@ -881,7 +939,7 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', overwise True (GH16560) @@ -1054,10 +1112,6 @@ def test_constructor_coverage(self): end=end.to_pydatetime()) tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.tz_localize('UTC'), - end=end.tz_localize('UTC')) - tm.assert_index_equal(result, expected) - result = pd.interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py index 3792293f48b99..e29dc627a5d94 100644 --- a/pandas/tests/indexing/test_interval.py +++ b/pandas/tests/indexing/test_interval.py @@ -54,7 +54,7 @@ def test_getitem_with_scalar(self): def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] if direction == 'decreasing': - tpls = reversed(tpls) + tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) s = Series(list('abc'), idx) From de5faf1d54ad2e10e00113562c447d51a16833b4 Mon Sep 17 00:00:00 2001 From: Andrew Date: Fri, 24 Nov 2017 17:57:03 +0100 Subject: [PATCH 14/98] Lint rule to catch incorrect sphinx directives (#18437) --- ci/lint.sh | 15 ++++++++++++++- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- pandas/core/tools/datetimes.py | 8 ++++---- pandas/tseries/offsets.py | 4 ++-- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 4027737900bf9..5d9fafe6c9064 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -90,13 +90,26 @@ if [ "$LINT" ]; then # # Check the following functions: # any(), all(), sum(), max(), min(), list(), dict(), set(), frozenset(), tuple(), str.join() - grep -R --include="*.py*" -E "[^_](any|all|sum|max|min|list|dict|set|frozenset|tuple|join)\(\[.* for .* in .*\]\)" + grep -R --include="*.py*" -E "[^_](any|all|sum|max|min|list|dict|set|frozenset|tuple|join)\(\[.* for .* in .*\]\)" * if [ $? = "0" ]; then RET=1 fi echo "Check for use of lists instead of generators in built-in Python functions DONE" + echo "Check for incorrect sphinx directives" + SPHINX_DIRECTIVES=$(echo \ + "autosummary|contents|currentmodule|deprecated|function|image|"\ + "important|include|ipython|literalinclude|math|module|note|raw|"\ + "seealso|toctree|versionadded|versionchanged|warning" | tr -d "[:space:]") + for path in './pandas' './doc/source' + do + grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. ($SPHINX_DIRECTIVES):[^:]" $path + if [ $? = "0" ]; then + RET=1 + fi + done + echo "Check for incorrect sphinx directives DONE" else echo "NOT Linting" fi diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b05cfe41fd9d1..20ed3f69db99b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4513,7 +4513,7 @@ def unstack(self, level=-1, fill_value=None): fill_value : replace NaN with this value if the unstack produces missing values - .. versionadded: 0.18.0 + .. versionadded:: 0.18.0 See also -------- @@ -4676,7 +4676,7 @@ def diff(self, periods=1, axis=0): axis : {0 or 'index', 1 or 'columns'}, default 0 Take difference over rows (0) or columns (1). - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index be1de4c6814ba..d7833526c0408 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1626,7 +1626,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): ignore_index : boolean, default False If True, do not use the index labels. - .. versionadded: 0.19.0 + .. versionadded:: 0.19.0 verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates @@ -2213,7 +2213,7 @@ def unstack(self, level=-1, fill_value=None): fill_value : replace NaN with this value if the unstack produces missing values - .. versionadded: 0.18.0 + .. versionadded:: 0.18.0 Examples -------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 65f4704da3800..219fb3f67db97 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -114,7 +114,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series - .. versionadded: 0.18.1 + .. versionadded:: 0.18.1 or DataFrame/dict-like @@ -140,7 +140,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware @@ -178,13 +178,13 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If Timestamp convertible, origin is set to Timestamp identified by origin. - .. versionadded: 0.20.0 + .. versionadded:: 0.20.0 cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime conversion. May produce sigificant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded: 0.22.0 + .. versionadded:: 0.22.0 Returns ------- diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2097fb22b3ec5..eef9d165e2447 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -774,7 +774,7 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n business days - .. versionadded: 0.16.1 + .. versionadded:: 0.16.1 """ _prefix = 'BH' @@ -878,7 +878,7 @@ class CustomBusinessHour(BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n custom business days - .. versionadded: 0.18.1 + .. versionadded:: 0.18.1 """ _prefix = 'CBH' From aec33479b923a05ead1ca35335f00aba87e4145e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Nov 2017 08:59:12 -0800 Subject: [PATCH 15/98] CLN/PERF: simplify tslib.get_time_micros (#18389) --- pandas/_libs/tslib.pyx | 23 ----------------------- pandas/_libs/tslibs/fields.pyx | 20 ++++++++++++++++++++ pandas/core/indexes/datetimes.py | 2 +- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ea4f4728a0741..2c43bed4ad053 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -730,29 +730,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -# ---------------------------------------------------------------------- -# Accessors - - -def get_time_micros(ndarray[int64_t] dtindex): - """ - Datetime as int64 representation to a structured array of fields - """ - cdef: - Py_ssize_t i, n = len(dtindex) - pandas_datetimestruct dts - ndarray[int64_t] micros - - micros = np.empty(n, dtype=np.int64) - - for i in range(n): - dt64_to_dtstruct(dtindex[i], &dts) - micros[i] = 1000000LL * (dts.hour * 60 * 60 + - 60 * dts.min + dts.sec) + dts.us - - return micros - - # ---------------------------------------------------------------------- # Some general helper functions diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index e813fad1d3fa7..3de361c511fbf 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -23,6 +23,26 @@ from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, from nattype cimport NPY_NAT +def get_time_micros(ndarray[int64_t] dtindex): + """ + Return the number of microseconds in the time component of a + nanosecond timestamp. + + Parameters + ---------- + dtindex : ndarray[int64_t] + + Returns + ------- + micros : ndarray[int64_t] + """ + cdef: + ndarray[int64_t] micros + + micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL + return micros + + def build_field_sarray(ndarray[int64_t] dtindex): """ Datetime as int64 representation to a structured array of fields diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e08bf4a625bce..111ba0c92aa9b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -926,7 +926,7 @@ def _get_time_micros(self): values = self.asi8 if self.tz is not None and self.tz is not utc: values = self._local_timestamps() - return libts.get_time_micros(values) + return fields.get_time_micros(values) def to_series(self, keep_tz=False): """ From 4fce7846be56e12999fe8758abb2ea2f2794259d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 Nov 2017 11:27:13 -0800 Subject: [PATCH 16/98] CLN: Replace comprehensions list/set/dict functions with corresponding symbols (#18383) --- asv_bench/benchmarks/frame_ctor.py | 6 ++--- asv_bench/benchmarks/packers.py | 2 +- asv_bench/benchmarks/replace.py | 6 ++--- doc/sphinxext/numpydoc/phantom_import.py | 4 ++-- pandas/_libs/tslibs/offsets.pyx | 2 +- pandas/_libs/tslibs/resolution.pyx | 2 +- pandas/_version.py | 6 ++--- pandas/core/common.py | 2 +- pandas/core/dtypes/concat.py | 4 ++-- pandas/core/frame.py | 15 ++++++------ pandas/core/generic.py | 24 +++++++++---------- pandas/core/groupby.py | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/interval.py | 2 +- pandas/core/internals.py | 6 ++--- pandas/core/ops.py | 2 +- pandas/core/panelnd.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/melt.py | 2 +- pandas/core/sparse/frame.py | 9 ++++--- pandas/io/clipboards.py | 2 +- pandas/io/json/json.py | 16 +++++-------- pandas/io/parsers.py | 14 +++++------ pandas/io/pytables.py | 11 ++++----- pandas/io/sql.py | 2 +- pandas/plotting/_converter.py | 2 +- pandas/tests/frame/common.py | 11 ++++----- pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_constructors.py | 25 ++++++++++---------- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_whitelist.py | 2 +- pandas/tests/indexes/test_multi.py | 4 ++-- pandas/tests/indexing/test_panel.py | 2 +- pandas/tests/io/formats/test_format.py | 14 +++++------ pandas/tests/io/formats/test_to_latex.py | 20 ++++++++-------- pandas/tests/io/json/test_ujson.py | 2 +- pandas/tests/io/msgpack/test_case.py | 4 ++-- pandas/tests/io/msgpack/test_pack.py | 2 +- pandas/tests/io/test_packers.py | 8 +++---- pandas/tests/io/test_pytables.py | 2 +- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/plotting/test_series.py | 5 ++-- pandas/tests/reshape/test_join.py | 2 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/series/test_indexing.py | 2 +- pandas/tests/test_panel.py | 30 ++++++++++++------------ pandas/tests/test_panel4d.py | 2 +- pandas/tseries/holiday.py | 4 ++-- pandas/tseries/offsets.py | 8 +++---- pandas/util/testing.py | 10 ++++---- versioneer.py | 12 +++++----- 51 files changed, 160 insertions(+), 172 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 7f95e8d06eb72..2ee5f5da7a84a 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -23,9 +23,9 @@ def setup(self): self.some_dict = list(self.data.values())[0] self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - self.data2 = dict( - ((i, dict(((j, float(j)) for j in range(100)))) for i in - range(2000))) + self.data2 = {i: {j: float(j) for j in range(100)} + for i in range(2000)} + def time_frame_ctor_list_of_dict(self): DataFrame(self.dict_list) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 927f1505e85c6..758162f000e8d 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -18,7 +18,7 @@ def _setup(self): self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict(('float{0}'.format(i), randn(self.N)) for i in range(self.C)), index=self.index) + self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)}, index=self.index) self.df2 = self.df.copy() self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 63562f90eab2b..157d5fe1e3948 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -23,7 +23,7 @@ class replace_large_dict(object): def setup(self): self.n = (10 ** 6) self.start_value = (10 ** 5) - self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n))) + self.to_rep = {i: self.start_value + i for i in range(self.n)} self.s = Series(np.random.randint(self.n, size=(10 ** 3))) def time_replace_large_dict(self): @@ -35,8 +35,8 @@ class replace_convert(object): def setup(self): self.n = (10 ** 3) - self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) - self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) + self.to_ts = {i: pd.Timestamp(i) for i in range(self.n)} + self.to_td = {i: pd.Timedelta(i) for i in range(self.n)} self.s = Series(np.random.randint(self.n, size=(10 ** 3))) self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), 'B': np.random.randint(self.n, size=(10 ** 3))}) diff --git a/doc/sphinxext/numpydoc/phantom_import.py b/doc/sphinxext/numpydoc/phantom_import.py index e0bd645f5db76..f33dd838e8bb3 100755 --- a/doc/sphinxext/numpydoc/phantom_import.py +++ b/doc/sphinxext/numpydoc/phantom_import.py @@ -60,8 +60,8 @@ def import_phantom_module(xml_file): # Sort items so that # - Base classes come before classes inherited from them # - Modules come before their contents - all_nodes = dict((n.attrib['id'], n) for n in root) - + all_nodes = {n.attrib['id']: n for n in root} + def _get_bases(node, recurse=False): bases = [x.attrib['ref'] for x in node.findall('base')] if recurse: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 526595e3a2eda..b03d48bba1649 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -33,7 +33,7 @@ from np_datetime cimport (pandas_datetimestruct, _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] _int_to_month = {(k + 1): v for k, v in enumerate(_MONTHS)} -_month_to_int = dict((v, k) for k, v in _int_to_month.items()) +_month_to_int = {v: k for k, v in _int_to_month.items()} class WeekDay(object): diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 388075903a8ba..0692d985b4877 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -53,7 +53,7 @@ _ONE_HOUR = 60 * _ONE_MINUTE _ONE_DAY = 24 * _ONE_HOUR DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] -_weekday_rule_aliases = dict((k, v) for k, v in enumerate(DAYS)) +_weekday_rule_aliases = {k: v for k, v in enumerate(DAYS)} _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] diff --git a/pandas/_version.py b/pandas/_version.py index 4a469ebb8630e..624c7b5cd63a1 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -141,11 +141,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set(r.strip() for r in refnames.strip("()").split(",")) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set(r[len(TAG):] for r in refs if r.startswith(TAG)) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -154,7 +154,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set(r for r in refs if re.search(r'\d', r)) + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '{}', no digits".format(",".join(refs - tags))) if verbose: diff --git a/pandas/core/common.py b/pandas/core/common.py index 8e12ce3647340..76a69030463ec 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -347,7 +347,7 @@ def map_indices_py(arr): Returns a dictionary with (element, index) pairs for each element in the given array/list """ - return dict((x, i) for i, x in enumerate(arr)) + return {x: i for i, x in enumerate(arr)} def union(*seqs): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7f9245bb31530..c1ba018adbcec 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -459,7 +459,7 @@ def _concat_datetimetz(to_concat, name=None): it is used in DatetimeIndex.append also """ # do not pass tz to set because tzlocal cannot be hashed - if len(set(str(x.dtype) for x in to_concat)) != 1: + if len({str(x.dtype) for x in to_concat}) != 1: raise ValueError('to_concat must have the same tz') tz = to_concat[0].tz # no need to localize because internal repr will not be changed @@ -525,7 +525,7 @@ def convert_sparse(x, axis): if len(typs) == 1: # concat input as it is if all inputs are sparse # and have the same fill_value - fill_values = set(c.fill_value for c in to_concat) + fill_values = {c.fill_value for c in to_concat} if len(fill_values) == 1: sp_values = [c.sp_values for c in to_concat] indexes = [c.sp_index.to_int_index() for c in to_concat] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20ed3f69db99b..e82eb8635d4c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -347,7 +347,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: data_columns = list(data.dtype.names) - data = dict((k, data[k]) for k in data_columns) + data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns mgr = self._init_dict(data, index, columns, dtype=dtype) @@ -417,8 +417,7 @@ def _init_dict(self, data, index, columns, dtype=None): extract_index(list(data.values())) # prefilter if columns passed - data = dict((k, v) for k, v in compat.iteritems(data) - if k in columns) + data = {k: v for k, v in compat.iteritems(data) if k in columns} if index is None: index = extract_index(list(data.values())) @@ -3895,7 +3894,7 @@ def f(col): return self._constructor_sliced(r, index=new_index, dtype=r.dtype) - result = dict((col, f(col)) for col in this) + result = {col: f(col) for col in this} # non-unique else: @@ -3906,7 +3905,7 @@ def f(i): return self._constructor_sliced(r, index=new_index, dtype=r.dtype) - result = dict((i, f(i)) for i, col in enumerate(this.columns)) + result = {i: f(i) for i, col in enumerate(this.columns)} result = self._constructor(result, index=new_index, copy=False) result.columns = new_columns return result @@ -3984,7 +3983,7 @@ def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True): if self.columns.is_unique: def _compare(a, b): - return dict((col, func(a[col], b[col])) for col in a.columns) + return {col: func(a[col], b[col]) for col in a.columns} new_data = expressions.evaluate(_compare, str_rep, self, other) return self._constructor(data=new_data, index=self.index, @@ -3993,8 +3992,8 @@ def _compare(a, b): else: def _compare(a, b): - return dict((i, func(a.iloc[:, i], b.iloc[:, i])) - for i, col in enumerate(a.columns)) + return {i: func(a.iloc[:, i], b.iloc[:, i]) + for i, col in enumerate(a.columns)} new_data = expressions.evaluate(_compare, str_rep, self, other) result = self._constructor(data=new_data, index=self.index, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 782971a742b54..548f228cdd96b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -235,10 +235,10 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, """ cls._AXIS_ORDERS = axes - cls._AXIS_NUMBERS = dict((a, i) for i, a in enumerate(axes)) + cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)} cls._AXIS_LEN = len(axes) cls._AXIS_ALIASES = aliases or dict() - cls._AXIS_IALIASES = dict((v, k) for k, v in cls._AXIS_ALIASES.items()) + cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()} cls._AXIS_NAMES = dict(enumerate(axes)) cls._AXIS_SLICEMAP = slicers or None cls._AXIS_REVERSED = axes_are_reversed @@ -279,21 +279,21 @@ def set_axis(a, i): def _construct_axes_dict(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = dict((a, self._get_axis(a)) for a in (axes or self._AXIS_ORDERS)) + d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d @staticmethod def _construct_axes_dict_from(self, axes, **kwargs): """Return an axes dictionary for the passed axes.""" - d = dict((a, ax) for a, ax in zip(self._AXIS_ORDERS, axes)) + d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)} d.update(kwargs) return d def _construct_axes_dict_for_slice(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = dict((self._AXIS_SLICEMAP[a], self._get_axis(a)) - for a in (axes or self._AXIS_ORDERS)) + d = {self._AXIS_SLICEMAP[a]: self._get_axis(a) + for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d @@ -329,7 +329,7 @@ def _construct_axes_from_arguments(self, args, kwargs, require_all=False): raise TypeError("not enough/duplicate arguments " "specified!") - axes = dict((a, kwargs.pop(a, None)) for a in self._AXIS_ORDERS) + axes = {a: kwargs.pop(a, None) for a in self._AXIS_ORDERS} return axes, kwargs @classmethod @@ -1172,7 +1172,7 @@ def to_dense(self): # Picklability def __getstate__(self): - meta = dict((k, getattr(self, k, None)) for k in self._metadata) + meta = {k: getattr(self, k, None) for k in self._metadata} return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta) @@ -4277,8 +4277,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, elif self.ndim == 3: # fill in 2d chunks - result = dict((col, s.fillna(method=method, value=value)) - for col, s in self.iteritems()) + result = {col: s.fillna(method=method, value=value) + for col, s in self.iteritems()} new_obj = self._constructor.\ from_dict(result).__finalize__(self) new_data = new_obj._data @@ -5681,7 +5681,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means other is a DataFrame, and we need to broadcast # self cons = self._constructor_expanddim - df = cons(dict((c, self) for c in other.columns), + df = cons({c: self for c in other.columns}, **other._construct_axes_dict()) return df._align_frame(other, join=join, axis=axis, level=level, copy=copy, @@ -5691,7 +5691,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means self is a DataFrame, and we need to broadcast # other cons = other._constructor_expanddim - df = cons(dict((c, other) for c in self.columns), + df = cons({c: other for c in self.columns}, **self._construct_axes_dict()) return self._align_frame(df, join=join, axis=axis, level=level, copy=copy, fill_value=fill_value, diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8338df33f5cde..ba180cc98cb08 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3840,7 +3840,7 @@ def first_not_none(values): # if all series have a consistent name. If the # series do not have a consistent name, do # nothing. - names = set(v.name for v in values) + names = {v.name for v in values} if len(names) == 1: index.name = list(names)[0] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a34e64724245..1cb40b3ecf255 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -732,7 +732,7 @@ def _coerce_to_ndarray(cls, data): def _get_attributes_dict(self): """ return an attributes dict for my class """ - return dict((k, getattr(self, k, None)) for k in self._attributes) + return {k: getattr(self, k, None) for k in self._attributes} def view(self, cls=None): @@ -1784,7 +1784,7 @@ def append(self, other): if not isinstance(obj, Index): raise TypeError('all inputs must be Index') - names = set(obj.name for obj in to_concat) + names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name return self._concat(to_concat, name) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index cca7a06a2d44b..c7c739b766a9f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1024,7 +1024,7 @@ def _concat_same_dtype(self, to_concat, name): assert that we all have the same .closed we allow a 0-len index here as well """ - if not len(set(i.closed for i in to_concat if len(i))) == 1: + if not len({i.closed for i in to_concat if len(i)}) == 1: msg = ('can only append two IntervalIndex objects ' 'that are closed on the same side') raise ValueError(msg) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ca4984cc16673..e537cb2edc1c4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3377,7 +3377,7 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, blocks.append(block) # note that some DatetimeTZ, Categorical are always ndim==1 - ndim = set(b.ndim for b in blocks) + ndim = {b.ndim for b in blocks} if 2 in ndim: @@ -3891,7 +3891,7 @@ def get_scalar(self, tup): """ Retrieve single item """ - full_loc = list(ax.get_loc(x) for ax, x in zip(self.axes, tup)) + full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)] blk = self.blocks[self._blknos[full_loc[0]]] values = blk.values @@ -4871,7 +4871,7 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): if _can_consolidate: if dtype is None: - if len(set(b.dtype for b in blocks)) != 1: + if len({b.dtype for b in blocks}) != 1: raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype diff --git a/pandas/core/ops.py b/pandas/core/ops.py index fa50036b6eb95..934570602c99d 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -146,7 +146,7 @@ def names(x): construct_result=_construct_divmod_result, ) - new_methods = dict((names(k), v) for k, v in new_methods.items()) + new_methods = {names(k): v for k, v in new_methods.items()} return new_methods diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 691787125043d..80ee680d2b9d2 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -105,7 +105,7 @@ def _combine_with_constructor(self, other, func): new_axes.append(getattr(self, a).union(getattr(other, a))) # reindex: could check that everything's the same size, but forget it - d = dict((a, ax) for a, ax in zip(self._AXIS_ORDERS, new_axes)) + d = {a: ax for a, ax in zip(self._AXIS_ORDERS, new_axes)} d['copy'] = False this = self.reindex(**d) other = other.reindex(**d) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6139f093202fe..9bd5abb2cd476 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -568,7 +568,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = list(names) else: # make sure that all of the passed indices have the same nlevels - if not len(set(idx.nlevels for idx in indexes)) == 1: + if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 36e52f1472f82..16439b30d5bb4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -149,7 +149,7 @@ def lreshape(data, groups, dropna=True, label=None): for c in pivot_cols: mask &= notna(mdata[c]) if not mask.all(): - mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) + mdata = {k: v[mask] for k, v in compat.iteritems(mdata)} from pandas import DataFrame return DataFrame(mdata, columns=id_cols + pivot_cols) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 0c9a55e0c9acd..36a18d8f8b4a0 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -131,8 +131,7 @@ def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = _ensure_index(columns) - data = dict((k, v) for k, v in compat.iteritems(data) - if k in columns) + data = {k: v for k, v in compat.iteritems(data) if k in columns} else: columns = Index(_try_sort(list(data.keys()))) @@ -173,7 +172,7 @@ def _init_matrix(self, data, index, columns, dtype=None): """ Init self from ndarray or list of lists """ data = _prep_ndarray(data, copy=False) index, columns = self._prep_index(data, index, columns) - data = dict((idx, data[:, i]) for i, idx in enumerate(columns)) + data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) def _init_spmatrix(self, data, index, columns, dtype=None, @@ -307,7 +306,7 @@ def to_dense(self): ------- df : DataFrame """ - data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) + data = {k: v.to_dense() for k, v in compat.iteritems(self)} return DataFrame(data, index=self.index, columns=self.columns) def _apply_columns(self, func): @@ -697,7 +696,7 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, raise NotImplementedError("'method' argument is not supported") # TODO: fill value handling - sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) + sdict = {k: v for k, v in compat.iteritems(self) if k in columns} return self._constructor( sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value).__finalize__(self) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 117c96d00171c..8e9b5497083f6 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -53,7 +53,7 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = set(x.lstrip().count('\t') for x in lines) + counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 32bab09a0c4ac..11bf3a9363953 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -715,10 +715,8 @@ def _parse_no_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + decoded = {str(k): v for k, v in compat.iteritems( + loads(json, precise_float=self.precise_float))} self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: @@ -732,7 +730,7 @@ def _parse_numpy(self): if orient == "split": decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + decoded = {str(k): v for k, v in compat.iteritems(decoded)} self.check_keys_split(decoded) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": @@ -770,7 +768,7 @@ def _parse_numpy(self): elif orient == "split": decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + decoded = {str(k): v for k, v in compat.iteritems(decoded)} self.check_keys_split(decoded) self.obj = DataFrame(**decoded) elif orient == "values": @@ -790,10 +788,8 @@ def _parse_no_numpy(self): self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) elif orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + decoded = {str(k): v for k, v in compat.iteritems( + loads(json, precise_float=self.precise_float))} self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 558a1f6d76868..8f6b013558396 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1133,8 +1133,7 @@ def _evaluate_usecols(usecols, names): If not a callable, returns 'usecols'. """ if callable(usecols): - return set(i for i, name in enumerate(names) - if usecols(name)) + return {i for i, name in enumerate(names) if usecols(name)} return usecols @@ -1906,7 +1905,7 @@ def read(self, nrows=None): # rename dict keys data = sorted(data.items()) - data = dict((k, v) for k, (i, v) in zip(names, data)) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) @@ -1924,7 +1923,7 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] - data = dict((k, v) for k, (i, v) in zip(names, data)) + data = {k: v for k, (i, v) in zip(names, data)} names, data = self._do_date_conversions(names, data) index, names = self._make_index(data, alldata, names) @@ -2300,7 +2299,7 @@ def _exclude_implicit_index(self, alldata): offset += 1 data[col] = alldata[i + offset] else: - data = dict((k, v) for k, v in zip(names, alldata)) + data = {k: v for k, v in zip(names, alldata)} return data @@ -3233,9 +3232,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): for i, n in enumerate(index_col): columns.pop(n - i) - col_dict = dict((col_name, - Series([], dtype=dtype[col_name])) - for col_name in columns) + col_dict = {col_name: Series([], dtype=dtype[col_name]) + for col_name in columns} return index, columns, col_dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b9cddce55c096..2a66aea88f6d9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -815,7 +815,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, "all tables must have exactly the same nrows!") # axis is the concentation axes - axis = list(set(t.non_index_axes[0][0] for t in tbls))[0] + axis = list({t.non_index_axes[0][0] for t in tbls})[0] def func(_start, _stop, _where): @@ -2374,8 +2374,7 @@ class GenericFixed(Fixed): """ a generified fixed version """ _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} - _reverse_index_map = dict((v, k) - for k, v in compat.iteritems(_index_type_map)) + _reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)} attributes = [] # indexer helpders @@ -3510,8 +3509,8 @@ def get_blk_items(mgr, blocks): # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict((tuple(b_items.tolist()), (b, b_items)) - for b, b_items in zip(blocks, blk_items)) + by_items = {tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items)} new_blocks = [] new_blk_items = [] for ea in existing_table.values_axes: @@ -3659,7 +3658,7 @@ def create_description(self, complib=None, complevel=None, d = dict(name='table', expectedrows=expectedrows) # description from the axes & values - d['description'] = dict((a.cname, a.typ) for a in self.axes) + d['description'] = {a.cname: a.typ for a in self.axes} if complib: if complevel is None: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 401a9c11a774d..975ad1e4ff368 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -641,7 +641,7 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter] + data = [{k: v for k, v in zip(keys, row)} for row in data_iter] conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 0f06d87726905..9daee918b9f30 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -994,7 +994,7 @@ def _set_default_format(self, vmin, vmax): info) else: format = np.compress(info['maj'], info) - self.formatdict = dict((x, f) for (x, _, _, f) in format) + self.formatdict = {x: f for (x, _, _, f) in format} return self.formatdict def set_locs(self, locs): diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 3786facdd4ebd..c85fea3c3d71b 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -10,8 +10,8 @@ _frame = pd.DataFrame(_seriesd) _frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) -_intframe = pd.DataFrame(dict((k, v.astype(int)) - for k, v in compat.iteritems(_seriesd))) +_intframe = pd.DataFrame({k: v.astype(int) + for k, v in compat.iteritems(_seriesd)}) _tsframe = pd.DataFrame(_tsd) @@ -32,8 +32,7 @@ def frame2(self): @cache_readonly def intframe(self): # force these all to int64 to avoid platform testing issues - return pd.DataFrame(dict((c, s) for c, s in - compat.iteritems(_intframe)), + return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)}, dtype=np.int64) @cache_readonly @@ -112,7 +111,7 @@ def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 dtypes = dict(A='float32', B='float32', C='float16', D='float64') if isinstance(dtype, compat.string_types): - dtypes = dict((k, dtype) for k, v in dtypes.items()) + dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): @@ -128,7 +127,7 @@ def _check_mixed_float(df, dtype=None): def _check_mixed_int(df, dtype=None): dtypes = dict(A='int32', B='uint64', C='uint8', D='int64') if isinstance(dtype, compat.string_types): - dtypes = dict((k, dtype) for k, v in dtypes.items()) + dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c50aa858a15b5..e81e31b718498 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -234,7 +234,7 @@ def test_itertuples(self): if sys.version >= LooseVersion('2.7'): assert tup2._fields == ('Index', '_1', '_2') - df3 = DataFrame(dict(('f' + str(i), [i]) for i in range(1024))) + df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert not hasattr(tup3, '_fields') diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6ca90d715cb0b..2f947527ce95b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -120,7 +120,7 @@ def _make_mixed_dtypes_df(typ, ad=None): assert(a.dtype == d) if ad is None: ad = dict() - ad.update(dict((d, a) for d, a in zipper)) + ad.update({d: a for d, a in zipper}) return DataFrame(ad) def _check_mixed_dtypes(df, dtypes=None): @@ -349,8 +349,8 @@ def test_constructor_subclass_dict(self): data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} df = DataFrame(data) - refdf = DataFrame(dict((col, dict(compat.iteritems(val))) - for col, val in compat.iteritems(data))) + refdf = DataFrame({col: dict(compat.iteritems(val)) + for col, val in compat.iteritems(data)}) tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(compat.iteritems(data)) @@ -413,8 +413,7 @@ def test_constructor_dict_of_tuples(self): data = {'a': (1, 2, 3), 'b': (4, 5, 6)} result = DataFrame(data) - expected = DataFrame(dict((k, list(v)) - for k, v in compat.iteritems(data))) + expected = DataFrame({k: list(v) for k, v in compat.iteritems(data)}) tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): @@ -447,8 +446,8 @@ def test_constructor_dict_datetime64_index(self): dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] def create_data(constructor): - return dict((i, {constructor(s): 2 * i}) - for i, s in enumerate(dates_as_str)) + return {i: {constructor(s): 2 * i} + for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) @@ -472,8 +471,8 @@ def test_constructor_dict_timedelta64_index(self): td_as_int = [1, 2, 3, 4] def create_data(constructor): - return dict((i, {constructor(s): 2 * i}) - for i, s in enumerate(td_as_int)) + return {i: {constructor(s): 2 * i} + for i, s in enumerate(td_as_int)} data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D')) data_timedelta = create_data(lambda x: timedelta(days=x)) @@ -696,8 +695,8 @@ def test_constructor_mrecarray(self): mrecs = mrecords.fromarrays(data, names=names) # fill the comb - comb = dict((k, v.filled()) if hasattr( - v, 'filled') else (k, v) for k, v in comb) + comb = {k: (v.filled() if hasattr(v, 'filled') else v) + for k, v in comb} expected = DataFrame(comb, columns=names) result = DataFrame(mrecs) @@ -1854,8 +1853,8 @@ def test_from_records_dictlike(self): for dtype, b in compat.iteritems(blocks): columns.extend(b.columns) - asdict = dict((x, y) for x, y in compat.iteritems(df)) - asdict2 = dict((x, y.values) for x, y in compat.iteritems(df)) + asdict = {x: y for x, y in compat.iteritems(df)} + asdict2 = {x: y.values for x, y in compat.iteritems(df)} # dict of series & dict of ndarrays (have dtype info) results = [] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 91a5569b352e9..81153e83471cd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -257,7 +257,7 @@ def test_len(self): assert len(grouped) == len(df) grouped = df.groupby([lambda x: x.year, lambda x: x.month]) - expected = len(set((x.year, x.month) for x in df.index)) + expected = len({(x.year, x.month) for x in df.index}) assert len(grouped) == expected # issue 11016 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 977c639d79711..de0deb442e516 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -238,7 +238,7 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') - results = set(v for v in dir(grp) if not v.startswith('_')) + results = {v for v in dir(grp) if not v.startswith('_')} expected = { 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 506a9e1c64b10..2f8c27f1abb7d 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2178,7 +2178,7 @@ def check(nlevels, with_nulls): if with_nulls: # inject some null values labels[500] = -1 # common nan value - labels = list(labels.copy() for i in range(nlevels)) + labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 @@ -2773,7 +2773,7 @@ def test_groupby(self): # GH5620 groups = self.index.groupby(self.index) - exp = dict((key, [key]) for key in self.index) + exp = {key: [key] for key in self.index} tm.assert_dict_equal(groups, exp) def test_index_name_retained(self): diff --git a/pandas/tests/indexing/test_panel.py b/pandas/tests/indexing/test_panel.py index 4d7768c9e8083..c4f7bd28e4d90 100644 --- a/pandas/tests/indexing/test_panel.py +++ b/pandas/tests/indexing/test_panel.py @@ -119,7 +119,7 @@ def test_panel_getitem(self): df = DataFrame( np.random.randn( len(ind), 5), index=ind, columns=list('ABCDE')) - panel = Panel(dict(('frame_' + c, df) for c in list('ABC'))) + panel = Panel({'frame_' + c: df for c in list('ABC')}) test2 = panel.loc[:, "2002":"2002-12-31"] test1 = panel.loc[:, "2002"] diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 47632b1399991..6553dd66cba5f 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1505,11 +1505,11 @@ def test_repr_html_long_and_wide(self): max_rows = get_option('display.max_rows') h, w = max_rows - 1, max_cols - 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert '...' not in df._repr_html_() h, w = max_rows + 1, max_cols + 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert '...' in df._repr_html_() def test_info_repr(self): @@ -1517,14 +1517,14 @@ def test_info_repr(self): max_cols = get_option('display.max_columns') # Long h, w = max_rows + 1, max_cols - 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_vertically_truncated_repr(df) with option_context('display.large_repr', 'info'): assert has_info_repr(df) # Wide h, w = max_rows - 1, max_cols + 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_horizontally_truncated_repr(df) with option_context('display.large_repr', 'info'): assert has_info_repr(df) @@ -1550,14 +1550,14 @@ def test_info_repr_html(self): max_cols = get_option('display.max_columns') # Long h, w = max_rows + 1, max_cols - 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert r'<class' not in df._repr_html_() with option_context('display.large_repr', 'info'): assert r'<class' in df._repr_html_() # Wide h, w = max_rows - 1, max_cols + 1 - df = DataFrame(dict((k, np.arange(1, 1 + h)) for k in np.arange(w))) + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert ' Date: Fri, 24 Nov 2017 20:03:17 +0000 Subject: [PATCH 17/98] CLN/DEPR: remove pd.ordered_merge (#18459) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/reshape/api.py | 3 +-- pandas/core/reshape/merge.py | 18 +----------------- pandas/tests/api/test_api.py | 2 +- pandas/tests/reshape/test_merge_ordered.py | 9 +-------- 5 files changed, 5 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 92d9123d2cf4c..9716aab69143d 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -95,6 +95,7 @@ Removal of prior version deprecations/changes - The ``levels`` and ``labels`` attributes of a ``MultiIndex`` can no longer be set directly (:issue:`4039`). - ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) - ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) +- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd..merge_ordered`` instead (:issue:`18459`) .. _whatsnew_0220.performance: diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 454a3965d74a6..11d69359f5c65 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -3,7 +3,6 @@ from pandas.core.reshape.concat import concat from pandas.core.reshape.melt import melt, lreshape, wide_to_long from pandas.core.reshape.reshape import pivot_simple as pivot, get_dummies -from pandas.core.reshape.merge import ( - merge, ordered_merge, merge_ordered, merge_asof) +from pandas.core.reshape.merge import merge, merge_ordered, merge_asof from pandas.core.reshape.pivot import pivot_table, crosstab from pandas.core.reshape.tile import cut, qcut diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d00aa1003988a..e4b31939250a7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -139,19 +139,6 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, return result, lby -def ordered_merge(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y')): - - warnings.warn("ordered_merge is deprecated and replaced by merge_ordered", - FutureWarning, stacklevel=2) - return merge_ordered(left, right, on=on, - left_on=left_on, right_on=right_on, - left_by=left_by, right_by=right_by, - fill_method=fill_method, suffixes=suffixes) - - def merge_ordered(left, right, on=None, left_on=None, right_on=None, left_by=None, right_by=None, @@ -204,7 +191,7 @@ def merge_ordered(left, right, on=None, 4 c 2 b 5 e 3 b - >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + >>> merge_ordered(A, B, fill_method='ffill', left_by='group') key lvalue group rvalue 0 a 1 a NaN 1 b 1 a 1 @@ -253,9 +240,6 @@ def _merger(x, y): return result -ordered_merge.__doc__ = merge_ordered.__doc__ - - def merge_asof(left, right, on=None, left_on=None, right_on=None, left_index=False, right_index=False, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index fad455d6391c3..0d1ea1c775aeb 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -103,7 +103,7 @@ class TestPDApi(Base): 'rolling_kurt', 'rolling_max', 'rolling_mean', 'rolling_median', 'rolling_min', 'rolling_quantile', 'rolling_skew', 'rolling_std', 'rolling_sum', - 'rolling_var', 'rolling_window', 'ordered_merge', + 'rolling_var', 'rolling_window', 'pnow', 'match', 'groupby', 'get_store', 'plot_params', 'scatter_matrix'] diff --git a/pandas/tests/reshape/test_merge_ordered.py b/pandas/tests/reshape/test_merge_ordered.py index 9b1806ee52c1d..a4c8793cc0ade 100644 --- a/pandas/tests/reshape/test_merge_ordered.py +++ b/pandas/tests/reshape/test_merge_ordered.py @@ -6,7 +6,7 @@ from numpy import nan -class TestOrderedMerge(object): +class TestMergeOrdered(object): def setup_method(self, method): self.left = DataFrame({'key': ['a', 'c', 'e'], @@ -15,13 +15,6 @@ def setup_method(self, method): self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], 'rvalue': [1, 2, 3., 4]}) - def test_deprecation(self): - - with tm.assert_produces_warning(FutureWarning): - pd.ordered_merge(self.left, self.right, on='key') - - # GH #813 - def test_basic(self): result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], From e728f94b1e9f16f4720b0c99d2eec2ff184c0301 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Nov 2017 12:05:55 -0800 Subject: [PATCH 18/98] Remove unused from datetime.pxd, check for fastpath in ensure_datetime64ns (#18453) --- pandas/_libs/src/datetime.pxd | 14 -------------- pandas/_libs/src/numpy_helper.h | 8 -------- pandas/_libs/tslibs/conversion.pyx | 17 ++++++++++------- 3 files changed, 10 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 3fc3625a06634..0624779e50497 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -5,7 +5,6 @@ from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString cdef extern from "numpy/ndarrayobject.h": - ctypedef int64_t npy_timedelta ctypedef int64_t npy_datetime ctypedef enum NPY_CASTING: @@ -15,15 +14,10 @@ cdef extern from "numpy/ndarrayobject.h": NPY_SAME_KIND_CASTING NPY_UNSAFE_CASTING -cdef extern from "numpy_helper.h": - npy_datetime get_datetime64_value(object o) - npy_timedelta get_timedelta64_value(object o) - cdef extern from "numpy/npy_common.h": ctypedef unsigned char npy_bool cdef extern from "datetime/np_datetime.h": - ctypedef enum PANDAS_DATETIMEUNIT: PANDAS_FR_Y PANDAS_FR_M @@ -44,20 +38,12 @@ cdef extern from "datetime/np_datetime.h": npy_int64 year npy_int32 month, day, hour, min, sec, us, ps, as - npy_datetime pandas_datetimestruct_to_datetime( - PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) nogil - void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil - int days_per_month_table[2][12] - int dayofweek(int y, int m, int d) nogil - int is_leapyear(int64_t year) nogil - PANDAS_DATETIMEUNIT get_datetime64_unit(object o) cdef extern from "datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, NPY_CASTING casting, pandas_datetimestruct *out, diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index ad683459ad878..8a9a05723d9fe 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -18,14 +18,6 @@ The full license is in the LICENSE file, distributed with this software. PANDAS_INLINE npy_int64 get_nat(void) { return NPY_MIN_INT64; } -PANDAS_INLINE npy_datetime get_datetime64_value(PyObject* obj) { - return ((PyDatetimeScalarObject*)obj)->obval; -} - -PANDAS_INLINE npy_timedelta get_timedelta64_value(PyObject* obj) { - return ((PyTimedeltaScalarObject*)obj)->obval; -} - PANDAS_INLINE int is_integer_object(PyObject* obj) { return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 16e88bcaeea3e..f58ad0a86d106 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -104,13 +104,16 @@ def ensure_datetime64ns(ndarray arr): return result unit = get_datetime64_unit(arr.flat[0]) - for i in range(n): - if ivalues[i] != NPY_NAT: - pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) - iresult[i] = dtstruct_to_dt64(&dts) - check_dts_bounds(&dts) - else: - iresult[i] = NPY_NAT + if unit == PANDAS_FR_ns: + result = arr + else: + for i in range(n): + if ivalues[i] != NPY_NAT: + pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) + iresult[i] = dtstruct_to_dt64(&dts) + check_dts_bounds(&dts) + else: + iresult[i] = NPY_NAT return result From aaee541b538559f8887881ab23d2734dddd920d3 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 24 Nov 2017 13:18:03 -0700 Subject: [PATCH 19/98] Change UInt64Index._na_value from 0 to np.nan (#18401) --- doc/source/whatsnew/v0.22.0.txt | 3 +- pandas/core/indexes/base.py | 25 ++++++++------ pandas/core/indexes/numeric.py | 1 - pandas/tests/indexes/common.py | 28 +++++----------- pandas/tests/indexes/period/test_period.py | 23 ++++--------- pandas/tests/indexes/test_base.py | 17 +++++++++- pandas/tests/indexes/test_category.py | 27 +++++---------- pandas/tests/indexes/test_interval.py | 19 +++++------ pandas/tests/indexes/test_numeric.py | 39 ++++++++-------------- pandas/tests/indexes/test_range.py | 27 +-------------- 10 files changed, 79 insertions(+), 130 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 9716aab69143d..4bdff1355874e 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -52,6 +52,7 @@ Backwards incompatible API changes - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - @@ -129,7 +130,7 @@ Bug Fixes Conversion ^^^^^^^^^^ -- +- Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1cb40b3ecf255..af9e29a84b472 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -251,7 +251,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name) + data, copy, name, dtype) except ValueError: pass @@ -307,7 +307,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if inferred == 'integer': try: return cls._try_convert_to_int_index( - subarr, copy, name) + subarr, copy, name, dtype) except ValueError: pass @@ -664,7 +664,7 @@ def ravel(self, order='C'): # construction helpers @classmethod - def _try_convert_to_int_index(cls, data, copy, name): + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ Attempt to convert an array of data into an integer index. @@ -685,15 +685,18 @@ def _try_convert_to_int_index(cls, data, copy, name): """ from .numeric import Int64Index, UInt64Index - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - # Conversion to int64 failed (possibly due to - # overflow), so let's try now with uint64. + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. try: res = data.astype('u8', copy=False) if (res == data).all(): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ae6a810ece510..fddbb2de83dca 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -216,7 +216,6 @@ class UInt64Index(NumericIndex): _inner_indexer = libjoin.inner_join_indexer_uint64 _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False - _na_value = 0 _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 81360bc0c13f9..43b20f420eb48 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -9,8 +9,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, IntervalIndex, - notna, isna) + TimedeltaIndex, PeriodIndex, IntervalIndex, isna) from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.dtypes.common import needs_i8_conversion @@ -529,31 +528,20 @@ def test_numpy_repeat(self): tm.assert_raises_regex(ValueError, msg, np.repeat, i, rep, axis=0) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + + cond = [True] * len(i) + result = i.where(klass(cond)) expected = i tm.assert_index_equal(result, expected) - _nan = i._na_value cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - result = i.where(cond) + expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 7fefcc859d447..52558c27ce707 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -61,27 +61,18 @@ def test_pickle_round_trip(self): result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, Series] - expected = pd.PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = PeriodIndex([NaT] + i[1:].tolist(), freq='D') + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_where_other(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c55f53601848c..99a99cc5cc3eb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta import pandas.util.testing as tm +from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.indexes.api import Index, MultiIndex from pandas.tests.indexes.common import Base @@ -14,7 +15,7 @@ import numpy as np from pandas import (period_range, date_range, Series, - DataFrame, Float64Index, Int64Index, + DataFrame, Float64Index, Int64Index, UInt64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, isna) from pandas.core.index import _get_combined_index, _ensure_index_from_sequences @@ -201,6 +202,20 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('dtype', [ + int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32', + 'uint16', 'uint8']) + def test_constructor_int_dtype_float(self, dtype): + # GH 18400 + if is_unsigned_integer_dtype(dtype): + index_type = UInt64Index + else: + index_type = Int64Index + + expected = index_type([0, 1, 2, 3]) + result = Index([0., 1., 2., 3.], dtype=dtype) + tm.assert_index_equal(result, expected) + def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5e40e06d57413..5e6898f9c8711 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, IntervalIndex, compat, notna +from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -269,28 +269,19 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), - categories=i.categories) - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.CategoricalIndex([np.nan] + i[1:].tolist(), - categories=i.categories) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), + categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_append(self): diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index b98359ea0ec4d..7d6f544f6d533 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -348,20 +348,19 @@ def test_astype(self, closed): expected = pd.Categorical(idx, ordered=True) tm.assert_categorical_equal(result, expected) - def test_where(self, closed): - expected = self.create_index(closed=closed) - result = expected.where(expected.notna()) + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, closed, klass): + idx = self.create_index(closed=closed) + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) - idx = IntervalIndex.from_breaks([1, 2], closed=closed) - result = idx.where([True, False]) - expected = IntervalIndex.from_intervals( - [Interval(1.0, 2.0, closed=closed), np.nan]) + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - pass - def test_delete(self, closed): expected = IntervalIndex.from_breaks([1, 2], closed=closed) result = self.create_index(closed=closed).delete(0) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a96c677852339..030d688f510b0 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -7,7 +7,7 @@ import numpy as np -from pandas import (date_range, notna, Series, Index, Float64Index, +from pandas import (date_range, Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex) import pandas.util.testing as tm @@ -175,6 +175,18 @@ def test_modulo(self): expected = Index(index.values % 2) tm.assert_index_equal(index % 2, expected) + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): + i = self.create_index() + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + + cond = [False] + [True] * (len(i) - 1) + expected = Float64Index([i._na_value] + i[1:].tolist()) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric): _holder = Float64Index @@ -726,31 +738,6 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) assert isinstance(arr, Index) - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7d88b547746f6..b4d1c3760f25a 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -10,7 +10,7 @@ import numpy as np -from pandas import (isna, notna, Series, Index, Float64Index, +from pandas import (isna, Series, Index, Float64Index, Int64Index, RangeIndex) import pandas.util.testing as tm @@ -934,31 +934,6 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) assert len(i) == 0 - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_append(self): # GH16212 RI = RangeIndex From 412988ed972a929203b2e867c8fdc7fcb0e7d312 Mon Sep 17 00:00:00 2001 From: Vince W Date: Fri, 24 Nov 2017 16:17:02 -0600 Subject: [PATCH 20/98] Update clipboard Qt-bindings for flexiblity and Python3 compatibility (#17723) --- ci/requirements-3.6_BUILD_TEST.sh | 2 +- doc/source/install.rst | 13 +++++++------ doc/source/io.rst | 2 +- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/io/clipboard/__init__.py | 25 ++++++++++++++++++++----- pandas/io/clipboard/clipboards.py | 15 ++++++++++++--- 6 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ci/requirements-3.6_BUILD_TEST.sh b/ci/requirements-3.6_BUILD_TEST.sh index 84dd27c50d587..2a3adeff836ee 100644 --- a/ci/requirements-3.6_BUILD_TEST.sh +++ b/ci/requirements-3.6_BUILD_TEST.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 36 BUILD_TEST" -conda install -n pandas -c conda-forge pyarrow dask +conda install -n pandas -c conda-forge pyarrow dask pyqt qtpy diff --git a/doc/source/install.rst b/doc/source/install.rst index c805f84d0faaa..b8968e18aecb0 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -251,12 +251,13 @@ Optional Dependencies * `Jinja2 `__: Template engine for conditional HTML formatting. * `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). * `blosc `__: for msgpack compression using ``blosc`` -* One of `PyQt4 - `__, `PySide - `__, `pygtk - `__, `xsel - `__, or `xclip - `__: necessary to use +* One of + `qtpy `__ (requires PyQt or PySide), + `PyQt5 `__, + `PyQt4 `__, + `pygtk `__, + `xsel `__, or + `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. * For Google BigQuery I/O - see `here `__ diff --git a/doc/source/io.rst b/doc/source/io.rst index c94d5bc75d4fc..5390fc3399e23 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3053,7 +3053,7 @@ We can see that we got the same content back, which we had earlier written to th .. note:: - You may need to install xclip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. + You may need to install xclip or xsel (with gtk, PyQt5, PyQt4 or qtpy) on Linux to use these methods. .. _io.pickle: diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4bdff1355874e..5549ba4e8f735 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -44,6 +44,7 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 4066a3be5e850..37d398f20ef41 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -18,7 +18,8 @@ On Linux, install xclip or xsel via package manager. For example, in Debian: sudo apt-get install xclip -Otherwise on Linux, you will need the gtk or PyQt4 modules installed. +Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed. +qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2 gtk and PyQt4 modules are not available for Python 3, and this module does not work with PyGObject yet. @@ -34,9 +35,9 @@ init_klipper_clipboard, init_no_clipboard) from .windows import init_windows_clipboard -# `import PyQt4` sys.exit()s if DISPLAY is not in the environment. +# `import qtpy` sys.exit()s if DISPLAY is not in the environment. # Thus, we need to detect the presence of $DISPLAY manually -# and not load PyQt4 if it is absent. +# and not load qtpy if it is absent. HAS_DISPLAY = os.getenv("DISPLAY", False) CHECK_CMD = "where" if platform.system() == "Windows" else "which" @@ -68,9 +69,23 @@ def determine_clipboard(): return init_gtk_clipboard() try: - # Check if PyQt4 is installed - import PyQt4 # noqa + # qtpy is a small abstraction layer that lets you write + # applications using a single api call to either PyQt or PySide + # https://pypi.python.org/pypi/QtPy + import qtpy # noqa except ImportError: + # If qtpy isn't installed, fall back on importing PyQt5, or PyQt5 + try: + import PyQt5 # noqa + except ImportError: + try: + import PyQt4 # noqa + except ImportError: + pass # fail fast for all non-ImportError exceptions. + else: + return init_qt_clipboard() + else: + return init_qt_clipboard() pass else: return init_qt_clipboard() diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index e32380a383374..285d93e3ca497 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -46,10 +46,19 @@ def paste_gtk(): def init_qt_clipboard(): # $DISPLAY should exist - from PyQt4.QtGui import QApplication - # use the global instance if it exists - app = QApplication.instance() or QApplication([]) + # Try to import from qtpy, but if that fails try PyQt5 then PyQt4 + try: + from qtpy.QtWidgets import QApplication + except ImportError: + try: + from PyQt5.QtWidgets import QApplication + except ImportError: + from PyQt4.QtGui import QApplication + + app = QApplication.instance() + if app is None: + app = QApplication([]) def copy_qt(text): cb = app.clipboard() From 467ee2bac1a410fdf7e33ad673123062d74bc111 Mon Sep 17 00:00:00 2001 From: Nate Yoder Date: Sat, 25 Nov 2017 06:24:15 -0800 Subject: [PATCH 21/98] Allow indices to be mapped through through dictionaries or series (#15081) --- asv_bench/benchmarks/series_methods.py | 24 ++++++ doc/source/whatsnew/v0.22.0.txt | 2 + pandas/core/base.py | 77 ++++++++++++++++- pandas/core/dtypes/cast.py | 35 ++++++++ pandas/core/dtypes/missing.py | 7 +- pandas/core/indexes/base.py | 53 +++++++++--- pandas/core/indexes/datetimelike.py | 12 ++- pandas/core/indexes/datetimes.py | 11 +++ pandas/core/series.py | 83 +++---------------- pandas/tests/indexes/common.py | 27 ++++++ pandas/tests/indexes/datetimelike.py | 40 ++++++++- pandas/tests/indexes/period/test_period.py | 6 +- pandas/tests/indexes/test_base.py | 60 ++++++++++++++ pandas/tests/indexes/test_category.py | 18 ++++ pandas/tests/indexes/test_interval.py | 4 + .../indexes/timedeltas/test_timedelta.py | 1 + pandas/tests/indexing/test_categorical.py | 18 ++++ pandas/tests/series/test_apply.py | 1 + 18 files changed, 386 insertions(+), 93 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3c0e2869357ae..5e8cf3a0350bb 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -123,6 +123,30 @@ def time_series_dropna_datetime(self): self.s.dropna() +class series_map_dict(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_dict = {i: map_size - i for i in range(map_size)} + + def time_series_map_dict(self): + self.s.map(self.map_dict) + + +class series_map_series(object): + goal_time = 0.2 + + def setup(self): + map_size = 1000 + self.s = Series(np.random.randint(0, map_size, 10000)) + self.map_series = Series(map_size - np.arange(map_size)) + + def time_series_map_series(self): + self.s.map(self.map_series) + + class series_clip(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 5549ba4e8f735..f97b958d553e0 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -75,6 +75,7 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`). - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) @@ -108,6 +109,7 @@ Performance Improvements - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) +- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 90fe350848bf7..cce0f384cb983 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -12,11 +12,12 @@ is_object_dtype, is_list_like, is_scalar, - is_datetimelike) + is_datetimelike, + is_extension_type) from pandas.util._validators import validate_bool_kwarg -from pandas.core import common as com +from pandas.core import common as com, algorithms import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv @@ -838,6 +839,78 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, klass=self.__class__.__name__, op=name)) return func(**kwds) + def _map_values(self, mapper, na_action=None): + """An internal function that maps values using the input + correspondence (which can be a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + The input correspondence object + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function + + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + + """ + + # we can fastpath dict/Series to an efficient map + # as we know that we are not going to have to yield + # python types + if isinstance(mapper, dict): + if hasattr(mapper, '__missing__'): + # If a dictionary subclass defines a default value method, + # convert mapper to a lookup function (GH #15999). + dict_with_default = mapper + mapper = lambda x: dict_with_default[x] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an Series for efficiency. + # we specify the keys here to handle the + # possibility that they are tuples + from pandas import Series + mapper = Series(mapper, index=mapper.keys()) + + if isinstance(mapper, ABCSeries): + # Since values were input this means we came from either + # a dict or a series and mapper should be an index + if is_extension_type(self.dtype): + values = self._values + else: + values = self.values + + indexer = mapper.index.get_indexer(values) + new_values = algorithms.take_1d(mapper._values, indexer) + + return new_values + + # we must convert to python types + if is_extension_type(self.dtype): + values = self._values + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) + else: + values = self.astype(object) + values = getattr(values, 'values', values) + if na_action == 'ignore': + def map_f(values, f): + return lib.map_infer_mask(values, f, + isna(values).view(np.uint8)) + else: + map_f = lib.map_infer + + # mapper is a function + new_values = map_f(values, mapper) + + return new_values + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bc8aacfe90170..a97b84ab9cc5b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1127,3 +1127,38 @@ def cast_scalar_to_array(shape, value, dtype=None): values.fill(fill_value) return values + + +def construct_1d_arraylike_from_scalar(value, length, dtype): + """ + create a np.ndarray / pandas type of specified shape and dtype + filled with values + + Parameters + ---------- + value : scalar value + length : int + dtype : pandas_dtype / np.dtype + + Returns + ------- + np.ndarray / pandas type of length, filled with value + + """ + if is_datetimetz(dtype): + from pandas import DatetimeIndex + subarr = DatetimeIndex([value] * length, dtype=dtype) + elif is_categorical_dtype(dtype): + from pandas import Categorical + subarr = Categorical([value] * length) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + + # coerce if we have nan for an integer dtype + if is_integer_dtype(dtype) and isna(value): + dtype = np.float64 + subarr = np.empty(length, dtype=dtype) + subarr.fill(value) + + return subarr diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7cae536c5edd9..ce57b544d9d66 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -369,13 +369,14 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype): +def na_value_for_dtype(dtype, compat=True): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype + compat : boolean, default True Returns ------- @@ -389,7 +390,9 @@ def na_value_for_dtype(dtype): elif is_float_dtype(dtype): return np.nan elif is_integer_dtype(dtype): - return 0 + if compat: + return 0 + return np.nan elif is_bool_dtype(dtype): return False return np.nan diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index af9e29a84b472..8a751f0204b60 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,7 +13,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.core.dtypes.generic import ( ABCSeries, ABCMultiIndex, @@ -2827,6 +2826,27 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer + _index_shared_docs['_get_values_from_dict'] = """ + Return the values of the input dictionary in the order the keys are + in the index. np.nan is returned for index values not in the + dictionary. + + Parameters + ---------- + data : dict + The dictionary from which to extract the values + + Returns + ------- + np.array + + """ + + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + return lib.fast_multiget(data, self.values, + default=np.nan) + def _maybe_promote(self, other): # A hack, but it works from pandas.core.indexes.datetimes import DatetimeIndex @@ -2865,13 +2885,15 @@ def groupby(self, values): return result - def map(self, mapper): - """Apply mapper function to an index. + def map(self, mapper, na_action=None): + """Map values of Series using input correspondence Parameters ---------- - mapper : callable - Function to be applied. + mapper : function, dict, or Series + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function Returns ------- @@ -2881,15 +2903,26 @@ def map(self, mapper): a MultiIndex will be returned. """ + from .multi import MultiIndex - mapped_values = self._arrmap(self.values, mapper) + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) attributes = self._get_attributes_dict() - if mapped_values.size and isinstance(mapped_values[0], tuple): - return MultiIndex.from_tuples(mapped_values, - names=attributes.get('name')) + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) attributes['copy'] = False - return Index(mapped_values, **attributes) + + # we infer the result types based on the + # returned values + return Index(new_values, **attributes) def isin(self, values, level=None): """ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4934ccb49b844..5643d886a4fec 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -136,7 +136,7 @@ def equals(self, other): elif not isinstance(other, type(self)): try: other = type(self)(other) - except: + except Exception: return False if not is_dtype_equal(self.dtype, other.dtype): @@ -352,7 +352,7 @@ def map(self, f): # Try to use this result if we can if isinstance(result, np.ndarray): - self._shallow_copy(result) + result = Index(result) if not isinstance(result, Index): raise TypeError('The map function must return an Index object') @@ -698,6 +698,14 @@ def __rsub__(self, other): def _add_delta(self, other): return NotImplemented + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + if len(data): + return np.array([data.get(i, np.nan) + for i in self.asobject.values]) + + return np.array([np.nan]) + def _add_delta_td(self, other): # add a delta of a timedeltalike # return the i8 result view diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 111ba0c92aa9b..e1def38289243 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1457,6 +1457,17 @@ def get_value_maybe_box(self, series, key): key, tz=self.tz) return _maybe_box(self, values, series, key) + @Appender(_index_shared_docs['_get_values_from_dict']) + def _get_values_from_dict(self, data): + if len(data): + # coerce back to datetime objects for lookup + data = com._dict_compat(data) + return lib.fast_multiget(data, + self.asobject.values, + default=np.nan) + + return np.array([np.nan]) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/series.py b/pandas/core/series.py index d7833526c0408..bff7c21ad69b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -18,7 +18,7 @@ is_bool, is_integer, is_integer_dtype, is_float_dtype, - is_extension_type, is_datetimetz, + is_extension_type, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -34,7 +34,8 @@ from pandas.core.dtypes.cast import ( maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, - maybe_cast_to_datetime, maybe_castable) + maybe_cast_to_datetime, maybe_castable, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.common import (is_bool_indexer, @@ -45,7 +46,6 @@ _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, - _dict_compat, standardize_mapping, _any_none) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -203,23 +203,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, index = Index(data) else: index = Index(_try_sort(data)) + try: - if isinstance(index, DatetimeIndex): - if len(data): - # coerce back to datetime objects for lookup - data = _dict_compat(data) - data = lib.fast_multiget(data, - index.asobject.values, - default=np.nan) - else: - data = np.nan - # GH #12169 - elif isinstance(index, (PeriodIndex, TimedeltaIndex)): - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - else: - data = lib.fast_multiget(data, index.values, - default=np.nan) + data = index._get_values_from_dict(data) except TypeError: data = ([data.get(i, np.nan) for i in index] if data else np.nan) @@ -2338,41 +2324,8 @@ def map(self, arg, na_action=None): 3 0 dtype: int64 """ - - if is_extension_type(self.dtype): - values = self._values - if na_action is not None: - raise NotImplementedError - map_f = lambda values, f: values.map(f) - else: - values = self.asobject - - if na_action == 'ignore': - def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) - else: - map_f = lib.map_infer - - if isinstance(arg, dict): - if hasattr(arg, '__missing__'): - # If a dictionary subclass defines a default value method, - # convert arg to a lookup function (GH #15999). - dict_with_default = arg - arg = lambda x: dict_with_default[x] - else: - # Dictionary does not have a default. Thus it's safe to - # convert to an indexed series for efficiency. - arg = self._constructor(arg, index=arg.keys()) - - if isinstance(arg, Series): - # arg is a Series - indexer = arg.index.get_indexer(values) - new_values = algorithms.take_1d(arg._values, indexer) - else: - # arg is a function - new_values = map_f(values, arg) - + new_values = super(Series, self)._map_values( + arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) @@ -3248,21 +3201,6 @@ def _try_cast(arr, take_fast_path): else: subarr = _try_cast(data, False) - def create_from_value(value, index, dtype): - # return a new empty value suitable for the dtype - - if is_datetimetz(dtype): - subarr = DatetimeIndex([value] * len(index), dtype=dtype) - elif is_categorical_dtype(dtype): - subarr = Categorical([value] * len(index)) - else: - if not isinstance(dtype, (np.dtype, type(np.dtype))): - dtype = dtype.dtype - subarr = np.empty(len(index), dtype=dtype) - subarr.fill(value) - - return subarr - # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover @@ -3277,7 +3215,8 @@ def create_from_value(value, index, dtype): # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) - subarr = create_from_value(value, index, dtype) + subarr = construct_1d_arraylike_from_scalar( + value, len(index), dtype) else: return subarr.item() @@ -3288,8 +3227,8 @@ def create_from_value(value, index, dtype): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - subarr = create_from_value(subarr[0], index, - subarr.dtype) + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 43b20f420eb48..ee6434431bcfc 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1005,3 +1005,30 @@ def test_searchsorted_monotonic(self, indices): # non-monotonic should raise. with pytest.raises(ValueError): indices._searchsorted_monotonic(value, side='left') + + def test_map(self): + index = self.create_index() + + # From output of UInt64Index mapping can't infer that we + # shouldn't default to Int64 + if isinstance(index, UInt64Index): + expected = Index(index.values.tolist()) + else: + expected = index + + tm.assert_index_equal(index.map(lambda x: x), expected) + + identity_dict = {x: x for x in index} + tm.assert_index_equal(index.map(identity_dict), expected) + + # Use values to work around MultiIndex instantiation of series + identity_series = Series(expected.values, index=index) + tm.assert_index_equal(index.map(identity_series), expected) + + # empty mappable + nan_index = pd.Index([np.nan] * len(index)) + series_map = pd.Series() + tm.assert_index_equal(index.map(series_map), nan_index) + + dict_map = {} + tm.assert_index_equal(index.map(dict_map), nan_index) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 12b509d4aef3f..839fccc1441e5 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,5 +1,7 @@ """ generic datetimelike tests """ - +import pytest +import pandas as pd +import numpy as np from .common import Base import pandas.util.testing as tm @@ -38,3 +40,39 @@ def test_view(self, indices): i_view = i.view(self._holder) result = self._holder(i) tm.assert_index_equal(result, i_view) + + def test_map_callable(self): + + expected = self.index + 1 + result = self.index.map(lambda x: x + 1) + tm.assert_index_equal(result, expected) + + # map to NaT + result = self.index.map(lambda x: pd.NaT if x == self.index[0] else x) + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): + expected = self.index + 1 + + # don't compare the freqs + if isinstance(expected, pd.DatetimeIndex): + expected.freq = None + + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) + + expected = pd.Index([pd.NaT] + self.index[1:].tolist()) + result = self.index.map(mapper(expected, self.index)) + tm.assert_index_equal(result, expected) + + # empty map; these map to np.nan because we cannot know + # to re-infer things + expected = pd.Index([np.nan] * len(self.index)) + result = self.index.map(mapper([], [])) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 52558c27ce707..9d5746e07814e 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -683,11 +683,9 @@ def test_pickle_freq(self): assert new_prng.freqstr == 'M' def test_map(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') - result = index.map(lambda x: x + 1) - expected = index + 1 - tm.assert_index_equal(result, expected) + # test_map_dictlike generally tests + index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 99a99cc5cc3eb..f5016e6d19a57 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -4,6 +4,8 @@ from datetime import datetime, timedelta +from collections import defaultdict + import pandas.util.testing as tm from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.indexes.api import Index, MultiIndex @@ -844,6 +846,64 @@ def test_map_tseries_indices_return_index(self): exp = Index(range(24), name='hourly') tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) + def test_map_with_dict_and_series(self): + # GH 12756 + expected = Index(['foo', 'bar', 'baz']) + mapper = Series(expected.values, index=[0, 1, 2]) + result = tm.makeIntIndex(3).map(mapper) + tm.assert_index_equal(result, expected) + + for name in self.indices.keys(): + if name == 'catIndex': + # Tested in test_categorical + continue + elif name == 'repeats': + # Cannot map duplicated index + continue + + cur_index = self.indices[name] + expected = Index(np.arange(len(cur_index), 0, -1)) + mapper = pd.Series(expected, index=cur_index) + result = cur_index.map(mapper) + + tm.assert_index_equal(result, expected) + + # If the mapper is empty the expected index type is Int64Index + # but the output defaults to Float64 so I treat it independently + mapper = {o: n for o, n in + zip(cur_index, expected)} + + result = cur_index.map(mapper) + if not mapper: + expected = Float64Index([]) + tm.assert_index_equal(result, expected) + + def test_map_with_non_function_missing_values(self): + # GH 12756 + expected = Index([2., np.nan, 'foo']) + input = Index([2, 1, 0]) + + mapper = Series(['foo', 2., 'baz'], index=[0, 2, -1]) + tm.assert_index_equal(expected, input.map(mapper)) + + mapper = {0: 'foo', 2: 2.0, -1: 'baz'} + tm.assert_index_equal(expected, input.map(mapper)) + + def test_map_na_exclusion(self): + idx = Index([1.5, np.nan, 3, np.nan, 5]) + + result = idx.map(lambda x: x * 2, na_action='ignore') + exp = idx * 2 + tm.assert_index_equal(result, exp) + + def test_map_defaultdict(self): + idx = Index([1, 2, 3]) + default_dict = defaultdict(lambda: 'blank') + default_dict[1] = 'stuff' + result = idx.map(default_dict) + expected = Index(['stuff', 'blank', 'blank']) + tm.assert_index_equal(result, expected) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5e6898f9c8711..92d5a53f6570b 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -269,6 +269,24 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) + result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) + tm.assert_index_equal(result, exp) + + result = ci.map({'A': 10, 'B': 20, 'C': 30}) + tm.assert_index_equal(result, exp) + + def test_map_with_categorical_series(self): + # GH 12756 + a = pd.Index([1, 2, 3, 4]) + b = pd.Series(["even", "odd", "even", "odd"], + dtype="category") + c = pd.Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = pd.Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 7d6f544f6d533..b17d241ff50e0 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -530,6 +530,10 @@ def test_repr_max_seq_item_setting(self): def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() + @pytest.mark.xfail(reason='get_indexer behavior does not currently work') + def test_map(self): + super(TestIntervalIndex, self).test_map() + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 533b06088f1bf..e25384ebf7d62 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -187,6 +187,7 @@ def test_misc_coverage(self): assert not idx.equals(list(non_td)) def test_map(self): + # test_map_dictlike generally tests rng = timedelta_range('1 day', periods=10) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 2c93d2afd1760..22b3fd9073bab 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -439,3 +439,21 @@ def test_indexing_with_category(self): res = (cat[['A']] == 'foo') tm.assert_frame_equal(res, exp) + + def test_map_with_dict_or_series(self): + orig_values = ['a', 'B', 1, 'a'] + new_values = ['one', 2, 3.0, 'one'] + cur_index = pd.CategoricalIndex(orig_values, name='XXX') + expected = pd.CategoricalIndex(new_values, + name='XXX', categories=[3.0, 2, 'one']) + + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + mapper = {o: n for o, n in + zip(orig_values[:-1], new_values[:-1])} + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index d0693984689a6..fe21ba569ae99 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -424,6 +424,7 @@ def test_map_dict_with_tuple_keys(self): """ df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df['labels'] = df['a'].map(label_mappings) df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) # All labels should be filled now From 5cd4cb23c92f69d5db2a5f6ad6f1a9db01d5073b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 Nov 2017 06:31:38 -0800 Subject: [PATCH 22/98] CLN: ASV ctors benchmark (#18479) --- asv_bench/benchmarks/ctors.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index b5694a3a21502..2c9c382e2db86 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,24 +1,28 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp class Constructors(object): + goal_time = 0.2 def setup(self): - self.arr = np.random.randn(100, 100) + N = 10**2 + np.random.seed(1234) + self.arr = np.random.randn(N, N) self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - self.data = np.random.randn(100) - self.index = Index(np.arange(100)) + self.data = np.random.randn(N) + self.index = Index(np.arange(N)) - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')] * 1000)) + self.s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * N * 10) def time_frame_from_ndarray(self): DataFrame(self.arr) def time_series_from_ndarray(self): - pd.Series(self.data, index=self.index) + Series(self.data, index=self.index) def time_index_from_array_string(self): Index(self.arr_str) @@ -26,5 +30,5 @@ def time_index_from_array_string(self): def time_dtindex_from_series(self): DatetimeIndex(self.s) - def time_dtindex_from_series2(self): + def time_dtindex_from_index_with_series(self): Index(self.s) From 200227e456b0aae82248a057f923ae9090b295c9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 Nov 2017 06:36:59 -0800 Subject: [PATCH 23/98] CLN: ASV categoricals benchmark (#18465) --- asv_bench/benchmarks/categoricals.py | 133 +++++++++++++++++---------- 1 file changed, 82 insertions(+), 51 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a5bb5e790dec1..df41a2afad1f8 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,6 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +import pandas.util.testing as tm try: from pandas.api.types import union_categoricals except ImportError: @@ -8,107 +10,136 @@ pass -class Categoricals(object): +class Concat(object): + goal_time = 0.2 def setup(self): - N = 100000 - self.s = pd.Series((list('aabbcd') * N)).astype('category') + N = 10**5 + self.s = pd.Series(list('aabbcd') * N).astype('category') + + self.a = pd.Categorical(list('aabbcd') * N) + self.b = pd.Categorical(list('bbcdjk') * N) + + def time_concat(self): + pd.concat([self.s, self.s]) + + def time_union(self): + union_categoricals([self.a, self.b]) + - self.a = pd.Categorical((list('aabbcd') * N)) - self.b = pd.Categorical((list('bbcdjk') * N)) +class Constructor(object): + goal_time = 0.2 + + def setup(self): + N = 10**5 self.categories = list('abcde') - self.cat_idx = Index(self.categories) + self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) self.codes = np.tile(range(len(self.categories)), N) - self.datetimes = pd.Series(pd.date_range( - '1995-01-01 00:00:00', periods=10000, freq='s')) + self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', + periods=N / 10, + freq='s')) + self.datetimes_with_nat = self.datetimes.copy() + self.datetimes_with_nat.iloc[-1] = pd.NaT self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) - def time_concat(self): - concat([self.s, self.s]) - - def time_union(self): - union_categoricals([self.a, self.b]) + def time_regular(self): + pd.Categorical(self.values, self.categories) - def time_constructor_regular(self): - Categorical(self.values, self.categories) + def time_fastpath(self): + pd.Categorical(self.codes, self.cat_idx, fastpath=True) - def time_constructor_fastpath(self): - Categorical(self.codes, self.cat_idx, fastpath=True) + def time_datetimes(self): + pd.Categorical(self.datetimes) - def time_constructor_datetimes(self): - Categorical(self.datetimes) + def time_datetimes_with_nat(self): + pd.Categorical(self.datetimes_with_nat) - def time_constructor_datetimes_with_nat(self): - t = self.datetimes - t.iloc[-1] = pd.NaT - Categorical(t) + def time_with_nan(self): + pd.Categorical(self.values_some_nan) - def time_constructor_with_nan(self): - Categorical(self.values_some_nan) + def time_all_nan(self): + pd.Categorical(self.values_all_nan) - def time_constructor_all_nan(self): - Categorical(self.values_all_nan) +class ValueCounts(object): -class Categoricals2(object): goal_time = 0.2 - def setup(self): - n = 500000 + params = [True, False] + param_names = ['dropna'] + + def setup(self, dropna): + n = 5 * 10**5 np.random.seed(2718281) arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') + self.ts = pd.Series(arr).astype('category') + + def time_value_counts(self, dropna): + self.ts.value_counts(dropna=dropna) + - self.sel = self.ts.loc[[0]] +class Repr(object): - def time_value_counts(self): - self.ts.value_counts(dropna=False) + goal_time = 0.2 - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) + def setup(self): + self.sel = pd.Series(['s1234']).astype('category') def time_rendering(self): str(self.sel) + +class SetCategories(object): + + goal_time = 0.2 + + def setup(self): + n = 5 * 10**5 + np.random.seed(2718281) + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype('category') + def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) -class Categoricals3(object): +class Rank(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 ncats = 100 + np.random.seed(1234) - self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats))) - self.s1_cat = self.s1.astype('category') - self.s1_cat_ordered = self.s1.astype('category', ordered=True) + self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str_cat = self.s_str.astype('category') + self.s_str_cat_ordered = self.s_str.astype('category', ordered=True) - self.s2 = Series(np.random.randint(0, ncats, size=N)) - self.s2_cat = self.s2.astype('category') - self.s2_cat_ordered = self.s2.astype('category', ordered=True) + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) + self.s_int_cat = self.s_int.astype('category') + self.s_int_cat_ordered = self.s_int.astype('category', ordered=True) def time_rank_string(self): - self.s1.rank() + self.s_str.rank() def time_rank_string_cat(self): - self.s1_cat.rank() + self.s_str_cat.rank() def time_rank_string_cat_ordered(self): - self.s1_cat_ordered.rank() + self.s_str_cat_ordered.rank() def time_rank_int(self): - self.s2.rank() + self.s_int.rank() def time_rank_int_cat(self): - self.s2_cat.rank() + self.s_int_cat.rank() def time_rank_int_cat_ordered(self): - self.s2_cat_ordered.rank() + self.s_int_cat_ordered.rank() From b71ecbd747c256368274ef2c14680dafffc7d388 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 Nov 2017 06:37:52 -0800 Subject: [PATCH 24/98] CLN: ASV binary ops benchmark (#18444) --- asv_bench/benchmarks/binary_ops.py | 91 +++++++++++++++--------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 0ca21b929ea17..429965c06cb48 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,4 +1,5 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, date_range try: import pandas.core.computation.expressions as expr except ImportError: @@ -6,12 +7,14 @@ class Ops(object): + goal_time = 0.2 params = [[True, False], ['default', 1]] param_names = ['use_numexpr', 'threads'] def setup(self, use_numexpr, threads): + np.random.seed(1234) self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) @@ -20,18 +23,17 @@ def setup(self, use_numexpr, threads): if not use_numexpr: expr.set_use_numexpr(False) - def time_frame_add(self, use_numexpr, threads): - (self.df + self.df2) + self.df + self.df2 def time_frame_mult(self, use_numexpr, threads): - (self.df * self.df2) + self.df * self.df2 def time_frame_multi_and(self, use_numexpr, threads): - self.df[((self.df > 0) & (self.df2 > 0))] + self.df[(self.df > 0) & (self.df2 > 0)] def time_frame_comparison(self, use_numexpr, threads): - (self.df > self.df2) + self.df > self.df2 def teardown(self, use_numexpr, threads): expr.set_use_numexpr(True) @@ -39,75 +41,70 @@ def teardown(self, use_numexpr, threads): class Ops2(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) + N = 10**3 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N, N)) + self.df2 = DataFrame(np.random.randn(N, N)) - self.df_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) - self.df2_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) + self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) + self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) - ## Division + # Division def time_frame_float_div(self): - (self.df // self.df2) + self.df // self.df2 def time_frame_float_div_by_zero(self): - (self.df / 0) + self.df / 0 def time_frame_float_floor_by_zero(self): - (self.df // 0) + self.df // 0 def time_frame_int_div_by_zero(self): - (self.df_int / 0) + self.df_int / 0 - ## Modulo + # Modulo def time_frame_int_mod(self): - (self.df / self.df2) + self.df_int % self.df2_int def time_frame_float_mod(self): - (self.df / self.df2) + self.df % self.df2 class Timeseries(object): + goal_time = 0.2 - def setup(self): - self.N = 1000000 + params = [None, 'US/Eastern'] + param_names = ['tz'] + + def setup(self, tz): + self.N = 10**6 self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T')) + self.s = Series(date_range('20010101', periods=self.N, freq='T', + tz=tz)) self.ts = self.s[self.halfway] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s')) + self.s2 = Series(date_range('20010101', periods=self.N, freq='s', + tz=tz)) - def time_series_timestamp_compare(self): - (self.s <= self.ts) + def time_series_timestamp_compare(self, tz): + self.s <= self.ts - def time_timestamp_series_compare(self): - (self.ts >= self.s) + def time_timestamp_series_compare(self, tz): + self.ts >= self.s - def time_timestamp_ops_diff1(self): + def time_timestamp_ops_diff(self, tz): self.s2.diff() - def time_timestamp_ops_diff2(self): - (self.s - self.s.shift()) - - - -class TimeseriesTZ(Timeseries): - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) - self.ts = self.s[self.halfway] - - self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) + def time_timestamp_ops_diff_with_shift(self, tz): + self.s - self.s.shift() From 9c9a09f5f55d0eb733e7945fa33e6426098b80dc Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 25 Nov 2017 07:45:33 -0700 Subject: [PATCH 25/98] BUG: Fix IntervalIndex.insert to allow inserting NaN (#18300) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/indexes/base.py | 4 ++ pandas/core/indexes/category.py | 4 +- pandas/core/indexes/datetimes.py | 5 +- pandas/core/indexes/interval.py | 23 ++++++--- pandas/core/indexes/timedeltas.py | 6 ++- .../tests/indexes/datetimes/test_indexing.py | 7 +++ pandas/tests/indexes/period/test_period.py | 8 +++ pandas/tests/indexes/test_base.py | 6 +++ pandas/tests/indexes/test_category.py | 6 +++ pandas/tests/indexes/test_interval.py | 50 ++++++++++++++++--- pandas/tests/indexes/test_numeric.py | 7 +++ pandas/tests/indexes/test_range.py | 6 +++ .../tests/indexes/timedeltas/test_indexing.py | 6 +++ 14 files changed, 119 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index f97b958d553e0..7229bd38fffa9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -78,6 +78,7 @@ Other API Changes - :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`). - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) +- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8a751f0204b60..b5d912f4201b5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3767,6 +3767,10 @@ def insert(self, loc, item): ------- new_index : Index """ + if is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value + _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._values idx = np.concatenate((_self[:loc], item, _self[loc:])) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d09e5447431ce..26ffb01b9577f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,7 +12,7 @@ is_scalar) from pandas.core.common import (_asarray_tuplesafe, _values_from_object) -from pandas.core.dtypes.missing import array_equivalent +from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core.algorithms import take_1d @@ -690,7 +690,7 @@ def insert(self, loc, item): """ code = self.categories.get_indexer([item]) - if (code == -1): + if (code == -1) and not (is_scalar(item) and isna(item)): raise TypeError("cannot insert an item into a CategoricalIndex " "that is not already an existing category") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e1def38289243..196c881f97526 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1768,6 +1768,9 @@ def insert(self, loc, item): ------- new_index : Index """ + if is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value freq = None @@ -1784,6 +1787,7 @@ def insert(self, loc, item): elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = _to_m8(item, tz=self.tz) + try: new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) @@ -1791,7 +1795,6 @@ def insert(self, loc, item): new_dates = conversion.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) - except (AttributeError, TypeError): # fall back to object index diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c7c739b766a9f..06843150bf46a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1001,14 +1001,21 @@ def delete(self, loc): return self._shallow_copy(new_left, new_right) def insert(self, loc, item): - if not isinstance(item, Interval): - raise ValueError('can only insert Interval objects into an ' - 'IntervalIndex') - if not item.closed == self.closed: - raise ValueError('inserted item must be closed on the same side ' - 'as the index') - new_left = self.left.insert(loc, item.left) - new_right = self.right.insert(loc, item.right) + if isinstance(item, Interval): + if item.closed != self.closed: + raise ValueError('inserted item must be closed on the same ' + 'side as the index') + left_insert = item.left + right_insert = item.right + elif is_scalar(item) and isna(item): + # GH 18295 + left_insert = right_insert = item + else: + raise ValueError('can only insert Interval objects and NA into ' + 'an IntervalIndex') + + new_left = self.left.insert(loc, left_insert) + new_right = self.right.insert(loc, right_insert) return self._shallow_copy(new_left, new_right) def _as_like_interval_index(self, other, error_msg): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 22fb7c255b12c..97f6ca2e5d642 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -852,16 +852,18 @@ def insert(self, loc, item): ------- new_index : Index """ - # try to convert if possible if _is_convertible_to_td(item): try: item = Timedelta(item) except Exception: pass + elif is_scalar(item) and isna(item): + # GH 18295 + item = self._na_value freq = None - if isinstance(item, Timedelta) or item is NaT: + if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): # check freq can be preserved on edge cases if self.freq is not None: diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4ce9441d87970..b3ce22962d5d4 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -145,6 +145,13 @@ def test_insert(self): assert result.tz == expected.tz assert result.freq is None + # GH 18295 (test missing) + expected = DatetimeIndex( + ['20170101', pd.NaT, '20170102', '20170103', '20170104']) + for na in (np.nan, pd.NaT, None): + result = date_range('20170101', periods=4).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 9d5746e07814e..13a63de22169e 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -695,3 +695,11 @@ def test_join_self(self, how): index = period_range('1/1/2000', periods=10) joined = index.join(index, how=how) assert index is joined + + def test_insert(self): + # GH 18295 (test missing) + expected = PeriodIndex( + ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q') + for na in (np.nan, pd.NaT, None): + result = period_range('2017Q1', periods=4, freq='Q').insert(1, na) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f5016e6d19a57..7dfd1511da292 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -459,6 +459,12 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) + # GH 18295 (test missing) + expected = Index(['a', np.nan, 'b', 'c']) + for na in (np.nan, pd.NaT, None): + result = Index(list('abc')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = Index(['a', 'b', 'c', 'd'], name='idx') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 92d5a53f6570b..c2eee4e437347 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -362,6 +362,12 @@ def test_insert(self): # invalid pytest.raises(TypeError, lambda: ci.insert(0, 'd')) + # GH 18295 (test missing) + expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) + for na in (np.nan, pd.NaT, None): + result = CategoricalIndex(list('aabcb')).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index b17d241ff50e0..33ba0189d747a 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -366,14 +366,50 @@ def test_delete(self, closed): result = self.create_index(closed=closed).delete(0) tm.assert_index_equal(result, expected) - def test_insert(self): - expected = IntervalIndex.from_breaks(range(4)) - actual = self.index.insert(2, Interval(2, 3)) - assert expected.equals(actual) + @pytest.mark.parametrize('data', [ + interval_range(0, periods=10, closed='neither'), + interval_range(1.7, periods=8, freq=2.5, closed='both'), + interval_range(Timestamp('20170101'), periods=12, closed='left'), + interval_range(Timedelta('1 day'), periods=6, closed='right'), + IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]), + IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])]) + def test_insert(self, data): + item = data[0] + idx_item = IntervalIndex([item]) + + # start + expected = idx_item.append(data) + result = data.insert(0, item) + tm.assert_index_equal(result, expected) + + # end + expected = data.append(idx_item) + result = data.insert(len(data), item) + tm.assert_index_equal(result, expected) + + # mid + expected = data[:3].append(idx_item).append(data[3:]) + result = data.insert(3, item) + tm.assert_index_equal(result, expected) + + # invalid type + msg = 'can only insert Interval objects and NA into an IntervalIndex' + with tm.assert_raises_regex(ValueError, msg): + data.insert(1, 'foo') - pytest.raises(ValueError, self.index.insert, 0, 1) - pytest.raises(ValueError, self.index.insert, 0, - Interval(2, 3, closed='left')) + # invalid closed + msg = 'inserted item must be closed on the same side as the index' + for closed in {'left', 'right', 'both', 'neither'} - {item.closed}: + with tm.assert_raises_regex(ValueError, msg): + bad_item = Interval(item.left, item.right, closed=closed) + data.insert(1, bad_item) + + # GH 18295 (test missing) + na_idx = IntervalIndex([np.nan], closed=data.closed) + for na in (np.nan, pd.NaT, None): + expected = data[:1].append(na_idx).append(data[1:]) + result = data.insert(1, na) + tm.assert_index_equal(result, expected) def test_take(self, closed): index = self.create_index(closed=closed) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 030d688f510b0..cbd819fa9cfb7 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -187,6 +187,13 @@ def test_where(self, klass): result = i.where(klass(cond)) tm.assert_index_equal(result, expected) + def test_insert(self): + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = self.create_index().insert(1, na) + tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric): _holder = Float64Index diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b4d1c3760f25a..96d5981abc1bb 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -295,6 +295,12 @@ def test_insert(self): # test 0th element tm.assert_index_equal(idx[0:4], result.insert(0, idx[0])) + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = RangeIndex(5).insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = RangeIndex(5, name='Foo') diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index cb88bac6386f7..e64c4e6ac54a5 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -57,6 +57,12 @@ def test_insert(self): assert result.name == expected.name assert result.freq == expected.freq + # GH 18295 (test missing) + expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + for na in (np.nan, pd.NaT, None): + result = timedelta_range('1day', '3day').insert(1, na) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') From be66ef832ef551d118a8e219a9e98928681df835 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Nov 2017 07:10:23 -0800 Subject: [PATCH 26/98] Cross off a few tslibs-TODOs (#18443) --- pandas/_libs/lib.pyx | 13 ----- pandas/_libs/period.pyx | 1 - pandas/_libs/src/datetime/np_datetime.c | 64 ++++++++++++------------- pandas/_libs/src/datetime/np_datetime.h | 3 -- pandas/_libs/tslib.pyx | 12 ++--- pandas/_libs/tslibs/conversion.pyx | 4 +- pandas/_libs/tslibs/nattype.pxd | 2 +- pandas/_libs/tslibs/nattype.pyx | 2 +- pandas/_libs/tslibs/strptime.pyx | 4 +- pandas/_libs/tslibs/timedeltas.pyx | 8 ++-- 10 files changed, 46 insertions(+), 67 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 956aeaf39b021..2ec4b5cf19b72 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -929,19 +929,6 @@ def write_csv_rows(list data, ndarray data_index, # ------------------------------------------------------------------------------ # Groupby-related functions -@cython.boundscheck(False) -def arrmap(ndarray[object] index, object func): - cdef int length = index.shape[0] - cdef int i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - for i from 0 <= i < length: - result[i] = func(index[i]) - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index d09459898321e..2b09e9376bd3d 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -559,7 +559,6 @@ cdef class _Period(object): int64_t ordinal object freq - _comparables = ['name', 'freqstr'] _typ = 'period' def __cinit__(self, ordinal, freq): diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index 7278cbaff86ca..3c63f42f14b83 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -564,18 +564,15 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, void pandas_timedelta_to_timedeltastruct(npy_timedelta val, PANDAS_DATETIMEUNIT fr, - pandas_timedeltastruct *result) { + pandas_timedeltastruct *result) { pandas_datetime_metadata meta; meta.base = fr; - meta.num - 1; + meta.num = 1; convert_timedelta_to_timedeltastruct(&meta, val, result); } -PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *)obj)->obmeta.base; -} /* * Converts a datetime from a datetimestruct to a datetime based @@ -1001,7 +998,6 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, npy_timedelta td, pandas_timedeltastruct *out) { - npy_int64 perday; npy_int64 frac; npy_int64 sfrac; npy_int64 ifrac; @@ -1016,11 +1012,11 @@ int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, // put frac in seconds if (td < 0 && td % (1000LL * 1000LL * 1000LL) != 0) - frac = td / (1000LL * 1000LL * 1000LL) - 1; + frac = td / (1000LL * 1000LL * 1000LL) - 1; else frac = td / (1000LL * 1000LL * 1000LL); - if (frac < 0) { + if (frac < 0) { sign = -1; // even fraction @@ -1030,66 +1026,66 @@ int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, } else { frac = -frac; } - } else { + } else { sign = 1; out->days = 0; - } + } - if (frac >= 86400) { + if (frac >= 86400) { out->days += frac / 86400LL; frac -= out->days * 86400LL; - } + } - if (frac >= 3600) { + if (frac >= 3600) { out->hrs = frac / 3600LL; frac -= out->hrs * 3600LL; - } else { + } else { out->hrs = 0; - } + } - if (frac >= 60) { + if (frac >= 60) { out->min = frac / 60LL; frac -= out->min * 60LL; - } else { + } else { out->min = 0; - } + } - if (frac >= 0) { + if (frac >= 0) { out->sec = frac; frac -= out->sec; - } else { + } else { out->sec = 0; - } + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * (1000LL * 1000LL * 1000LL); + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * (1000LL * 1000LL * 1000LL); - if (sign < 0) + if (sign < 0) out->days = -out->days; - ifrac = td - (out->days * DAY_NS + sfrac); + ifrac = td - (out->days * DAY_NS + sfrac); - if (ifrac != 0) { + if (ifrac != 0) { out->ms = ifrac / (1000LL * 1000LL); ifrac -= out->ms * 1000LL * 1000LL; out->us = ifrac / 1000LL; ifrac -= out->us * 1000LL; out->ns = ifrac; - } else { + } else { out->ms = 0; out->us = 0; out->ns = 0; - } + } - out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; - out->microseconds = out->ms * 1000 + out->us; - out->nanoseconds = out->ns; - break; + out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->microseconds = out->ms * 1000 + out->us; + out->nanoseconds = out->ns; + break; default: PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); + "NumPy timedelta metadata is corrupted with " + "invalid base unit"); return -1; } diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index c51a4bddac82f..7ee7e1e99a704 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -148,7 +148,4 @@ convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, pandas_timedeltastruct *out); -PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); - - #endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 2c43bed4ad053..6d8cf39114f6f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -60,7 +60,7 @@ from tslibs.conversion cimport (tz_convert_single, _TSObject, from tslibs.conversion import tz_convert_single from tslibs.nattype import NaT, nat_strings, iNaT -from tslibs.nattype cimport _checknull_with_nat, NPY_NAT +from tslibs.nattype cimport checknull_with_nat, NPY_NAT from tslibs.timestamps cimport (create_timestamp_from_ts, _NS_UPPER_BOUND, _NS_LOWER_BOUND) @@ -409,7 +409,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif is_integer_object(val) or is_float_object(val): @@ -475,7 +475,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): oresult[i] = NaT elif is_integer_object(val) or is_float_object(val): @@ -526,7 +526,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif PyDateTime_Check(val): @@ -686,7 +686,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', val = values[i] # set as nan except if its a NaT - if _checknull_with_nat(val): + if checknull_with_nat(val): if PyFloat_Check(val): oresult[i] = np.nan else: @@ -704,7 +704,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): oresult[i] = val elif is_string_object(val): diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f58ad0a86d106..7f3cc0a7e81dd 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -40,7 +40,7 @@ from timezones cimport ( from parsing import parse_datetime_string from nattype import nat_strings, NaT -from nattype cimport NPY_NAT, _checknull_with_nat +from nattype cimport NPY_NAT, checknull_with_nat # ---------------------------------------------------------------------- # Constants @@ -143,7 +143,7 @@ def datetime_to_datetime64(ndarray[object] values): iresult = result.view('i8') for i in range(n): val = values[i] - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT elif PyDateTime_Check(val): if val.tzinfo is not None: diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 34fa1e70305e7..96e02142d501b 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -6,4 +6,4 @@ cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] -cdef bint _checknull_with_nat(object val) +cdef bint checknull_with_nat(object val) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index d2f6006b41f65..2e7b861b24fa8 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -572,7 +572,7 @@ NaT = NaTType() # ---------------------------------------------------------------------- -cdef inline bint _checknull_with_nat(object val): +cdef inline bint checknull_with_nat(object val): """ utility to check if a value is a nat or not """ return val is None or ( PyFloat_Check(val) and val != val) or val is NaT diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 439cc21a360c7..65594de586bac 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -38,7 +38,7 @@ from np_datetime cimport (check_dts_bounds, from util cimport is_string_object -from nattype cimport _checknull_with_nat, NPY_NAT +from nattype cimport checknull_with_nat, NPY_NAT from nattype import nat_strings @@ -142,7 +142,7 @@ def array_strptime(ndarray[object] values, object fmt, iresult[i] = NPY_NAT continue else: - if _checknull_with_nat(val): + if checknull_with_nat(val): iresult[i] = NPY_NAT continue else: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6ea30642625fe..b37e5dc620260 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -30,7 +30,7 @@ from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) from nattype import nat_strings, NaT -from nattype cimport _checknull_with_nat, NPY_NAT +from nattype cimport checknull_with_nat, NPY_NAT # ---------------------------------------------------------------------- # Constants @@ -111,7 +111,7 @@ cpdef convert_to_timedelta64(object ts, object unit): # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case """ - if _checknull_with_nat(ts): + if checknull_with_nat(ts): return np.timedelta64(NPY_NAT) elif isinstance(ts, Timedelta): # already in the proper format @@ -443,7 +443,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): cdef bint _validate_ops_compat(other): # return True if we are compat with operating - if _checknull_with_nat(other): + if checknull_with_nat(other): return True elif PyDelta_Check(other) or is_timedelta64_object(other): return True @@ -837,7 +837,7 @@ class Timedelta(_Timedelta): elif is_integer_object(value) or is_float_object(value): # unit=None is de-facto 'ns' value = convert_to_timedelta64(value, unit) - elif _checknull_with_nat(value): + elif checknull_with_nat(value): return NaT else: raise ValueError( From 0bcd77e36b078c9531454a91334c46c25d216b0d Mon Sep 17 00:00:00 2001 From: Xbar Date: Sat, 25 Nov 2017 15:56:49 -0500 Subject: [PATCH 27/98] BUG: in Python3 MultiIndex.from_tuples cannot take "zipped" tuples (#18440) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/indexes/multi.py | 15 +++++++ pandas/tests/indexes/test_multi.py | 64 +++++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 7229bd38fffa9..657d8ecbeb68e 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -144,6 +144,7 @@ Indexing - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) +- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 81d892fba0fe2..456999b94c523 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1162,6 +1162,11 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): @@ -1206,6 +1211,11 @@ def from_tuples(cls, tuples, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + if len(tuples) == 0: if names is None: msg = 'Cannot infer number of levels from empty list' @@ -1260,6 +1270,11 @@ def from_product(cls, iterables, sortorder=None, names=None): from pandas.core.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 2f8c27f1abb7d..5c2a0254b072b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -672,8 +672,9 @@ def test_from_arrays(self): for lev, lab in zip(self.index.levels, self.index.labels): arrays.append(np.asarray(lev).take(lab)) - result = MultiIndex.from_arrays(arrays) - assert list(result) == list(self.index) + # list of arrays as input + result = MultiIndex.from_arrays(arrays, names=self.index.names) + tm.assert_index_equal(result, self.index) # infer correctly result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')], @@ -681,6 +682,21 @@ def test_from_arrays(self): assert result.levels[0].equals(Index([Timestamp('20130101')])) assert result.levels[1].equals(Index(['a', 'b'])) + def test_from_arrays_iterator(self): + # GH 18434 + arrays = [] + for lev, lab in zip(self.index.levels, self.index.labels): + arrays.append(np.asarray(lev).take(lab)) + + # iterator as input + result = MultiIndex.from_arrays(iter(arrays), names=self.index.names) + tm.assert_index_equal(result, self.index) + + # invalid iterator input + with tm.assert_raises_regex( + TypeError, "Input must be a list / sequence of array-likes."): + MultiIndex.from_arrays(0) + def test_from_arrays_index_series_datetimetz(self): idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') @@ -825,7 +841,25 @@ def test_from_product(self): expected = MultiIndex.from_tuples(tuples, names=names) tm.assert_index_equal(result, expected) - assert result.names == names + + def test_from_product_iterator(self): + # GH 18434 + first = ['foo', 'bar', 'buz'] + second = ['a', 'b', 'c'] + names = ['first', 'second'] + tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), + ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), + ('buz', 'c')] + expected = MultiIndex.from_tuples(tuples, names=names) + + # iterator as input + result = MultiIndex.from_product(iter([first, second]), names=names) + tm.assert_index_equal(result, expected) + + # Invalid non-iterable input + with tm.assert_raises_regex( + TypeError, "Input must be a list / sequence of iterables."): + MultiIndex.from_product(0) def test_from_product_empty(self): # 0 levels @@ -1725,8 +1759,28 @@ def test_from_tuples(self): 'from empty list', MultiIndex.from_tuples, []) - idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) - assert len(idx) == 2 + expected = MultiIndex(levels=[[1, 3], [2, 4]], + labels=[[0, 1], [0, 1]], + names=['a', 'b']) + + # input tuples + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) + tm.assert_index_equal(result, expected) + + def test_from_tuples_iterator(self): + # GH 18434 + # input iterator for tuples + expected = MultiIndex(levels=[[1, 3], [2, 4]], + labels=[[0, 1], [0, 1]], + names=['a', 'b']) + + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) + tm.assert_index_equal(result, expected) + + # input non-iterables + with tm.assert_raises_regex( + TypeError, 'Input must be a list / sequence of tuple-likes.'): + MultiIndex.from_tuples(0) def test_from_tuples_empty(self): # GH 16777 From 06518b22e82bf79c92e36904b097b1b1d954070d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Nov 2017 12:58:00 -0800 Subject: [PATCH 28/98] Prevent passing invalid kwds to DateOffset constructors (#18226) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/_libs/tslibs/offsets.pyx | 29 +++++++++++- pandas/tests/tseries/offsets/conftest.py | 13 ++++++ pandas/tests/tseries/offsets/test_offsets.py | 37 +++++++++++++++ pandas/tseries/offsets.py | 48 ++++++++++++++------ 5 files changed, 112 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 657d8ecbeb68e..4ae3d9be04aa7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -79,7 +79,7 @@ Other API Changes - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) - +- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). .. _whatsnew_0220.deprecations: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b03d48bba1649..4ed4d4a9b7b99 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -261,7 +261,7 @@ def _validate_business_time(t_input): # --------------------------------------------------------------------- # Constructor Helpers -_rd_kwds = set([ +relativedelta_kwds = set([ 'years', 'months', 'weeks', 'days', 'year', 'month', 'week', 'day', 'weekday', 'hour', 'minute', 'second', 'microsecond', @@ -406,6 +406,33 @@ class _BaseOffset(object): # will raise NotImplementedError. return get_day_of_month(other, self._day_opt) + def _validate_n(self, n): + """ + Require that `n` be a nonzero integer. + + Parameters + ---------- + n : int + + Returns + ------- + nint : int + + Raises + ------ + TypeError if `int(n)` raises + ValueError if n != int(n) + """ + try: + nint = int(n) + except (ValueError, TypeError): + raise TypeError('`n` argument must be an integer, ' + 'got {ntype}'.format(ntype=type(n))) + if n != nint: + raise ValueError('`n` argument must be an integer, ' + 'got {n}'.format(n=n)) + return nint + class BaseOffset(_BaseOffset): # Here we add __rfoo__ methods that don't play well with cdef classes diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index 25446c24b28c0..76f24123ea0e1 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -7,6 +7,19 @@ def offset_types(request): return request.param +@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if + issubclass(getattr(offsets, o), offsets.MonthOffset) + and o != 'MonthOffset']) +def month_classes(request): + return request.param + + +@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if + issubclass(getattr(offsets, o), offsets.Tick)]) +def tick_classes(request): + return request.param + + @pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific']) def tz(request): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6821017c89c3a..357c95282e78d 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -17,6 +17,7 @@ get_offset, get_standard_freq) from pandas.core.indexes.datetimes import ( _to_m8, DatetimeIndex, _daterange_cache) +import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import WeekDay, CacheableOffset from pandas.tseries.offsets import (BDay, CDay, BQuarterEnd, BMonthEnd, BusinessHour, WeekOfMonth, CBMonthEnd, @@ -4682,9 +4683,45 @@ def test_all_offset_classes(self, tup): assert first == second +# --------------------------------------------------------------------- def test_get_offset_day_error(): # subclass of _BaseOffset must override _day_opt attribute, or we should # get a NotImplementedError with pytest.raises(NotImplementedError): DateOffset()._get_offset_day(datetime.now()) + + +@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +def test_valid_month_attributes(kwd, month_classes): + # GH#18226 + cls = month_classes + # check that we cannot create e.g. MonthEnd(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +def test_valid_tick_attributes(kwd, tick_classes): + # GH#18226 + cls = tick_classes + # check that we cannot create e.g. Hour(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +def test_validate_n_error(): + with pytest.raises(TypeError): + DateOffset(n='Doh!') + + with pytest.raises(TypeError): + MonthBegin(n=timedelta(1)) + + with pytest.raises(TypeError): + BDay(n=np.array([1, 2], dtype=np.int64)) + + +def test_require_integers(offset_types): + cls = offset_types + with pytest.raises(ValueError): + cls(n=1.5) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 90496729554f8..7b699349c3f07 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- +from datetime import date, datetime, timedelta import functools import operator -from datetime import date, datetime, timedelta from pandas.compat import range from pandas import compat import numpy as np @@ -166,7 +166,7 @@ def __add__(date): normalize = False def __init__(self, n=1, normalize=False, **kwds): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self.kwds = kwds @@ -473,7 +473,7 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): _adjust_dst = True def __init__(self, n=1, normalize=False, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self.kwds = {'offset': offset} self._offset = offset @@ -782,7 +782,7 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): def __init__(self, n=1, normalize=False, start='09:00', end='17:00', offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize super(BusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -819,7 +819,7 @@ class CustomBusinessDay(BusinessDay): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self._offset = offset self.kwds = {} @@ -887,7 +887,7 @@ class CustomBusinessHour(BusinessHourMixin, SingleConstructorOffset): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, start='09:00', end='17:00', offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize super(CustomBusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -919,6 +919,11 @@ def next_bday(self): class MonthOffset(SingleConstructorOffset): _adjust_dst = True + def __init__(self, n=1, normalize=False): + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + @property def name(self): if self.isAnchored: @@ -994,7 +999,8 @@ def __init__(self, n=1, normalize=False, day_of_month=None): msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' raise ValueError(msg.format(min=self._min_day_of_month, day=self.day_of_month)) - self.n = int(n) + + self.n = self._validate_n(n) self.normalize = normalize self.kwds = {'day_of_month': self.day_of_month} @@ -1205,7 +1211,7 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self._offset = offset self.kwds = {} @@ -1278,7 +1284,7 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): - self.n = int(n) + self.n = self._validate_n(n) self.normalize = normalize self._offset = offset self.kwds = {} @@ -1345,7 +1351,7 @@ class Week(EndMixin, DateOffset): _prefix = 'W' def __init__(self, n=1, normalize=False, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -1424,7 +1430,7 @@ class WeekOfMonth(DateOffset): _adjust_dst = True def __init__(self, n=1, normalize=False, week=None, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday self.week = week @@ -1509,7 +1515,7 @@ class LastWeekOfMonth(DateOffset): _prefix = 'LWOM' def __init__(self, n=1, normalize=False, weekday=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -1575,7 +1581,7 @@ class QuarterOffset(DateOffset): # point def __init__(self, n=1, normalize=False, startingMonth=None): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize if startingMonth is None: startingMonth = self._default_startingMonth @@ -1820,7 +1826,7 @@ class FY5253(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest"): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.startingMonth = startingMonth self.weekday = weekday @@ -2032,7 +2038,7 @@ class FY5253Quarter(DateOffset): def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, qtr_with_extra_week=1, variation="nearest"): - self.n = n + self.n = self._validate_n(n) self.normalize = normalize self.weekday = weekday @@ -2158,6 +2164,11 @@ class Easter(DateOffset): """ _adjust_dst = True + def __init__(self, n=1, normalize=False): + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + @apply_wraps def apply(self, other): current_easter = easter(other.year) @@ -2199,6 +2210,12 @@ class Tick(SingleConstructorOffset): _inc = Timedelta(microseconds=1000) _prefix = 'undefined' + def __init__(self, n=1, normalize=False): + # TODO: do Tick classes with normalize=True make sense? + self.n = self._validate_n(n) + self.normalize = normalize + self.kwds = {} + __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) __lt__ = _tick_comp(operator.lt) @@ -2257,6 +2274,7 @@ def delta(self): def nanos(self): return delta_to_nanoseconds(self.delta) + # TODO: Should Tick have its own apply_index? def apply(self, other): # Timestamp can handle tz and nano sec, thus no need to use apply_wraps if isinstance(other, Timestamp): From b69c1a26899b38adff8390236ee83ba36af0374e Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 26 Nov 2017 06:01:21 +0900 Subject: [PATCH 29/98] BUG: Fix Index.putmask makes stack overflow with an invalid mask (#18407) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/indexes/base.py | 5 ++++- pandas/tests/indexes/common.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 637ccf0603e0f..51fd3b1076ade 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -73,7 +73,7 @@ Indexing - Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) -- +- Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) - I/O diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b5d912f4201b5..2696f9f94375d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1939,7 +1939,10 @@ def putmask(self, mask, value): try: np.putmask(values, mask, self._convert_for_op(value)) return self._shallow_copy(values) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err + # coerces to object return self.astype(object).putmask(mask, value) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ee6434431bcfc..ba7795d005721 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1032,3 +1032,16 @@ def test_map(self): dict_map = {} tm.assert_index_equal(index.map(dict_map), nan_index) + + def test_putmask_with_wrong_mask(self): + # GH18368 + index = self.create_index() + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) + 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) - 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask('foo', 1) From 3d4422173ee2c169afd19b6762e3b5003d8a954f Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 26 Nov 2017 06:13:14 +0900 Subject: [PATCH 30/98] BUG: Fix inaccurate rolling.var calculation (#18481) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/window.pyx | 25 +++++++++++++++++-------- pandas/tests/test_window.py | 8 ++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 51fd3b1076ade..976f3524e3c71 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -103,7 +103,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 4d5ebdc0c581a..95df5a07a390b 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,9 +661,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +677,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +693,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -749,6 +753,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: @@ -760,10 +767,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2427bcea4053d..8135e263f412f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2482,6 +2482,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) From 1fab80852f90cecd852d8f89f7e963cf89d69d79 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 25 Nov 2017 13:26:13 -0800 Subject: [PATCH 31/98] CLN: ASV Algorithms benchmark (#18423) --- asv_bench/benchmarks/algorithms.py | 168 +++++++++++++++-------------- asv_bench/benchmarks/binary_ops.py | 44 ++++++++ 2 files changed, 130 insertions(+), 82 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 40cfec1bcd4c7..7ffb180b49e09 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,7 +1,6 @@ from importlib import import_module import numpy as np - import pandas as pd from pandas.util import testing as tm @@ -12,113 +11,118 @@ except: pass -class Algorithms(object): + +class Factorize(object): + goal_time = 0.2 - def setup(self): - N = 100000 - np.random.seed(1234) + params = [True, False] + param_names = ['sort'] - self.int_unique = pd.Int64Index(np.arange(N * 5)) - # cache is_unique - self.int_unique.is_unique + def setup(self, sort): + N = 10**5 + np.random.seed(1234) + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - self.int = pd.Int64Index(np.arange(N).repeat(5)) - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + def time_factorize_int(self, sort): + self.int_idx.factorize(sort=sort) - # Convenience naming. - self.checked_add = pd.core.algorithms.checked_add_with_arr + def time_factorize_float(self, sort): + self.float_idx.factorize(sort=sort) - self.arr = np.arange(1000000) - self.arrpos = np.arange(1000000) - self.arrneg = np.arange(-1000000, 0) - self.arrmixed = np.array([1, -1]).repeat(500000) - self.strings = tm.makeStringIndex(100000) + def time_factorize_string(self, sort): + self.string_idx.factorize(sort=sort) - self.arr_nan = np.random.choice([True, False], size=1000000) - self.arrmixed_nan = np.random.choice([True, False], size=1000000) - # match - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) +class Duplicated(object): - def time_factorize_string(self): - self.strings.factorize() + goal_time = 0.2 - def time_factorize_int(self): - self.int.factorize() + params = ['first', 'last', False] + param_names = ['keep'] - def time_factorize_float(self): - self.int.factorize() + def setup(self, keep): + N = 10**5 + np.random.seed(1234) + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - def time_duplicated_int_unique(self): - self.int_unique.duplicated() + def time_duplicated_int(self, keep): + self.int_idx.duplicated(keep=keep) - def time_duplicated_int(self): - self.int.duplicated() + def time_duplicated_float(self, keep): + self.float_idx.duplicated(keep=keep) - def time_duplicated_float(self): - self.float.duplicated() + def time_duplicated_string(self, keep): + self.string_idx.duplicated(keep=keep) - def time_match_strings(self): - pd.match(self.all, self.uniques) - def time_add_overflow_pos_scalar(self): - self.checked_add(self.arr, 1) +class DuplicatedUniqueIndex(object): - def time_add_overflow_neg_scalar(self): - self.checked_add(self.arr, -1) + goal_time = 0.2 - def time_add_overflow_zero_scalar(self): - self.checked_add(self.arr, 0) + def setup(self): + N = 10**5 + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.idx_int_dup.is_unique - def time_add_overflow_pos_arr(self): - self.checked_add(self.arr, self.arrpos) + def time_duplicated_unique_int(self): + self.idx_int_dup.duplicated() - def time_add_overflow_neg_arr(self): - self.checked_add(self.arr, self.arrneg) - def time_add_overflow_mixed_arr(self): - self.checked_add(self.arr, self.arrmixed) +class Match(object): - def time_add_overflow_first_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + goal_time = 0.2 - def time_add_overflow_second_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + def setup(self): + np.random.seed(1234) + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) - def time_add_overflow_both_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, - b_mask=self.arrmixed_nan) + def time_match_string(self): + pd.match(self.all, self.uniques) class Hashing(object): + goal_time = 0.2 - def setup(self): - N = 100000 - - self.df = pd.DataFrame( - {'A': pd.Series(tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=N))), - 'B': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'D': np.random.randn(N), - 'E': np.arange(N), - 'F': pd.date_range('20110101', freq='s', periods=N), - 'G': pd.timedelta_range('1 day', freq='s', periods=N), - }) - self.df['C'] = self.df['B'].astype('category') - self.df.iloc[10:20] = np.nan - - def time_frame(self): - hashing.hash_pandas_object(self.df) - - def time_series_int(self): - hashing.hash_pandas_object(self.df.E) - - def time_series_string(self): - hashing.hash_pandas_object(self.df.B) - - def time_series_categorical(self): - hashing.hash_pandas_object(self.df.C) + def setup_cache(self): + np.random.seed(1234) + N = 10**5 + + df = pd.DataFrame( + {'strings': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'floats': np.random.randn(N), + 'ints': np.arange(N), + 'dates': pd.date_range('20110101', freq='s', periods=N), + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) + df['categories'] = df['strings'].astype('category') + df.iloc[10:20] = np.nan + return df + + def time_frame(self, df): + hashing.hash_pandas_object(df) + + def time_series_int(self, df): + hashing.hash_pandas_object(df['ints']) + + def time_series_string(self, df): + hashing.hash_pandas_object(df['strings']) + + def time_series_float(self, df): + hashing.hash_pandas_object(df['floats']) + + def time_series_categorical(self, df): + hashing.hash_pandas_object(df['categories']) + + def time_series_timedeltas(self, df): + hashing.hash_pandas_object(df['timedeltas']) + + def time_series_dates(self, df): + hashing.hash_pandas_object(df['dates']) diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 429965c06cb48..14169ced4b71f 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,5 +1,6 @@ import numpy as np from pandas import DataFrame, Series, date_range +from pandas.core.algorithms import checked_add_with_arr try: import pandas.core.computation.expressions as expr except ImportError: @@ -108,3 +109,46 @@ def time_timestamp_ops_diff(self, tz): def time_timestamp_ops_diff_with_shift(self, tz): self.s - self.s.shift() + + +class AddOverflowScalar(object): + + goal_time = 0.2 + + params = [1, -1, 0] + param_names = ['scalar'] + + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) + + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) + + +class AddOverflowArray(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) + + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) + + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) + + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) + + def time_add_overflow_both_arg_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2) From 20f65126e0de65876bf412fa4280d8725afe2260 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 25 Nov 2017 16:51:13 -0500 Subject: [PATCH 32/98] Propogating NaN values when using str.split (#18450) (#18462) --- doc/source/whatsnew/v0.21.1.txt | 6 +++++- pandas/core/strings.py | 4 ++++ pandas/tests/test_strings.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 976f3524e3c71..f8274bda546f7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -138,9 +138,13 @@ Categorical - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) + Other ^^^^^ - - -- diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,18 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617 From 50f432de81078e522b12c1247376f459bb235feb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Nov 2017 17:13:39 -0500 Subject: [PATCH 33/98] BLD: merge-script.py typo --- scripts/merge-pr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 11cc96609be94..4062a96d8e08d 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -160,7 +160,7 @@ def merge_pr(pr_num, target_ref): if body is not None: merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))] - authors = "\n".join("Author: %s" % a for a in distinct_authorsS) + authors = "\n".join("Author: %s" % a for a in distinct_authors) merge_message_flags += ["-m", authors] From 38f41e64f4b8a0479f8835022af5e7343ccf8498 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Nov 2017 19:30:34 -0500 Subject: [PATCH 34/98] CI: remove pandas-gbq from 3.5 build to avoid conflicts with 3.6 build_test (#18492) --- ci/requirements-3.5.pip | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 6e4f7b65f9728..0d9e44cf39fa4 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,2 +1 @@ xarray==0.9.1 -pandas-gbq From c44a0630dbf2fe61e717b624bc8b12f7a50967ce Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Nov 2017 07:00:31 -0800 Subject: [PATCH 35/98] CLN: ASV eval benchmark (#18500) --- asv_bench/benchmarks/eval.py | 57 +++++++++++++++++------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 6f33590ee9e33..fd18b3f21cf45 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,4 +1,4 @@ -from .pandas_vb_common import * +import numpy as np import pandas as pd try: import pandas.core.computation.expressions as expr @@ -7,64 +7,61 @@ class Eval(object): + goal_time = 0.2 params = [['numexpr', 'python'], [1, 'all']] param_names = ['engine', 'threads'] def setup(self, engine, threads): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) + np.random.seed(1234) + self.df = pd.DataFrame(np.random.randn(20000, 100)) + self.df2 = pd.DataFrame(np.random.randn(20000, 100)) + self.df3 = pd.DataFrame(np.random.randn(20000, 100)) + self.df4 = pd.DataFrame(np.random.randn(20000, 100)) if threads == 1: expr.set_numexpr_threads(1) def time_add(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df + df2 + df3 + df4', engine=engine) + pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) def time_and(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine) + pd.eval('(self.df > 0) & (self.df2 > 0) & ' + '(self.df3 > 0) & (self.df4 > 0)', engine=engine) def time_chained_cmp(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df < df2 < df3 < df4', engine=engine) + pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) def time_mult(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df * df2 * df3 * df4', engine=engine) + pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) def teardown(self, engine, threads): expr.set_numexpr_threads() class Query(object): + goal_time = 0.2 def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) + np.random.seed(1234) + self.N = 10**6 + self.halfway = (self.N // 2) - 1 + self.index = pd.date_range('20010101', periods=self.N, freq='T') + self.s = pd.Series(self.index) self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) - self.df2 = DataFrame({'dates': self.s.values,}) - - self.df3 = DataFrame({'a': np.random.randn(self.N),}) - self.min_val = self.df3['a'].min() - self.max_val = self.df3['a'].max() + self.df = pd.DataFrame({'a': np.random.randn(self.N), 'dates': self.s}, + index=self.index) + self.data = np.random.randn(self.N) + self.min_val = self.data.min() + self.max_val = self.data.max() def time_query_datetime_index(self): - ts = self.ts - self.df.query('index < @ts') + self.df.query('index < @self.ts') - def time_query_datetime_series(self): - ts = self.ts - self.df2.query('dates < @ts') + def time_query_datetime_column(self): + self.df.query('dates < @self.ts') def time_query_with_boolean_selection(self): - min_val, max_val = self.min_val, self.max_val - self.df.query('(a >= @min_val) & (a <= @max_val)') + self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') From f1aac43c45aac67007867f67043cf911d2690c40 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Nov 2017 07:01:53 -0800 Subject: [PATCH 36/98] CLN: ASV frame_ctor benchmark (#18499) --- asv_bench/benchmarks/frame_ctor.py | 70 +++++++++++---------------- asv_bench/benchmarks/frame_methods.py | 14 ++++++ 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 2ee5f5da7a84a..5fad7b682c2ed 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,32 +1,33 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: - from pandas.tseries.offsets import * + from pandas.tseries import offsets except: from pandas.core.datetools import * -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Creation from nested dict class FromDicts(object): + goal_time = 0.2 def setup(self): - (N, K) = (5000, 50) + np.random.seed(1234) + N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() + self.frame = DataFrame(np.random.randn(N, K), + index=self.index, + columns=self.columns) + self.data = self.frame.to_dict() self.some_dict = list(self.data.values())[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - + self.dict_list = self.frame.to_dict(orient='records') self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} - def time_frame_ctor_list_of_dict(self): DataFrame(self.dict_list) @@ -38,38 +39,21 @@ def time_series_ctor_from_dict(self): def time_frame_ctor_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 - DataFrame(self.data) + DataFrame(self.data2) # from a mi-series -class frame_from_series(object): +class FromSeries(object): goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) - self.s = Series(randn(10000), index=self.mi) + self.mi = MultiIndex.from_product([range(100), range(100)]) + self.s = Series(np.random.randn(10000), index=self.mi) def time_frame_from_mi_series(self): DataFrame(self.s) - -#---------------------------------------------------------------------- -# get_numeric_data - -class frame_get_numeric_data(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' - self.df = self.df.consolidate() - - def time_frame_get_numeric_data(self): - self.df._get_numeric_data() - - # ---------------------------------------------------------------------- # From dict with DatetimeIndex with all offsets @@ -84,13 +68,15 @@ def get_period_count(start_date, off): if (ten_offsets_in_days == 0): return 1000 else: - return min((9 * ((Timestamp.max - start_date).days // ten_offsets_in_days)), 1000) + periods = 9 * (Timestamp.max - start_date).days // ten_offsets_in_days + return min(periods, 1000) def get_index_for_offset(off): start_date = Timestamp('1/1/1900') - return date_range(start_date, periods=min(1000, get_period_count( - start_date, off)), freq=off) + return date_range(start_date, + periods=get_period_count(start_date, off), + freq=off) all_offsets = offsets.__all__ @@ -100,7 +86,7 @@ def get_index_for_offset(off): all_offsets.extend([off + '_1', off + '_2']) -class FrameConstructorDTIndexFromOffsets(object): +class FromDictwithTimestampOffsets(object): params = [all_offsets, [1, 2]] param_names = ['offset', 'n_steps'] @@ -108,13 +94,15 @@ class FrameConstructorDTIndexFromOffsets(object): offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, 'FY5253': {'startingMonth': 1, 'weekday': 1}, - 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + 'FY5253Quarter': {'qtr_with_extra_week': 1, + 'startingMonth': 1, + 'weekday': 1}} offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, 'FY5253Quarter': {'variation': ['nearest', 'last']}} def setup(self, offset, n_steps): - + np.random.seed(1234) extra = False if offset.endswith("_", None, -1): extra = int(offset[-1]) @@ -127,12 +115,12 @@ def setup(self, offset, n_steps): if extra: extras = self.offset_extra_cases[offset] for extra_arg in extras: - kwargs[extra_arg] = extras[extra_arg][extra -1] + kwargs[extra_arg] = extras[extra_arg][extra - 1] offset = getattr(offsets, offset) self.idx = get_index_for_offset(offset(n_steps, **kwargs)) self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict(self.df.items()) + self.d = self.df.to_dict() def time_frame_ctor(self, offset, n_steps): DataFrame(self.d) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index af72ca1e9a6ab..53ee4d8019938 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,6 +1,20 @@ from .pandas_vb_common import * import string +#---------------------------------------------------------------------- +# get_numeric_data + +class frame_get_numeric_data(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 25)) + self.df['foo'] = 'bar' + self.df['bar'] = 'baz' + self.df = self.df.consolidate() + + def time_frame_get_numeric_data(self): + self.df._get_numeric_data() #---------------------------------------------------------------------- # lookup From f26bed69cc1dab2e2401cca5f92e016aa9837046 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 26 Nov 2017 15:04:03 +0000 Subject: [PATCH 37/98] DEPR: Deprecate NDFrame.as_matrix (#18458) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/generic.py | 17 ++-- pandas/core/internals.py | 27 +++++-- pandas/core/panel.py | 2 +- pandas/tests/frame/test_api.py | 33 ++++---- pandas/tests/frame/test_block_internals.py | 32 ++++---- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/internals/test_internals.py | 82 ++++++++++---------- pandas/tests/sparse/test_frame.py | 8 +- 9 files changed, 117 insertions(+), 88 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4ae3d9be04aa7..debaa638c42d0 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -87,7 +87,7 @@ Deprecations ~~~~~~~~~~~~ - ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). -- +- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). - .. _whatsnew_0220.prior_deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 548f228cdd96b..54b0089335b19 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3735,6 +3735,9 @@ def _get_bool_data(self): def as_matrix(self, columns=None): """ + DEPRECATED: as_matrix will be removed in a future version. + Use :meth:`DataFrame.values` instead. + Convert the frame to its Numpy-array representation. Parameters @@ -3770,10 +3773,11 @@ def as_matrix(self, columns=None): -------- pandas.DataFrame.values """ + warnings.warn("Method .as_matrix will be removed in a future version. " + "Use .values instead.", FutureWarning, stacklevel=2) self._consolidate_inplace() - if self._AXIS_REVERSED: - return self._data.as_matrix(columns).T - return self._data.as_matrix(columns) + return self._data.as_array(transpose=self._AXIS_REVERSED, + items=columns) @property def values(self): @@ -3791,7 +3795,8 @@ def values(self): int32. By numpy.find_common_type convention, mixing int64 and uint64 will result in a flot64 dtype. """ - return self.as_matrix() + self._consolidate_inplace() + return self._data.as_array(transpose=self._AXIS_REVERSED) @property def _values(self): @@ -3801,11 +3806,11 @@ def _values(self): @property def _get_values(self): # compat - return self.as_matrix() + return self.values def get_values(self): """same as values (but handles sparseness conversions)""" - return self.as_matrix() + return self.values def get_dtype_counts(self): """Return the counts of dtypes in this object.""" diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e537cb2edc1c4..4f25a19d437ca 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3484,7 +3484,7 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False, mgr = self # figure out our mask a-priori to avoid repeated replacements - values = self.as_matrix() + values = self.as_array() def comp(s): if isna(s): @@ -3670,9 +3670,24 @@ def copy(self, deep=True, mgr=None): return self.apply('copy', axes=new_axes, deep=deep, do_integrity_check=False) - def as_matrix(self, items=None): + def as_array(self, transpose=False, items=None): + """Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : boolean, default False + If True, transpose the return array + items : list of strings or None + Names of block items that will be included in the returned + array. ``None`` means that all block items will be used + + Returns + ------- + arr : ndarray + """ if len(self.blocks) == 0: - return np.empty(self.shape, dtype=float) + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr if items is not None: mgr = self.reindex_axis(items, axis=0) @@ -3680,9 +3695,11 @@ def as_matrix(self, items=None): mgr = self if self._is_single_block or not self.is_mixed_type: - return mgr.blocks[0].get_values() + arr = mgr.blocks[0].get_values() else: - return mgr._interleave() + arr = mgr._interleave() + + return arr.transpose() if transpose else arr def _interleave(self): """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 0a5e705071b5e..0f3c5cb85249a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -464,7 +464,7 @@ def to_excel(self, path, na_rep='', engine=None, **kwargs): def as_matrix(self): self._consolidate_inplace() - return self._data.as_matrix() + return self._data.as_array() # ---------------------------------------------------------------------- # Getting and setting elements diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index e81e31b718498..0b562269ea29d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -243,31 +243,31 @@ def test_itertuples(self): def test_len(self): assert len(self.frame) == len(self.frame.index) - def test_as_matrix(self): + def test_values(self): frame = self.frame - mat = frame.as_matrix() + arr = frame.values - frameCols = frame.columns - for i, row in enumerate(mat): + frame_cols = frame.columns + for i, row in enumerate(arr): for j, value in enumerate(row): - col = frameCols[j] + col = frame_cols[j] if np.isnan(value): assert np.isnan(frame[col][i]) else: assert value == frame[col][i] # mixed type - mat = self.mixed_frame.as_matrix(['foo', 'A']) - assert mat[0, 0] == 'bar' + arr = self.mixed_frame[['foo', 'A']].values + assert arr[0, 0] == 'bar' df = self.klass({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) - mat = df.as_matrix() - assert mat[0, 0] == 1j + arr = df.values + assert arr[0, 0] == 1j # single block corner case - mat = self.frame.as_matrix(['A', 'B']) + arr = self.frame[['A', 'B']].values expected = self.frame.reindex(columns=['A', 'B']).values - assert_almost_equal(mat, expected) + assert_almost_equal(arr, expected) def test_transpose(self): frame = self.frame @@ -311,8 +311,8 @@ def test_class_axis(self): DataFrame.index # no exception! DataFrame.columns # no exception! - def test_more_asMatrix(self): - values = self.mixed_frame.as_matrix() + def test_more_values(self): + values = self.mixed_frame.values assert values.shape[1] == len(self.mixed_frame.columns) def test_repr_with_mi_nat(self): @@ -369,6 +369,13 @@ def test_values(self): self.frame.values[:, 0] = 5. assert (self.frame.values[:, 0] == 5).all() + def test_as_matrix_deprecated(self): + # GH18458 + with tm.assert_produces_warning(FutureWarning): + result = self.frame.as_matrix(columns=self.frame.columns.tolist()) + expected = self.frame.values + tm.assert_numpy_array_equal(result, expected) + def test_deepcopy(self): cp = deepcopy(self.frame) series = cp['A'] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c29821ba51284..8b1fd7d50cb4d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -67,10 +67,10 @@ def test_consolidate_inplace(self): for letter in range(ord('A'), ord('Z')): self.frame[chr(letter)] = chr(letter) - def test_as_matrix_consolidate(self): + def test_values_consolidate(self): self.frame['E'] = 7. assert not self.frame._data.is_consolidated() - _ = self.frame.as_matrix() # noqa + _ = self.frame.values # noqa assert self.frame._data.is_consolidated() def test_modify_values(self): @@ -91,50 +91,50 @@ def test_boolean_set_uncons(self): self.frame[self.frame > 1] = 2 assert_almost_equal(expected, self.frame.values) - def test_as_matrix_numeric_cols(self): + def test_values_numeric_cols(self): self.frame['foo'] = 'bar' - values = self.frame.as_matrix(['A', 'B', 'C', 'D']) + values = self.frame[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - def test_as_matrix_lcd(self): + def test_values_lcd(self): # mixed lcd - values = self.mixed_float.as_matrix(['A', 'B', 'C', 'D']) + values = self.mixed_float[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - values = self.mixed_float.as_matrix(['A', 'B', 'C']) + values = self.mixed_float[['A', 'B', 'C']].values assert values.dtype == np.float32 - values = self.mixed_float.as_matrix(['C']) + values = self.mixed_float[['C']].values assert values.dtype == np.float16 # GH 10364 # B uint64 forces float because there are other signed int types - values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) + values = self.mixed_int[['A', 'B', 'C', 'D']].values assert values.dtype == np.float64 - values = self.mixed_int.as_matrix(['A', 'D']) + values = self.mixed_int[['A', 'D']].values assert values.dtype == np.int64 # B uint64 forces float because there are other signed int types - values = self.mixed_int.as_matrix(['A', 'B', 'C']) + values = self.mixed_int[['A', 'B', 'C']].values assert values.dtype == np.float64 # as B and C are both unsigned, no forcing to float is needed - values = self.mixed_int.as_matrix(['B', 'C']) + values = self.mixed_int[['B', 'C']].values assert values.dtype == np.uint64 - values = self.mixed_int.as_matrix(['A', 'C']) + values = self.mixed_int[['A', 'C']].values assert values.dtype == np.int32 - values = self.mixed_int.as_matrix(['C', 'D']) + values = self.mixed_int[['C', 'D']].values assert values.dtype == np.int64 - values = self.mixed_int.as_matrix(['A']) + values = self.mixed_int[['A']].values assert values.dtype == np.int32 - values = self.mixed_int.as_matrix(['C']) + values = self.mixed_int[['C']].values assert values.dtype == np.uint8 def test_constructor_with_convert(self): diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 5b903c5a1eaf6..f0a21cde4fbd9 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -439,7 +439,7 @@ def test_columns_with_dups(self): xp.columns = ['A', 'A', 'B'] assert_frame_equal(rs, xp) - def test_as_matrix_duplicates(self): + def test_values_duplicates(self): df = DataFrame([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], columns=['one', 'one', 'two', 'two']) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 4c0c7d8598a8e..a22d0174947e1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -476,7 +476,7 @@ def test_copy(self, mgr): def test_sparse(self): mgr = create_mgr('a: sparse-1; b: sparse-2') # what to test here? - assert mgr.as_matrix().dtype == np.float64 + assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') @@ -485,32 +485,32 @@ def test_sparse_mixed(self): # what to test here? - def test_as_matrix_float(self): + def test_as_array_float(self): mgr = create_mgr('c: f4; d: f2; e: f8') - assert mgr.as_matrix().dtype == np.float64 + assert mgr.as_array().dtype == np.float64 mgr = create_mgr('c: f4; d: f2') - assert mgr.as_matrix().dtype == np.float32 + assert mgr.as_array().dtype == np.float32 - def test_as_matrix_int_bool(self): + def test_as_array_int_bool(self): mgr = create_mgr('a: bool-1; b: bool-2') - assert mgr.as_matrix().dtype == np.bool_ + assert mgr.as_array().dtype == np.bool_ mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') - assert mgr.as_matrix().dtype == np.int64 + assert mgr.as_array().dtype == np.int64 mgr = create_mgr('c: i4; d: i2; e: u1') - assert mgr.as_matrix().dtype == np.int32 + assert mgr.as_array().dtype == np.int32 - def test_as_matrix_datetime(self): + def test_as_array_datetime(self): mgr = create_mgr('h: datetime-1; g: datetime-2') - assert mgr.as_matrix().dtype == 'M8[ns]' + assert mgr.as_array().dtype == 'M8[ns]' - def test_as_matrix_datetime_tz(self): + def test_as_array_datetime_tz(self): mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]' assert mgr.get('g').dtype == 'datetime64[ns, CET]' - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' def test_astype(self): # coerce all @@ -607,49 +607,49 @@ def test_interleave(self): for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]', 'm8[ns]']: mgr = create_mgr('a: {0}'.format(dtype)) - assert mgr.as_matrix().dtype == dtype + assert mgr.as_array().dtype == dtype mgr = create_mgr('a: {0}; b: {0}'.format(dtype)) - assert mgr.as_matrix().dtype == dtype + assert mgr.as_array().dtype == dtype # will be converted according the actual dtype of the underlying mgr = create_mgr('a: category') - assert mgr.as_matrix().dtype == 'i8' + assert mgr.as_array().dtype == 'i8' mgr = create_mgr('a: category; b: category') - assert mgr.as_matrix().dtype == 'i8' + assert mgr.as_array().dtype == 'i8' mgr = create_mgr('a: category; b: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: category2; b: category2') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' # combinations mgr = create_mgr('a: f8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f8; b: i8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f4; b: i8') - assert mgr.as_matrix().dtype == 'f8' + assert mgr.as_array().dtype == 'f8' mgr = create_mgr('a: f4; b: i8; d: object') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: bool; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: complex') - assert mgr.as_matrix().dtype == 'complex' + assert mgr.as_array().dtype == 'complex' mgr = create_mgr('a: f8; b: category') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: category') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: bool') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: bool') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: m8[ns]; b: i8') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' mgr = create_mgr('a: M8[ns]; b: m8[ns]') - assert mgr.as_matrix().dtype == 'object' + assert mgr.as_array().dtype == 'object' def test_interleave_non_unique_cols(self): df = DataFrame([ @@ -831,7 +831,7 @@ def test_equals_block_order_different_dtypes(self): def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) - assert mgr.as_matrix().tolist() == [0., 1., 2., 3., 4.] + assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.] def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] @@ -878,7 +878,7 @@ class TestIndexing(object): def test_get_slice(self): def assert_slice_ok(mgr, axis, slobj): # import pudb; pudb.set_trace() - mat = mgr.as_matrix() + mat = mgr.as_array() # we maybe using an ndarray to test slicing and # might not be the full length of the axis @@ -889,7 +889,7 @@ def assert_slice_ok(mgr, axis, slobj): len(ax) - len(slobj), dtype=bool)]) sliced = mgr.get_slice(slobj, axis=axis) mat_slobj = (slice(None), ) * axis + (slobj, ) - tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_matrix(), + tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(), check_dtype=False) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) @@ -930,10 +930,10 @@ def assert_slice_ok(mgr, axis, slobj): def test_take(self): def assert_take_ok(mgr, axis, indexer): - mat = mgr.as_matrix() + mat = mgr.as_array() taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal(np.take(mat, indexer, axis), - taken.as_matrix(), check_dtype=False) + taken.as_array(), check_dtype=False) tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) @@ -950,14 +950,14 @@ def assert_take_ok(mgr, axis, indexer): def test_reindex_axis(self): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): - mat = mgr.as_matrix() + mat = mgr.as_array() indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis, fill_value=fill_value), - reindexed.as_matrix(), + reindexed.as_array(), check_dtype=False) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -996,13 +996,13 @@ def test_reindex_indexer(self): def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): - mat = mgr.as_matrix() + mat = mgr.as_array() reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer(new_labels, indexer, axis, fill_value=fill_value) tm.assert_numpy_array_equal(reindexed_mat, - reindexed.as_matrix(), + reindexed.as_array(), check_dtype=False) tm.assert_index_equal(reindexed.axes[axis], new_labels) diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index e65059156c5b9..bb5dbdcaaa7c4 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -78,16 +78,16 @@ def test_fill_value_when_combine_const(self): res = df.add(2, fill_value=0) tm.assert_sp_frame_equal(res, exp) - def test_as_matrix(self): - empty = self.empty.as_matrix() + def test_values(self): + empty = self.empty.values assert empty.shape == (0, 0) no_cols = SparseDataFrame(index=np.arange(10)) - mat = no_cols.as_matrix() + mat = no_cols.values assert mat.shape == (10, 0) no_index = SparseDataFrame(columns=np.arange(10)) - mat = no_index.as_matrix() + mat = no_index.values assert mat.shape == (0, 10) def test_copy(self): From b08c22ba8d2800335bd743fb90be7658a0a5a688 Mon Sep 17 00:00:00 2001 From: jschendel Date: Sun, 26 Nov 2017 08:05:19 -0700 Subject: [PATCH 38/98] TYPO: IntervalIndex.symmetric_differnce -> IntervalIndex.symmetric_difference (#18476) --- doc/source/whatsnew/v0.22.0.txt | 4 +- pandas/core/indexes/interval.py | 2 +- pandas/tests/indexes/test_interval.py | 288 +++++++++++++++----------- 3 files changed, 174 insertions(+), 120 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index debaa638c42d0..347f6047e0b48 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -114,7 +114,7 @@ Performance Improvements - The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) -- +- Improved performance of ``IntervalIndex.symmetric_difference()`` (:issue:`18475`) .. _whatsnew_0220.docs: @@ -146,7 +146,7 @@ Indexing - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) -- +- Bug in ``IntervalIndex.symmetric_difference()`` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) I/O ^^^ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 06843150bf46a..3f74694880533 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1171,7 +1171,7 @@ def func(self, other): union = _setop('union') intersection = _setop('intersection') difference = _setop('difference') - symmetric_differnce = _setop('symmetric_difference') + symmetric_difference = _setop('symmetric_difference') # TODO: arithmetic operations diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 33ba0189d747a..815d5fcde1400 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -3,9 +3,9 @@ import pytest import numpy as np from datetime import timedelta -from pandas import (Interval, IntervalIndex, Index, isna, - interval_range, Timestamp, Timedelta, - compat, date_range, timedelta_range, DateOffset) +from pandas import ( + Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp, + Timedelta, compat, date_range, timedelta_range, DateOffset) from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree @@ -19,6 +19,11 @@ def closed(request): return request.param +@pytest.fixture(scope='class', params=[None, 'foo']) +def name(request): + return request.param + + class TestIntervalIndex(Base): _holder = IntervalIndex @@ -29,13 +34,14 @@ def setup_method(self, method): self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) def create_index(self, closed='right'): - return IntervalIndex.from_breaks(np.arange(3), closed=closed) + return IntervalIndex.from_breaks(range(11), closed=closed) def create_index_with_nan(self, closed='right'): - return IntervalIndex.from_tuples( - [(0, 1), np.nan, (1, 2)], closed=closed) + mask = [True, False] + [True] * 8 + return IntervalIndex.from_arrays( + np.where(mask, np.arange(10), np.nan), + np.where(mask, np.arange(1, 11), np.nan), closed=closed) - @pytest.mark.parametrize('name', [None, 'foo']) def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] @@ -226,55 +232,67 @@ def f(): def test_properties(self, closed): index = self.create_index(closed=closed) - assert len(index) == 2 - assert index.size == 2 - assert index.shape == (2, ) + assert len(index) == 10 + assert index.size == 10 + assert index.shape == (10, ) - tm.assert_index_equal(index.left, Index([0, 1])) - tm.assert_index_equal(index.right, Index([1, 2])) - tm.assert_index_equal(index.mid, Index([0.5, 1.5])) + tm.assert_index_equal(index.left, Index(np.arange(10))) + tm.assert_index_equal(index.right, Index(np.arange(1, 11))) + tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5))) assert index.closed == closed - expected = np.array([Interval(0, 1, closed=closed), - Interval(1, 2, closed=closed)], dtype=object) + ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] + expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) # with nans index = self.create_index_with_nan(closed=closed) - assert len(index) == 3 - assert index.size == 3 - assert index.shape == (3, ) + assert len(index) == 10 + assert index.size == 10 + assert index.shape == (10, ) - tm.assert_index_equal(index.left, Index([0, np.nan, 1])) - tm.assert_index_equal(index.right, Index([1, np.nan, 2])) - tm.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) + expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9]) + expected_right = expected_left + 1 + expected_mid = expected_left + 0.5 + tm.assert_index_equal(index.left, expected_left) + tm.assert_index_equal(index.right, expected_right) + tm.assert_index_equal(index.mid, expected_mid) assert index.closed == closed - expected = np.array([Interval(0, 1, closed=closed), np.nan, - Interval(1, 2, closed=closed)], dtype=object) + ivs = [Interval(l, r, closed) if notna(l) else np.nan + for l, r in zip(expected_left, expected_right)] + expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) def test_with_nans(self, closed): index = self.create_index(closed=closed) assert not index.hasnans - tm.assert_numpy_array_equal(index.isna(), - np.array([False, False])) - tm.assert_numpy_array_equal(index.notna(), - np.array([True, True])) + + result = index.isna() + expected = np.repeat(False, len(index)) + tm.assert_numpy_array_equal(result, expected) + + result = index.notna() + expected = np.repeat(True, len(index)) + tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) assert index.hasnans - tm.assert_numpy_array_equal(index.notna(), - np.array([True, False, True])) - tm.assert_numpy_array_equal(index.isna(), - np.array([False, True, False])) + + result = index.isna() + expected = np.array([False, True] + [False] * (len(index) - 2)) + tm.assert_numpy_array_equal(result, expected) + + result = index.notna() + expected = np.array([True, False] + [True] * (len(index) - 2)) + tm.assert_numpy_array_equal(result, expected) def test_copy(self, closed): - expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + expected = self.create_index(closed=closed) result = expected.copy() assert result.equals(expected) @@ -362,7 +380,7 @@ def test_where(self, closed, klass): tm.assert_index_equal(result, expected) def test_delete(self, closed): - expected = IntervalIndex.from_breaks([1, 2], closed=closed) + expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed) result = self.create_index(closed=closed).delete(0) tm.assert_index_equal(result, expected) @@ -414,13 +432,13 @@ def test_insert(self, data): def test_take(self, closed): index = self.create_index(closed=closed) - actual = index.take([0, 1]) - tm.assert_index_equal(actual, index) + result = index.take(range(10)) + tm.assert_index_equal(result, index) + result = index.take([0, 0, 1]) expected = IntervalIndex.from_arrays( [0, 0, 1], [1, 1, 2], closed=closed) - actual = index.take([0, 0, 1]) - tm.assert_index_equal(actual, expected) + tm.assert_index_equal(result, expected) def test_unique(self, closed): # unique non-overlapping @@ -780,50 +798,85 @@ def test_non_contiguous(self, closed): assert 1.5 not in index def test_union(self, closed): - idx = self.create_index(closed=closed) - other = IntervalIndex.from_arrays([2], [3], closed=closed) - expected = IntervalIndex.from_arrays( - range(3), range(1, 4), closed=closed) - actual = idx.union(other) - assert expected.equals(actual) + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) - actual = other.union(idx) - assert expected.equals(actual) + expected = IntervalIndex.from_breaks(range(13), closed=closed) + result = index.union(other) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) - tm.assert_index_equal(idx.union(idx), idx) - tm.assert_index_equal(idx.union(idx[:1]), idx) + tm.assert_index_equal(index.union(index), index) + tm.assert_index_equal(index.union(index[:1]), index) def test_intersection(self, closed): - idx = self.create_index(closed=closed) - other = IntervalIndex.from_breaks([1, 2, 3], closed=closed) - expected = IntervalIndex.from_breaks([1, 2], closed=closed) - actual = idx.intersection(other) - assert expected.equals(actual) + index = self.create_index(closed=closed) + other = IntervalIndex.from_breaks(range(5, 13), closed=closed) + + expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + result = other.intersection(index) + tm.assert_index_equal(result, expected) - tm.assert_index_equal(idx.intersection(idx), idx) + tm.assert_index_equal(index.intersection(index), index) def test_difference(self, closed): - idx = self.create_index(closed=closed) - tm.assert_index_equal(idx.difference(idx[:1]), idx[1:]) + index = self.create_index(closed=closed) + tm.assert_index_equal(index.difference(index[:1]), index[1:]) - def test_symmetric_difference(self): - result = self.index[:1].symmetric_difference(self.index[1:]) - expected = self.index + def test_symmetric_difference(self, closed): + idx = self.create_index(closed=closed) + result = idx[1:].symmetric_difference(idx[:-1]) + expected = IntervalIndex([idx[0], idx[-1]]) tm.assert_index_equal(result, expected) - def test_set_operation_errors(self): - pytest.raises(ValueError, self.index.union, self.index.left) + @pytest.mark.parametrize('op_name', [ + 'union', 'intersection', 'difference', 'symmetric_difference']) + def test_set_operation_errors(self, closed, op_name): + index = self.create_index(closed=closed) + set_op = getattr(index, op_name) + + # test errors + msg = ('can only do set operations between two IntervalIndex objects ' + 'that are closed on the same side') + with tm.assert_raises_regex(ValueError, msg): + set_op(Index([1, 2, 3])) - other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - pytest.raises(ValueError, self.index.union, other) + for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + other = self.create_index(closed=other_closed) + with tm.assert_raises_regex(ValueError, msg): + set_op(other) def test_isin(self, closed): - idx = self.create_index(closed=closed) - actual = idx.isin(idx) - tm.assert_numpy_array_equal(np.array([True, True]), actual) + index = self.create_index(closed=closed) - actual = idx.isin(idx[:1]) - tm.assert_numpy_array_equal(np.array([True, False]), actual) + expected = np.array([True] + [False] * (len(index) - 1)) + result = index.isin(index[:1]) + tm.assert_numpy_array_equal(result, expected) + + result = index.isin([index[0]]) + tm.assert_numpy_array_equal(result, expected) + + other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed) + expected = np.array([True] * (len(index) - 1) + [False]) + result = index.isin(other) + tm.assert_numpy_array_equal(result, expected) + + result = index.isin(other.tolist()) + tm.assert_numpy_array_equal(result, expected) + + for other_closed in {'right', 'left', 'both', 'neither'}: + other = self.create_index(closed=other_closed) + expected = np.repeat(closed == other_closed, len(index)) + result = index.isin(other) + tm.assert_numpy_array_equal(result, expected) + + result = index.isin(other.tolist()) + tm.assert_numpy_array_equal(result, expected) def test_comparison(self): actual = Interval(0, 1) < self.index @@ -896,23 +949,24 @@ def test_missing_values(self, closed): np.array([True, False, False])) def test_sort_values(self, closed): - expected = IntervalIndex.from_breaks([1, 2, 3, 4], closed=closed) - actual = IntervalIndex.from_tuples( - [(3, 4), (1, 2), (2, 3)], closed=closed).sort_values() - tm.assert_index_equal(expected, actual) + index = self.create_index(closed=closed) + + result = index.sort_values() + tm.assert_index_equal(result, index) - # nan - idx = self.create_index_with_nan(closed=closed) - mask = idx.isna() - tm.assert_numpy_array_equal(mask, np.array([False, True, False])) + result = index.sort_values(ascending=False) + tm.assert_index_equal(result, index[::-1]) - result = idx.sort_values() - mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([False, False, True])) + # with nan + index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) - result = idx.sort_values(ascending=False) - mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([True, False, False])) + result = index.sort_values() + expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) + tm.assert_index_equal(result, expected) + + result = index.sort_values(ascending=False) + expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) + tm.assert_index_equal(result, expected) def test_datetime(self): dates = date_range('2000', periods=3) @@ -992,58 +1046,58 @@ def test_is_non_overlapping_monotonic(self, closed): class TestIntervalRange(object): - def test_construction_from_numeric(self, closed): + def test_construction_from_numeric(self, closed, name): # combinations of start/end/periods without freq expected = IntervalIndex.from_breaks( - np.arange(0, 6), name='foo', closed=closed) + np.arange(0, 6), name=name, closed=closed) - result = interval_range(start=0, end=5, name='foo', closed=closed) + result = interval_range(start=0, end=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=0, periods=5, name='foo', closed=closed) + result = interval_range(start=0, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=5, periods=5, name='foo', closed=closed) + result = interval_range(end=5, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) # combinations of start/end/periods with freq expected = IntervalIndex.from_tuples([(0, 2), (2, 4), (4, 6)], - name='foo', closed=closed) + name=name, closed=closed) - result = interval_range(start=0, end=6, freq=2, name='foo', + result = interval_range(start=0, end=6, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=0, periods=3, freq=2, name='foo', + result = interval_range(start=0, periods=3, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=6, periods=3, freq=2, name='foo', + result = interval_range(end=6, periods=3, freq=2, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. expected = IntervalIndex.from_tuples([(0.0, 1.5), (1.5, 3.0)], - name='foo', closed=closed) - result = interval_range(start=0, end=4, freq=1.5, name='foo', + name=name, closed=closed) + result = interval_range(start=0, end=4, freq=1.5, name=name, closed=closed) tm.assert_index_equal(result, expected) - def test_construction_from_timestamp(self, closed): + def test_construction_from_timestamp(self, closed, name): # combinations of start/end/periods without freq start, end = Timestamp('2017-01-01'), Timestamp('2017-01-06') breaks = date_range(start=start, end=end) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, name='foo', + result = interval_range(start=start, end=end, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=5, name='foo', + result = interval_range(start=start, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=5, name='foo', + result = interval_range(end=end, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -1051,23 +1105,23 @@ def test_construction_from_timestamp(self, closed): freq = '2D' start, end = Timestamp('2017-01-01'), Timestamp('2017-01-07') breaks = date_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=3, freq=freq, name='foo', + result = interval_range(start=start, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=3, freq=freq, name='foo', + result = interval_range(end=end, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timestamp('2017-01-08') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -1075,41 +1129,41 @@ def test_construction_from_timestamp(self, closed): freq = 'M' start, end = Timestamp('2017-01-01'), Timestamp('2017-12-31') breaks = date_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=11, freq=freq, name='foo', + result = interval_range(start=start, periods=11, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=11, freq=freq, name='foo', + result = interval_range(end=end, periods=11, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timestamp('2018-01-15') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - def test_construction_from_timedelta(self, closed): + def test_construction_from_timedelta(self, closed, name): # combinations of start/end/periods without freq start, end = Timedelta('1 day'), Timedelta('6 days') breaks = timedelta_range(start=start, end=end) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, name='foo', + result = interval_range(start=start, end=end, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=5, name='foo', + result = interval_range(start=start, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=5, name='foo', + result = interval_range(end=end, periods=5, name=name, closed=closed) tm.assert_index_equal(result, expected) @@ -1117,23 +1171,23 @@ def test_construction_from_timedelta(self, closed): freq = '2D' start, end = Timedelta('1 day'), Timedelta('7 days') breaks = timedelta_range(start=start, end=end, freq=freq) - expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(start=start, periods=3, freq=freq, name='foo', + result = interval_range(start=start, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) - result = interval_range(end=end, periods=3, freq=freq, name='foo', + result = interval_range(end=end, periods=3, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) # output truncates early if freq causes end to be skipped. end = Timedelta('7 days 1 hour') - result = interval_range(start=start, end=end, freq=freq, name='foo', + result = interval_range(start=start, end=end, freq=freq, name=name, closed=closed) tm.assert_index_equal(result, expected) From 78b24b288a65ac125f4681211b37a3fe677997aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Nov 2017 07:06:08 -0800 Subject: [PATCH 39/98] parametrize offsets tests (#18494) --- asv_bench/benchmarks/offset.py | 239 ++ asv_bench/benchmarks/timeseries.py | 213 +- pandas/tests/tseries/offsets/test_fiscal.py | 606 +++ pandas/tests/tseries/offsets/test_offsets.py | 3294 +++++------------ .../tests/tseries/offsets/test_yqm_offsets.py | 994 +++++ pandas/tseries/offsets.py | 305 +- 6 files changed, 2905 insertions(+), 2746 deletions(-) create mode 100644 asv_bench/benchmarks/offset.py create mode 100644 pandas/tests/tseries/offsets/test_fiscal.py create mode 100644 pandas/tests/tseries/offsets/test_yqm_offsets.py diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py new file mode 100644 index 0000000000000..ea826e8270ace --- /dev/null +++ b/asv_bench/benchmarks/offset.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np + +import pandas as pd +from pandas import date_range + +try: + import pandas.tseries.holiday +except ImportError: + pass + +hcal = pd.tseries.holiday.USFederalHolidayCalendar() + + +class ApplyIndex(object): + goal_time = 0.2 + + params = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), + pd.offsets.BYearEnd(), pd.offsets.BYearBegin(), + pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), + pd.offsets.BQuarterEnd(), pd.offsets.BQuarterBegin(), + pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), + pd.offsets.BMonthEnd(), pd.offsets.BMonthBegin()] + + def setup(self, param): + self.offset = param + + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + self.ser = pd.Series(self.rng) + + def time_apply_index(self, param): + self.rng + self.offset + + def time_apply_series(self, param): + self.ser + self.offset + + +class DatetimeIndexArithmetic(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + self.day_offset = pd.offsets.Day() + self.relativedelta_offset = pd.offsets.DateOffset(months=2, days=2) + self.busday_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + self.rng + self.day_offset + + def time_add_offset_fast(self): + self.rng + self.relativedelta_offset + + def time_add_offset_slow(self): + self.rng + self.busday_offset + + +class SeriesArithmetic(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + rng = date_range(start='20140101', freq='T', periods=self.N) + self.ser = pd.Series(rng) + self.day_offset = pd.offsets.Day() + self.relativedelta_offset = pd.offsets.DateOffset(months=2, days=2) + self.busday_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + self.ser + self.day_offset + + def time_add_offset_fast(self): + self.ser + self.relativedelta_offset + + def time_add_offset_slow(self): + self.ser + self.busday_offset + + +class YearBegin(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.year = pd.offsets.YearBegin() + + def time_timeseries_year_apply(self): + self.year.apply(self.date) + + def time_timeseries_year_incr(self): + self.date + self.year + + +class Day(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.day = pd.offsets.Day() + + def time_timeseries_day_apply(self): + self.day.apply(self.date) + + def time_timeseries_day_incr(self): + self.date + self.day + + +class CBDay(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.cday = pd.offsets.CustomBusinessDay() + + def time_custom_bday_decr(self): + self.date - self.cday + + def time_custom_bday_incr(self): + self.date + self.cday + + def time_custom_bday_apply(self): + self.cday.apply(self.date) + + def time_custom_bday_apply_dt64(self): + self.cday.apply(self.dt64) + + +class CBDayHolidays(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=hcal) + + def time_custom_bday_cal_incr(self): + self.date + 1 * self.cdayh + + def time_custom_bday_cal_decr(self): + self.date - 1 * self.cdayh + + def time_custom_bday_cal_incr_n(self): + self.date + 10 * self.cdayh + + def time_custom_bday_cal_incr_neg_n(self): + self.date - 10 * self.cdayh + + +class CBMonthBegin(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal) + + def time_custom_bmonthbegin_decr_n(self): + self.date - (10 * self.cmb) + + def time_custom_bmonthbegin_incr_n(self): + self.date + (10 * self.cmb) + + +class CBMonthEnd(object): + goal_time = 0.2 + + def setup(self): + self.date = datetime(2011, 1, 1) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal) + + def time_custom_bmonthend_incr(self): + self.date + self.cme + + def time_custom_bmonthend_incr_n(self): + self.date + (10 * self.cme) + + def time_custom_bmonthend_decr_n(self): + self.date - (10 * self.cme) + + +class SemiMonthOffset(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + # date is not on an offset which will be slowest case + self.date = datetime(2011, 1, 2) + self.semi_month_end = pd.offsets.SemiMonthEnd() + self.semi_month_begin = pd.offsets.SemiMonthBegin() + + def time_end_apply(self): + self.semi_month_end.apply(self.date) + + def time_end_incr(self): + self.date + self.semi_month_end + + def time_end_incr_n(self): + self.date + 10 * self.semi_month_end + + def time_end_decr(self): + self.date - self.semi_month_end + + def time_end_decr_n(self): + self.date - 10 * self.semi_month_end + + def time_end_apply_index(self): + self.semi_month_end.apply_index(self.rng) + + def time_end_incr_rng(self): + self.rng + self.semi_month_end + + def time_end_decr_rng(self): + self.rng - self.semi_month_end + + def time_begin_apply(self): + self.semi_month_begin.apply(self.date) + + def time_begin_incr(self): + self.date + self.semi_month_begin + + def time_begin_incr_n(self): + self.date + 10 * self.semi_month_begin + + def time_begin_decr(self): + self.date - self.semi_month_begin + + def time_begin_decr_n(self): + self.date - 10 * self.semi_month_begin + + def time_begin_apply_index(self): + self.semi_month_begin.apply_index(self.rng) + + def time_begin_incr_rng(self): + self.rng + self.semi_month_begin + + def time_begin_decr_rng(self): + self.rng - self.semi_month_begin diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 2ca2416f58b57..b3996739e33f7 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,13 +2,11 @@ from pandas.plotting._converter import DatetimeConverter except ImportError: from pandas.tseries.converter import DatetimeConverter -from .pandas_vb_common import * + import pandas as pd +from pandas import to_datetime, date_range, Series, DataFrame, period_range + import datetime as dt -try: - import pandas.tseries.holiday -except ImportError: - pass from pandas.tseries.frequencies import infer_freq import numpy as np @@ -22,32 +20,38 @@ class DatetimeIndex(object): def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') + self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, + freq='S', tz='US/Eastern') - self.index_repeated = date_range(start='1/1/2000', periods=1000, freq='T').repeat(10) + self.index_repeated = date_range(start='1/1/2000', + periods=1000, freq='T').repeat(10) self.rng3 = date_range(start='1/1/2000', periods=1000, freq='H') self.df = DataFrame(np.random.randn(len(self.rng3), 2), self.rng3) - self.rng4 = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), index=self.rng4) + self.rng4 = date_range(start='1/1/2000', periods=1000, + freq='H', tz='US/Eastern') + self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), + index=self.rng4) N = 100000 self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, tz='Asia/Tokyo').repeat(5) - self.rng5 = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') + self.rng5 = date_range(start='1/1/2000', + end='3/1/2000', tz='US/Eastern') - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') + self.dst_rng = date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', + end='10/29/2000 00:59:59', freq='S') self.index = self.index.append(self.dst_rng) self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', + end='10/29/2000 3:00:00', + freq='S')) self.N = 10000 self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') @@ -62,15 +66,6 @@ def setup(self): def time_add_timedelta(self): (self.rng + dt.timedelta(minutes=2)) - def time_add_offset_delta(self): - (self.rng + self.delta_offset) - - def time_add_offset_fast(self): - (self.rng + self.fast_offset) - - def time_add_offset_slow(self): - (self.rng + self.slow_offset) - def time_normalize(self): self.rng2.normalize() @@ -116,6 +111,7 @@ def time_to_date(self): def time_to_pydatetime(self): self.rng.to_pydatetime() + class TimeDatetimeConverter(object): goal_time = 0.2 @@ -156,7 +152,7 @@ def time_iter_periodindex_preexit(self): self.iter_n(self.idx2, self.M) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Resampling class ResampleDataFrame(object): @@ -195,7 +191,8 @@ def setup(self): self.rng2 = date_range(start='1/1/2000', end='1/1/2001', freq='T') self.ts2 = Series(np.random.randn(len(self.rng2)), index=self.rng2) - self.rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') + self.rng3 = date_range(start='2000-01-01 00:00:00', + end='2000-01-01 10:00:00', freq='555000U') self.int_ts = Series(5, self.rng3, dtype='int64') self.dt_ts = self.int_ts.astype('datetime64[ns]') @@ -223,7 +220,8 @@ def setup(self): self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.dates = date_range(start='1/1/1990', + periods=(self.N * 10), freq='5s') self.ts2 = self.ts.copy() self.ts2[250:5000] = np.nan self.ts3 = self.ts.copy() @@ -261,7 +259,8 @@ def setup(self): self.N = 10000 self.M = 100 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.dates = date_range(start='1/1/1990', + periods=(self.N * 10), freq='5s') self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) self.ts2 = self.ts.copy() self.ts2.iloc[250:5000] = np.nan @@ -306,8 +305,10 @@ def setup(self): self.lindex = np.random.permutation(self.N)[:(self.N // 2)] self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts2.values.take(self.lindex), index=self.ts2.index.take(self.lindex)) - self.right = Series(self.ts2.values.take(self.rindex), index=self.ts2.index.take(self.rindex)) + self.left = Series(self.ts2.values.take(self.lindex), + index=self.ts2.index.take(self.lindex)) + self.right = Series(self.ts2.values.take(self.rindex), + index=self.ts2.index.take(self.rindex)) self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') self.ts3 = Series(1, index=self.rng3) @@ -329,26 +330,6 @@ def time_large_lookup_value(self): self.ts3.index._cleanup() -class SeriesArithmetic(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_add_offset_delta(self): - (self.s + self.delta_offset) - - def time_add_offset_fast(self): - (self.s + self.fast_offset) - - def time_add_offset_slow(self): - (self.s + self.slow_offset) - - class ToDatetime(object): goal_time = 0.2 @@ -425,136 +406,6 @@ def time_cache_false_with_dup_string_tzoffset_dates(self): to_datetime(self.dup_string_with_tz, cache=False) -class Offsets(object): - goal_time = 0.2 - - def setup(self): - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_apply(self): - self.day.apply(self.date) - - def time_timeseries_day_incr(self): - (self.date + self.day) - - def time_timeseries_year_apply(self): - self.year.apply(self.date) - - def time_timeseries_year_incr(self): - (self.date + self.year) - - # custom business offsets - - def time_custom_bday_decr(self): - (self.date - self.cday) - - def time_custom_bday_incr(self): - (self.date + self.cday) - - def time_custom_bday_apply(self): - self.cday.apply(self.date) - - def time_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) - - def time_custom_bday_cal_incr(self): - self.date + 1 * self.cdayh - - def time_custom_bday_cal_decr(self): - self.date - 1 * self.cdayh - - def time_custom_bday_cal_incr_n(self): - self.date + 10 * self.cdayh - - def time_custom_bday_cal_incr_neg_n(self): - self.date - 10 * self.cdayh - - # Increment custom business month - - def time_custom_bmonthend_incr(self): - (self.date + self.cme) - - def time_custom_bmonthend_incr_n(self): - (self.date + (10 * self.cme)) - - def time_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) - - def time_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - def time_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class SemiMonthOffset(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - # date is not on an offset which will be slowest case - self.date = dt.datetime(2011, 1, 2) - self.semi_month_end = pd.offsets.SemiMonthEnd() - self.semi_month_begin = pd.offsets.SemiMonthBegin() - - def time_end_apply(self): - self.semi_month_end.apply(self.date) - - def time_end_incr(self): - self.date + self.semi_month_end - - def time_end_incr_n(self): - self.date + 10 * self.semi_month_end - - def time_end_decr(self): - self.date - self.semi_month_end - - def time_end_decr_n(self): - self.date - 10 * self.semi_month_end - - def time_end_apply_index(self): - self.semi_month_end.apply_index(self.rng) - - def time_end_incr_rng(self): - self.rng + self.semi_month_end - - def time_end_decr_rng(self): - self.rng - self.semi_month_end - - def time_begin_apply(self): - self.semi_month_begin.apply(self.date) - - def time_begin_incr(self): - self.date + self.semi_month_begin - - def time_begin_incr_n(self): - self.date + 10 * self.semi_month_begin - - def time_begin_decr(self): - self.date - self.semi_month_begin - - def time_begin_decr_n(self): - self.date - 10 * self.semi_month_begin - - def time_begin_apply_index(self): - self.semi_month_begin.apply_index(self.rng) - - def time_begin_incr_rng(self): - self.rng + self.semi_month_begin - - def time_begin_decr_rng(self): - self.rng - self.semi_month_begin - - class DatetimeAccessor(object): def setup(self): self.N = 100000 diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py new file mode 100644 index 0000000000000..45f12c6931fd9 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -0,0 +1,606 @@ +# -*- coding: utf-8 -*- +""" +Tests for Fiscal Year and Fiscal Quarter offset classes +""" +from datetime import datetime + +from dateutil.relativedelta import relativedelta +import pytest + +import pandas.util.testing as tm + +from pandas.tseries.frequencies import get_offset, _INVALID_FREQ_ERROR +from pandas.tseries.offsets import FY5253Quarter, FY5253 +from pandas._libs.tslibs.offsets import WeekDay + +from .common import assert_offset_equal, assert_onOffset +from .test_offsets import Base + + +def makeFY5253LastOfMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="last", **kwds) + + +def makeFY5253NearestEndMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="nearest", **kwds) + + +def makeFY5253NearestEndMonth(*args, **kwds): + return FY5253(*args, variation="nearest", **kwds) + + +def makeFY5253LastOfMonth(*args, **kwds): + return FY5253(*args, variation="last", **kwds) + + +def test_get_offset_name(): + assert (makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, + qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") + assert (makeFY5253NearestEndMonthQuarter( + weekday=1, startingMonth=3, + qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") + + +def test_get_offset(): + with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + get_offset('gibberish') + with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + get_offset('QS-JAN-B') + + pairs = [ + ("RE-N-DEC-MON", + makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", + makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ("REQ-L-MAR-TUE-4", + makeFY5253LastOfMonthQuarter(weekday=1, + startingMonth=3, + qtr_with_extra_week=4)), + ("REQ-L-DEC-MON-3", + makeFY5253LastOfMonthQuarter(weekday=0, + startingMonth=12, + qtr_with_extra_week=3)), + ("REQ-N-DEC-MON-3", + makeFY5253NearestEndMonthQuarter(weekday=0, + startingMonth=12, + qtr_with_extra_week=3))] + + for name, expected in pairs: + offset = get_offset(name) + assert offset == expected, ("Expected %r to yield %r (actual: %r)" % + (name, expected, offset)) + + +class TestFY5253LastOfMonth(Base): + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, + weekday=WeekDay.SAT) + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) + (offset_lom_sat_aug, datetime(2006, 8, 26), True), + (offset_lom_sat_aug, datetime(2007, 8, 25), True), + (offset_lom_sat_aug, datetime(2008, 8, 30), True), + (offset_lom_sat_aug, datetime(2009, 8, 29), True), + (offset_lom_sat_aug, datetime(2010, 8, 28), True), + (offset_lom_sat_aug, datetime(2011, 8, 27), True), + (offset_lom_sat_aug, datetime(2012, 8, 25), True), + (offset_lom_sat_aug, datetime(2013, 8, 31), True), + (offset_lom_sat_aug, datetime(2014, 8, 30), True), + (offset_lom_sat_aug, datetime(2015, 8, 29), True), + (offset_lom_sat_aug, datetime(2016, 8, 27), True), + (offset_lom_sat_aug, datetime(2017, 8, 26), True), + (offset_lom_sat_aug, datetime(2018, 8, 25), True), + (offset_lom_sat_aug, datetime(2019, 8, 31), True), + + (offset_lom_sat_aug, datetime(2006, 8, 27), False), + (offset_lom_sat_aug, datetime(2007, 8, 28), False), + (offset_lom_sat_aug, datetime(2008, 8, 31), False), + (offset_lom_sat_aug, datetime(2009, 8, 30), False), + (offset_lom_sat_aug, datetime(2010, 8, 29), False), + (offset_lom_sat_aug, datetime(2011, 8, 28), False), + + (offset_lom_sat_aug, datetime(2006, 8, 25), False), + (offset_lom_sat_aug, datetime(2007, 8, 24), False), + (offset_lom_sat_aug, datetime(2008, 8, 29), False), + (offset_lom_sat_aug, datetime(2009, 8, 28), False), + (offset_lom_sat_aug, datetime(2010, 8, 27), False), + (offset_lom_sat_aug, datetime(2011, 8, 26), False), + (offset_lom_sat_aug, datetime(2019, 8, 30), False), + + # From GMCR (see for example: + # http://yahoo.brand.edgar-online.com/Default.aspx? + # companyid=3184&formtypeID=7) + (offset_lom_sat_sep, datetime(2010, 9, 25), True), + (offset_lom_sat_sep, datetime(2011, 9, 24), True), + (offset_lom_sat_sep, datetime(2012, 9, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_apply(self): + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT) + + date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 8, 27), + datetime(2012, 8, 25), datetime(2013, 8, 31), + datetime(2014, 8, 30), datetime(2015, 8, 29), + datetime(2016, 8, 27)] + + tests = [ + (offset_lom_aug_sat, date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, date_seq_lom_aug_sat), + (offset_lom_aug_sat, [ + datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [ + datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + (makeFY5253LastOfMonth(n=-1, startingMonth=8, + weekday=WeekDay.SAT), + list(reversed(date_seq_lom_aug_sat))), + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253NearestEndMonth(Base): + + def test_get_target_month_end(self): + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=12, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 12, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=2, weekday=WeekDay.SAT).get_target_month_end( + datetime(2013, 1, 1)) == datetime(2013, 2, 28)) + + def test_get_year_end(self): + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 31)) + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SUN).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 9, 1)) + assert (makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.FRI).get_year_end( + datetime(2013, 1, 1)) == datetime(2013, 8, 30)) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + assert (offset_n.get_year_end(datetime(2012, 1, 1)) == + datetime(2013, 1, 1)) + assert (offset_n.get_year_end(datetime(2012, 1, 10)) == + datetime(2013, 1, 1)) + + assert (offset_n.get_year_end(datetime(2013, 1, 1)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 2)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 3)) == + datetime(2013, 12, 31)) + assert (offset_n.get_year_end(datetime(2013, 1, 10)) == + datetime(2013, 12, 31)) + + JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") + assert (JNJ.get_year_end(datetime(2006, 1, 1)) == + datetime(2006, 12, 31)) + + offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, + weekday=WeekDay.SAT) + offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, + weekday=WeekDay.THU) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar + # #Saturday_nearest_the_end_of_month) + # 2006-09-02 2006 September 2 + # 2007-09-01 2007 September 1 + # 2008-08-30 2008 August 30 (leap year) + # 2009-08-29 2009 August 29 + # 2010-08-28 2010 August 28 + # 2011-09-03 2011 September 3 + # 2012-09-01 2012 September 1 (leap year) + # 2013-08-31 2013 August 31 + # 2014-08-30 2014 August 30 + # 2015-08-29 2015 August 29 + # 2016-09-03 2016 September 3 (leap year) + # 2017-09-02 2017 September 2 + # 2018-09-01 2018 September 1 + # 2019-08-31 2019 August 31 + (offset_lom_aug_sat, datetime(2006, 9, 2), True), + (offset_lom_aug_sat, datetime(2007, 9, 1), True), + (offset_lom_aug_sat, datetime(2008, 8, 30), True), + (offset_lom_aug_sat, datetime(2009, 8, 29), True), + (offset_lom_aug_sat, datetime(2010, 8, 28), True), + (offset_lom_aug_sat, datetime(2011, 9, 3), True), + + (offset_lom_aug_sat, datetime(2016, 9, 3), True), + (offset_lom_aug_sat, datetime(2017, 9, 2), True), + (offset_lom_aug_sat, datetime(2018, 9, 1), True), + (offset_lom_aug_sat, datetime(2019, 8, 31), True), + + (offset_lom_aug_sat, datetime(2006, 8, 27), False), + (offset_lom_aug_sat, datetime(2007, 8, 28), False), + (offset_lom_aug_sat, datetime(2008, 8, 31), False), + (offset_lom_aug_sat, datetime(2009, 8, 30), False), + (offset_lom_aug_sat, datetime(2010, 8, 29), False), + (offset_lom_aug_sat, datetime(2011, 8, 28), False), + + (offset_lom_aug_sat, datetime(2006, 8, 25), False), + (offset_lom_aug_sat, datetime(2007, 8, 24), False), + (offset_lom_aug_sat, datetime(2008, 8, 29), False), + (offset_lom_aug_sat, datetime(2009, 8, 28), False), + (offset_lom_aug_sat, datetime(2010, 8, 27), False), + (offset_lom_aug_sat, datetime(2011, 8, 26), False), + (offset_lom_aug_sat, datetime(2019, 8, 30), False), + + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_lom_aug_thu, datetime(2012, 8, 30), True), + (offset_lom_aug_thu, datetime(2011, 9, 1), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_apply(self): + date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 9, 3)] + + JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), + datetime(2006, 12, 31), datetime(2007, 12, 30), + datetime(2008, 12, 28), datetime(2010, 1, 3), + datetime(2011, 1, 2), datetime(2012, 1, 1), + datetime(2012, 12, 30)] + + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, + variation="nearest") + + tests = [ + (makeFY5253NearestEndMonth(startingMonth=8, + weekday=WeekDay.SAT), + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT), + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 1)] + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, + weekday=WeekDay.SAT), + [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), + (makeFY5253NearestEndMonth(n=-1, startingMonth=8, + weekday=WeekDay.SAT), + list(reversed(date_seq_nem_8_sat))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), JNJ), + (makeFY5253NearestEndMonth(n=-1, startingMonth=12, + weekday=WeekDay.SUN), + list(reversed(JNJ))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), + [datetime(2005, 1, 2), datetime(2006, 1, 1)]), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, + weekday=WeekDay.SUN), + [datetime(2006, 1, 2), datetime(2006, 12, 31)]), + (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253LastOfMonthQuarter(Base): + + def test_isAnchored(self): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4).isAnchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, + qtr_with_extra_week=4).isAnchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4).isAnchored() + + def test_equality(self): + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) + assert (makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, + qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( + startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + + def test_offset(self): + offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), + datetime(2010, 9, 25), datetime(2010, 12, 25), + datetime(2011, 3, 26), datetime(2011, 6, 25), + datetime(2011, 9, 24), datetime(2011, 12, 24), + datetime(2012, 3, 24), datetime(2012, 6, 23), + datetime(2012, 9, 29), datetime(2012, 12, 29), + datetime(2013, 3, 30), datetime(2013, 6, 29)] + + assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) + assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), + expected=GMCR[0]) + assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) + + assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) + assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) + + assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) + assert_offset_equal(offset_neg1, + base=GMCR[-1] + relativedelta(days=+1), + expected=GMCR[-1]) + assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) + + date = GMCR[0] + relativedelta(days=-1) + for expected in GMCR: + assert_offset_equal(offset, date, expected) + date = date + offset + + date = GMCR[-1] + relativedelta(days=+1) + for expected in reversed(GMCR): + assert_offset_equal(offset_neg1, date, expected) + date = date + offset_neg1 + + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + on_offset_cases = [ + # From Wikipedia + (lomq_aug_sat_4, datetime(2006, 8, 26), True), + (lomq_aug_sat_4, datetime(2007, 8, 25), True), + (lomq_aug_sat_4, datetime(2008, 8, 30), True), + (lomq_aug_sat_4, datetime(2009, 8, 29), True), + (lomq_aug_sat_4, datetime(2010, 8, 28), True), + (lomq_aug_sat_4, datetime(2011, 8, 27), True), + (lomq_aug_sat_4, datetime(2019, 8, 31), True), + + (lomq_aug_sat_4, datetime(2006, 8, 27), False), + (lomq_aug_sat_4, datetime(2007, 8, 28), False), + (lomq_aug_sat_4, datetime(2008, 8, 31), False), + (lomq_aug_sat_4, datetime(2009, 8, 30), False), + (lomq_aug_sat_4, datetime(2010, 8, 29), False), + (lomq_aug_sat_4, datetime(2011, 8, 28), False), + + (lomq_aug_sat_4, datetime(2006, 8, 25), False), + (lomq_aug_sat_4, datetime(2007, 8, 24), False), + (lomq_aug_sat_4, datetime(2008, 8, 29), False), + (lomq_aug_sat_4, datetime(2009, 8, 28), False), + (lomq_aug_sat_4, datetime(2010, 8, 27), False), + (lomq_aug_sat_4, datetime(2011, 8, 26), False), + (lomq_aug_sat_4, datetime(2019, 8, 30), False), + + # From GMCR + (lomq_sep_sat_4, datetime(2010, 9, 25), True), + (lomq_sep_sat_4, datetime(2011, 9, 24), True), + (lomq_sep_sat_4, datetime(2012, 9, 29), True), + + (lomq_sep_sat_4, datetime(2013, 6, 29), True), + (lomq_sep_sat_4, datetime(2012, 6, 23), True), + (lomq_sep_sat_4, datetime(2012, 6, 30), False), + + (lomq_sep_sat_4, datetime(2013, 3, 30), True), + (lomq_sep_sat_4, datetime(2012, 3, 24), True), + + (lomq_sep_sat_4, datetime(2012, 12, 29), True), + (lomq_sep_sat_4, datetime(2011, 12, 24), True), + + # INTC (extra week in Q1) + # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2011, 4, 2), True), + + # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2012, 12, 29), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2011, 12, 31), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1), + datetime(2010, 12, 25), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_year_has_extra_week(self): + # End of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) + + # Start of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) + + # End of year before year with long Q1 + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) + + for year in [x + for x in range(1994, 2011 + 1) + if x not in [2011, 2005, 2000, 1994]]: + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week( + datetime(year, 4, 2)) + + # Other long years + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, + qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) + + def test_get_weeks(self): + sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=1) + sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, + weekday=WeekDay.SAT, + qtr_with_extra_week=4) + + assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] + assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] + assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13] + + +class TestFY5253NearestEndMonthQuarter(Base): + + offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, + qtr_with_extra_week=4) + offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, + qtr_with_extra_week=4) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + + on_offset_cases = [ + # From Wikipedia + (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), + (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), + (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), + (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), + + (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), + (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), + + (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), + + (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), + (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), + + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), + (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), + + # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 + (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), + (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), + (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), + (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), + (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), + (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + def test_offset(self): + offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, + weekday=WeekDay.THU, + qtr_with_extra_week=4) + + MU = [datetime(2012, 5, 31), + datetime(2012, 8, 30), datetime(2012, 11, 29), + datetime(2013, 2, 28), datetime(2013, 5, 30)] + + date = MU[0] + relativedelta(days=-1) + for expected in MU: + assert_offset_equal(offset, date, expected) + date = date + offset + + assert_offset_equal(offset, + datetime(2012, 5, 31), + datetime(2012, 8, 30)) + assert_offset_equal(offset, + datetime(2012, 5, 30), + datetime(2012, 5, 31)) + + offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", + qtr_with_extra_week=4) + + assert_offset_equal(offset2, + datetime(2013, 1, 15), + datetime(2013, 3, 30)) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 357c95282e78d..d6b64896b8a60 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,7 +1,6 @@ import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta -from dateutil.relativedelta import relativedelta import pytest from pandas.compat import range @@ -621,50 +620,50 @@ def test_onOffset(self): for offset, d, expected in tests: assert_onOffset(offset, d, expected) - def test_apply(self): - tests = [] - - tests.append((BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)} - )) - - tests.append((-BDay(), {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)} - )) - - tests.append((-2 * BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)} - )) - - tests.append((BDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)} - )) + apply_cases = [] + apply_cases.append((BDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + apply_cases.append((2 * BDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + apply_cases.append((-BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + apply_cases.append((-2 * BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + apply_cases.append((BDay(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -851,483 +850,469 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2014, 7, 7, 9) - def test_normalize(self): - tests = [] - - tests.append((BusinessHour(normalize=True), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - tests.append((BusinessHour(-1, normalize=True), - {datetime(2014, 7, 1, 8): datetime(2014, 6, 30), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30), - datetime(2014, 7, 1, 0): datetime(2014, 6, 30), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - tests.append((BusinessHour(1, normalize=True, start='17:00', - end='04:00'), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 2), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - for offset, cases in tests: - for dt, expected in compat.iteritems(cases): - assert offset.apply(dt) == expected - - def test_onOffset(self): - tests = [] - - tests.append((BusinessHour(), {datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False})) - - tests.append((BusinessHour(start='10:00', end='15:00'), - {datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) - - tests.append((BusinessHour(start='19:00', end='05:00'), - {datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False})) - - for offset, cases in tests: - for dt, expected in compat.iteritems(cases): - assert offset.onOffset(dt) == expected - - def test_opening_time(self): - tests = [] - - # opening time should be affected by sign of n, not by n's value and - # end - tests.append(( - [BusinessHour(), BusinessHour(n=2), BusinessHour( - n=4), BusinessHour(end='10:00'), BusinessHour(n=2, end='4:00'), - BusinessHour(n=4, end='15:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 1, 9)), - # if timestamp is on opening time, next opening time is - # as it is - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), datetime( - 2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), datetime( - 2014, 7, 2, 9)), - # 2014-07-05 is saturday - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), datetime( - 2014, 7, 4, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), datetime( - 2014, 7, 7, 9))})) - - tests.append(([BusinessHour(start='11:15'), - BusinessHour(n=2, start='11:15'), - BusinessHour(n=3, start='11:15'), - BusinessHour(start='11:15', end='10:00'), - BusinessHour(n=2, start='11:15', end='4:00'), - BusinessHour(n=3, start='11:15', end='15:00')], - {datetime(2014, 7, 1, 11): (datetime( - 2014, 7, 1, 11, 15), datetime(2014, 6, 30, 11, 15)), - datetime(2014, 7, 1, 18): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 1, 23): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 10): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 11, 15): (datetime( - 2014, 7, 2, 11, 15), datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime( - 2014, 7, 3, 11, 15), datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 4, 10): (datetime( - 2014, 7, 4, 11, 15), datetime(2014, 7, 3, 11, 15)), - datetime(2014, 7, 4, 23): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 6, 10): (datetime( - 2014, 7, 7, 11, 15), datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15))})) - - tests.append(([BusinessHour(-1), BusinessHour(n=-2), - BusinessHour(n=-4), - BusinessHour(n=-1, end='10:00'), - BusinessHour(n=-2, end='4:00'), - BusinessHour(n=-4, end='15:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 3, 9)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 8, 9))})) - - tests.append(([BusinessHour(start='17:00', end='05:00'), - BusinessHour(n=3, start='17:00', end='03:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), - datetime(2014, 6, 30, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 3, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 17, 1): (datetime( - 2014, 7, 8, 17), datetime(2014, 7, 7, 17)), })) - - tests.append(([BusinessHour(-1, start='17:00', end='05:00'), - BusinessHour(n=-2, start='17:00', end='03:00')], - {datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime( - 2014, 7, 1, 17), datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 17))})) - - for _offsets, cases in tests: - for offset in _offsets: - for dt, (exp_next, exp_prev) in compat.iteritems(cases): - assert offset._next_opening_time(dt) == exp_next - assert offset._prev_opening_time(dt) == exp_prev - - def test_apply(self): - tests = [] - - tests.append(( - BusinessHour(), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, - 30)})) - - tests.append((BusinessHour( - 4), {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, - 30)})) - - tests.append( - (BusinessHour(-1), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime( - 2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 1, 9, 30, 15): datetime( - 2014, 6, 30, 16, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, - 30)})) - - tests.append((BusinessHour( - -4), {datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, - 30)})) - - tests.append((BusinessHour(start='13:00', end='16:00'), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, - 13, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) - - tests.append((BusinessHour(n=2, start='13:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), - datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30) - })) - - tests.append((BusinessHour(n=-1, start='13:00', end='16:00'), - {datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, - 15, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), - datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) - - tests.append((BusinessHour(n=-3, start='10:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), - datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30) - })) - - tests.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), - datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), - datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), - datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), - datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30) - })) - - tests.append((BusinessHour(n=-1, start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30) - })) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_apply_large_n(self): - tests = [] + normalize_cases = [] + normalize_cases.append((BusinessHour(normalize=True), { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) + + normalize_cases.append((BusinessHour(-1, normalize=True), { + datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) + + normalize_cases.append((BusinessHour(1, normalize=True, start='17:00', + end='04:00'), { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) + + @pytest.mark.parametrize('case', normalize_cases) + def test_normalize(self, case): + offset, cases = case + for dt, expected in compat.iteritems(cases): + assert offset.apply(dt) == expected + + on_offset_cases = [] + on_offset_cases.append((BusinessHour(), { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False})) + + on_offset_cases.append((BusinessHour(start='10:00', end='15:00'), { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False})) + + on_offset_cases.append((BusinessHour(start='19:00', end='05:00'), { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False})) - tests.append( - (BusinessHour(40), # A week later - {datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), - datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), - datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), - datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), - datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), - datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), - datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), - datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, - 30)})) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, cases = case + for dt, expected in compat.iteritems(cases): + assert offset.onOffset(dt) == expected - tests.append( - (BusinessHour(-25), # 3 days and 1 hour before - {datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, - 30)})) - - # 5 days and 3 hours later - tests.append((BusinessHour(28, start='21:00', end='02:00'), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), - datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), - datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, - 30)})) + opening_time_cases = [] + # opening time should be affected by sign of n, not by n's value and + # end + opening_time_cases.append(([BusinessHour(), BusinessHour(n=2), + BusinessHour(n=4), BusinessHour(end='10:00'), + BusinessHour(n=2, end='4:00'), + BusinessHour(n=4, end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9)), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9)), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9))})) + + opening_time_cases.append(([BusinessHour(start='11:15'), + BusinessHour(n=2, start='11:15'), + BusinessHour(n=3, start='11:15'), + BusinessHour(start='11:15', end='10:00'), + BusinessHour(n=2, start='11:15', end='4:00'), + BusinessHour(n=3, start='11:15', + end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15)), + datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15))})) + + opening_time_cases.append(([BusinessHour(-1), BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end='10:00'), + BusinessHour(n=-2, end='4:00'), + BusinessHour(n=-4, end='15:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9))})) + + opening_time_cases.append(([BusinessHour(start='17:00', end='05:00'), + BusinessHour(n=3, start='17:00', + end='03:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17)), })) + + opening_time_cases.append(([BusinessHour(-1, start='17:00', end='05:00'), + BusinessHour(n=-2, start='17:00', + end='03:00')], { + datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17)), + datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17))})) + + @pytest.mark.parametrize('case', opening_time_cases) + def test_opening_time(self, case): + _offsets, cases = case + for offset in _offsets: + for dt, (exp_next, exp_prev) in compat.iteritems(cases): + assert offset._next_opening_time(dt) == exp_next + assert offset._prev_opening_time(dt) == exp_prev + + apply_cases = [] + apply_cases.append((BusinessHour(), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) + + apply_cases.append((BusinessHour(4), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) + + apply_cases.append((BusinessHour(-1), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)})) + + apply_cases.append((BusinessHour(-4), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)})) + + apply_cases.append((BusinessHour(start='13:00', end='16:00'), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) + + apply_cases.append((BusinessHour(n=2, start='13:00', end='16:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)})) + + apply_cases.append((BusinessHour(n=-1, start='13:00', end='16:00'), { + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) + + apply_cases.append((BusinessHour(n=-3, start='10:00', end='16:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)})) + + apply_cases.append((BusinessHour(start='19:00', end='05:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)})) + + apply_cases.append((BusinessHour(n=-1, start='19:00', end='05:00'), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)})) + + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + apply_large_n_cases = [] + # A week later + apply_large_n_cases.append((BusinessHour(40), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)})) + + # 3 days and 1 hour before + apply_large_n_cases.append((BusinessHour(-25), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + + # 5 days and 3 hours later + apply_large_n_cases.append((BusinessHour(28, start='21:00', end='02:00'), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + + @pytest.mark.parametrize('case', apply_large_n_cases) + def test_apply_large_n(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_nanoseconds(self): tests = [] @@ -1743,58 +1728,58 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CDay(), datetime(2008, 1, 1), True), - (CDay(), datetime(2008, 1, 5), False)] - - for offset, d, expected in tests: - assert_onOffset(offset, d, expected) - - def test_apply(self): - tests = [] + on_offset_cases = [(CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False)] - tests.append((CDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)} - )) - - tests.append((-CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)} - )) - - tests.append((-2 * CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)} - )) - - tests.append((CDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, d, expected = case + assert_onOffset(offset, d, expected) + + apply_cases = [] + apply_cases.append((CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + apply_cases.append((2 * CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + apply_cases.append((-CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + apply_cases.append((-2 * CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + apply_cases.append((CDay(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -1988,37 +1973,40 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CBMonthEnd(), datetime(2008, 1, 31), True), - (CBMonthEnd(), datetime(2008, 1, 1), False)] - - for offset, d, expected in tests: - assert_onOffset(offset, d, expected) + on_offset_cases = [(CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False)] - def test_apply(self): - cbm = CBMonthEnd() - tests = [] + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, d, expected = case + assert_onOffset(offset, d, expected) - tests.append((cbm, {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) + apply_cases = [] + apply_cases.append((CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) - tests.append((2 * cbm, {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 2, 7): datetime(2008, 3, 31)})) + apply_cases.append((2 * CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31)})) - tests.append((-cbm, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 2, 8): datetime(2008, 1, 31)})) + apply_cases.append((-CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31)})) - tests.append((-2 * cbm, {datetime(2008, 1, 1): datetime(2007, 11, 30), - datetime(2008, 2, 9): datetime(2007, 12, 31)} - )) + apply_cases.append((-2 * CBMonthEnd(), { + datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31)})) - tests.append((CBMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) + apply_cases.append((CBMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -2102,36 +2090,40 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): - tests = [(CBMonthBegin(), datetime(2008, 1, 1), True), - (CBMonthBegin(), datetime(2008, 1, 31), False)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) + on_offset_cases = [(CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False)] - def test_apply(self): - cbm = CBMonthBegin() - tests = [] + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) - tests.append((cbm, {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 2, 7): datetime(2008, 3, 3)})) + apply_cases = [] + apply_cases.append((CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3)})) - tests.append((2 * cbm, {datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 2, 7): datetime(2008, 4, 1)})) + apply_cases.append((2 * CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1)})) - tests.append((-cbm, {datetime(2008, 1, 1): datetime(2007, 12, 3), - datetime(2008, 2, 8): datetime(2008, 2, 1)})) + apply_cases.append((-CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1)})) - tests.append((-2 * cbm, {datetime(2008, 1, 1): datetime(2007, 11, 1), - datetime(2008, 2, 9): datetime(2008, 1, 1)})) + apply_cases.append((-2 * CBMonthBegin(), { + datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1)})) - tests.append((CBMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 7): datetime(2008, 2, 1)})) + apply_cases.append((CBMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_apply_large_n(self): dt = datetime(2012, 10, 23) @@ -2189,38 +2181,42 @@ def test_isAnchored(self): assert not Week(2, weekday=2).isAnchored() assert not Week(2).isAnchored() - def test_offset(self): - tests = [] - - tests.append((Week(), # not business week - {datetime(2008, 1, 1): datetime(2008, 1, 8), - datetime(2008, 1, 4): datetime(2008, 1, 11), - datetime(2008, 1, 5): datetime(2008, 1, 12), - datetime(2008, 1, 6): datetime(2008, 1, 13), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) - - tests.append((Week(weekday=0), # Mon - {datetime(2007, 12, 31): datetime(2008, 1, 7), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) - - tests.append((Week(0, weekday=0), # n=0 -> roll forward. Mon - {datetime(2007, 12, 31): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - tests.append((Week(-2, weekday=1), # n=0 -> roll forward. Mon - {datetime(2010, 4, 6): datetime(2010, 3, 23), - datetime(2010, 4, 8): datetime(2010, 3, 30), - datetime(2010, 4, 5): datetime(2010, 3, 23)})) + offset_cases = [] + # not business week + offset_cases.append((Week(), { + datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + # Mon + offset_cases.append((Week(weekday=0), { + datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + # n=0 -> roll forward. Mon + offset_cases.append((Week(0, weekday=0), { + datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + # n=0 -> roll forward. Mon + offset_cases.append((Week(-2, weekday=1), { + datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23)})) - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) def test_onOffset(self): for weekday in range(7): @@ -2300,8 +2296,7 @@ def test_offset(self): (2, 2, 1, date1, datetime(2011, 2, 15)), (2, 2, 1, date2, datetime(2011, 2, 15)), (2, 2, 1, date3, datetime(2011, 3, 15)), - (2, 2, 1, date4, datetime(2011, 3, 15)), - ] + (2, 2, 1, date4, datetime(2011, 3, 15))] for n, week, weekday, dt, expected in test_cases: offset = WeekOfMonth(n, week=week, weekday=weekday) @@ -2314,19 +2309,18 @@ def test_offset(self): result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) assert result == datetime(2011, 2, 2) - def test_onOffset(self): - test_cases = [ - (0, 0, datetime(2011, 2, 7), True), - (0, 0, datetime(2011, 2, 6), False), - (0, 0, datetime(2011, 2, 14), False), - (1, 0, datetime(2011, 2, 14), True), - (0, 1, datetime(2011, 2, 1), True), - (0, 1, datetime(2011, 2, 8), False), - ] - - for week, weekday, dt, expected in test_cases: - offset = WeekOfMonth(week=week, weekday=weekday) - assert offset.onOffset(dt) == expected + on_offset_cases = [(0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + week, weekday, dt, expected = case + offset = WeekOfMonth(week=week, weekday=weekday) + assert offset.onOffset(dt) == expected class TestLastWeekOfMonth(Base): @@ -2380,346 +2374,32 @@ def test_offset(self): offset_sunday = LastWeekOfMonth(n=1, weekday=WeekDay.SUN) assert datetime(2013, 7, 31) + offset_sunday == datetime(2013, 8, 25) - def test_onOffset(self): - test_cases = [ - (WeekDay.SUN, datetime(2013, 1, 27), True), - (WeekDay.SAT, datetime(2013, 3, 30), True), - (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon - (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN - (WeekDay.MON, datetime(2013, 2, 25), True), - (WeekDay.SAT, datetime(2013, 11, 30), True), - - (WeekDay.SAT, datetime(2006, 8, 26), True), - (WeekDay.SAT, datetime(2007, 8, 25), True), - (WeekDay.SAT, datetime(2008, 8, 30), True), - (WeekDay.SAT, datetime(2009, 8, 29), True), - (WeekDay.SAT, datetime(2010, 8, 28), True), - (WeekDay.SAT, datetime(2011, 8, 27), True), - (WeekDay.SAT, datetime(2019, 8, 31), True), - ] - - for weekday, dt, expected in test_cases: - offset = LastWeekOfMonth(weekday=weekday) - assert offset.onOffset(dt) == expected - - -class TestBMonthBegin(Base): - _offset = BMonthBegin - - def test_offset(self): - tests = [] - - tests.append((BMonthBegin(), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 1): datetime(2006, 10, 2), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1)})) - - tests.append((BMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 10, 2): datetime(2006, 10, 2), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 15): datetime(2006, 10, 2)})) - - tests.append((BMonthBegin(2), - {datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 1, 15): datetime(2008, 3, 3), - datetime(2006, 12, 29): datetime(2007, 2, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - tests.append((BMonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 2), - datetime(2008, 6, 1): datetime(2008, 5, 1), - datetime(2008, 3, 10): datetime(2008, 3, 3), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 30): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_onOffset(self): - - tests = [(BMonthBegin(), datetime(2007, 12, 31), False), - (BMonthBegin(), datetime(2008, 1, 1), True), - (BMonthBegin(), datetime(2001, 4, 2), True), - (BMonthBegin(), datetime(2008, 3, 3), True)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) - - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = BMonthBegin() - offset2 = BMonthBegin() - assert not offset1 != offset2 - - -class TestBMonthEnd(Base): - _offset = BMonthEnd - - def test_offset(self): - tests = [] - - tests.append((BMonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 29)})) - - tests.append((BMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 29), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - tests.append((BMonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 2, 28), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 29)})) - - tests.append((BMonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2008, 5, 30), - datetime(2008, 12, 31): datetime(2008, 11, 28), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_normalize(self): - dt = datetime(2007, 1, 1, 3) - - result = dt + BMonthEnd(normalize=True) - expected = dt.replace(hour=0) + BMonthEnd() - assert result == expected - - def test_onOffset(self): - - tests = [(BMonthEnd(), datetime(2007, 12, 31), True), - (BMonthEnd(), datetime(2008, 1, 1), False)] - - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) - - def test_offsets_compare_equal(self): - # root cause of #456 - offset1 = BMonthEnd() - offset2 = BMonthEnd() - assert not offset1 != offset2 - - -class TestMonthBegin(Base): - _offset = MonthBegin - - def test_offset(self): - tests = [] - - # NOTE: I'm not entirely happy with the logic here for Begin -ss - # see thread 'offset conventions' on the ML - tests.append((MonthBegin(), - {datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 2, 1): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - tests.append((MonthBegin(0), - {datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 12, 3): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - tests.append((MonthBegin(2), - {datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 12, 28): datetime(2008, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - tests.append((MonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 5, 31): datetime(2008, 5, 1), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 1, 2): datetime(2006, 1, 1)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - -class TestMonthEnd(Base): - _offset = MonthEnd - - def test_offset(self): - tests = [] - - tests.append((MonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 31)})) - - tests.append((MonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - tests.append((MonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 31)})) - - tests.append((MonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 11, 30), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - for offset, cases in tests: - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_day_of_month(self): - dt = datetime(2007, 1, 1) - offset = MonthEnd() - - result = dt + offset - assert result == Timestamp(2007, 1, 31) - - result = result + offset - assert result == Timestamp(2007, 2, 28) - - def test_normalize(self): - dt = datetime(2007, 1, 1, 3) - - result = dt + MonthEnd(normalize=True) - expected = dt.replace(hour=0) + MonthEnd() - assert result == expected - - def test_onOffset(self): - - tests = [(MonthEnd(), datetime(2007, 12, 31), True), - (MonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (WeekDay.SUN, datetime(2013, 1, 27), True), + (WeekDay.SAT, datetime(2013, 3, 30), True), + (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon + (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN + (WeekDay.MON, datetime(2013, 2, 25), True), + (WeekDay.SAT, datetime(2013, 11, 30), True), + + (WeekDay.SAT, datetime(2006, 8, 26), True), + (WeekDay.SAT, datetime(2007, 8, 25), True), + (WeekDay.SAT, datetime(2008, 8, 30), True), + (WeekDay.SAT, datetime(2009, 8, 29), True), + (WeekDay.SAT, datetime(2010, 8, 28), True), + (WeekDay.SAT, datetime(2011, 8, 27), True), + (WeekDay.SAT, datetime(2019, 8, 31), True)] - for offset, dt, expected in tests: - assert_onOffset(offset, dt, expected) + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + weekday, dt, expected = case + offset = LastWeekOfMonth(weekday=weekday) + assert offset.onOffset(dt) == expected class TestSemiMonthEnd(Base): _offset = SemiMonthEnd - def _get_tests(self): - tests = [] - - tests.append((SemiMonthEnd(), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(day_of_month=20), - {datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 20), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 20), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - tests.append((SemiMonthEnd(0), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 16): datetime(2008, 1, 31), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 15)})) - - tests.append((SemiMonthEnd(0, day_of_month=16), - {datetime(2008, 1, 1): datetime(2008, 1, 16), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 16)})) - - tests.append((SemiMonthEnd(2), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 11, 30)})) - - tests.append((SemiMonthEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 30): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(-1, day_of_month=4), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2007, 1, 4): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - tests.append((SemiMonthEnd(-2), - {datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 2, 15), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 14): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - return tests - def test_offset_whole_year(self): dates = (datetime(2007, 12, 31), datetime(2008, 1, 15), @@ -2761,28 +2441,107 @@ def test_offset_whole_year(self): exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) - def test_offset(self): - for offset, cases in self._get_tests(): - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_apply_index(self): - for offset, cases in self._get_tests(): - s = DatetimeIndex(cases.keys()) - result = offset.apply_index(s) - exp = DatetimeIndex(cases.values()) - tm.assert_index_equal(result, exp) - - def test_onOffset(self): - - tests = [(datetime(2007, 12, 31), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 1), False), - (datetime(2008, 2, 29), True)] + offset_cases = [] + offset_cases.append((SemiMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(day_of_month=20), { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + offset_cases.append((SemiMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15)})) + + offset_cases.append((SemiMonthEnd(0, day_of_month=16), { + datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16)})) + + offset_cases.append((SemiMonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30)})) + + offset_cases.append((SemiMonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(-1, day_of_month=4), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((SemiMonthEnd(-2), { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) - for dt, expected in tests: - assert_onOffset(SemiMonthEnd(), dt, expected) + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize('case', offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + on_offset_cases = [(datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + dt, expected = case + assert_onOffset(SemiMonthEnd(), dt, expected) @pytest.mark.parametrize('klass,assert_func', [(Series, tm.assert_series_equal), @@ -2811,91 +2570,6 @@ def test_vectorized_offset_addition(self, klass, assert_func): class TestSemiMonthBegin(Base): _offset = SemiMonthBegin - def _get_tests(self): - tests = [] - - tests.append((SemiMonthBegin(), - {datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(day_of_month=20), - {datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - tests.append((SemiMonthBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 2): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(0, day_of_month=16), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 5): datetime(2007, 1, 16), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - tests.append((SemiMonthBegin(2), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 15): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 12, 1)})) - - tests.append((SemiMonthBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 6, 14): datetime(2008, 6, 1), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - tests.append((SemiMonthBegin(-1, day_of_month=4), - {datetime(2007, 1, 1): datetime(2006, 12, 4), - datetime(2007, 1, 4): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2006, 12, 2): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 4)})) - - tests.append((SemiMonthBegin(-2), - {datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 1), - datetime(2008, 6, 14): datetime(2008, 5, 15), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 15): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - return tests - def test_offset_whole_year(self): dates = (datetime(2007, 12, 15), datetime(2008, 1, 1), @@ -2937,27 +2611,111 @@ def test_offset_whole_year(self): exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) - def test_offset(self): - for offset, cases in self._get_tests(): - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) + offset_cases = [] + offset_cases.append((SemiMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1)})) + + offset_cases.append((SemiMonthBegin(day_of_month=20), { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + offset_cases.append((SemiMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) - def test_apply_index(self): - for offset, cases in self._get_tests(): - s = DatetimeIndex(cases.keys()) - result = offset.apply_index(s) - exp = DatetimeIndex(cases.values()) - tm.assert_index_equal(result, exp) + offset_cases.append((SemiMonthBegin(0, day_of_month=16), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) - def test_onOffset(self): - tests = [(datetime(2007, 12, 1), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 31), False), - (datetime(2008, 2, 15), True)] + offset_cases.append((SemiMonthBegin(2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1)})) + + offset_cases.append((SemiMonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + offset_cases.append((SemiMonthBegin(-1, day_of_month=4), { + datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4)})) + + offset_cases.append((SemiMonthBegin(-2), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) - for dt, expected in tests: - assert_onOffset(SemiMonthBegin(), dt, expected) + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize('case', offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + on_offset_cases = [(datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + dt, expected = case + assert_onOffset(SemiMonthBegin(), dt, expected) @pytest.mark.parametrize('klass,assert_func', [(Series, tm.assert_series_equal), @@ -2982,1276 +2740,6 @@ def test_vectorized_offset_addition(self, klass, assert_func): assert_func(result2, exp) -class TestBQuarterBegin(Base): - _offset = BQuarterBegin - - def test_repr(self): - assert (repr(BQuarterBegin()) == - "") - assert (repr(BQuarterBegin(startingMonth=3)) == - "") - assert (repr(BQuarterBegin(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert BQuarterBegin(startingMonth=1).isAnchored() - assert BQuarterBegin().isAnchored() - assert not BQuarterBegin(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((BQuarterBegin(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2007, 3, 15): datetime(2007, 4, 2), - datetime(2007, 2, 28): datetime(2007, 4, 2), - datetime(2007, 1, 1): datetime(2007, 4, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 7, 2), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 8, 15): datetime(2008, 11, 3), - datetime(2008, 9, 15): datetime(2008, 11, 3), - datetime(2008, 11, 1): datetime(2008, 11, 3), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2007, 12, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 15): datetime(2008, 4, 1), - datetime(2008, 2, 27): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 4, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 2): datetime(2007, 7, 2)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2007, 7, 3): datetime(2007, 7, 2), - datetime(2007, 4, 3): datetime(2007, 4, 2), - datetime(2007, 7, 2): datetime(2007, 4, 2), - datetime(2008, 4, 1): datetime(2008, 1, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 1, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2007, 3, 31): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 10, 1), - datetime(2008, 4, 30): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = BQuarterBegin(n=-1, startingMonth=1) - assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) - - -class TestBQuarterEnd(Base): - _offset = BQuarterEnd - - def test_repr(self): - assert (repr(BQuarterEnd()) == - "") - assert (repr(BQuarterEnd(startingMonth=3)) == - "") - assert (repr(BQuarterEnd(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert BQuarterEnd(startingMonth=1).isAnchored() - assert BQuarterEnd().isAnchored() - assert not BQuarterEnd(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((BQuarterEnd(startingMonth=1), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31), })) - - offset_cases.append((BQuarterEnd(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 30), - datetime(2008, 3, 15): datetime(2008, 5, 30), - datetime(2008, 3, 31): datetime(2008, 5, 30), - datetime(2008, 4, 15): datetime(2008, 5, 30), - datetime(2008, 4, 30): datetime(2008, 5, 30), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), })) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=2), - {datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = BQuarterEnd(n=-1, startingMonth=1) - assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) - - on_offset_cases = [ - (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), - (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), - (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), - (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), - (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -def makeFY5253LastOfMonthQuarter(*args, **kwds): - return FY5253Quarter(*args, variation="last", **kwds) - - -def makeFY5253NearestEndMonthQuarter(*args, **kwds): - return FY5253Quarter(*args, variation="nearest", **kwds) - - -def makeFY5253NearestEndMonth(*args, **kwds): - return FY5253(*args, variation="nearest", **kwds) - - -def makeFY5253LastOfMonth(*args, **kwds): - return FY5253(*args, variation="last", **kwds) - - -class TestFY5253LastOfMonth(Base): - offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, - weekday=WeekDay.SAT) - - on_offset_cases = [ - # From Wikipedia (see: - # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) - (offset_lom_sat_aug, datetime(2006, 8, 26), True), - (offset_lom_sat_aug, datetime(2007, 8, 25), True), - (offset_lom_sat_aug, datetime(2008, 8, 30), True), - (offset_lom_sat_aug, datetime(2009, 8, 29), True), - (offset_lom_sat_aug, datetime(2010, 8, 28), True), - (offset_lom_sat_aug, datetime(2011, 8, 27), True), - (offset_lom_sat_aug, datetime(2012, 8, 25), True), - (offset_lom_sat_aug, datetime(2013, 8, 31), True), - (offset_lom_sat_aug, datetime(2014, 8, 30), True), - (offset_lom_sat_aug, datetime(2015, 8, 29), True), - (offset_lom_sat_aug, datetime(2016, 8, 27), True), - (offset_lom_sat_aug, datetime(2017, 8, 26), True), - (offset_lom_sat_aug, datetime(2018, 8, 25), True), - (offset_lom_sat_aug, datetime(2019, 8, 31), True), - - (offset_lom_sat_aug, datetime(2006, 8, 27), False), - (offset_lom_sat_aug, datetime(2007, 8, 28), False), - (offset_lom_sat_aug, datetime(2008, 8, 31), False), - (offset_lom_sat_aug, datetime(2009, 8, 30), False), - (offset_lom_sat_aug, datetime(2010, 8, 29), False), - (offset_lom_sat_aug, datetime(2011, 8, 28), False), - - (offset_lom_sat_aug, datetime(2006, 8, 25), False), - (offset_lom_sat_aug, datetime(2007, 8, 24), False), - (offset_lom_sat_aug, datetime(2008, 8, 29), False), - (offset_lom_sat_aug, datetime(2009, 8, 28), False), - (offset_lom_sat_aug, datetime(2010, 8, 27), False), - (offset_lom_sat_aug, datetime(2011, 8, 26), False), - (offset_lom_sat_aug, datetime(2019, 8, 30), False), - - # From GMCR (see for example: - # http://yahoo.brand.edgar-online.com/Default.aspx? - # companyid=3184&formtypeID=7) - (offset_lom_sat_sep, datetime(2010, 9, 25), True), - (offset_lom_sat_sep, datetime(2011, 9, 24), True), - (offset_lom_sat_sep, datetime(2012, 9, 29), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_apply(self): - offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT) - - date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 8, 27), - datetime(2012, 8, 25), datetime(2013, 8, 31), - datetime(2014, 8, 30), datetime(2015, 8, 29), - datetime(2016, 8, 27)] - - tests = [ - (offset_lom_aug_sat, date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, date_seq_lom_aug_sat), - (offset_lom_aug_sat, [ - datetime(2006, 8, 25)] + date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, [ - datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), - (makeFY5253LastOfMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_lom_aug_sat))), - ] - for test in tests: - offset, data = test - current = data[0] - for datum in data[1:]: - current = current + offset - assert current == datum - - -class TestFY5253NearestEndMonth(Base): - - def test_get_target_month_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=12, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 12, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=2, weekday=WeekDay.SAT).get_target_month_end( - datetime(2013, 1, 1)) == datetime(2013, 2, 28)) - - def test_get_year_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SUN).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 9, 1)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.FRI).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 30)) - - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - assert (offset_n.get_year_end(datetime(2012, 1, 1)) == - datetime(2013, 1, 1)) - assert (offset_n.get_year_end(datetime(2012, 1, 10)) == - datetime(2013, 1, 1)) - - assert (offset_n.get_year_end(datetime(2013, 1, 1)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 2)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 3)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 10)) == - datetime(2013, 12, 31)) - - JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") - assert (JNJ.get_year_end(datetime(2006, 1, 1)) == - datetime(2006, 12, 31)) - - offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.THU) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - - on_offset_cases = [ - # From Wikipedia (see: - # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar - # #Saturday_nearest_the_end_of_month) - # 2006-09-02 2006 September 2 - # 2007-09-01 2007 September 1 - # 2008-08-30 2008 August 30 (leap year) - # 2009-08-29 2009 August 29 - # 2010-08-28 2010 August 28 - # 2011-09-03 2011 September 3 - # 2012-09-01 2012 September 1 (leap year) - # 2013-08-31 2013 August 31 - # 2014-08-30 2014 August 30 - # 2015-08-29 2015 August 29 - # 2016-09-03 2016 September 3 (leap year) - # 2017-09-02 2017 September 2 - # 2018-09-01 2018 September 1 - # 2019-08-31 2019 August 31 - (offset_lom_aug_sat, datetime(2006, 9, 2), True), - (offset_lom_aug_sat, datetime(2007, 9, 1), True), - (offset_lom_aug_sat, datetime(2008, 8, 30), True), - (offset_lom_aug_sat, datetime(2009, 8, 29), True), - (offset_lom_aug_sat, datetime(2010, 8, 28), True), - (offset_lom_aug_sat, datetime(2011, 9, 3), True), - - (offset_lom_aug_sat, datetime(2016, 9, 3), True), - (offset_lom_aug_sat, datetime(2017, 9, 2), True), - (offset_lom_aug_sat, datetime(2018, 9, 1), True), - (offset_lom_aug_sat, datetime(2019, 8, 31), True), - - (offset_lom_aug_sat, datetime(2006, 8, 27), False), - (offset_lom_aug_sat, datetime(2007, 8, 28), False), - (offset_lom_aug_sat, datetime(2008, 8, 31), False), - (offset_lom_aug_sat, datetime(2009, 8, 30), False), - (offset_lom_aug_sat, datetime(2010, 8, 29), False), - (offset_lom_aug_sat, datetime(2011, 8, 28), False), - - (offset_lom_aug_sat, datetime(2006, 8, 25), False), - (offset_lom_aug_sat, datetime(2007, 8, 24), False), - (offset_lom_aug_sat, datetime(2008, 8, 29), False), - (offset_lom_aug_sat, datetime(2009, 8, 28), False), - (offset_lom_aug_sat, datetime(2010, 8, 27), False), - (offset_lom_aug_sat, datetime(2011, 8, 26), False), - (offset_lom_aug_sat, datetime(2019, 8, 30), False), - - # From Micron, see: - # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 - (offset_lom_aug_thu, datetime(2012, 8, 30), True), - (offset_lom_aug_thu, datetime(2011, 9, 1), True), - - (offset_n, datetime(2012, 12, 31), False), - (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_apply(self): - date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 9, 3)] - - JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), - datetime(2006, 12, 31), datetime(2007, 12, 30), - datetime(2008, 12, 28), datetime(2010, 1, 3), - datetime(2011, 1, 2), datetime(2012, 1, 1), - datetime(2012, 12, 30)] - - DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, - variation="nearest") - - tests = [ - (makeFY5253NearestEndMonth(startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), - [datetime(2006, 9, 1)] + date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), - (makeFY5253NearestEndMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_nem_8_sat))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), JNJ), - (makeFY5253NearestEndMonth(n=-1, startingMonth=12, - weekday=WeekDay.SUN), - list(reversed(JNJ))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2005, 1, 2), datetime(2006, 1, 1)]), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2006, 1, 2), datetime(2006, 12, 31)]), - (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) - ] - for test in tests: - offset, data = test - current = data[0] - for datum in data[1:]: - current = current + offset - assert current == datum - - -class TestFY5253LastOfMonthQuarter(Base): - - def test_isAnchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, - qtr_with_extra_week=4).isAnchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() - - def test_equality(self): - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - - def test_offset(self): - offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), - datetime(2010, 9, 25), datetime(2010, 12, 25), - datetime(2011, 3, 26), datetime(2011, 6, 25), - datetime(2011, 9, 24), datetime(2011, 12, 24), - datetime(2012, 3, 24), datetime(2012, 6, 23), - datetime(2012, 9, 29), datetime(2012, 12, 29), - datetime(2013, 3, 30), datetime(2013, 6, 29)] - - assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) - assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), - expected=GMCR[0]) - assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) - - assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) - assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) - - assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) - assert_offset_equal(offset_neg1, - base=GMCR[-1] + relativedelta(days=+1), - expected=GMCR[-1]) - assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) - - date = GMCR[0] + relativedelta(days=-1) - for expected in GMCR: - assert_offset_equal(offset, date, expected) - date = date + offset - - date = GMCR[-1] + relativedelta(days=+1) - for expected in reversed(GMCR): - assert_offset_equal(offset_neg1, date, expected) - date = date + offset_neg1 - - lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - on_offset_cases = [ - # From Wikipedia - (lomq_aug_sat_4, datetime(2006, 8, 26), True), - (lomq_aug_sat_4, datetime(2007, 8, 25), True), - (lomq_aug_sat_4, datetime(2008, 8, 30), True), - (lomq_aug_sat_4, datetime(2009, 8, 29), True), - (lomq_aug_sat_4, datetime(2010, 8, 28), True), - (lomq_aug_sat_4, datetime(2011, 8, 27), True), - (lomq_aug_sat_4, datetime(2019, 8, 31), True), - - (lomq_aug_sat_4, datetime(2006, 8, 27), False), - (lomq_aug_sat_4, datetime(2007, 8, 28), False), - (lomq_aug_sat_4, datetime(2008, 8, 31), False), - (lomq_aug_sat_4, datetime(2009, 8, 30), False), - (lomq_aug_sat_4, datetime(2010, 8, 29), False), - (lomq_aug_sat_4, datetime(2011, 8, 28), False), - - (lomq_aug_sat_4, datetime(2006, 8, 25), False), - (lomq_aug_sat_4, datetime(2007, 8, 24), False), - (lomq_aug_sat_4, datetime(2008, 8, 29), False), - (lomq_aug_sat_4, datetime(2009, 8, 28), False), - (lomq_aug_sat_4, datetime(2010, 8, 27), False), - (lomq_aug_sat_4, datetime(2011, 8, 26), False), - (lomq_aug_sat_4, datetime(2019, 8, 30), False), - - # From GMCR - (lomq_sep_sat_4, datetime(2010, 9, 25), True), - (lomq_sep_sat_4, datetime(2011, 9, 24), True), - (lomq_sep_sat_4, datetime(2012, 9, 29), True), - - (lomq_sep_sat_4, datetime(2013, 6, 29), True), - (lomq_sep_sat_4, datetime(2012, 6, 23), True), - (lomq_sep_sat_4, datetime(2012, 6, 30), False), - - (lomq_sep_sat_4, datetime(2013, 3, 30), True), - (lomq_sep_sat_4, datetime(2012, 3, 24), True), - - (lomq_sep_sat_4, datetime(2012, 12, 29), True), - (lomq_sep_sat_4, datetime(2011, 12, 24), True), - - # INTC (extra week in Q1) - # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 4, 2), True), - - # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2012, 12, 29), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 12, 31), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2010, 12, 25), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_year_has_extra_week(self): - # End of long Q1 - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) - - # Start of long Q1 - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) - - # End of year before year with long Q1 - assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) - - for year in [x - for x in range(1994, 2011 + 1) - if x not in [2011, 2005, 2000, 1994]]: - assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week( - datetime(year, 4, 2)) - - # Other long years - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) - - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) - - assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) - - def test_get_weeks(self): - sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1) - sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] - assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] - assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13] - - -class TestFY5253NearestEndMonthQuarter(Base): - - offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.THU, - qtr_with_extra_week=4) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - - on_offset_cases = [ - # From Wikipedia - (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), - (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), - (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), - (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), - (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), - (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), - - (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), - (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), - (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), - (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), - - (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), - (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), - (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), - (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), - (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), - - (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), - (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), - (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), - (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), - (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), - (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), - - # From Micron, see: - # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 - (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), - (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), - - # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 - (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), - (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), - (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), - (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), - (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), - (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), - - (offset_n, datetime(2012, 12, 31), False), - (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_offset(self): - offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, - weekday=WeekDay.THU, - qtr_with_extra_week=4) - - MU = [datetime(2012, 5, 31), - datetime(2012, 8, 30), datetime(2012, 11, 29), - datetime(2013, 2, 28), datetime(2013, 5, 30)] - - date = MU[0] + relativedelta(days=-1) - for expected in MU: - assert_offset_equal(offset, date, expected) - date = date + offset - - assert_offset_equal(offset, - datetime(2012, 5, 31), - datetime(2012, 8, 30)) - assert_offset_equal(offset, - datetime(2012, 5, 30), - datetime(2012, 5, 31)) - - offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", - qtr_with_extra_week=4) - - assert_offset_equal(offset2, - datetime(2013, 1, 15), - datetime(2013, 3, 30)) - - -class TestQuarterBegin(Base): - - def test_repr(self): - assert (repr(QuarterBegin()) == - "") - assert (repr(QuarterBegin(startingMonth=3)) == - "") - assert (repr(QuarterBegin(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert QuarterBegin(startingMonth=1).isAnchored() - assert QuarterBegin().isAnchored() - assert not QuarterBegin(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((QuarterBegin(startingMonth=1), - {datetime(2007, 12, 1): datetime(2008, 1, 1), - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 1): datetime(2008, 7, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 4, 30): datetime(2008, 5, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 12, 1): datetime(2009, 1, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 30): datetime(2008, 7, 1), })) - - offset_cases.append((QuarterBegin(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2008, 4, 30): datetime(2008, 4, 1), - datetime(2008, 7, 1): datetime(2008, 4, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=2), - {datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 2, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2008, 3, 31): datetime(2008, 7, 1), - datetime(2008, 4, 15): datetime(2008, 10, 1), - datetime(2008, 4, 1): datetime(2008, 10, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = QuarterBegin(n=-1, startingMonth=1) - assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) - - -class TestQuarterEnd(Base): - _offset = QuarterEnd - - def test_repr(self): - assert (repr(QuarterEnd()) == - "") - assert (repr(QuarterEnd(startingMonth=3)) == - "") - assert (repr(QuarterEnd(startingMonth=1)) == - "") - - def test_isAnchored(self): - assert QuarterEnd(startingMonth=1).isAnchored() - assert QuarterEnd().isAnchored() - assert not QuarterEnd(2, startingMonth=1).isAnchored() - - offset_cases = [] - offset_cases.append((QuarterEnd(startingMonth=1), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31), })) - - offset_cases.append((QuarterEnd(startingMonth=2), - {datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 5, 31), - datetime(2008, 3, 31): datetime(2008, 5, 31), - datetime(2008, 4, 15): datetime(2008, 5, 31), - datetime(2008, 4, 30): datetime(2008, 5, 31), })) - - offset_cases.append((QuarterEnd(startingMonth=1, n=0), - {datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30), })) - - offset_cases.append((QuarterEnd(startingMonth=1, n=-1), - {datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), - datetime(2008, 7, 1): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=2), - {datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - def test_offset_corner_case(self): - # corner - offset = QuarterEnd(n=-1, startingMonth=1) - assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) - - on_offset_cases = [ - (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), - (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), - (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), - (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), - (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), - (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), - (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), - (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearBegin(Base): - _offset = BYearBegin - - def test_misspecified(self): - pytest.raises(ValueError, BYearBegin, month=13) - pytest.raises(ValueError, BYearEnd, month=13) - - offset_cases = [] - offset_cases.append((BYearBegin(), - {datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2011, 1, 1): datetime(2011, 1, 3), - datetime(2011, 1, 3): datetime(2012, 1, 2), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2), })) - - offset_cases.append((BYearBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 1, 2), - datetime(2009, 1, 4): datetime(2009, 1, 1), - datetime(2009, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 2), - datetime(2006, 12, 30): datetime(2006, 1, 2), - datetime(2006, 1, 1): datetime(2005, 1, 3), })) - - offset_cases.append((BYearBegin(-2), - {datetime(2007, 1, 1): datetime(2005, 1, 3), - datetime(2007, 6, 30): datetime(2006, 1, 2), - datetime(2008, 12, 31): datetime(2007, 1, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - -class TestYearBegin(Base): - _offset = YearBegin - - def test_misspecified(self): - pytest.raises(ValueError, YearBegin, month=13) - - offset_cases = [] - offset_cases.append((YearBegin(), - {datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(0), - {datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(3), - {datetime(2008, 1, 1): datetime(2011, 1, 1), - datetime(2008, 6, 30): datetime(2011, 1, 1), - datetime(2008, 12, 31): datetime(2011, 1, 1), - datetime(2005, 12, 30): datetime(2008, 1, 1), - datetime(2005, 12, 31): datetime(2008, 1, 1), })) - - offset_cases.append((YearBegin(-1), - {datetime(2007, 1, 1): datetime(2006, 1, 1), - datetime(2007, 1, 15): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 1), - datetime(2006, 12, 30): datetime(2006, 1, 1), - datetime(2007, 1, 1): datetime(2006, 1, 1), })) - - offset_cases.append((YearBegin(-2), - {datetime(2007, 1, 1): datetime(2005, 1, 1), - datetime(2008, 6, 30): datetime(2007, 1, 1), - datetime(2008, 12, 31): datetime(2007, 1, 1), })) - - offset_cases.append((YearBegin(month=4), - {datetime(2007, 4, 1): datetime(2008, 4, 1), - datetime(2007, 4, 15): datetime(2008, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1), })) - - offset_cases.append((YearBegin(0, month=4), - {datetime(2007, 4, 1): datetime(2007, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1), })) - - offset_cases.append((YearBegin(4, month=4), - {datetime(2007, 4, 1): datetime(2011, 4, 1), - datetime(2007, 4, 15): datetime(2011, 4, 1), - datetime(2007, 3, 1): datetime(2010, 4, 1), - datetime(2007, 12, 15): datetime(2011, 4, 1), - datetime(2012, 1, 31): datetime(2015, 4, 1), })) - - offset_cases.append((YearBegin(-1, month=4), - {datetime(2007, 4, 1): datetime(2006, 4, 1), - datetime(2007, 3, 1): datetime(2006, 4, 1), - datetime(2007, 12, 15): datetime(2007, 4, 1), - datetime(2012, 1, 31): datetime(2011, 4, 1), })) - - offset_cases.append((YearBegin(-3, month=4), - {datetime(2007, 4, 1): datetime(2004, 4, 1), - datetime(2007, 3, 1): datetime(2004, 4, 1), - datetime(2007, 12, 15): datetime(2005, 4, 1), - datetime(2012, 1, 31): datetime(2009, 4, 1), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), - (YearBegin(), datetime(2008, 1, 1), True), - (YearBegin(), datetime(2006, 12, 31), False), - (YearBegin(), datetime(2006, 1, 2), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearEndLagged(Base): - - def test_bad_month_fail(self): - pytest.raises(Exception, BYearEnd, month=13) - pytest.raises(Exception, BYearEnd, month=0) - - offset_cases = [] - offset_cases.append((BYearEnd(month=6), - {datetime(2008, 1, 1): datetime(2008, 6, 30), - datetime(2007, 6, 30): datetime(2008, 6, 30)}, )) - - offset_cases.append((BYearEnd(n=-1, month=6), - {datetime(2008, 1, 1): datetime(2007, 6, 29), - datetime(2007, 6, 30): datetime(2007, 6, 29)}, )) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert base + offset == expected - - def test_roll(self): - offset = BYearEnd(month=6) - date = datetime(2009, 11, 30) - - assert offset.rollforward(date) == datetime(2010, 6, 30) - assert offset.rollback(date) == datetime(2009, 6, 30) - - on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), - (BYearEnd(month=6), datetime(2007, 6, 30), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestBYearEnd(Base): - _offset = BYearEnd - - offset_cases = [] - offset_cases.append((BYearEnd(), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2006, 12, 29), - datetime(2005, 12, 31): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(0), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29), })) - - offset_cases.append((BYearEnd(-2), - {datetime(2007, 1, 1): datetime(2005, 12, 30), - datetime(2008, 6, 30): datetime(2006, 12, 29), - datetime(2008, 12, 31): datetime(2006, 12, 29), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), - (BYearEnd(), datetime(2008, 1, 1), False), - (BYearEnd(), datetime(2006, 12, 31), False), - (BYearEnd(), datetime(2006, 12, 29), True)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestYearEnd(Base): - _offset = YearEnd - - def test_misspecified(self): - pytest.raises(ValueError, YearEnd, month=13) - - offset_cases = [] - offset_cases.append((YearEnd(), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 31), })) - - offset_cases.append((YearEnd(0), - {datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), })) - - offset_cases.append((YearEnd(-1), - {datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 31), - datetime(2006, 12, 30): datetime(2005, 12, 31), - datetime(2007, 1, 1): datetime(2006, 12, 31), })) - - offset_cases.append((YearEnd(-2), - {datetime(2007, 1, 1): datetime(2005, 12, 31), - datetime(2008, 6, 30): datetime(2006, 12, 31), - datetime(2008, 12, 31): datetime(2006, 12, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), - (YearEnd(), datetime(2008, 1, 1), False), - (YearEnd(), datetime(2006, 12, 31), True), - (YearEnd(), datetime(2006, 12, 29), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - -class TestYearEndDiffMonth(Base): - - offset_cases = [] - offset_cases.append((YearEnd(month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 15): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2009, 3, 31), - datetime(2008, 3, 30): datetime(2008, 3, 31), - datetime(2005, 3, 31): datetime(2006, 3, 31), - datetime(2006, 7, 30): datetime(2007, 3, 31)})) - - offset_cases.append((YearEnd(0, month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 28): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2008, 3, 31), - datetime(2005, 3, 30): datetime(2005, 3, 31), })) - - offset_cases.append((YearEnd(-1, month=3), - {datetime(2007, 1, 1): datetime(2006, 3, 31), - datetime(2008, 2, 28): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2007, 3, 31), - datetime(2006, 3, 29): datetime(2005, 3, 31), - datetime(2006, 3, 30): datetime(2005, 3, 31), - datetime(2007, 3, 1): datetime(2006, 3, 31), })) - - offset_cases.append((YearEnd(-2, month=3), - {datetime(2007, 1, 1): datetime(2005, 3, 31), - datetime(2008, 6, 30): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2006, 3, 31), })) - - @pytest.mark.parametrize('case', offset_cases) - def test_offset(self, case): - offset, cases = case - for base, expected in compat.iteritems(cases): - assert_offset_equal(offset, base, expected) - - on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), - (YearEnd(month=3), datetime(2008, 1, 1), False), - (YearEnd(month=3), datetime(2006, 3, 31), True), - (YearEnd(month=3), datetime(2006, 3, 29), False)] - - @pytest.mark.parametrize('case', on_offset_cases) - def test_onOffset(self, case): - offset, dt, expected = case - assert_onOffset(offset, dt, expected) - - def test_Easter(): assert_offset_equal(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)) assert_offset_equal(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)) @@ -4285,12 +2773,6 @@ def test_get_offset_name(self): assert Week(weekday=4).freqstr == 'W-FRI' assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN" - assert (makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") - assert (makeFY5253NearestEndMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") def test_get_offset(): @@ -4303,17 +2785,7 @@ def test_get_offset(): ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)), - ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4)), - ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, - startingMonth=12)), - ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), - ("REQ-L-MAR-TUE-4", makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, qtr_with_extra_week=4)), - ("REQ-L-DEC-MON-3", makeFY5253LastOfMonthQuarter( - weekday=0, startingMonth=12, qtr_with_extra_week=3)), - ("REQ-N-DEC-MON-3", makeFY5253NearestEndMonthQuarter( - weekday=0, startingMonth=12, qtr_with_extra_week=3)), - ] + ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4))] for name, expected in pairs: offset = get_offset(name) @@ -4381,16 +2853,6 @@ def test_get_standard_freq(): assert fstr == get_standard_freq(('q', 5)) -def test_quarterly_dont_normalize(): - date = datetime(2012, 3, 31, 5, 30) - - offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) - - for klass in offsets: - result = date + klass() - assert (result.time() == date.time()) - - class TestOffsetAliases(object): def setup_method(self, method): diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py new file mode 100644 index 0000000000000..1d47cf67c6e55 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -0,0 +1,994 @@ +# -*- coding: utf-8 -*- +""" +Tests for Year, Quarter, and Month-based DateOffset subclasses +""" +from datetime import datetime + +import pytest + +from pandas import Timestamp +from pandas import compat + +from pandas.tseries.offsets import (BMonthBegin, BMonthEnd, + MonthBegin, MonthEnd, + YearEnd, YearBegin, BYearEnd, BYearBegin, + QuarterEnd, QuarterBegin, + BQuarterEnd, BQuarterBegin) + +from .test_offsets import Base +from .common import assert_offset_equal, assert_onOffset + + +# -------------------------------------------------------------------- +# Misc + +def test_quarterly_dont_normalize(): + date = datetime(2012, 3, 31, 5, 30) + + offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) + + for klass in offsets: + result = date + klass() + assert (result.time() == date.time()) + + +# -------------------------------------------------------------------- +# Months + +class TestMonthBegin(Base): + _offset = MonthBegin + + offset_cases = [] + # NOTE: I'm not entirely happy with the logic here for Begin -ss + # see thread 'offset conventions' on the ML + offset_cases.append((MonthBegin(), { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + offset_cases.append((MonthBegin(0), { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + offset_cases.append((MonthBegin(2), { + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + offset_cases.append((MonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestMonthEnd(Base): + _offset = MonthEnd + + def test_day_of_month(self): + dt = datetime(2007, 1, 1) + offset = MonthEnd() + + result = dt + offset + assert result == Timestamp(2007, 1, 31) + + result = result + offset + assert result == Timestamp(2007, 2, 28) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + MonthEnd(normalize=True) + expected = dt.replace(hour=0) + MonthEnd() + assert result == expected + + offset_cases = [] + offset_cases.append((MonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31)})) + + offset_cases.append((MonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + offset_cases.append((MonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31)})) + + offset_cases.append((MonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBMonthBegin(Base): + _offset = BMonthBegin + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthBegin() + offset2 = BMonthBegin() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append((BMonthBegin(), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1)})) + + offset_cases.append((BMonthBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2)})) + + offset_cases.append((BMonthBegin(2), { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + offset_cases.append((BMonthBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBMonthEnd(Base): + _offset = BMonthEnd + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + BMonthEnd(normalize=True) + expected = dt.replace(hour=0) + BMonthEnd() + assert result == expected + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthEnd() + offset2 = BMonthEnd() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append((BMonthEnd(), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BMonthEnd(0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + offset_cases.append((BMonthEnd(2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BMonthEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + +# -------------------------------------------------------------------- +# Quarters + + +class TestQuarterBegin(Base): + + def test_repr(self): + expected = "" + assert repr(QuarterBegin()) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=1)) == expected + + def test_isAnchored(self): + assert QuarterBegin(startingMonth=1).isAnchored() + assert QuarterBegin().isAnchored() + assert not QuarterBegin(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append((QuarterBegin(startingMonth=1), { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1)})) + + offset_cases.append((QuarterBegin(startingMonth=1, n=2), { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestQuarterEnd(Base): + _offset = QuarterEnd + + def test_repr(self): + expected = "" + assert repr(QuarterEnd()) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=1)) == expected + + def test_isAnchored(self): + assert QuarterEnd(startingMonth=1).isAnchored() + assert QuarterEnd().isAnchored() + assert not QuarterEnd(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) + + offset_cases = [] + offset_cases.append((QuarterEnd(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31)})) + + offset_cases.append((QuarterEnd(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30)})) + + offset_cases.append((QuarterEnd(startingMonth=1, n=2), { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBQuarterBegin(Base): + _offset = BQuarterBegin + + def test_repr(self): + expected = "" + assert repr(BQuarterBegin()) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=1)) == expected + + def test_isAnchored(self): + assert BQuarterBegin(startingMonth=1).isAnchored() + assert BQuarterBegin().isAnchored() + assert not BQuarterBegin(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterBegin(n=-1, startingMonth=1) + assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) + + offset_cases = [] + offset_cases.append((BQuarterBegin(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1)})) + + offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestBQuarterEnd(Base): + _offset = BQuarterEnd + + def test_repr(self): + expected = "" + assert repr(BQuarterEnd()) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=1)) == expected + + def test_isAnchored(self): + assert BQuarterEnd(startingMonth=1).isAnchored() + assert BQuarterEnd().isAnchored() + assert not BQuarterEnd(2, startingMonth=1).isAnchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) + + offset_cases = [] + offset_cases.append((BQuarterEnd(startingMonth=1), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31)})) + + offset_cases.append((BQuarterEnd(startingMonth=2), { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=0), { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31)})) + + offset_cases.append((BQuarterEnd(startingMonth=1, n=2), { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + +# -------------------------------------------------------------------- +# Years + + +class TestYearBegin(Base): + _offset = YearBegin + + def test_misspecified(self): + pytest.raises(ValueError, YearBegin, month=13) + + offset_cases = [] + offset_cases.append((YearBegin(), { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(3), { + datetime(2008, 1, 1): datetime(2011, 1, 1), + datetime(2008, 6, 30): datetime(2011, 1, 1), + datetime(2008, 12, 31): datetime(2011, 1, 1), + datetime(2005, 12, 30): datetime(2008, 1, 1), + datetime(2005, 12, 31): datetime(2008, 1, 1)})) + + offset_cases.append((YearBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1)})) + + offset_cases.append((YearBegin(-2), { + datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1)})) + + offset_cases.append((YearBegin(month=4), { + datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1)})) + + offset_cases.append((YearBegin(0, month=4), { + datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1)})) + + offset_cases.append((YearBegin(4, month=4), { + datetime(2007, 4, 1): datetime(2011, 4, 1), + datetime(2007, 4, 15): datetime(2011, 4, 1), + datetime(2007, 3, 1): datetime(2010, 4, 1), + datetime(2007, 12, 15): datetime(2011, 4, 1), + datetime(2012, 1, 31): datetime(2015, 4, 1)})) + + offset_cases.append((YearBegin(-1, month=4), { + datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1)})) + + offset_cases.append((YearBegin(-3, month=4), { + datetime(2007, 4, 1): datetime(2004, 4, 1), + datetime(2007, 3, 1): datetime(2004, 4, 1), + datetime(2007, 12, 15): datetime(2005, 4, 1), + datetime(2012, 1, 31): datetime(2009, 4, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestYearEnd(Base): + _offset = YearEnd + + def test_misspecified(self): + pytest.raises(ValueError, YearEnd, month=13) + + offset_cases = [] + offset_cases.append((YearEnd(), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31)})) + + offset_cases.append((YearEnd(0), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31)})) + + offset_cases.append((YearEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + offset_cases.append((YearEnd(-2), { + datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestYearEndDiffMonth(Base): + offset_cases = [] + offset_cases.append((YearEnd(month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31)})) + + offset_cases.append((YearEnd(0, month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31)})) + + offset_cases.append((YearEnd(-1, month=3), + {datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31)})) + + offset_cases.append((YearEnd(-2, month=3), + {datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBYearBegin(Base): + _offset = BYearBegin + + def test_misspecified(self): + pytest.raises(ValueError, BYearBegin, month=13) + pytest.raises(ValueError, BYearEnd, month=13) + + offset_cases = [] + offset_cases.append((BYearBegin(), { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2)})) + + offset_cases.append((BYearBegin(0), { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2)})) + + offset_cases.append((BYearBegin(-1), { + datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3)})) + + offset_cases.append((BYearBegin(-2), { + datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + +class TestBYearEnd(Base): + _offset = BYearEnd + + offset_cases = [] + offset_cases.append((BYearEnd(), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(0), { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(-1), { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + offset_cases.append((BYearEnd(-2), { + datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) + + +class TestBYearEndLagged(Base): + _offset = BYearEnd + + def test_bad_month_fail(self): + pytest.raises(Exception, BYearEnd, month=13) + pytest.raises(Exception, BYearEnd, month=0) + + offset_cases = [] + offset_cases.append((BYearEnd(month=6), { + datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30)})) + + offset_cases.append((BYearEnd(n=-1, month=6), { + datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29)})) + + @pytest.mark.parametrize('case', offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in compat.iteritems(cases): + assert_offset_equal(offset, base, expected) + + def test_roll(self): + offset = BYearEnd(month=6) + date = datetime(2009, 11, 30) + + assert offset.rollforward(date) == datetime(2010, 6, 30) + assert offset.rollback(date) == datetime(2009, 6, 30) + + on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False)] + + @pytest.mark.parametrize('case', on_offset_cases) + def test_onOffset(self, case): + offset, dt, expected = case + assert_onOffset(offset, dt, expected) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 7b699349c3f07..a307b7e5817a8 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -985,6 +985,162 @@ class BusinessMonthBegin(MonthOffset): _day_opt = 'business_start' +class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): + """ + DateOffset subclass representing one custom business month, incrementing + between end of month dates + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar + """ + + _cacheable = False + _prefix = 'CBM' + + onOffset = DateOffset.onOffset # override MonthOffset method + + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, offset=timedelta(0)): + self.n = self._validate_n(n) + self.normalize = normalize + self._offset = offset + self.kwds = {} + + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + self.kwds['offset'] = offset + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + kwds = self.kwds + kwds = {key: kwds[key] for key in kwds + if key not in ['calendar', 'weekmask', 'holidays', 'offset']} + return MonthEnd(n=1, normalize=self.normalize, **kwds) + + @apply_wraps + def apply(self, other): + n = self.n + + # First move to month offset + cur_mend = self.m_offset.rollforward(other) + + # Find this custom month offset + cur_cmend = self.cbday.rollback(cur_mend) + + # handle zero case. arbitrarily rollforward + if n == 0 and other != cur_cmend: + n += 1 + + if other < cur_cmend and n >= 1: + n -= 1 + elif other > cur_cmend and n <= -1: + n += 1 + + new = cur_mend + n * self.m_offset + result = self.cbday.rollback(new) + return result + + +class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): + """ + DateOffset subclass representing one custom business month, incrementing + between beginning of month dates + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar + """ + + _cacheable = False + _prefix = 'CBMS' + + onOffset = DateOffset.onOffset # override MonthOffset method + + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, offset=timedelta(0)): + self.n = self._validate_n(n) + self.normalize = normalize + self._offset = offset + self.kwds = {} + + # _get_calendar does validation and possible transformation + # of calendar and holidays. + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['calendar'] = self.calendar = calendar + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['offset'] = offset + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + kwds = self.kwds + kwds = {key: kwds[key] for key in kwds + if key not in ['calendar', 'weekmask', 'holidays', 'offset']} + return MonthBegin(n=1, normalize=self.normalize, **kwds) + + @apply_wraps + def apply(self, other): + n = self.n + dt_in = other + + # First move to month offset + cur_mbegin = self.m_offset.rollback(dt_in) + + # Find this custom month offset + cur_cmbegin = self.cbday.rollforward(cur_mbegin) + + # handle zero case. arbitrarily rollforward + if n == 0 and dt_in != cur_cmbegin: + n += 1 + + if dt_in > cur_cmbegin and n <= -1: + n += 1 + elif dt_in < cur_cmbegin and n >= 1: + n -= 1 + + new = cur_mbegin + n * self.m_offset + result = self.cbday.rollforward(new) + return result + + +# --------------------------------------------------------------------- +# Semi-Month Based Offset Classes + class SemiMonthOffset(DateOffset): _adjust_dst = True _default_day_of_month = 15 @@ -1185,155 +1341,6 @@ def _apply_index_days(self, i, roll): return i + (roll % 2) * Timedelta(days=self.day_of_month - 1).value -class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): - """ - DateOffset subclass representing one custom business month, incrementing - between end of month dates - - Parameters - ---------- - n : int, default 1 - offset : timedelta, default timedelta(0) - normalize : bool, default False - Normalize start/end dates to midnight before generating date range - weekmask : str, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` - holidays : list - list/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` - calendar : pd.HolidayCalendar or np.busdaycalendar - """ - - _cacheable = False - _prefix = 'CBM' - - onOffset = DateOffset.onOffset # override MonthOffset method - - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset - self.kwds = {} - - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) - self.kwds['weekmask'] = self.weekmask = weekmask - self.kwds['holidays'] = self.holidays = holidays - self.kwds['calendar'] = self.calendar = calendar - self.kwds['offset'] = offset - - @cache_readonly - def cbday(self): - kwds = self.kwds - return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) - - @cache_readonly - def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthEnd(n=1, normalize=self.normalize, **kwds) - - @apply_wraps - def apply(self, other): - n = self.n - # First move to month offset - cur_mend = self.m_offset.rollforward(other) - # Find this custom month offset - cur_cmend = self.cbday.rollback(cur_mend) - - # handle zero case. arbitrarily rollforward - if n == 0 and other != cur_cmend: - n += 1 - - if other < cur_cmend and n >= 1: - n -= 1 - elif other > cur_cmend and n <= -1: - n += 1 - - new = cur_mend + n * self.m_offset - result = self.cbday.rollback(new) - return result - - -class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): - """ - DateOffset subclass representing one custom business month, incrementing - between beginning of month dates - - Parameters - ---------- - n : int, default 1 - offset : timedelta, default timedelta(0) - normalize : bool, default False - Normalize start/end dates to midnight before generating date range - weekmask : str, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` - holidays : list - list/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` - calendar : pd.HolidayCalendar or np.busdaycalendar - """ - - _cacheable = False - _prefix = 'CBMS' - - onOffset = DateOffset.onOffset # override MonthOffset method - - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): - self.n = self._validate_n(n) - self.normalize = normalize - self._offset = offset - self.kwds = {} - - # _get_calendar does validation and possible transformation - # of calendar and holidays. - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) - self.kwds['calendar'] = self.calendar = calendar - self.kwds['weekmask'] = self.weekmask = weekmask - self.kwds['holidays'] = self.holidays = holidays - self.kwds['offset'] = offset - - @cache_readonly - def cbday(self): - kwds = self.kwds - return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) - - @cache_readonly - def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthBegin(n=1, normalize=self.normalize, **kwds) - - @apply_wraps - def apply(self, other): - n = self.n - dt_in = other - # First move to month offset - cur_mbegin = self.m_offset.rollback(dt_in) - # Find this custom month offset - cur_cmbegin = self.cbday.rollforward(cur_mbegin) - - # handle zero case. arbitrarily rollforward - if n == 0 and dt_in != cur_cmbegin: - n += 1 - - if dt_in > cur_cmbegin and n <= -1: - n += 1 - elif dt_in < cur_cmbegin and n >= 1: - n -= 1 - - new = cur_mbegin + n * self.m_offset - result = self.cbday.rollforward(new) - return result - - # --------------------------------------------------------------------- # Week-Based Offset Classes From 29206ee4c51e158643f79fc270419799da64bfbc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Nov 2017 07:08:19 -0800 Subject: [PATCH 40/98] fix missing arg in asvs (#18503) --- asv_bench/benchmarks/timestamp.py | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index b8ef309e6a464..fc5e6dc8c06d6 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -13,55 +13,55 @@ class TimestampProperties(object): def setup(self, tz): self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz) - def time_tz(self): + def time_tz(self, tz): self.ts.tz - def time_offset(self): + def time_offset(self, tz): self.ts.offset - def time_dayofweek(self): + def time_dayofweek(self, tz): self.ts.dayofweek - def time_weekday_name(self): + def time_weekday_name(self, tz): self.ts.weekday_name - def time_dayofyear(self): + def time_dayofyear(self, tz): self.ts.dayofyear - def time_week(self): + def time_week(self, tz): self.ts.week - def time_quarter(self): + def time_quarter(self, tz): self.ts.quarter - def time_days_in_month(self): + def time_days_in_month(self, tz): self.ts.days_in_month - def time_freqstr(self): + def time_freqstr(self, tz): self.ts.freqstr - def time_is_month_start(self): + def time_is_month_start(self, tz): self.ts.is_month_start - def time_is_month_end(self): + def time_is_month_end(self, tz): self.ts.is_month_end - def time_is_quarter_start(self): + def time_is_quarter_start(self, tz): self.ts.is_quarter_start - def time_is_quarter_end(self): + def time_is_quarter_end(self, tz): self.ts.is_quarter_end - def time_is_year_start(self): + def time_is_year_start(self, tz): self.ts.is_quarter_end - def time_is_year_end(self): + def time_is_year_end(self, tz): self.ts.is_quarter_end - def time_is_leap_year(self): + def time_is_leap_year(self, tz): self.ts.is_quarter_end - def time_microsecond(self): + def time_microsecond(self, tz): self.ts.microsecond @@ -74,13 +74,13 @@ class TimestampOps(object): def setup(self, tz): self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) - def time_replace_tz(self): + def time_replace_tz(self, tz): self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) - def time_replace_None(self): + def time_replace_None(self, tz): self.ts.replace(tzinfo=None) - def time_to_pydatetime(self): + def time_to_pydatetime(self, tz): self.ts.to_pydatetime() From f6fe089e9f1441f18bbdcf00347812a4ec264f57 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 26 Nov 2017 15:09:25 +0000 Subject: [PATCH 41/98] EHN: Improve from_items error message (#17312) (#17881) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/frame.py | 27 +++++++++++++++++++++---- pandas/tests/frame/test_constructors.py | 20 +++++++++++++++--- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 347f6047e0b48..56521df5fcce4 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -80,6 +80,7 @@ Other API Changes - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). +- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e82eb8635d4c7..d3561f8a0eadf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -54,6 +54,7 @@ _ensure_int64, _ensure_platform_int, is_list_like, + is_nested_list_like, is_iterator, is_sequence, is_named_tuple) @@ -1271,16 +1272,34 @@ def from_items(cls, items, columns=None, orient='columns'): columns = _ensure_index(keys) arrays = values - return cls._from_arrays(arrays, columns, None) + # GH 17312 + # Provide more informative error msg when scalar values passed + try: + return cls._from_arrays(arrays, columns, None) + + except ValueError: + if not is_nested_list_like(values): + raise ValueError('The value in each (key, value) pair ' + 'must be an array, Series, or dict') + elif orient == 'index': if columns is None: raise TypeError("Must pass columns with orient='index'") keys = _ensure_index(keys) - arr = np.array(values, dtype=object).T - data = [lib.maybe_convert_objects(v) for v in arr] - return cls._from_arrays(data, columns, keys) + # GH 17312 + # Provide more informative error msg when scalar values passed + try: + arr = np.array(values, dtype=object).T + data = [lib.maybe_convert_objects(v) for v in arr] + return cls._from_arrays(data, columns, keys) + + except TypeError: + if not is_nested_list_like(values): + raise ValueError('The value in each (key, value) pair ' + 'must be an array, Series, or dict') + else: # pragma: no cover raise ValueError("'orient' must be either 'columns' or 'index'") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 2f947527ce95b..b6090a13c8d38 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -268,13 +268,14 @@ def test_constructor_dict(self): # GH10856 # dict with scalar values should raise error, even if columns passed - with pytest.raises(ValueError): + msg = 'If using all scalar values, you must pass an index' + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}) - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['a']) - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['b']) def test_constructor_multi_index(self): @@ -1204,6 +1205,19 @@ def test_constructor_from_items(self): columns=['one', 'two', 'three']) tm.assert_frame_equal(rs, xp) + def test_constructor_from_items_scalars(self): + # GH 17312 + with tm.assert_raises_regex(ValueError, + 'The value in each \(key, value\) ' + 'pair must be an array, Series, or dict'): + DataFrame.from_items([('A', 1), ('B', 4)]) + + with tm.assert_raises_regex(ValueError, + 'The value in each \(key, value\) ' + 'pair must be an array, Series, or dict'): + DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'], + orient='index') + def test_constructor_mix_series_nonseries(self): df = DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])}, columns=['A', 'B']) From 5f7d86c244260d7eeae4458306f3b1d28913bb14 Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Sun, 26 Nov 2017 10:13:24 -0500 Subject: [PATCH 42/98] Improved description of seaborn (#18495) --- doc/source/ecosystem.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index f7d1edff15cfb..8ed647c2a19bc 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -53,6 +53,18 @@ the latest web technologies. Its goal is to provide elegant, concise constructio graphics in the style of Protovis/D3, while delivering high-performance interactivity over large data to thin clients. +`seaborn `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Seaborn is a Python visualization library based on `matplotlib +`__. It provides a high-level, dataset-oriented +interface for creating attractive statistical graphics. The plotting functions +in seaborn understand pandas objects and leverage pandas grouping operations +internally to support concise specification of complex visualizations. Seaborn +also goes beyond matplotlib and pandas with the option to perform statistical +estimation while plotting, aggregating across observations and visualizing the +fit of statistical models to emphasize patterns in a dataset. + `yhat/ggplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,15 +76,6 @@ but a faithful implementation for python users has long been missing. Although s (as of Jan-2014), the `yhat/ggplot `__ project has been progressing quickly in that direction. -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Although pandas has quite a bit of "just plot it" functionality built-in, visualization and -in particular statistical graphics is a vast field with a long tradition and lots of ground -to cover. The `Seaborn `__ project builds on top of pandas -and `matplotlib `__ to provide easy plotting of data which extends to -more advanced types of plots then those offered by pandas. - `Vincent `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 68b66ab6cd5215f74abf5dbd0e6bd2dcb0368997 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 26 Nov 2017 10:14:51 -0500 Subject: [PATCH 43/98] COMPAT: map infers all-nan / empty correctly (#18491) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/base.py | 24 +++++++++++++++-- pandas/tests/indexes/common.py | 38 ++++++++++++++------------- pandas/tests/indexes/datetimelike.py | 3 +-- pandas/tests/indexes/test_base.py | 31 +++++++++++----------- pandas/tests/indexes/test_interval.py | 4 --- 6 files changed, 59 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 56521df5fcce4..78b8ca8d5a480 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -75,7 +75,7 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) -- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`). +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`). - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2696f9f94375d..f4332ac244af4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2910,7 +2910,10 @@ def map(self, mapper, na_action=None): from .multi import MultiIndex new_values = super(Index, self)._map_values( mapper, na_action=na_action) + attributes = self._get_attributes_dict() + + # we can return a MultiIndex if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): names = self.names @@ -2923,8 +2926,25 @@ def map(self, mapper, na_action=None): attributes['copy'] = False - # we infer the result types based on the - # returned values + # we want to try to return our original dtype + # ints infer to integer, but if we have + # uints, would prefer to return these + if is_unsigned_integer_dtype(self.dtype): + inferred = lib.infer_dtype(new_values) + if inferred == 'integer': + attributes['dtype'] = self.dtype + + elif not new_values.size: + # empty + attributes['dtype'] = self.dtype + elif isna(new_values).all(): + # all nan + inferred = lib.infer_dtype(self) + if inferred in ['datetime', 'datetime64', + 'timedelta', 'timedelta64', + 'period']: + new_values = [libts.NaT] * len(new_values) + return Index(new_values, **attributes) def isin(self, values, level=None): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ba7795d005721..99bdaf02e25ff 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1007,31 +1007,33 @@ def test_searchsorted_monotonic(self, indices): indices._searchsorted_monotonic(value, side='left') def test_map(self): + # callable index = self.create_index() + expected = index + result = index.map(lambda x: x) + tm.assert_index_equal(result, expected) - # From output of UInt64Index mapping can't infer that we - # shouldn't default to Int64 - if isinstance(index, UInt64Index): - expected = Index(index.values.tolist()) - else: - expected = index + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): - tm.assert_index_equal(index.map(lambda x: x), expected) + index = self.create_index() + if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)): + pytest.skip("skipping tests for {}".format(type(index))) - identity_dict = {x: x for x in index} - tm.assert_index_equal(index.map(identity_dict), expected) + expected = index - # Use values to work around MultiIndex instantiation of series - identity_series = Series(expected.values, index=index) - tm.assert_index_equal(index.map(identity_series), expected) + identity = mapper(index.values, index) + result = index.map(identity) + tm.assert_index_equal(result, expected) # empty mappable - nan_index = pd.Index([np.nan] * len(index)) - series_map = pd.Series() - tm.assert_index_equal(index.map(series_map), nan_index) - - dict_map = {} - tm.assert_index_equal(index.map(dict_map), nan_index) + expected = pd.Index([np.nan] * len(index)) + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) def test_putmask_with_wrong_mask(self): # GH18368 diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 839fccc1441e5..a01c60a47c0f9 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,7 +1,6 @@ """ generic datetimelike tests """ import pytest import pandas as pd -import numpy as np from .common import Base import pandas.util.testing as tm @@ -73,6 +72,6 @@ def test_map_dictlike(self, mapper): # empty map; these map to np.nan because we cannot know # to re-infer things - expected = pd.Index([np.nan] * len(self.index)) + expected = pd.Index([pd.NaT] * len(self.index)) result = self.index.map(mapper([], [])) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7dfd1511da292..372c11b296d9e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -852,11 +852,15 @@ def test_map_tseries_indices_return_index(self): exp = Index(range(24), name='hourly') tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) - def test_map_with_dict_and_series(self): + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index)]) + def test_map_dictlike(self, mapper): # GH 12756 expected = Index(['foo', 'bar', 'baz']) - mapper = Series(expected.values, index=[0, 1, 2]) - result = tm.makeIntIndex(3).map(mapper) + result = tm.makeIntIndex(3).map(mapper(expected.values, [0, 1, 2])) tm.assert_index_equal(result, expected) for name in self.indices.keys(): @@ -867,21 +871,16 @@ def test_map_with_dict_and_series(self): # Cannot map duplicated index continue - cur_index = self.indices[name] - expected = Index(np.arange(len(cur_index), 0, -1)) - mapper = pd.Series(expected, index=cur_index) - result = cur_index.map(mapper) - - tm.assert_index_equal(result, expected) + index = self.indices[name] + expected = Index(np.arange(len(index), 0, -1)) - # If the mapper is empty the expected index type is Int64Index - # but the output defaults to Float64 so I treat it independently - mapper = {o: n for o, n in - zip(cur_index, expected)} + # to match proper result coercion for uints + if name == 'uintIndex': + expected = expected.astype('uint64') + elif name == 'empty': + expected = Index([]) - result = cur_index.map(mapper) - if not mapper: - expected = Float64Index([]) + result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) def test_map_with_non_function_missing_values(self): diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 815d5fcde1400..dc06e51c6d8e7 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -584,10 +584,6 @@ def test_repr_max_seq_item_setting(self): def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() - @pytest.mark.xfail(reason='get_indexer behavior does not currently work') - def test_map(self): - super(TestIntervalIndex, self).test_map() - def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) From d1010643fea058ba43c2c7124af75cc462ccf242 Mon Sep 17 00:00:00 2001 From: Alexander Michael Schade <3345464+aschade@users.noreply.github.com> Date: Sun, 26 Nov 2017 10:19:13 -0500 Subject: [PATCH 44/98] Fix tzaware dates mismatch but no exception raised (#18488) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/tslibs/timezones.pyx | 7 +++---- .../tests/indexes/datetimes/test_date_range.py | 16 ++++++++++++++++ pandas/tests/tseries/test_timezones.py | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index f8274bda546f7..4c6cdb9846305 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -64,7 +64,7 @@ Conversion - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) - Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) -- +- Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) - Indexing diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7fb48e7c66f47..d326f2cb68f24 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -284,10 +284,9 @@ cdef object get_dst_info(object tz): def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo - if end.tzinfo: - if not (get_timezone(tz) == get_timezone(end.tzinfo)): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + if not (get_timezone(tz) == get_timezone(end.tzinfo)): + msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' + raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index edcee0479827f..826e20b8b0586 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -290,6 +290,22 @@ def test_precision_finer_than_offset(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + dt1, dt2 = '2017-01-01', '2017-01-01' + tz1, tz2 = 'US/Eastern', 'Europe/London' + + @pytest.mark.parametrize("start,end", [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) + ]) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + with pytest.raises(TypeError): + pd.date_range(start, end) + with pytest.raises(TypeError): + pd.DatetimeIndex(start, end, freq=BDay()) + class TestBusinessDateRange(object): diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 3dfad2d4af75e..a01166daf6be1 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -424,7 +424,7 @@ def test_with_tz(self): # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - '1/1/2009', tz=pytz.utc) + datetime(2009, 1, 1, tzinfo=pytz.utc)) pytest.raises(Exception, bdate_range, datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', From 982ad07cf38ba4567ddf17d3cfe3e986d1adaae1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 26 Nov 2017 16:46:51 -0500 Subject: [PATCH 45/98] TST: move gbq back to 3.5 build and remove from BUILD_TEST (#18506) --- ci/requirements-3.5.pip | 1 + ci/requirements-3.6_BUILD_TEST.pip | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 0d9e44cf39fa4..c9565f2173070 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1 +1,2 @@ xarray==0.9.1 +pandas_gbq diff --git a/ci/requirements-3.6_BUILD_TEST.pip b/ci/requirements-3.6_BUILD_TEST.pip index a0fc77c40bc00..f4617133cad5b 100644 --- a/ci/requirements-3.6_BUILD_TEST.pip +++ b/ci/requirements-3.6_BUILD_TEST.pip @@ -1,7 +1,6 @@ xarray geopandas seaborn -pandas_gbq pandas_datareader statsmodels scikit-learn From 674fb96b33c07c680844f674fcdf0767b6e3c2f9 Mon Sep 17 00:00:00 2001 From: Bob Haffner Date: Sun, 26 Nov 2017 17:42:48 -0600 Subject: [PATCH 46/98] BUG fixes tuple agg issue 18079 (#18354) --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/groupby.py | 7 +++--- pandas/tests/groupby/test_aggregate.py | 35 +++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 78b8ca8d5a480..bd3fe7750a0a1 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -169,7 +169,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) - - diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba180cc98cb08..69de7630ede2c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2299,8 +2299,7 @@ def _aggregate_series_pure_python(self, obj, func): for label, group in splitter: res = func(group) if result is None: - if (isinstance(res, (Series, Index, np.ndarray)) or - isinstance(res, list)): + if (isinstance(res, (Series, Index, np.ndarray))): raise ValueError('Function does not reduce') result = np.empty(ngroups, dtype='O') @@ -3022,7 +3021,9 @@ def aggregate(self, func_or_funcs, *args, **kwargs): if isinstance(func_or_funcs, compat.string_types): return getattr(self, func_or_funcs)(*args, **kwargs) - if hasattr(func_or_funcs, '__iter__'): + if isinstance(func_or_funcs, collections.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) else: diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 913d3bcc09869..3d27df31cee6e 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -637,7 +637,7 @@ def test_agg_consistency(self): def P1(a): try: return np.percentile(a.dropna(), q=1) - except: + except Exception: return np.nan import datetime as dt @@ -892,3 +892,36 @@ def test_sum_uint64_overflow(self): expected.index.name = 0 result = df.groupby(0).sum() tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("structure, expected", [ + (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), + (3, 4): (3, 4, 4)}})), + (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], + (3, 4): [3, 4, 4]}})) + ]) + def test_agg_structs_dataframe(self, structure, expected): + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby(['A', 'B']).aggregate(structure) + expected.index.names = ['A', 'B'] + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("structure, expected", [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], + index=[1, 3], name='C')), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], + index=[1, 3], name='C')) + ]) + def test_agg_structs_series(self, structure, expected): + # Issue #18079 + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby('A')['C'].aggregate(structure) + expected.index.name = 'A' + assert_series_equal(result, expected) From 49ddcd59fb2d02c952cad146ad59c0020c74f427 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Nov 2017 16:35:41 -0800 Subject: [PATCH 47/98] simplify skiplist inclusion/cimport to be more cythonize-friendly (#18420) --- pandas/_libs/skiplist.pxd | 48 +++++++++++++++++++++++++++++ pandas/_libs/{src => }/skiplist.pyx | 18 +++++------ pandas/_libs/src/skiplist.pxd | 22 ------------- pandas/_libs/window.pyx | 17 ++++------ setup.py | 8 +++-- 5 files changed, 68 insertions(+), 45 deletions(-) create mode 100644 pandas/_libs/skiplist.pxd rename pandas/_libs/{src => }/skiplist.pyx (95%) delete mode 100644 pandas/_libs/src/skiplist.pxd diff --git a/pandas/_libs/skiplist.pxd b/pandas/_libs/skiplist.pxd new file mode 100644 index 0000000000000..82a0862112199 --- /dev/null +++ b/pandas/_libs/skiplist.pxd @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +from cython cimport Py_ssize_t + +from numpy cimport double_t + + +cdef extern from "src/skiplist.h": + ctypedef struct node_t: + node_t **next + int *width + double value + int is_nil + int levels + int ref_count + + ctypedef struct skiplist_t: + node_t *head + node_t **tmp_chain + int *tmp_steps + int size + int maxlevels + + skiplist_t* skiplist_init(int) nogil + void skiplist_destroy(skiplist_t*) nogil + double skiplist_get(skiplist_t*, int, int*) nogil + int skiplist_insert(skiplist_t*, double) nogil + int skiplist_remove(skiplist_t*, double) nogil + + +# Note: Node is declared here so that IndexableSkiplist can be exposed; +# Node itself not intended to be exposed. +cdef class Node: + cdef public: + double_t value + list next + list width + + +cdef class IndexableSkiplist: + cdef: + Py_ssize_t size, maxlevels + Node head + + cpdef get(self, Py_ssize_t i) + cpdef insert(self, double value) + cpdef remove(self, double value) diff --git a/pandas/_libs/src/skiplist.pyx b/pandas/_libs/skiplist.pyx similarity index 95% rename from pandas/_libs/src/skiplist.pyx rename to pandas/_libs/skiplist.pyx index 1524dca38d0e0..c96413edfb0f2 100644 --- a/pandas/_libs/src/skiplist.pyx +++ b/pandas/_libs/skiplist.pyx @@ -6,8 +6,7 @@ # Cython version: Wes McKinney -cdef extern from "math.h": - double log(double x) +from libc.math cimport log # MSVC does not have log2! @@ -16,6 +15,7 @@ cdef double Log2(double x): cimport numpy as np import numpy as np +from numpy cimport double_t from random import random @@ -25,10 +25,10 @@ np.import_array() # TODO: optimize this, make less messy cdef class Node: - cdef public: - double_t value - list next - list width + # cdef public: + # double_t value + # list next + # list width def __init__(self, double_t value, list next, list width): self.value = value @@ -43,9 +43,9 @@ cdef class IndexableSkiplist: Sorted collection supporting O(lg n) insertion, removal, and lookup by rank. """ - cdef: - Py_ssize_t size, maxlevels - Node head + # cdef: + # Py_ssize_t size, maxlevels + # Node head def __init__(self, expected_size=100): self.size = 0 diff --git a/pandas/_libs/src/skiplist.pxd b/pandas/_libs/src/skiplist.pxd deleted file mode 100644 index 214aa1c7aeaf0..0000000000000 --- a/pandas/_libs/src/skiplist.pxd +++ /dev/null @@ -1,22 +0,0 @@ -cdef extern from "skiplist.h": - ctypedef struct node_t: - node_t **next - int *width - double value - int is_nil - int levels - int ref_count - - ctypedef struct skiplist_t: - node_t *head - node_t **tmp_chain - int *tmp_steps - int size - int maxlevels - - skiplist_t* skiplist_init(int) nogil - void skiplist_destroy(skiplist_t*) nogil - double skiplist_get(skiplist_t*, int, int*) nogil - int skiplist_insert(skiplist_t*, double) nogil - int skiplist_remove(skiplist_t*, double) nogil - diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 95df5a07a390b..ecce45742afa7 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -14,9 +14,13 @@ cimport util from libc.stdlib cimport malloc, free - from numpy cimport ndarray, double_t, int64_t, float64_t +from skiplist cimport (IndexableSkiplist, + node_t, skiplist_t, + skiplist_init, skiplist_destroy, + skiplist_get, skiplist_insert, skiplist_remove) + cdef np.float32_t MINfloat32 = np.NINF cdef np.float64_t MINfloat64 = np.NINF @@ -30,19 +34,10 @@ cdef inline int int_min(int a, int b): return a if a <= b else b from util cimport numeric -from skiplist cimport ( - skiplist_t, - skiplist_init, - skiplist_destroy, - skiplist_get, - skiplist_insert, - skiplist_remove) - cdef extern from "../src/headers/math.h": - double sqrt(double x) nogil int signbit(double) nogil + double sqrt(double x) nogil -include "skiplist.pyx" # Cython implementations of rolling sum, mean, variance, skewness, # other statistical moment functions diff --git a/setup.py b/setup.py index 7e56298d1b20b..37be0b696503d 100755 --- a/setup.py +++ b/setup.py @@ -341,6 +341,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/missing.pyx', 'pandas/_libs/testing.pyx', 'pandas/_libs/window.pyx', + 'pandas/_libs/skiplist.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', 'pandas/_libs/tslibs/strptime.pyx', @@ -544,6 +545,9 @@ def pxd(name): '_libs.reshape': { 'pyxfile': '_libs/reshape', 'depends': _pxi_dep['reshape']}, + '_libs.skiplist': { + 'pyxfile': '_libs/skiplist', + 'depends': ['pandas/_libs/src/skiplist.h']}, '_libs.sparse': { 'pyxfile': '_libs/sparse', 'depends': _pxi_dep['sparse']}, @@ -629,9 +633,7 @@ def pxd(name): 'pyxfile': '_libs/testing'}, '_libs.window': { 'pyxfile': '_libs/window', - 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], - 'depends': ['pandas/_libs/src/skiplist.pyx', - 'pandas/_libs/src/skiplist.h']}, + 'pxdfiles': ['_libs/skiplist', '_libs/src/util']}, 'io.sas._sas': { 'pyxfile': 'io/sas/sas'}} From f745e52e168790bff06a55928c3491cdea389508 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Nov 2017 16:59:26 -0800 Subject: [PATCH 48/98] Implement business_start/end cases for shift_months (#18489) --- asv_bench/benchmarks/offset.py | 19 ++++++ doc/source/whatsnew/v0.22.0.txt | 1 + pandas/_libs/tslibs/offsets.pyx | 52 +++++++++++++++- .../tests/tseries/offsets/test_yqm_offsets.py | 30 ++++++++++ pandas/tseries/offsets.py | 60 ++++++++----------- 5 files changed, 126 insertions(+), 36 deletions(-) diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index ea826e8270ace..849776bf9a591 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -38,6 +38,25 @@ def time_apply_series(self, param): self.ser + self.offset +class OnOffset(object): + goal_time = 0.2 + + params = [pd.offsets.QuarterBegin(), pd.offsets.QuarterEnd(), + pd.offsets.BQuarterBegin(), pd.offsets.BQuarterEnd()] + param_names = ['offset'] + + def setup(self, offset): + self.offset = offset + self.dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31)] + + def time_on_offset(self, offset): + for date in self.dates: + self.offset.onOffset(date) + + class DatetimeIndexArithmetic(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index bd3fe7750a0a1..8e6382c18343e 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -116,6 +116,7 @@ Performance Improvements - ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) - Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) - Improved performance of ``IntervalIndex.symmetric_difference()`` (:issue:`18475`) +- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) .. _whatsnew_0220.docs: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4ed4d4a9b7b99..654c51f0ca842 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -554,8 +554,58 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): dts.day = get_days_in_month(dts.year, dts.month) out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_start': + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + wkday, days_in_month = monthrange(dts.year, dts.month) + compare_day = get_firstbday(wkday, days_in_month) + + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 + + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) + + wkday, days_in_month = monthrange(dts.year, dts.month) + dts.day = get_firstbday(wkday, days_in_month) + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_end': + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + wkday, days_in_month = monthrange(dts.year, dts.month) + compare_day = get_lastbday(wkday, days_in_month) + + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 + + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) + + wkday, days_in_month = monthrange(dts.year, dts.month) + dts.day = get_lastbday(wkday, days_in_month) + out[i] = dtstruct_to_dt64(&dts) + else: - raise ValueError("day must be None, 'start' or 'end'") + raise ValueError("day must be None, 'start', 'end', " + "'business_start', or 'business_end'") return np.asarray(out) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index 1d47cf67c6e55..292dd5eba938e 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -6,6 +6,7 @@ import pytest +import pandas as pd from pandas import Timestamp from pandas import compat @@ -32,6 +33,35 @@ def test_quarterly_dont_normalize(): assert (result.time() == date.time()) +@pytest.mark.parametrize('offset', [MonthBegin(), MonthEnd(), + BMonthBegin(), BMonthEnd()]) +def test_apply_index(offset): + rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') + ser = pd.Series(rng) + + res = rng + offset + res_v2 = offset.apply_index(rng) + assert (res == res_v2).all() + assert res[0] == rng[0] + offset + assert res[-1] == rng[-1] + offset + res2 = ser + offset + # apply_index is only for indexes, not series, so no res2_v2 + assert res2.iloc[0] == ser.iloc[0] + offset + assert res2.iloc[-1] == ser.iloc[-1] + offset + + +@pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(), + BQuarterBegin(), BQuarterEnd()]) +def test_on_offset(offset): + dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)] + for date in dates: + res = offset.onOffset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + # -------------------------------------------------------------------- # Months diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a307b7e5817a8..8e1ead5dfbe9e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -929,8 +929,9 @@ def name(self): if self.isAnchored: return self.rule_code else: + month = liboffsets._int_to_month[self.n] return "{code}-{month}".format(code=self.rule_code, - month=_int_to_month[self.n]) + month=month) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -950,28 +951,23 @@ def apply(self, other): return shift_month(other, n, self._day_opt) + @apply_index_wraps + def apply_index(self, i): + shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) + return i._shallow_copy(shifted) + class MonthEnd(MonthOffset): """DateOffset of one month end""" _prefix = 'M' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) - return i._shallow_copy(shifted) - class MonthBegin(MonthOffset): """DateOffset of one month at beginning""" _prefix = 'MS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) - return i._shallow_copy(shifted) - class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" @@ -1008,6 +1004,7 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): _prefix = 'CBM' onOffset = DateOffset.onOffset # override MonthOffset method + apply_index = DateOffset.apply_index # override MonthOffset method def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): @@ -1083,6 +1080,7 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): _prefix = 'CBMS' onOffset = DateOffset.onOffset # override MonthOffset method + apply_index = DateOffset.apply_index # override MonthOffset method def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): @@ -1603,7 +1601,7 @@ def isAnchored(self): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['startingMonth'] = _month_to_int[suffix] + kwargs['startingMonth'] = liboffsets._month_to_int[suffix] else: if cls._from_name_startingMonth is not None: kwargs['startingMonth'] = cls._from_name_startingMonth @@ -1611,7 +1609,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - month = _int_to_month[self.startingMonth] + month = liboffsets._int_to_month[self.startingMonth] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @apply_wraps @@ -1631,6 +1629,12 @@ def apply(self, other): return shift_month(other, 3 * n - months_since, self._day_opt) + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + modMonth = (dt.month - self.startingMonth) % 3 + return modMonth == 0 and dt.day == self._get_offset_day(dt) + class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates @@ -1644,16 +1648,6 @@ class BQuarterEnd(QuarterOffset): _prefix = 'BQ' _day_opt = 'business_end' - def onOffset(self, dt): - if self.normalize and not _is_normalized(dt): - return False - modMonth = (dt.month - self.startingMonth) % 3 - return modMonth == 0 and dt.day == self._get_offset_day(dt) - - -_int_to_month = tslib._MONTH_ALIASES -_month_to_int = {v: k for k, v in _int_to_month.items()} - # TODO: This is basically the same as BQuarterEnd class BQuarterBegin(QuarterOffset): @@ -1680,12 +1674,6 @@ class QuarterEnd(EndMixin, QuarterOffset): def apply_index(self, i): return self._end_apply_index(i, self.freqstr) - def onOffset(self, dt): - if self.normalize and not _is_normalized(dt): - return False - modMonth = (dt.month - self.startingMonth) % 3 - return modMonth == 0 and dt.day == self._get_offset_day(dt) - class QuarterBegin(BeginMixin, QuarterOffset): _outputName = 'QuarterBegin' @@ -1697,7 +1685,8 @@ class QuarterBegin(BeginMixin, QuarterOffset): @apply_index_wraps def apply_index(self, i): freq_month = 12 if self.startingMonth == 1 else self.startingMonth - 1 - freqstr = 'Q-{month}'.format(month=_int_to_month[freq_month]) + month = liboffsets._int_to_month[freq_month] + freqstr = 'Q-{month}'.format(month=month) return self._beg_apply_index(i, freqstr) @@ -1738,12 +1727,12 @@ def __init__(self, n=1, normalize=False, month=None): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['month'] = _month_to_int[suffix] + kwargs['month'] = liboffsets._month_to_int[suffix] return cls(**kwargs) @property def rule_code(self): - month = _int_to_month[self.month] + month = liboffsets._int_to_month[self.month] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @@ -1784,7 +1773,8 @@ class YearBegin(BeginMixin, YearOffset): @apply_index_wraps def apply_index(self, i): freq_month = 12 if self.month == 1 else self.month - 1 - freqstr = 'A-{month}'.format(month=_int_to_month[freq_month]) + month = liboffsets._int_to_month[freq_month] + freqstr = 'A-{month}'.format(month=month) return self._beg_apply_index(i, freqstr) @@ -1969,7 +1959,7 @@ def _get_suffix_prefix(self): def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() - month = _int_to_month[self.startingMonth] + month = liboffsets._int_to_month[self.startingMonth] weekday = _int_to_weekday[self.weekday] return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, weekday=weekday) @@ -1984,7 +1974,7 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): raise ValueError("Unable to parse varion_code: " "{code}".format(code=varion_code)) - startingMonth = _month_to_int[startingMonth_code] + startingMonth = liboffsets._month_to_int[startingMonth_code] weekday = _weekday_to_int[weekday_code] return {"weekday": weekday, From 1043a46bcdebf974e1e656837e46aab9b31da5f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Nov 2017 03:26:47 -0800 Subject: [PATCH 49/98] move monthrange inside get_first/last_bday, allows nogil (#18512) --- pandas/_libs/tslibs/offsets.pyx | 128 +++++++++--------- .../tests/tseries/offsets/test_liboffsets.py | 13 +- 2 files changed, 67 insertions(+), 74 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 654c51f0ca842..6f5ad2ae45f50 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -24,7 +24,7 @@ from frequencies cimport get_freq_code from nattype cimport NPY_NAT from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, - is_leapyear, days_per_month_table) + is_leapyear, days_per_month_table, dayofweek) # --------------------------------------------------------------------- # Constants @@ -145,45 +145,44 @@ def apply_index_wraps(func): # --------------------------------------------------------------------- # Business Helpers -cpdef int get_lastbday(int wkday, int days_in_month): +cpdef int get_lastbday(int year, int month) nogil: """ Find the last day of the month that is a business day. - (wkday, days_in_month) is the output from monthrange(year, month) - Parameters ---------- - wkday : int - days_in_month : int + year : int + month : int Returns ------- last_bday : int """ + cdef: + int wkday, days_in_month + + wkday = dayofweek(year, month, 1) + days_in_month = get_days_in_month(year, month) return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) -cpdef int get_firstbday(int wkday, int days_in_month=0): +cpdef int get_firstbday(int year, int month) nogil: """ Find the first day of the month that is a business day. - (wkday, days_in_month) is the output from monthrange(year, month) - Parameters ---------- - wkday : int - days_in_month : int, default 0 + year : int + month : int Returns ------- first_bday : int - - Notes - ----- - `days_in_month` arg is a dummy so that this has the same signature as - `get_lastbday`. """ - cdef int first + cdef: + int first, wkday + + wkday = dayofweek(year, month, 1) first = 1 if wkday == 5: # on Saturday first = 3 @@ -556,52 +555,50 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): out[i] = dtstruct_to_dt64(&dts) elif day == 'business_start': - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - wkday, days_in_month = monthrange(dts.year, dts.month) - compare_day = get_firstbday(wkday, days_in_month) + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + compare_day = get_firstbday(dts.year, dts.month) - if months_to_roll > 0 and dts.day < compare_day: - months_to_roll -= 1 - elif months_to_roll <= 0 and dts.day > compare_day: - # as if rolled forward already - months_to_roll += 1 + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) - wkday, days_in_month = monthrange(dts.year, dts.month) - dts.day = get_firstbday(wkday, days_in_month) - out[i] = dtstruct_to_dt64(&dts) + dts.day = get_firstbday(dts.year, dts.month) + out[i] = dtstruct_to_dt64(&dts) elif day == 'business_end': - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - wkday, days_in_month = monthrange(dts.year, dts.month) - compare_day = get_lastbday(wkday, days_in_month) + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + compare_day = get_lastbday(dts.year, dts.month) - if months_to_roll > 0 and dts.day < compare_day: - months_to_roll -= 1 - elif months_to_roll <= 0 and dts.day > compare_day: - # as if rolled forward already - months_to_roll += 1 + if months_to_roll > 0 and dts.day < compare_day: + months_to_roll -= 1 + elif months_to_roll <= 0 and dts.day > compare_day: + # as if rolled forward already + months_to_roll += 1 - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) - wkday, days_in_month = monthrange(dts.year, dts.month) - dts.day = get_lastbday(wkday, days_in_month) - out[i] = dtstruct_to_dt64(&dts) + dts.day = get_lastbday(dts.year, dts.month) + out[i] = dtstruct_to_dt64(&dts) else: raise ValueError("day must be None, 'start', 'end', " @@ -635,7 +632,7 @@ cpdef datetime shift_month(datetime stamp, int months, object day_opt=None): """ cdef: int year, month, day - int wkday, days_in_month, dy + int days_in_month, dy dy = (stamp.month + months) // 12 month = (stamp.month + months) % 12 @@ -645,20 +642,21 @@ cpdef datetime shift_month(datetime stamp, int months, object day_opt=None): dy -= 1 year = stamp.year + dy - wkday, days_in_month = monthrange(year, month) if day_opt is None: + days_in_month = get_days_in_month(year, month) day = min(stamp.day, days_in_month) elif day_opt == 'start': day = 1 elif day_opt == 'end': - day = days_in_month + day = get_days_in_month(year, month) elif day_opt == 'business_start': # first business day of month - day = get_firstbday(wkday, days_in_month) + day = get_firstbday(year, month) elif day_opt == 'business_end': # last business day of month - day = get_lastbday(wkday, days_in_month) + day = get_lastbday(year, month) elif is_integer_object(day_opt): + days_in_month = get_days_in_month(year, month) day = min(day_opt, days_in_month) else: raise ValueError(day_opt) @@ -691,22 +689,22 @@ cpdef int get_day_of_month(datetime other, day_opt) except? -1: """ cdef: - int wkday, days_in_month + int days_in_month if day_opt == 'start': return 1 - - wkday, days_in_month = monthrange(other.year, other.month) - if day_opt == 'end': + elif day_opt == 'end': + days_in_month = get_days_in_month(other.year, other.month) return days_in_month elif day_opt == 'business_start': # first business day of month - return get_firstbday(wkday, days_in_month) + return get_firstbday(other.year, other.month) elif day_opt == 'business_end': # last business day of month - return get_lastbday(wkday, days_in_month) + return get_lastbday(other.year, other.month) elif is_integer_object(day_opt): - day = min(day_opt, days_in_month) + days_in_month = get_days_in_month(other.year, other.month) + return min(day_opt, days_in_month) elif day_opt is None: # Note: unlike `shift_month`, get_day_of_month does not # allow day_opt = None diff --git a/pandas/tests/tseries/offsets/test_liboffsets.py b/pandas/tests/tseries/offsets/test_liboffsets.py index 321104222936b..8aa32bc600ee6 100644 --- a/pandas/tests/tseries/offsets/test_liboffsets.py +++ b/pandas/tests/tseries/offsets/test_liboffsets.py @@ -6,7 +6,6 @@ import pytest -from pandas._libs import tslib from pandas import Timestamp import pandas._libs.tslibs.offsets as liboffsets @@ -15,25 +14,21 @@ def test_get_lastbday(): dt = datetime(2017, 11, 30) assert dt.weekday() == 3 # i.e. this is a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_lastbday(wkday, days_in_month) == 30 + assert liboffsets.get_lastbday(dt.year, dt.month) == 30 dt = datetime(1993, 10, 31) assert dt.weekday() == 6 # i.e. this is not a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_lastbday(wkday, days_in_month) == 29 + assert liboffsets.get_lastbday(dt.year, dt.month) == 29 def test_get_firstbday(): dt = datetime(2017, 4, 1) assert dt.weekday() == 5 # i.e. not a weekday - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_firstbday(wkday, days_in_month) == 3 + assert liboffsets.get_firstbday(dt.year, dt.month) == 3 dt = datetime(1993, 10, 1) assert dt.weekday() == 4 # i.e. a business day - wkday, days_in_month = tslib.monthrange(dt.year, dt.month) - assert liboffsets.get_firstbday(wkday, days_in_month) == 1 + assert liboffsets.get_firstbday(dt.year, dt.month) == 1 def test_shift_month(): From f7c79be4d5bc966a631c9876e272d19a54fd8edf Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 27 Nov 2017 11:28:13 +0000 Subject: [PATCH 50/98] Added repr string for Grouper and TimeGrouper (#18203) --- doc/source/whatsnew/v0.21.1.txt | 2 +- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/groupby.py | 16 +++++++++++++--- pandas/core/resample.py | 19 +++++++------------ pandas/tests/groupby/test_groupby.py | 9 +++++++++ pandas/tests/test_resample.py | 8 ++++++++ 6 files changed, 39 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4c6cdb9846305..4389dbcff280d 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). - .. _whatsnew_0211.deprecations: diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 8e6382c18343e..52ca05d9a76a9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -100,7 +100,7 @@ Removal of prior version deprecations/changes - The ``levels`` and ``labels`` attributes of a ``MultiIndex`` can no longer be set directly (:issue:`4039`). - ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) - ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) -- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd..merge_ordered`` instead (:issue:`18459`) +- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) .. _whatsnew_0220.performance: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 69de7630ede2c..0e8368e5a4533 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -206,12 +206,13 @@ class Grouper(object): sort : boolean, default to False whether to sort the resulting labels - additional kwargs to control time-like groupers (when freq is passed) + additional kwargs to control time-like groupers (when ``freq`` is passed) - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If grouper is PeriodIndex + base, loffset Returns ------- @@ -233,6 +234,7 @@ class Grouper(object): >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ + _attributes = ('key', 'level', 'freq', 'axis', 'sort') def __new__(cls, *args, **kwargs): if kwargs.get('freq') is not None: @@ -333,6 +335,14 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups + def __repr__(self): + attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None] + attrs = ", ".join(attrs_list) + cls_name = self.__class__.__name__ + return "{}({})".format(cls_name, attrs) + class GroupByPlot(PandasObject): """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6988528af415f..bd441a8248841 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1014,22 +1014,18 @@ class TimeGrouper(Grouper): Parameters ---------- freq : pandas date offset or offset alias for identifying bin edges - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right - nperiods : optional, integer + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex - - Notes - ----- - Use begin, end, nperiods to generate intervals that cannot be derived - directly from the associated object """ + _attributes = Grouper._attributes + ('closed', 'label', 'how', + 'loffset', 'kind', 'convention', + 'base') def __init__(self, freq='Min', closed=None, label=None, how='mean', - nperiods=None, axis=0, - fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0, **kwargs): + axis=0, fill_method=None, limit=None, loffset=None, + kind=None, convention=None, base=0, **kwargs): freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) @@ -1048,7 +1044,6 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.closed = closed self.label = label - self.nperiods = nperiods self.kind = kind self.convention = convention or 'E' diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 81153e83471cd..3436dd9169081 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -25,6 +25,15 @@ from .common import MixIn +class TestGrouper(object): + + def test_repr(self): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected + + class TestGroupBy(MixIn): def test_basic(self): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c9e40074c06ad..bf1cac3112c46 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3416,3 +3416,11 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + + def test_repr(self): + # GH18203 + result = repr(TimeGrouper(key='A', freq='H')) + expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)") + assert result == expected From 4fd104a72a825914851820fee623fbcdf1a989a7 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Mon, 27 Nov 2017 05:34:56 -0600 Subject: [PATCH 51/98] COMPAT: reading json with lines=True from s3, xref #17200 (#17201) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/io/json/json.py | 20 ++++--- pandas/tests/io/conftest.py | 74 +++++++++++++++++++++++++ pandas/tests/io/json/test_pandas.py | 65 +++++++++++++++++++++- pandas/tests/io/parser/data/items.jsonl | 2 + pandas/tests/io/parser/test_network.py | 48 ---------------- 6 files changed, 152 insertions(+), 59 deletions(-) create mode 100644 pandas/tests/io/conftest.py create mode 100644 pandas/tests/io/parser/data/items.jsonl diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4389dbcff280d..d1c3a4ba32603 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -88,7 +88,7 @@ I/O - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 11bf3a9363953..21736673350d8 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py new file mode 100644 index 0000000000000..828d5d0ccd3c6 --- /dev/null +++ b/pandas/tests/io/conftest.py @@ -0,0 +1,74 @@ +import os + +import moto +import pytest +from pandas.io.parsers import read_table + +HERE = os.path.dirname(__file__) + + +@pytest.fixture(scope='module') +def tips_file(): + """Path to the tips dataset""" + return os.path.join(HERE, 'parser', 'data', 'tips.csv') + + +@pytest.fixture(scope='module') +def jsonl_file(): + """Path a JSONL dataset""" + return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + + +@pytest.fixture(scope='module') +def salaries_table(): + """DataFrame with the salaries dataset""" + path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + return read_table(path) + + +@pytest.fixture(scope='module') +def s3_resource(tips_file, jsonl_file): + """Fixture for mocking S3 interaction. + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1c895f7e9e89a..fe447534efdc7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,7 +4,6 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os - import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) @@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp + def test_read_inline_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_s3_jsonl(self, s3_resource): + pytest.importorskip('s3fs') + # GH17200 + + result = read_json('s3n://pandas-test/items.jsonl', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + with ensure_clean('tmp_items.json') as path: + with open(path, 'w') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl new file mode 100644 index 0000000000000..f784d37befa82 --- /dev/null +++ b/pandas/tests/io/parser/data/items.jsonl @@ -0,0 +1,2 @@ +{"a": 1, "b": 2} +{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 27cc708889fa2..d00d3f31ce189 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,10 +4,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -import os - import pytest -import moto import pandas.util.testing as tm from pandas import DataFrame @@ -15,51 +12,6 @@ from pandas.compat import BytesIO -@pytest.fixture(scope='module') -def tips_file(): - return os.path.join(tm.get_data_path(), 'tips.csv') - - -@pytest.fixture(scope='module') -def salaries_table(): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - return read_table(path) - - -@pytest.fixture(scope='module') -def s3_resource(tips_file): - pytest.importorskip('s3fs') - moto.mock_s3().start() - - test_s3_files = [ - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) - - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' - - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') - - yield conn - - moto.mock_s3().stop() - - @pytest.mark.network @pytest.mark.parametrize( "compression,extension", From 262e8ff367c9291c79c4df0c2daf4713de52abc0 Mon Sep 17 00:00:00 2001 From: Yee Mey Date: Mon, 27 Nov 2017 03:36:21 -0800 Subject: [PATCH 52/98] BUG: Ignore division by 0 when merging empty dataframes (#17776) (#17846) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/test_merge.py | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index d1c3a4ba32603..e307e605687bf 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -120,7 +120,7 @@ Reshaping - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) - Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`) -- +- Bug when merging empty DataFrames when ``np.seterr(divide='raise')`` is set (:issue:`17776`) Numeric ^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e4b31939250a7..56ca913dbcddb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1529,7 +1529,8 @@ def _get_join_keys(llab, rlab, shape, sort): rkey = stride * rlab[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + with np.errstate(divide='ignore'): + stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index ee7c4e5c90bb8..b76951e8c2ac2 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -864,6 +864,12 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + def test_merge_two_empty_df_no_division_error(self): + # GH17776, PR #17846 + a = pd.DataFrame({'a': [], 'b': [], 'c': []}) + with np.errstate(divide='raise'): + merge(a, a, on=('a', 'b')) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: From 34b036c6096cf7fdd9d17e9af4ed8cfed9525dcf Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 27 Nov 2017 13:17:16 -0500 Subject: [PATCH 53/98] TST: Skipif decorator for matplotlib #18190 (#18427) --- pandas/tests/io/formats/test_style.py | 3 +- pandas/tests/plotting/common.py | 4 +- pandas/tests/plotting/test_boxplot_method.py | 5 +- pandas/tests/plotting/test_datetimelike.py | 4 +- pandas/tests/plotting/test_deprecated.py | 4 +- pandas/tests/plotting/test_frame.py | 4 +- pandas/tests/plotting/test_groupby.py | 4 +- pandas/tests/plotting/test_hist_method.py | 7 +- pandas/tests/plotting/test_misc.py | 5 +- pandas/tests/plotting/test_series.py | 4 +- pandas/tests/test_resample.py | 4 +- pandas/tests/util/test_util.py | 18 +++++ pandas/util/_test_decorators.py | 71 ++++++++++++++++++++ pandas/util/testing.py | 7 -- 14 files changed, 115 insertions(+), 29 deletions(-) create mode 100644 pandas/util/_test_decorators.py diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 0160371dc413d..4b0ca872da326 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import DataFrame import pandas.util.testing as tm +import pandas.util._test_decorators as td jinja2 = pytest.importorskip('jinja2') from pandas.io.formats.style import Styler, _get_level_lengths # noqa @@ -1011,8 +1012,8 @@ def test_hide_columns_mult_levels(self): class TestStylerMatplotlibDep(object): + @td.skip_if_no_mpl def test_background_gradient(self): - tm._skip_if_no_mpl() df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) for c_map in [None, 'YlOrRd']: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index dfab539e9474c..2e62b22b2b69e 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -12,6 +12,7 @@ import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_is_valid_plot_return_object) +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -23,8 +24,6 @@ This is a common base class used for various plotting tests """ -tm._skip_if_no_mpl() - def _skip_if_no_scipy_gaussian_kde(): try: @@ -43,6 +42,7 @@ def _ok_for_gaussian_kde(kind): return plotting._compat._mpl_ge_1_5_0() +@td.skip_if_no_mpl class TestPlotBase(object): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4b1cb2ccbd3dd..1bc49e9e5f96a 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -8,6 +8,7 @@ from pandas import Series, DataFrame, MultiIndex from pandas.compat import range, lzip import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -19,8 +20,6 @@ """ Test cases for .boxplot method """ -tm._skip_if_no_mpl() - def _skip_if_mpl_14_or_dev_boxplot(): # GH 8382 @@ -31,6 +30,7 @@ def _skip_if_mpl_14_or_dev_boxplot(): pytest.skip("Matplotlib Regression in 1.4 and current dev.") +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @pytest.mark.slow @@ -174,6 +174,7 @@ def test_fontsize(self): xlabelsize=16, ylabelsize=16) +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index d66012e2a56a0..f1a478581e730 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -16,13 +16,13 @@ from pandas.util.testing import assert_series_equal, ensure_clean import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.tests.plotting.common import (TestPlotBase, _skip_if_no_scipy_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestTSPlot(TestPlotBase): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 970de6ff881ab..d2f8e13a2444b 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -4,6 +4,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td import pytest from numpy.random import randn @@ -18,9 +19,8 @@ pandas.tools.plotting """ -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDeprecatedNameSpace(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3887271edb2a3..5c72d778a1220 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -15,6 +15,7 @@ from pandas.compat import range, lrange, lmap, lzip, u, zip, PY3 from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import rand, randn @@ -24,9 +25,8 @@ _skip_if_no_scipy_gaussian_kde, _ok_for_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index de48b58133e9a..a7c99a06c34e9 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -5,14 +5,14 @@ from pandas import Series, DataFrame import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from pandas.tests.plotting.common import TestPlotBase -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): def test_series_groupby_plotting_nominally_works(self): diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 5f7b2dd2d6ca9..864d39eba29c5 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -6,6 +6,7 @@ from pandas import Series, DataFrame import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import randn @@ -14,9 +15,7 @@ from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) -tm._skip_if_no_mpl() - - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): @@ -141,6 +140,7 @@ def test_plot_fails_when_ax_differs_from_figure(self): self.ts.hist(ax=ax1, figure=fig2) +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @pytest.mark.slow @@ -251,6 +251,7 @@ def test_tight_layout(self): tm.close() +@td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6f476553091d9..8b0a981760c72 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -7,6 +7,7 @@ from pandas import DataFrame from pandas.compat import lmap import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy import random @@ -15,9 +16,8 @@ import pandas.plotting as plotting from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): @@ -49,6 +49,7 @@ def test_bootstrap_plot(self): _check_plot_works(bootstrap_plot, series=self.ts, size=10) +@td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): def test_scatter_matrix_axis(self): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index fdfd87d1e898c..6dd7e1e9882b2 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -12,6 +12,7 @@ from pandas import Series, DataFrame, date_range from pandas.compat import range, lrange import pandas.util.testing as tm +import pandas.util._test_decorators as td import numpy as np from numpy.random import randn @@ -21,9 +22,8 @@ _skip_if_no_scipy_gaussian_kde, _ok_for_gaussian_kde) -tm._skip_if_no_mpl() - +@td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): def setup_method(self, method): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index bf1cac3112c46..b0154f6db7022 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -13,6 +13,7 @@ import pandas as pd import pandas.tseries.offsets as offsets import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas import (Series, DataFrame, Panel, Index, isna, notna, Timestamp) @@ -234,9 +235,8 @@ def test_groupby_resample_on_api(self): result = df.groupby('key').resample('D', on='dates').mean() assert_frame_equal(result, expected) + @td.skip_if_no_mpl def test_plot_api(self): - tm._skip_if_no_mpl() - # .resample(....).plot(...) # hitting warnings # GH 12448 diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 659ce36de6bab..be4e60c6493c8 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -16,6 +16,7 @@ validate_bool_kwarg) import pandas.util.testing as tm +from pandas.util._test_decorators import safe_import class TestDecorators(object): @@ -482,3 +483,20 @@ def test_make_signature(): assert sig == (['old_arg_name', 'new_arg_name', 'mapping=None', 'stacklevel=2'], ['old_arg_name', 'new_arg_name', 'mapping', 'stacklevel']) + + +def test_safe_import(monkeypatch): + assert not safe_import("foo") + assert not safe_import("pandas", min_version="99.99.99") + + # Create dummy module to be imported + import types + import sys + mod_name = "hello123" + mod = types.ModuleType(mod_name) + mod.__version__ = "1.5" + + assert not safe_import(mod_name) + monkeypatch.setitem(sys.modules, mod_name, mod) + assert not safe_import(mod_name, min_version="2.0") + assert safe_import(mod_name, min_version="1.0") diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py new file mode 100644 index 0000000000000..b592a73e5d758 --- /dev/null +++ b/pandas/util/_test_decorators.py @@ -0,0 +1,71 @@ +""" +This module provides decorator functions which can be applied to test objects +in order to skip those objects when certain conditions occur. A sample use case +is to detect if the platform is missing ``matplotlib``. If so, any test objects +which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be +skipped by ``pytest`` during the execution of the test suite. + +To illustrate, after importing this module: + +import pandas.util._test_decorators as td + +The decorators can be applied to classes: + +@td.skip_if_some_reason +class Foo(): + ... + +Or individual functions: + +@td.skip_if_some_reason +def test_foo(): + ... + +For more information, refer to the ``pytest`` documentation on ``skipif``. +""" + +import pytest + + +def safe_import(mod_name, min_version=None): + """ + Parameters: + ----------- + mod_name : str + Name of the module to be imported + min_version : str, default None + Minimum required version of the specified mod_name + + Returns: + -------- + object + The imported module if successful, or False + """ + try: + mod = __import__(mod_name) + except ImportError: + return False + + if not min_version: + return mod + else: + import sys + version = getattr(sys.modules[mod_name], '__version__') + if version: + from distutils.version import LooseVersion + if LooseVersion(version) >= LooseVersion(min_version): + return mod + + return False + + +def _skip_if_no_mpl(): + mod = safe_import("matplotlib") + if mod: + mod.use("Agg", warn=False) + else: + return True + + +skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), + reason="Missing matplotlib dependency") diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0da59ba5f958e..ff6fa8ae717d3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -325,13 +325,6 @@ def _skip_if_32bit(): pytest.skip("skipping for 32 bit") -def _skip_if_no_mpl(): - import pytest - - mpl = pytest.importorskip("matplotlib") - mpl.use("Agg", warn=False) - - def _skip_if_mpl_1_5(): import matplotlib as mpl From 88ab6934e78117359719eb09c7d580906155575d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Nov 2017 13:41:02 -0800 Subject: [PATCH 54/98] implement shift_quarters --> apply_index for quarters and years (#18522) --- pandas/_libs/tslibs/offsets.pyx | 156 +++++++++++++++++- .../tests/tseries/offsets/test_yqm_offsets.py | 12 +- pandas/tseries/offsets.py | 78 ++++----- 3 files changed, 193 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6f5ad2ae45f50..251af50ab12ce 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -17,8 +17,6 @@ np.import_array() from util cimport is_string_object, is_integer_object -from pandas._libs.tslib import monthrange - from conversion cimport tz_convert_single, pydt_to_i8 from frequencies cimport get_freq_code from nattype cimport NPY_NAT @@ -471,6 +469,160 @@ cdef inline int month_add_months(pandas_datetimestruct dts, int months) nogil: return 12 if new_month == 0 else new_month +@cython.wraparound(False) +@cython.boundscheck(False) +def shift_quarters(int64_t[:] dtindex, int quarters, + int q1start_month, object day, int modby=3): + """ + Given an int64 array representing nanosecond timestamps, shift all elements + by the specified number of quarters using DateOffset semantics. + + Parameters + ---------- + dtindex : int64_t[:] timestamps for input dates + quarters : int number of quarters to shift + q1start_month : int month in which Q1 begins by convention + day : {'start', 'end', 'business_start', 'business_end'} + modby : int (3 for quarters, 12 for years) + + Returns + ------- + out : ndarray[int64_t] + """ + cdef: + Py_ssize_t i + pandas_datetimestruct dts + int count = len(dtindex) + int months_to_roll, months_since, n, compare_day + bint roll_check + int64_t[:] out = np.empty(count, dtype='int64') + + if day == 'start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + + # offset semantics - if on the anchor point and going backwards + # shift to next + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > 1)): + n += 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + dts.day = 1 + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + + if n <= 0 and months_since != 0: + # The general case of this condition would be + # `months_since != 0 or (months_since == 0 and + # dts.day > get_days_in_month(dts.year, dts.month))` + # but the get_days_in_month inequality would never hold. + n += 1 + elif n > 0 and (months_since == 0 and + dts.day < get_days_in_month(dts.year, + dts.month)): + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + dts.day = get_days_in_month(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_start': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + compare_month = dts.month - months_since + compare_month = compare_month or 12 + # compare_day is only relevant for comparison in the case + # where months_since == 0. + compare_day = get_firstbday(dts.year, compare_month) + + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate + n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + + dts.day = get_firstbday(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + elif day == 'business_end': + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue + + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters + + months_since = (dts.month - q1start_month) % modby + compare_month = dts.month - months_since + compare_month = compare_month or 12 + # compare_day is only relevant for comparison in the case + # where months_since == 0. + compare_day = get_lastbday(dts.year, compare_month) + + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate + n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 + + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + + dts.day = get_lastbday(dts.year, dts.month) + + out[i] = dtstruct_to_dt64(&dts) + + else: + raise ValueError("day must be None, 'start', 'end', " + "'business_start', or 'business_end'") + + return np.asarray(out) + + @cython.wraparound(False) @cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index 292dd5eba938e..22b8cf6119d18 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -33,9 +33,15 @@ def test_quarterly_dont_normalize(): assert (result.time() == date.time()) -@pytest.mark.parametrize('offset', [MonthBegin(), MonthEnd(), - BMonthBegin(), BMonthEnd()]) -def test_apply_index(offset): +@pytest.mark.parametrize('n', [-2, 1]) +@pytest.mark.parametrize('cls', [MonthBegin, MonthEnd, + BMonthBegin, BMonthEnd, + QuarterBegin, QuarterEnd, + BQuarterBegin, BQuarterEnd, + YearBegin, YearEnd, + BYearBegin, BYearEnd]) +def test_apply_index(cls, n): + offset = cls(n=n) rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') ser = pd.Series(rng) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 8e1ead5dfbe9e..a3cddaa19dc17 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -27,7 +27,7 @@ apply_index_wraps, roll_yearday, shift_month, - BeginMixin, EndMixin, + EndMixin, BaseOffset) @@ -1028,10 +1028,7 @@ def cbday(self): @cache_readonly def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthEnd(n=1, normalize=self.normalize, **kwds) + return MonthEnd(n=1, normalize=self.normalize) @apply_wraps def apply(self, other): @@ -1106,10 +1103,7 @@ def cbday(self): @cache_readonly def m_offset(self): - kwds = self.kwds - kwds = {key: kwds[key] for key in kwds - if key not in ['calendar', 'weekmask', 'holidays', 'offset']} - return MonthBegin(n=1, normalize=self.normalize, **kwds) + return MonthBegin(n=1, normalize=self.normalize) @apply_wraps def apply(self, other): @@ -1254,12 +1248,9 @@ def onOffset(self, dt): def _apply(self, n, other): # if other.day is not day_of_month move to day_of_month and update n - if other.day < self.day_of_month: - other = other.replace(day=self.day_of_month) - if n > 0: - n -= 1 + if n > 0 and other.day < self.day_of_month: + n -= 1 elif other.day > self.day_of_month: - other = other.replace(day=self.day_of_month) n += 1 months = n // 2 @@ -1309,12 +1300,9 @@ def onOffset(self, dt): def _apply(self, n, other): # if other.day is not day_of_month move to day_of_month and update n if other.day < self.day_of_month: - other = other.replace(day=self.day_of_month) n -= 1 - elif other.day > self.day_of_month: - other = other.replace(day=self.day_of_month) - if n <= 0: - n += 1 + elif n <= 0 and other.day > self.day_of_month: + n += 1 months = n // 2 + n % 2 day = 1 if n % 2 else self.day_of_month @@ -1471,6 +1459,7 @@ def apply(self, other): def getOffsetOfMonth(self, dt): w = Week(weekday=self.weekday) d = datetime(dt.year, dt.month, 1, tzinfo=dt.tzinfo) + # TODO: Is this DST-safe? d = w.rollforward(d) return d + timedelta(weeks=self.week) @@ -1550,6 +1539,7 @@ def getOffsetOfMonth(self, dt): d = datetime(dt.year, dt.month, 1, dt.hour, dt.minute, dt.second, dt.microsecond, tzinfo=dt.tzinfo) eom = m.rollforward(d) + # TODO: Is this DST-safe? w = Week(weekday=self.weekday) return w.rollback(eom) @@ -1635,6 +1625,12 @@ def onOffset(self, dt): modMonth = (dt.month - self.startingMonth) % 3 return modMonth == 0 and dt.day == self._get_offset_day(dt) + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, + self.startingMonth, self._day_opt) + return dtindex._shallow_copy(shifted) + class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates @@ -1659,7 +1655,7 @@ class BQuarterBegin(QuarterOffset): _day_opt = 'business_start' -class QuarterEnd(EndMixin, QuarterOffset): +class QuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -1670,25 +1666,14 @@ class QuarterEnd(EndMixin, QuarterOffset): _prefix = 'Q' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - return self._end_apply_index(i, self.freqstr) - -class QuarterBegin(BeginMixin, QuarterOffset): +class QuarterBegin(QuarterOffset): _outputName = 'QuarterBegin' _default_startingMonth = 3 _from_name_startingMonth = 1 _prefix = 'QS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - freq_month = 12 if self.startingMonth == 1 else self.startingMonth - 1 - month = liboffsets._int_to_month[freq_month] - freqstr = 'Q-{month}'.format(month=month) - return self._beg_apply_index(i, freqstr) - # --------------------------------------------------------------------- # Year-Based Offset Classes @@ -1709,6 +1694,13 @@ def apply(self, other): months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, + self.month, self._day_opt, + modby=12) + return dtindex._shallow_copy(shifted) + def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False @@ -1752,31 +1744,19 @@ class BYearBegin(YearOffset): _day_opt = 'business_start' -class YearEnd(EndMixin, YearOffset): +class YearEnd(YearOffset): """DateOffset increments between calendar year ends""" _default_month = 12 _prefix = 'A' _day_opt = 'end' - @apply_index_wraps - def apply_index(self, i): - # convert month anchor to annual period tuple - return self._end_apply_index(i, self.freqstr) - -class YearBegin(BeginMixin, YearOffset): +class YearBegin(YearOffset): """DateOffset increments between calendar year begin dates""" _default_month = 1 _prefix = 'AS' _day_opt = 'start' - @apply_index_wraps - def apply_index(self, i): - freq_month = 12 if self.month == 1 else self.month - 1 - month = liboffsets._int_to_month[freq_month] - freqstr = 'A-{month}'.format(month=month) - return self._beg_apply_index(i, freqstr) - # --------------------------------------------------------------------- # Special Offset Classes @@ -2245,7 +2225,8 @@ def __eq__(self, other): if isinstance(other, Tick): return self.delta == other.delta else: - return DateOffset.__eq__(self, other) + # TODO: Are there cases where this should raise TypeError? + return False # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. @@ -2261,7 +2242,8 @@ def __ne__(self, other): if isinstance(other, Tick): return self.delta != other.delta else: - return DateOffset.__ne__(self, other) + # TODO: Are there cases where this should raise TypeError? + return True @property def delta(self): From 7463f86632571547184854faedf5ad8fa13c846e Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 28 Nov 2017 10:35:37 +0100 Subject: [PATCH 55/98] BUG: Index constructor support tupleization for mixed levels (#18514) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/base.py | 5 +++-- pandas/core/indexes/base.py | 19 ++++++------------- pandas/tests/indexes/test_base.py | 9 +++++++++ pandas/tests/series/test_constructors.py | 10 ++++++++++ 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 52ca05d9a76a9..1eb1b548788b9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -147,6 +147,7 @@ Indexing - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) +- Bug in :class:`Index`` construction from list of mixed type tuples (:issue:`18505`) - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - Bug in ``IntervalIndex.symmetric_difference()`` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) diff --git a/pandas/core/base.py b/pandas/core/base.py index cce0f384cb983..ae92b62ce1d11 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -874,8 +874,9 @@ def _map_values(self, mapper, na_action=None): # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples - from pandas import Series - mapper = Series(mapper, index=mapper.keys()) + from pandas import Series, Index + index = Index(mapper, tupleize_cols=False) + mapper = Series(mapper, index=index) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f4332ac244af4..10f9022e2666b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -353,22 +353,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if (tupleize_cols and isinstance(data, list) and data and - isinstance(data[0], tuple)): - + if tupleize_cols and is_list_like(data) and data: + if is_iterator(data): + data = list(data) # we must be all tuples, otherwise don't construct # 10697 if all(isinstance(e, tuple) for e in data): - try: - # must be orderable in py3 - if compat.PY3: - sorted(data) - from .multi import MultiIndex - return MultiIndex.from_tuples( - data, names=name or kwargs.get('names')) - except (TypeError, KeyError): - # python2 - MultiIndex fails on mixed types - pass + from .multi import MultiIndex + return MultiIndex.from_tuples( + data, names=name or kwargs.get('names')) # other iterable of some kind subarr = _asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 372c11b296d9e..0b71f6bb3fb01 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -106,6 +106,15 @@ def test_construction_list_mixed_tuples(self): assert isinstance(idx2, Index) assert not isinstance(idx2, MultiIndex) + @pytest.mark.parametrize('na_value', [None, np.nan]) + @pytest.mark.parametrize('vtype', [list, tuple, iter]) + def test_construction_list_tuples_nan(self, na_value, vtype): + # GH 18505 : valid tuples containing NaN + values = [(1, 'two'), (3., na_value)] + result = Index(vtype(values)) + expected = MultiIndex.from_tuples(values) + tm.assert_index_equal(result, expected) + def test_constructor_from_index_datetimetz(self): idx = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7ffda3a58ac1c..ccc04da3299fe 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -658,6 +658,16 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data + @pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with ' + 'NaN keys') + def test_constructor_dict_of_tuples(self): + data = {(1, 2): 3, + (None, 5): 6} + result = Series(data).sort_values() + expected = Series([3, 6], + index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + tm.assert_series_equal(result, expected) + def test_constructor_set(self): values = set([1, 2, 3, 4, 5]) pytest.raises(TypeError, Series, values) From 6148e5853460dc5325468fe3ec8f6e5c2b52b8b6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 28 Nov 2017 06:16:23 -0500 Subject: [PATCH 56/98] BUG: Fix marker for high memory (#18526) --- ci/run_circle.sh | 4 ++-- ci/script_multi.sh | 12 ++++++------ ci/script_single.sh | 8 ++++---- pandas/tests/io/test_pytables.py | 5 +++-- setup.cfg | 2 +- test.bat | 2 +- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 0e46d28ab6fc4..435985bd42148 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -5,5 +5,5 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas -echo "pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" +pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/script_multi.sh b/ci/script_multi.sh index e03d60360c800..58742552628c8 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -38,17 +38,17 @@ elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas elif [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" - echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas else - echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/script_single.sh b/ci/script_single.sh index 375e9879e950f..963ce00b4a094 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -23,12 +23,12 @@ elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5e5fc6e7eac62..3fcbf90d12494 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4269,9 +4269,10 @@ def test_select_as_multiple(self): ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') - @pytest.mark.skipf( + @pytest.mark.skipif( LooseVersion(tables.__version__) < '3.1.0', - "tables version does not support fix for nan selection bug: GH 4858") + reason=("tables version does not support fix for nan selection " + "bug: GH 4858")) def test_nan_selection_bug_4858(self): with ensure_clean_store(self.path) as store: diff --git a/setup.cfg b/setup.cfg index 7a88ee8557dc7..828ef80971f7b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,4 +27,4 @@ markers = single: mark a test as single cpu only slow: mark a test as slow network: mark a test as network - highmemory: mark a test as a high-memory only + high_memory: mark a test as a high-memory only diff --git a/test.bat b/test.bat index 6c69f83866ffd..2424f62b8dbfe 100644 --- a/test.bat +++ b/test.bat @@ -1,3 +1,3 @@ :: test on windows -pytest --skip-slow --skip-network pandas -n 2 %* +pytest --strict --skip-slow --skip-network pandas -n 2 %* From 94f3923c99ef612a953942d6c76fc605e8e5c6d9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Nov 2017 03:24:27 -0800 Subject: [PATCH 57/98] remove unused (#18533) --- pandas/_libs/algos.pyx | 45 ---------- pandas/_libs/groupby.pyx | 101 ---------------------- pandas/_libs/hashing.pyx | 5 -- pandas/_libs/join.pyx | 24 ------ pandas/_libs/lib.pyx | 106 ------------------------ pandas/_libs/src/datetime/np_datetime.c | 51 ------------ 6 files changed, 332 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a5aae6d6af656..61d543cd7303a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -212,51 +212,6 @@ cpdef numeric median(numeric[:] arr): kth_smallest(arr, n // 2 - 1)) / 2 -# -------------- Min, Max subsequence - -@cython.boundscheck(False) -@cython.wraparound(False) -def max_subseq(ndarray[double_t] arr): - cdef: - Py_ssize_t i=0, s=0, e=0, T, n - double m, S - - n = len(arr) - - if len(arr) == 0: - return (-1, -1, None) - - m = arr[0] - S = m - T = 0 - - with nogil: - for i in range(1, n): - # S = max { S + A[i], A[i] ) - if (S > 0): - S = S + arr[i] - else: - S = arr[i] - T = i - if S > m: - s = T - e = i - m = S - - return (s, e, m) - - -@cython.boundscheck(False) -@cython.wraparound(False) -def min_subseq(ndarray[double_t] arr): - cdef: - Py_ssize_t s, e - double m - - (s, e, m) = max_subseq(-arr) - - return (s, e, -m) - # ---------------------------------------------------------------------- # Pairwise correlation/covariance diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dc0fdcf123c32..9d9ac2ef2f5b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -75,57 +75,6 @@ def group_nth_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -169,56 +118,6 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_bin_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] bins): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, b - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[float64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.float64) - resx = np.empty(( out).shape, dtype=object) - - if len(bins) == 0: - return - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t _median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 53203dd30daee..4c4449fb3e291 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -105,11 +105,6 @@ cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: p[3] = (v >> 24) -cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: - u32to8_le(p, v) - u32to8_le(p + 4, (v >> 32)) - - cdef inline uint64_t u8to64_le(uint8_t* p) nogil: return (p[0] | p[1] << 8 | diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 6befc5e60f5f6..344c5d25d0c3d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -240,28 +240,4 @@ def ffill_indexer(ndarray[int64_t] indexer): return result -def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, - int64_t max_group): - cdef: - Py_ssize_t i, n = len(indexer) - ndarray[int64_t] result, last_obs - int64_t gid, val - - result = np.empty(n, dtype=np.int64) - - last_obs = np.empty(max_group, dtype=np.int64) - last_obs.fill(-1) - - for i in range(n): - gid = group_ids[i] - val = indexer[i] - if val == -1: - result[i] = last_obs[gid] - else: - result[i] = val - last_obs[gid] = val - - return result - - include "join_helper.pxi" diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2ec4b5cf19b72..02b3839ebf181 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -76,27 +76,6 @@ def values_from_object(object o): return o -cpdef map_indices_list(list index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i from 0 <= i < length: - result[index[i]] = i - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): @@ -1094,27 +1073,6 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, return out -def group_count(ndarray[int64_t] values, Py_ssize_t size): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] counts - - counts = np.zeros(size, dtype=np.int64) - for i in range(n): - counts[values[i]] += 1 - return counts - - -def lookup_values(ndarray[object] values, dict mapping): - cdef: - Py_ssize_t i, n = len(values) - - result = np.empty(n, dtype='O') - for i in range(n): - result[i] = mapping[values[i]] - return maybe_convert_objects(result) - - @cython.boundscheck(False) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, @@ -1145,70 +1103,6 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -cdef class _PandasNull: - - def __richcmp__(_PandasNull self, object other, int op): - if op == 2: # == - return isinstance(other, _PandasNull) - elif op == 3: # != - return not isinstance(other, _PandasNull) - else: - return False - - def __hash__(self): - return 0 - -pandas_null = _PandasNull() - - -def fast_zip_fillna(list ndarrays, fill_value=pandas_null): - """ - For zipping multiple ndarrays into an ndarray of tuples - """ - cdef: - Py_ssize_t i, j, k, n - ndarray[object] result - flatiter it - object val, tup - - k = len(ndarrays) - n = len(ndarrays[0]) - - result = np.empty(n, dtype=object) - - # initialize tuples on first pass - arr = ndarrays[0] - it = PyArray_IterNew(arr) - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - tup = PyTuple_New(k) - - if val != val: - val = fill_value - - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - result[i] = tup - PyArray_ITER_NEXT(it) - - for j in range(1, k): - arr = ndarrays[j] - it = PyArray_IterNew(arr) - if len(arr) != n: - raise ValueError('all arrays must be same length') - - for i in range(n): - val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - if val != val: - val = fill_value - - PyTuple_SET_ITEM(result[i], j, val) - Py_INCREF(val) - PyArray_ITER_NEXT(it) - - return result - - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index 3c63f42f14b83..b1206bd3f2d7a 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -24,20 +24,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include "np_datetime.h" #if PY_MAJOR_VERSION >= 3 -#define PyIntObject PyLongObject -#define PyInt_Type PyLong_Type -#define PyInt_Check(op) PyLong_Check(op) -#define PyInt_CheckExact(op) PyLong_CheckExact(op) -#define PyInt_FromString PyLong_FromString -#define PyInt_FromUnicode PyLong_FromUnicode -#define PyInt_FromLong PyLong_FromLong -#define PyInt_FromSize_t PyLong_FromSize_t -#define PyInt_FromSsize_t PyLong_FromSsize_t #define PyInt_AsLong PyLong_AsLong -#define PyInt_AS_LONG PyLong_AS_LONG -#define PyInt_AsSsize_t PyLong_AsSsize_t -#define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask -#define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask #endif const pandas_datetimestruct _NS_MIN_DTS = { @@ -692,44 +679,6 @@ int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, return 0; } -/* - * This provides the casting rules for the TIMEDELTA data type units. - * - * Notably, there is a barrier between the nonlinear years and - * months units, and all the other units. - */ -npy_bool can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) { - switch (casting) { - /* Allow anything with unsafe casting */ - case NPY_UNSAFE_CASTING: - return 1; - - /* - * Only enforce the 'date units' vs 'time units' barrier with - * 'same_kind' casting. - */ - case NPY_SAME_KIND_CASTING: - return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || - (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); - - /* - * Enforce the 'date units' vs 'time units' barrier and that - * casting is only allowed towards more precise units with - * 'safe' casting. - */ - case NPY_SAFE_CASTING: - return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || - (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); - - /* Enforce equality with 'no' or 'equiv' casting */ - default: - return src_unit == dst_unit; - } -} - /* * This provides the casting rules for the DATETIME data type units. * From 2a0e54bc841f27164b116135ebda4b74bae2fc4a Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 28 Nov 2017 11:29:40 +0000 Subject: [PATCH 58/98] improved DataFrame/SeriesGroupBy.apply doc string (#18534) --- pandas/core/groupby.py | 166 ++++++++++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 43 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0e8368e5a4533..4168e6f920d98 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -77,6 +77,119 @@ pandas.Panel.%(name)s """ +_apply_docs = dict( + template=""" + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a dataframe, a series or a scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods. + Pandas offers a wide range of method that will be much faster + than using ``apply`` for their specific purposes, so try to use them + before reaching for ``apply``. + + Parameters + ---------- + func : function + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func`` + + Returns + ------- + applied : Series or DataFrame + + Notes + ----- + In the current implementation ``apply`` calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + Examples + -------- + {examples} + + See also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate, transform + """, + dataframe_examples=""" + >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) + >>> g = df.groupby('A') + + From ``df`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: below the function passed to ``apply`` takes a dataframe as + its argument and returns a dataframe. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + Example 2: The function passed to ``apply`` takes a dataframe as + its argument and returns a series. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x.max() - x.min()) + B C + A + a 1 2 + b 0 0 + + Example 3: The function passed to ``apply`` takes a dataframe as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + """, + series_examples=""" + >>> ser = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g = ser.groupby(ser.index) + + From ``ser`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: The function passed to ``apply`` takes a series as + its argument and returns a series. ``apply`` combines the result for + each group together into a new series: + + >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + 0 0.0 + 1 0.5 + 2 4.0 + dtype: float64 + + Example 2: The function passed to ``apply`` takes a series as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + """) + _transform_template = """ Call function producing a like-indexed %(klass)s on each group and return a %(klass)s having the same indexes as the original object @@ -144,6 +257,7 @@ """ + # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames _plotting_methods = frozenset(['plot', 'boxplot', 'hist']) @@ -663,50 +777,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Substitution(name='groupby') + @Appender(_apply_docs['template'] + .format(input="dataframe", + examples=_apply_docs['dataframe_examples'])) def apply(self, func, *args, **kwargs): - """ - Apply function and combine results together in an intelligent way. - - The split-apply-combine combination rules attempt to be as common - sense based as possible. For example: - - case 1: - group DataFrame - apply aggregation function (f(chunk) -> Series) - yield DataFrame, with group axis having group labels - - case 2: - group DataFrame - apply transform function ((f(chunk) -> DataFrame with same indexes) - yield DataFrame with resulting chunks glued together - - case 3: - group Series - apply function with f(chunk) -> DataFrame - yield DataFrame with result of chunks glued together - - Parameters - ---------- - func : function - - Notes - ----- - See online documentation for full exposition on how to use apply. - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - - See also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate, transform - """ func = self._is_builtin_func(func) @@ -3022,6 +3096,12 @@ def _selection_name(self): """) + @Appender(_apply_docs['template'] + .format(input='series', + examples=_apply_docs['series_examples'])) + def apply(self, func, *args, **kwargs): + return super(SeriesGroupBy, self).apply(func, *args, **kwargs) + @Appender(_agg_doc) @Appender(_shared_docs['aggregate'] % dict( klass='Series', From 32f562dbe459ee9e437da3a67631949d34531411 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Nov 2017 16:22:40 -0800 Subject: [PATCH 59/98] Fastpaths for Timestamp properties (#18539) --- asv_bench/benchmarks/timestamp.py | 46 ++++++++++++++------------- pandas/_libs/tslibs/timestamps.pyx | 24 ++++++++++++-- pandas/tests/scalar/test_timestamp.py | 22 +++++++++++++ 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index fc5e6dc8c06d6..9d7d6d2998a8b 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -1,4 +1,3 @@ -from .pandas_vb_common import * from pandas import to_timedelta, Timestamp import pytz import datetime @@ -7,61 +6,64 @@ class TimestampProperties(object): goal_time = 0.2 - params = [None, pytz.timezone('Europe/Amsterdam')] - param_names = ['tz'] + params = [(None, None), + (pytz.timezone('Europe/Amsterdam'), None), + (None, 'B'), + (pytz.timezone('Europe/Amsterdam'), 'B')] + param_names = ['tz', 'freq'] - def setup(self, tz): - self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz) + def setup(self, tz, freq): + self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) - def time_tz(self, tz): + def time_tz(self, tz, freq): self.ts.tz - def time_offset(self, tz): + def time_offset(self, tz, freq): self.ts.offset - def time_dayofweek(self, tz): + def time_dayofweek(self, tz, freq): self.ts.dayofweek - def time_weekday_name(self, tz): + def time_weekday_name(self, tz, freq): self.ts.weekday_name - def time_dayofyear(self, tz): + def time_dayofyear(self, tz, freq): self.ts.dayofyear - def time_week(self, tz): + def time_week(self, tz, freq): self.ts.week - def time_quarter(self, tz): + def time_quarter(self, tz, freq): self.ts.quarter - def time_days_in_month(self, tz): + def time_days_in_month(self, tz, freq): self.ts.days_in_month - def time_freqstr(self, tz): + def time_freqstr(self, tz, freq): self.ts.freqstr - def time_is_month_start(self, tz): + def time_is_month_start(self, tz, freq): self.ts.is_month_start - def time_is_month_end(self, tz): + def time_is_month_end(self, tz, freq): self.ts.is_month_end - def time_is_quarter_start(self, tz): + def time_is_quarter_start(self, tz, freq): self.ts.is_quarter_start - def time_is_quarter_end(self, tz): + def time_is_quarter_end(self, tz, freq): self.ts.is_quarter_end - def time_is_year_start(self, tz): + def time_is_year_start(self, tz, freq): self.ts.is_quarter_end - def time_is_year_end(self, tz): + def time_is_year_end(self, tz, freq): self.ts.is_quarter_end - def time_is_leap_year(self, tz): + def time_is_leap_year(self, tz, freq): self.ts.is_quarter_end - def time_microsecond(self, tz): + def time_microsecond(self, tz, freq): self.ts.microsecond diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8fdded0bcb07a..cf0c0e2c01d60 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -304,10 +304,12 @@ cdef class _Timestamp(datetime): out = get_date_field(np.array([val], dtype=np.int64), field) return int(out[0]) - cpdef _get_start_end_field(self, field): + cpdef bint _get_start_end_field(self, str field): cdef: int64_t val dict kwds + ndarray out + int month_kw freq = self.freq if freq: @@ -713,7 +715,7 @@ class Timestamp(_Timestamp): @property def quarter(self): - return self._get_field('q') + return ((self.month - 1) // 3) + 1 @property def days_in_month(self): @@ -727,26 +729,44 @@ class Timestamp(_Timestamp): @property def is_month_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 return self._get_start_end_field('is_month_start') @property def is_month_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.days_in_month return self._get_start_end_field('is_month_end') @property def is_quarter_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 and self.month % 3 == 1 return self._get_start_end_field('is_quarter_start') @property def is_quarter_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return (self.month % 3) == 0 and self.day == self.days_in_month return self._get_start_end_field('is_quarter_end') @property def is_year_start(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.month == 1 return self._get_start_end_field('is_year_start') @property def is_year_end(self): + if self.freq is None: + # fast-path for non-business frequencies + return self.month == 12 and self.day == 31 return self._get_start_end_field('is_year_end') @property diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 545ed7f1ebbf3..992f211229441 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -47,6 +47,28 @@ def test_overflow_offset(self): stamp - offset +class TestTimestampProperties(object): + + def test_properties_business(self): + ts = Timestamp('2017-10-01', freq='B') + control = Timestamp('2017-10-01') + assert ts.dayofweek == 6 + assert not ts.is_month_start # not a weekday + assert not ts.is_quarter_start # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_start + assert control.is_quarter_start + + ts = Timestamp('2017-09-30', freq='B') + control = Timestamp('2017-09-30') + assert ts.dayofweek == 5 + assert not ts.is_month_end # not a weekday + assert not ts.is_quarter_end # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_end + assert control.is_quarter_end + + class TestTimestamp(object): def test_constructor(self): From 48c5bfca5d95b08ce01de3b2cdb9250b6515fa5c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 29 Nov 2017 02:50:47 -0800 Subject: [PATCH 60/98] CLN: ASV frame_methods benchmark (#18536) --- asv_bench/benchmarks/frame_ctor.py | 15 + asv_bench/benchmarks/frame_methods.py | 628 +++++++++-------------- asv_bench/benchmarks/indexing.py | 66 +++ asv_bench/benchmarks/pandas_vb_common.py | 6 +- asv_bench/benchmarks/strings.py | 12 + 5 files changed, 330 insertions(+), 397 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 5fad7b682c2ed..d577ebc20a31c 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -124,3 +124,18 @@ def setup(self, offset, n_steps): def time_frame_ctor(self, offset, n_steps): DataFrame(self.d) + + +class FromRecords(object): + + goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] + + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) + + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 53ee4d8019938..7ed341425e561 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,10 +1,13 @@ -from .pandas_vb_common import * import string +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + isnull, NaT) +from .pandas_vb_common import setup -#---------------------------------------------------------------------- -# get_numeric_data -class frame_get_numeric_data(object): +class GetNumericData(object): + goal_time = 0.2 def setup(self): @@ -16,19 +19,21 @@ def setup(self): def time_frame_get_numeric_data(self): self.df._get_numeric_data() -#---------------------------------------------------------------------- -# lookup -class frame_fancy_lookup(object): +class Lookup(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df = DataFrame(np.random.randn(10000, 8), + columns=list('abcdefgh')) self.df['foo'] = 'bar' self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + self.col_labels = list(self.df.columns) * 100 + self.row_labels_all = np.array( + list(self.df.index) * len(self.df.columns), dtype='object') + self.col_labels_all = np.array( + list(self.df.columns) * len(self.df.index), dtype='object') def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -37,25 +42,20 @@ def time_frame_fancy_lookup_all(self): self.df.lookup(self.row_labels_all, self.col_labels_all) -#---------------------------------------------------------------------- -# reindex - class Reindex(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.idx = np.arange(4000, 7000) - + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), - 1: randint(0, 1000, 1000).astype( - np.int16), - 2: randint(0, 1000, 1000).astype( - np.int32), - 3: randint(0, 1000, 1000).astype( - np.int64),}[randint(0, 4)]) for c in - range(1000)])) + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -67,81 +67,86 @@ def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) def time_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] + self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) -#---------------------------------------------------------------------- -# iteritems (monitor no-copying behaviour) - class Iteration(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(np.random.randn(50000, 10)) - self.df3 = pd.DataFrame(np.random.randn(1000,5000), - columns=['C'+str(c) for c in range(5000)]) + N = 1000 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.df2 = DataFrame(np.random.randn(N * 50, 10)) + self.df3 = DataFrame(np.random.randn(N, 5 * N), + columns=['C' + str(c) for c in range(N * 5)]) - def f(self): + def time_iteritems(self): + # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass - - def g(self): - for (name, col) in self.df.iteritems(): + for name, col in self.df.iteritems(): pass - def time_iteritems(self): - self.f() - def time_iteritems_cached(self): - self.g() + for name, col in self.df.iteritems(): + pass def time_iteritems_indexing(self): - df = self.df3 - for col in df: - df[col] + for col in self.df3: + self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass + def time_iterrows(self): + for row in self.df.iterrows(): + pass -#---------------------------------------------------------------------- -# to_string, to_html, repr -class Formatting(object): +class ToString(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) + self.df = DataFrame(np.random.randn(100, 10)) - self.nrows = 500 - self.df2 = DataFrame(randn(self.nrows, 10)) - self.df2[0] = period_range('2000', '2010', self.nrows) - self.df2[1] = range(self.nrows) + def time_to_string_floats(self): + self.df.to_string() - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) - self.df3 = DataFrame(self.data, index=self.idx) - self.idx = randn(self.nrows) - self.df4 = DataFrame(self.data, index=self.idx) - self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) +class ToHTML(object): - self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + goal_time = 0.2 - def time_to_string_floats(self): - self.df.to_string() + def setup(self): + nrows = 500 + self.df2 = DataFrame(np.random.randn(nrows, 10)) + self.df2[0] = period_range('2000', '2010', nrows) + self.df2[1] = range(nrows) def time_to_html_mixed(self): self.df2.to_html() + +class Repr(object): + + goal_time = 0.2 + + def setup(self): + nrows = 10000 + data = np.random.randn(nrows, 10) + idx = MultiIndex.from_arrays(np.tile(np.random.randn(3, nrows / 100), + 100)) + self.df3 = DataFrame(data, index=idx) + self.df4 = DataFrame(data, index=np.random.randn(nrows)) + self.df_tall = DataFrame(np.random.randn(nrows, 10)) + self.df_wide = DataFrame(np.random.randn(10, nrows)) + def time_html_repr_trunc_mi(self): self.df3._repr_html_() @@ -155,21 +160,16 @@ def time_frame_repr_wide(self): repr(self.df_wide) -#---------------------------------------------------------------------- -# nulls/masking +class MaskBool(object): - -## masking - -class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + data = np.random.randn(1000, 500) + df = DataFrame(data) + df = df.where(df > 0) + self.bools = df > 0 + self.mask = isnull(df) def time_frame_mask_bools(self): self.bools.mask(self.mask) @@ -178,31 +178,26 @@ def time_frame_mask_floats(self): self.bools.astype(float).mask(self.mask) -## isnull +class Isnull(object): -class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - - np.random.seed(1234) - self.sample = np.array([np.nan, 1.0]) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array(list(string.ascii_lowercase) + - list(string.ascii_uppercase) + - list(string.whitespace)) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_strings= DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_obj = DataFrame(self.data) + N = 10**3 + self.df_no_null = DataFrame(np.random.randn(N, N)) + + sample = np.array([np.nan, 1.0]) + data = np.random.choice(sample, (N, N)) + self.df = DataFrame(data) + + sample = np.array(list(string.ascii_letters + string.whitespace)) + data = np.random.choice(sample, (N, N)) + self.df_strings = DataFrame(data) + + sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + data = np.random.choice(sample, (N, N)) + self.df_obj = DataFrame(data) def time_isnull_floats_no_null(self): isnull(self.df_no_null) @@ -217,92 +212,74 @@ def time_isnull_obj(self): isnull(self.df_obj) -# ---------------------------------------------------------------------- -# fillna in place - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - +class Fillna(object): - -class frame_fillna_many_columns_pad(object): goal_time = 0.2 + params = ([True, False], ['pad', 'bfill']) + param_names = ['inplace', 'method'] - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') + def setup(self, inplace, method): + values = np.random.randn(10000, 100) + values[::2] = np.nan + self.df = DataFrame(values) + def time_frame_fillna(self, inplace, method): + self.df.fillna(inplace=inplace, method=method) class Dropna(object): + goal_time = 0.2 + params = (['all', 'any'], [0, 1]) + param_names = ['how', 'axis'] - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + def setup(self, how, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed['foo'] = 'bar' - self.df_mi = self.df.copy() - self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) - self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) - - self.df_mixed_mi = self.df_mixed.copy() - self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) - self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) + def time_dropna(self, how, axis): + self.df.dropna(how=how, axis=axis) - def time_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) + def time_dropna_axis_mixed_dtypes(self, how, axis): + self.df_mixed.dropna(how=how, axis=axis) - def time_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) - def time_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) +class Count(object): - def time_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) - - def time_dropna_axis0_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=0) - - def time_dropna_axis0_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=0) - - def time_dropna_axis1_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=1) + goal_time = 0.2 - def time_dropna_axis1_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=1) + params = [0, 1] + param_names = ['axis'] - def time_count_level_axis0_multi(self): - self.df_mi.count(axis=0, level=1) + def setup(self, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def time_count_level_axis1_multi(self): - self.df_mi.count(axis=1, level=1) + self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, + self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, + self.df_mixed.index]) + self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, + self.df_mixed.columns]) - def time_count_level_axis0_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=0, level=1) + def time_count_level_multi(self, axis): + self.df.count(axis=axis, level=1) - def time_count_level_axis1_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=1, level=1) + def time_count_level_mixed_dtypes_multi(self, axis): + self.df_mixed.count(axis=axis, level=1) class Apply(object): + goal_time = 0.2 def setup(self): @@ -310,32 +287,29 @@ def setup(self): self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_apply_user_func(self): - self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) def time_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) + self.df.apply(lambda x: x + 1, axis=1) def time_apply_lambda_mean(self): - self.df.apply((lambda x: x.mean())) + self.df.apply(lambda x: x.mean()) def time_apply_np_mean(self): self.df.apply(np.mean) def time_apply_pass_thru(self): - self.df.apply((lambda x: x)) + self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) + self.df3.apply(lambda x: x['A'] + x['B'], axis=1) -#---------------------------------------------------------------------- -# dtypes +class Dtypes(object): -class frame_dtypes(object): goal_time = 0.2 def setup(self): @@ -344,316 +318,170 @@ def setup(self): def time_frame_dtypes(self): self.df.dtypes -#---------------------------------------------------------------------- -# equals class Equals(object): + goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( - ('float_df', self.float_df), ('object_df', self.object_df), - ('nonunique_cols', self.nonunique_cols))]) - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + N = 10**3 + self.float_df = DataFrame(np.random.randn(N, N)) + self.float_df_nan = self.float_df.copy() + self.float_df_nan.iloc[-1, -1] = np.nan - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df_nan = self.object_df.copy() + self.object_df_nan.iloc[-1, -1] = np.nan - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols_nan = self.nonunique_cols.copy() + self.nonunique_cols_nan.iloc[-1, -1] = np.nan def time_frame_float_equal(self): - self.test_equal('float_df') + self.float_df.equals(self.float_df) def time_frame_float_unequal(self): - self.test_unequal('float_df') + self.float_df.equals(self.float_df_nan) def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols) def time_frame_nonunique_unequal(self): - self.test_unequal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols_nan) def time_frame_object_equal(self): - self.test_equal('object_df') + self.object_df.equals(self.object_df) def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.object_df.equals(self.object_df_nan) class Interpolate(object): + goal_time = 0.2 + params = [None, 'infer'] + param_names = ['downcast'] - def setup(self): + def setup(self, downcast): + N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(randn(10000, 100)) + self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame( - {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), - 'C': randn(10000), 'D': randn(10000),}) + self.df2 = DataFrame({'A': np.arange(0, N), + 'B': np.random.randint(0, 100, N), + 'C': np.random.randn(N), + 'D': np.random.randn(N)}) self.df2.loc[1::5, 'A'] = np.nan self.df2.loc[1::5, 'C'] = np.nan - def time_interpolate(self): - self.df.interpolate() + def time_interpolate(self, downcast): + self.df.interpolate(downcast=downcast) - def time_interpolate_some_good(self): - self.df2.interpolate() - - def time_interpolate_some_good_infer(self): - self.df2.interpolate(downcast='infer') + def time_interpolate_some_good(self, downcast): + self.df2.interpolate(downcast=downcast) class Shift(object): # frame shift speedup issue-5609 goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): + def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) - def time_shift_axis0(self): - self.df.shift(1, axis=0) - - def time_shift_axis_1(self): - self.df.shift(1, axis=1) - - -#----------------------------------------------------------------------------- -# from_records issue-6700 - -class frame_from_records_generator(object): - goal_time = 0.2 - - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) + def time_shift(self, axis): + self.df.shift(1, axis=axis) - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) - - -#----------------------------------------------------------------------------- -# nunique - -class frame_nunique(object): +class Nunique(object): def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + self.df = DataFrame(np.random.randn(10000, 1000)) def time_frame_nunique(self): self.df.nunique() +class Duplicated(object): -#----------------------------------------------------------------------------- -# duplicated - -class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + n = (1 << 20) + t = date_range('2015-01-01', freq='S', periods=(n // 64)) + xs = np.random.randn(n // 64).round(2) + self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), + 'b': np.random.choice(t, n), + 'c': np.random.choice(xs, n)}) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): self.df.duplicated() def time_frame_duplicated_wide(self): - self.df2.T.duplicated() - - - - - - - - - - - - - - - - - -class frame_xs_col(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(1, 100000)) - - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) - - -class frame_xs_row(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(100000, 1)) - - def time_frame_xs_row(self): - self.df.xs(50000) - - -class frame_sort_index(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) - - def time_frame_sort_index(self): - self.df.sort_index() - - -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) - - -class frame_quantile_axis1(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) - - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) - - -#---------------------------------------------------------------------- -# boolean indexing - -class frame_boolean_row_select(object): - goal_time = 0.2 + self.df2.duplicated() - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] +class XS(object): -class frame_getitem_single_column(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - def time_frame_getitem_single_column(self): - self.h() + def setup(self, axis): + self.N = 10**4 + self.df = DataFrame(np.random.randn(self.N, self.N)) - def time_frame_getitem_single_column2(self): - self.j() + def time_frame_xs(self, axis): + self.df.xs(self.N / 2, axis=axis) -#---------------------------------------------------------------------- -# assignment +class SortValues(object): -class frame_assign_timeseries_index(object): goal_time = 0.2 + params = [True, False] + param_names = ['ascending'] - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='H') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) + def setup(self, ascending): + self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index + def time_frame_sort_values(self, ascending): + self.df.sort_values(by='A', ascending=ascending) +class SortIndexByColumns(object): -# insert many columns - -class frame_insert_100_columns_begin(object): goal_time = 0.2 def setup(self): - self.N = 1000 - - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) + N = 10000 + K = 10 + self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), + 'key2': tm.makeStringIndex(N).values.repeat(K), + 'value': np.random.randn(N * K)}) - def g(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col + def time_frame_sort_values_by_columns(self): + self.df.sort_values(by=['key1', 'key2']) - def time_frame_insert_100_columns_begin(self): - self.f() - def time_frame_insert_500_columns_end(self): - self.g() +class Quantile(object): - - -#---------------------------------------------------------------------- -# strings methods, #2602 - -class series_string_vector_slice(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.s = Series((['abcdefg', np.nan] * 500000)) - - def time_series_string_vector_slice(self): - self.s.str[:5] + def setup(self, axis): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + def time_frame_quantile(self, axis): + self.df.quantile([0.1, 0.5], axis=axis) -#---------------------------------------------------------------------- -# df.info() and get_dtype_counts() # 2807 -class frame_get_dtype_counts(object): +class GetDtypeCounts(object): + # 2807 goal_time = 0.2 def setup(self): @@ -662,13 +490,21 @@ def setup(self): def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() + def time_info(self): + self.df.info() + + +class NSort(object): -class frame_nlargest(object): goal_time = 0.2 + params = ['first', 'last'] + param_names = ['keep'] - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + def setup(self, keep): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + + def time_nlargest(self, keep): + self.df.nlargest(100, 'A', keep=keep) - def time_frame_nlargest(self): - self.df.nlargest(100, 'A') + def time_nsmallest(self, keep): + self.df.nsmallest(100, 'A', keep=keep) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f3e7ebbbd33e8..f271b82c758ee 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -303,3 +303,69 @@ def time_lookup_ix(self): def time_lookup_loc(self): self.s.loc + + +class BooleanRowSelect(object): + + goal_time = 0.2 + + def setup(self): + N = 10000 + np.random.seed(1234) + self.df = DataFrame(np.random.randn(N, 100)) + self.bool_arr = np.zeros(N, dtype=bool) + self.bool_arr[:1000] = True + + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] + + +class GetItemSingleColumn(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.df2 = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df3 = DataFrame(np.random.randn(3000, 1)) + + def time_frame_getitem_single_column_label(self): + self.df2['A'] + + def time_frame_getitem_single_column_int(self): + self.df3[0] + + +class AssignTimeseriesIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + np.random.seed(1234) + dx = date_range('1/1/2000', periods=N, freq='H') + self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + + def time_frame_assign_timeseries_index(self): + self.df['date'] = self.df.index + + +class InsertColumns(object): + + goal_time = 0.2 + + def setup(self): + self.N = 10**3 + self.df = DataFrame(index=range(N)) + + def time_insert(self): + np.random.seed(1234) + for i in range(100): + self.df.insert(0, i, np.random.randn(self.N)) + + def time_assign_with_setitem(self): + np.random.seed(1234) + for i in range(100): + self.df[i] = np.random.randn(self.N) + + diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b1a58e49fe86c..62eb826418030 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,7 +13,11 @@ except ImportError: pass -np.random.seed(1234) +# This function just needs to be imported into each benchmark file in order to +# sets up the random seed before each function. +# http://asv.readthedocs.io/en/latest/writing_benchmarks.html +def setup(*args, **kwargs): + np.random.seed(1234) # try em until it works! for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c1600d4e07f58..948d4b92a5a57 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -105,3 +105,15 @@ def setup(self): def time_encode_decode(self): self.ser.str.encode('utf-8').str.decode('utf-8') + + +class StringSlice(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(['abcdefg', np.nan] * 500000) + + def time_series_string_vector_slice(self): + # GH 2602 + self.s.str[:5] From d3c3c2b092b17aa720b489101d59d60aff8799da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 29 Nov 2017 04:02:05 -0800 Subject: [PATCH 61/98] remove arg that is only ever used as NPY_UNSAFE_CASTING; remove code this renders unreachable (#18546) --- pandas/_libs/src/datetime.pxd | 9 - pandas/_libs/src/datetime/np_datetime.c | 38 --- pandas/_libs/src/datetime/np_datetime.h | 11 - .../_libs/src/datetime/np_datetime_strings.c | 276 +----------------- .../_libs/src/datetime/np_datetime_strings.h | 11 +- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 6 files changed, 4 insertions(+), 343 deletions(-) diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 0624779e50497..6e5d8b82c118f 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -7,13 +7,6 @@ from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString cdef extern from "numpy/ndarrayobject.h": ctypedef int64_t npy_datetime - ctypedef enum NPY_CASTING: - NPY_NO_CASTING - NPY_EQUIV_CASTING - NPY_SAFE_CASTING - NPY_SAME_KIND_CASTING - NPY_UNSAFE_CASTING - cdef extern from "numpy/npy_common.h": ctypedef unsigned char npy_bool @@ -45,7 +38,6 @@ cdef extern from "datetime/np_datetime.h": cdef extern from "datetime/np_datetime_strings.h": int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, int *out_local, int *out_tzoffset, PANDAS_DATETIMEUNIT *out_bestunit, @@ -75,7 +67,6 @@ cdef inline int _cstring_to_dts(char *val, int length, int result result = parse_iso_8601_datetime(val, length, PANDAS_FR_ns, - NPY_UNSAFE_CASTING, dts, out_local, out_tzoffset, &out_bestunit, &special) return result diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index b1206bd3f2d7a..cb4f9d3efdcd0 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -679,44 +679,6 @@ int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, return 0; } -/* - * This provides the casting rules for the DATETIME data type units. - * - * Notably, there is a barrier between 'date units' and 'time units' - * for all but 'unsafe' casting. - */ -npy_bool can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting) { - switch (casting) { - /* Allow anything with unsafe casting */ - case NPY_UNSAFE_CASTING: - return 1; - - /* - * Only enforce the 'date units' vs 'time units' barrier with - * 'same_kind' casting. - */ - case NPY_SAME_KIND_CASTING: - return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || - (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); - - /* - * Enforce the 'date units' vs 'time units' barrier and that - * casting is only allowed towards more precise units with - * 'safe' casting. - */ - case NPY_SAFE_CASTING: - return (src_unit <= dst_unit) && - ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || - (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); - - /* Enforce equality with 'no' or 'equiv' casting */ - default: - return src_unit == dst_unit; - } -} - /* * Converts a datetime based on the given metadata into a datetimestruct */ diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index 7ee7e1e99a704..980c66218f7e6 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -125,17 +125,6 @@ int cmp_pandas_datetimestruct(const pandas_datetimestruct *a, void add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); -/* - * This provides the casting rules for the TIMEDELTA data type units. - * - * Notably, there is a barrier between the nonlinear years and - * months units, and all the other units. - */ -npy_bool -can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, - PANDAS_DATETIMEUNIT dst_unit, - NPY_CASTING casting); - int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, diff --git a/pandas/_libs/src/datetime/np_datetime_strings.c b/pandas/_libs/src/datetime/np_datetime_strings.c index 5307d394423ff..1ff4f08cf3c9d 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/datetime/np_datetime_strings.c @@ -32,22 +32,6 @@ This file implements string parsing and creation for NumPy datetime. #include "np_datetime.h" #include "np_datetime_strings.h" -NPY_NO_EXPORT const char *npy_casting_to_string(NPY_CASTING casting) { - switch (casting) { - case NPY_NO_CASTING: - return "'no'"; - case NPY_EQUIV_CASTING: - return "'equiv'"; - case NPY_SAFE_CASTING: - return "'safe'"; - case NPY_SAME_KIND_CASTING: - return "'same_kind'"; - case NPY_UNSAFE_CASTING: - return "'unsafe'"; - default: - return ""; - } -} /* Platform-specific time_t typedef */ typedef time_t NPY_TIME_T; @@ -115,51 +99,6 @@ static int get_localtime(NPY_TIME_T *ts, struct tm *tms) { return -1; } -#if 0 -/* - * Wraps `gmtime` functionality for multiple platforms. This - * converts a time value to a time structure in UTC. - * - * Returns 0 on success, -1 on failure. - */ -static int -get_gmtime(NPY_TIME_T *ts, struct tm *tms) { - char *func_name = ""; -#if defined(_WIN32) -#if defined(_MSC_VER) && (_MSC_VER >= 1400) - if (gmtime_s(tms, ts) != 0) { - func_name = "gmtime_s"; - goto fail; - } -#elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) - if (_gmtime64_s(tms, ts) != 0) { - func_name = "_gmtime64_s"; - goto fail; - } -#else - struct tm *tms_tmp; - gmtime_r(ts, tms_tmp); - if (tms_tmp == NULL) { - func_name = "gmtime"; - goto fail; - } - memcpy(tms, tms_tmp, sizeof(struct tm)); -#endif -#else - if (gmtime_r(ts, tms) == NULL) { - func_name = "gmtime_r"; - goto fail; - } -#endif - - return 0; - -fail: - PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " - "to a UTC time", func_name); - return -1; -} -#endif /* * Converts a datetimestruct in UTC to a datetimestruct in local time, @@ -226,115 +165,6 @@ static int convert_datetimestruct_utc_to_local( return 0; } -#if 0 -/* - * Converts a datetimestruct in local time to a datetimestruct in UTC. - * - * Returns 0 on success, -1 on failure. - */ -static int -convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, - const pandas_datetimestruct *dts_local) { - npy_int64 year_correction = 0; - - /* Make a copy of the input 'dts' to modify */ - *out_dts_utc = *dts_local; - - /* HACK: Use a year < 2038 for later years for small time_t */ - if (sizeof(NPY_TIME_T) == 4 && out_dts_utc->year >= 2038) { - if (is_leapyear(out_dts_utc->year)) { - /* 2036 is a leap year */ - year_correction = out_dts_utc->year - 2036; - out_dts_utc->year -= year_correction; - } else { - /* 2037 is not a leap year */ - year_correction = out_dts_utc->year - 2037; - out_dts_utc->year -= year_correction; - } - } - - /* - * ISO 8601 states to treat date-times without a timezone offset - * or 'Z' for UTC as local time. The C standard libary functions - * mktime and gmtime allow us to do this conversion. - * - * Only do this timezone adjustment for recent and future years. - * In this case, "recent" is defined to be 1970 and later, because - * on MS Windows, mktime raises an error when given an earlier date. - */ - if (out_dts_utc->year >= 1970) { - NPY_TIME_T rawtime = 0; - struct tm tm_; - - tm_.tm_sec = out_dts_utc->sec; - tm_.tm_min = out_dts_utc->min; - tm_.tm_hour = out_dts_utc->hour; - tm_.tm_mday = out_dts_utc->day; - tm_.tm_mon = out_dts_utc->month - 1; - tm_.tm_year = out_dts_utc->year - 1900; - tm_.tm_isdst = -1; - - /* mktime converts a local 'struct tm' into a time_t */ - rawtime = mktime(&tm_); - if (rawtime == -1) { - PyErr_SetString(PyExc_OSError, "Failed to use mktime to " - "convert local time to UTC"); - return -1; - } - - /* gmtime converts a 'time_t' into a UTC 'struct tm' */ - if (get_gmtime(&rawtime, &tm_) < 0) { - return -1; - } - out_dts_utc->sec = tm_.tm_sec; - out_dts_utc->min = tm_.tm_min; - out_dts_utc->hour = tm_.tm_hour; - out_dts_utc->day = tm_.tm_mday; - out_dts_utc->month = tm_.tm_mon + 1; - out_dts_utc->year = tm_.tm_year + 1900; - } - - /* Reapply the year 2038 year correction HACK */ - out_dts_utc->year += year_correction; - - return 0; -} -#endif - -/* int */ -/* parse_python_string(PyObject* obj, pandas_datetimestruct *dts) { */ -/* PyObject *bytes = NULL; */ -/* char *str = NULL; */ -/* Py_ssize_t len = 0; */ -/* PANDAS_DATETIMEUNIT bestunit = -1; */ - -/* /\* Convert to an ASCII string for the date parser *\/ */ -/* if (PyUnicode_Check(obj)) { */ -/* bytes = PyUnicode_AsASCIIString(obj); */ -/* if (bytes == NULL) { */ -/* return -1; */ -/* } */ -/* } */ -/* else { */ -/* bytes = obj; */ -/* Py_INCREF(bytes); */ -/* } */ -/* if (PyBytes_AsStringAndSize(bytes, &str, &len) == -1) { */ -/* Py_DECREF(bytes); */ -/* return -1; */ -/* } */ - -/* /\* Parse the ISO date *\/ */ -/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, - */ -/* dts, NULL, &bestunit, NULL) < 0) { */ -/* Py_DECREF(bytes); */ -/* return -1; */ -/* } */ -/* Py_DECREF(bytes); */ - -/* return 0; */ -/* } */ /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -354,8 +184,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * 'str' must be a NULL-terminated string, and 'len' must be its length. * 'unit' should contain -1 if the unit is unknown, or the unit * which will be used if it is. - * 'casting' controls how the detected unit from the string is allowed - * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. * 'out_local' gets set to 1 if the parsed time contains timezone, @@ -375,7 +203,7 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * Returns 0 on success, -1 on failure. */ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, + pandas_datetimestruct *out, int *out_local, int *out_tzoffset, PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special) { @@ -444,16 +272,6 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_special = 1; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - return 0; } @@ -486,16 +304,6 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_special = 1; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - return convert_datetime_to_datetimestruct(&meta, rawtime, out); } @@ -941,16 +749,6 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_bestunit = bestunit; } - /* Check the casting rule */ - if (!can_cast_datetime64_units(bestunit, unit, casting)) { - PyErr_Format(PyExc_TypeError, - "Cannot parse \"%s\" as unit " - "'%s' using casting rule %s", - str, _datetime_strings[unit], - npy_casting_to_string(casting)); - return -1; - } - return 0; parse_error: @@ -1018,38 +816,6 @@ int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { return len; } -/* - * Finds the largest unit whose value is nonzero, and for which - * the remainder for the rest of the units is zero. - */ -static PANDAS_DATETIMEUNIT lossless_unit_from_datetimestruct( - pandas_datetimestruct *dts) { - if (dts->as % 1000 != 0) { - return PANDAS_FR_as; - } else if (dts->as != 0) { - return PANDAS_FR_fs; - } else if (dts->ps % 1000 != 0) { - return PANDAS_FR_ps; - } else if (dts->ps != 0) { - return PANDAS_FR_ns; - } else if (dts->us % 1000 != 0) { - return PANDAS_FR_us; - } else if (dts->us != 0) { - return PANDAS_FR_ms; - } else if (dts->sec != 0) { - return PANDAS_FR_s; - } else if (dts->min != 0) { - return PANDAS_FR_m; - } else if (dts->hour != 0) { - return PANDAS_FR_h; - } else if (dts->day != 1) { - return PANDAS_FR_D; - } else if (dts->month != 1) { - return PANDAS_FR_M; - } else { - return PANDAS_FR_Y; - } -} /* * Converts an pandas_datetimestruct to an (almost) ISO 8601 @@ -1069,17 +835,11 @@ static PANDAS_DATETIMEUNIT lossless_unit_from_datetimestruct( * set to a value other than -1. This is a manual override for * the local time zone to use, as an offset in minutes. * - * 'casting' controls whether data loss is allowed by truncating - * the data to a coarser unit. This interacts with 'local', slightly, - * in order to form a date unit string as a local time, the casting - * must be unsafe. - * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting) { + int local, PANDAS_DATETIMEUNIT base, int tzoffset) { pandas_datetimestruct dts_local; int timezone_offset = 0; @@ -1121,38 +881,6 @@ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, add_minutes_to_datetimestruct(dts, timezone_offset); } - /* - * Now the datetimestruct data is in the final form for - * the string representation, so ensure that the data - * is being cast according to the casting rule. - */ - if (casting != NPY_UNSAFE_CASTING) { - /* Producing a date as a local time is always 'unsafe' */ - if (base <= PANDAS_FR_D && local) { - PyErr_SetString(PyExc_TypeError, - "Cannot create a local " - "timezone-based date string from a NumPy " - "datetime without forcing 'unsafe' casting"); - return -1; - } else { - /* Only 'unsafe' and 'same_kind' allow data loss */ - PANDAS_DATETIMEUNIT unitprec; - - unitprec = lossless_unit_from_datetimestruct(dts); - if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { - PyErr_Format(PyExc_TypeError, - "Cannot create a " - "string with unit precision '%s' " - "from the NumPy datetime, which has data at " - "unit precision '%s', " - "requires 'unsafe' or 'same_kind' casting", - _datetime_strings[base], - _datetime_strings[unitprec]); - return -1; - } - } - } - /* YEAR */ /* * Can't use PyOS_snprintf, because it always produces a '\0' diff --git a/pandas/_libs/src/datetime/np_datetime_strings.h b/pandas/_libs/src/datetime/np_datetime_strings.h index 833c1869c1664..4c248129b68c3 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/src/datetime/np_datetime_strings.h @@ -40,8 +40,6 @@ This file implements string parsing and creation for NumPy datetime. * 'str' must be a NULL-terminated string, and 'len' must be its length. * 'unit' should contain -1 if the unit is unknown, or the unit * which will be used if it is. - * 'casting' controls how the detected unit from the string is allowed - * to be cast to the 'unit' parameter. * * 'out' gets filled with the parsed date-time. * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time. @@ -62,7 +60,6 @@ This file implements string parsing and creation for NumPy datetime. int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, - NPY_CASTING casting, pandas_datetimestruct *out, int *out_local, int *out_tzoffset, @@ -90,17 +87,11 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base); * set to a value other than -1. This is a manual override for * the local time zone to use, as an offset in minutes. * - * 'casting' controls whether data loss is allowed by truncating - * the data to a coarser unit. This interacts with 'local', slightly, - * in order to form a date unit string as a local time, the casting - * must be unsafe. - * * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting); + int local, PANDAS_DATETIMEUNIT base, int tzoffset); #endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f799b7f6b4785..c8a29cd949c3c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -456,7 +456,7 @@ static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, } if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, - -1, NPY_UNSAFE_CASTING)) { + -1)) { PRINTMARK(); *_outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; From e459658b79c228c908c9070fadcd957cf737339d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 29 Nov 2017 04:58:12 -0800 Subject: [PATCH 62/98] Make khash its own extension (#18472) --- pandas/_libs/{src => }/khash.pxd | 2 ++ pandas/_libs/tslibs/resolution.pyx | 2 +- setup.py | 8 +++----- 3 files changed, 6 insertions(+), 6 deletions(-) rename pandas/_libs/{src => }/khash.pxd (98%) diff --git a/pandas/_libs/src/khash.pxd b/pandas/_libs/khash.pxd similarity index 98% rename from pandas/_libs/src/khash.pxd rename to pandas/_libs/khash.pxd index ba9a3c70097b2..b1d965c3618cd 100644 --- a/pandas/_libs/src/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# cython: profile=False from cpython cimport PyObject from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 0692d985b4877..1c20dbe7f8fc9 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -10,7 +10,7 @@ np.import_array() from util cimport is_string_object, get_nat -from khash cimport ( +from pandas._libs.khash cimport ( khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t, diff --git a/setup.py b/setup.py index 37be0b696503d..68e1319458a33 100755 --- a/setup.py +++ b/setup.py @@ -497,7 +497,7 @@ def pxd(name): 'pyxfile': '_libs/hashing'}, '_libs.hashtable': { 'pyxfile': '_libs/hashtable', - 'pxdfiles': ['_libs/hashtable', '_libs/missing'], + 'pxdfiles': ['_libs/hashtable', '_libs/missing', '_libs/khash'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, '_libs.index': { @@ -554,7 +554,6 @@ def pxd(name): '_libs.tslib': { 'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', - '_libs/src/khash', '_libs/tslibs/conversion', '_libs/tslibs/timedeltas', '_libs/tslibs/timestamps', @@ -595,12 +594,11 @@ def pxd(name): 'sources': np_datetime_sources}, '_libs.tslibs.parsing': { 'pyxfile': '_libs/tslibs/parsing', - 'pxdfiles': ['_libs/src/util', - '_libs/src/khash']}, + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.resolution': { 'pyxfile': '_libs/tslibs/resolution', 'pxdfiles': ['_libs/src/util', - '_libs/src/khash', + '_libs/khash', '_libs/tslibs/frequencies', '_libs/tslibs/timezones'], 'depends': tseries_depends, From 7627ccaf9442f4101afda69b6077e7f035e23543 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 29 Nov 2017 05:24:00 -0800 Subject: [PATCH 63/98] check for datetime+period addition (#18524) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/_libs/period.pyx | 14 ++++++++++++++ pandas/tests/scalar/test_period.py | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1eb1b548788b9..4a27bf54de695 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -208,4 +208,5 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 2b09e9376bd3d..b95632b5b0eff 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -17,6 +17,10 @@ from pandas.compat import PY2 cimport cython +from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT +# import datetime C API +PyDateTime_IMPORT + from tslibs.np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, is_leapyear) @@ -647,9 +651,19 @@ cdef class _Period(object): elif util.is_integer_object(other): ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) + elif (PyDateTime_Check(other) or + is_period_object(other) or util.is_datetime64_object(other)): + # can't add datetime-like + # GH#17983 + sname = type(self).__name__ + oname = type(other).__name__ + raise TypeError("unsupported operand type(s) for +: '{self}' " + "and '{other}'".format(self=sname, + other=oname)) else: # pragma: no cover return NotImplemented elif is_period_object(other): + # this can be reached via __radd__ because of cython rules return other + self else: return NotImplemented diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index 8cfdf7a461879..3bd4a28b7767d 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -1038,6 +1038,29 @@ def test_add_raises(self): with tm.assert_raises_regex(TypeError, msg): dt1 + dt2 + boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] + + @pytest.mark.parametrize('lbox', boxes) + @pytest.mark.parametrize('rbox', boxes) + def test_add_timestamp_raises(self, rbox, lbox): + # GH # 17983 + ts = pd.Timestamp('2017') + per = pd.Period('2017', freq='M') + + # We may get a different message depending on which class raises + # the error. + msg = (r"cannot add|unsupported operand|" + r"can only operate on a|incompatible type|" + r"ufunc add cannot use operands") + with tm.assert_raises_regex(TypeError, msg): + lbox(ts) + rbox(per) + + with tm.assert_raises_regex(TypeError, msg): + lbox(per) + rbox(ts) + + with tm.assert_raises_regex(TypeError, msg): + lbox(per) + rbox(per) + def test_sub(self): dt1 = Period('2011-01-01', freq='D') dt2 = Period('2011-01-15', freq='D') From a47ad560d5ff030bb67b51feaf03d7c4b6d3e55b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Nov 2017 00:13:58 -0800 Subject: [PATCH 64/98] CLN: ASV remove uncessary selfs and add setups (#18575) --- asv_bench/benchmarks/algorithms.py | 6 ++---- asv_bench/benchmarks/binary_ops.py | 17 +++++++---------- asv_bench/benchmarks/categoricals.py | 5 ++--- asv_bench/benchmarks/ctors.py | 3 ++- asv_bench/benchmarks/eval.py | 24 ++++++++++++------------ asv_bench/benchmarks/frame_ctor.py | 25 ++++++++++--------------- asv_bench/benchmarks/frame_methods.py | 3 ++- 7 files changed, 37 insertions(+), 46 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7ffb180b49e09..45d62163ae80b 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -11,6 +11,8 @@ except: pass +from .pandas_vb_common import setup # noqa + class Factorize(object): @@ -21,7 +23,6 @@ class Factorize(object): def setup(self, sort): N = 10**5 - np.random.seed(1234) self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N) @@ -45,7 +46,6 @@ class Duplicated(object): def setup(self, keep): N = 10**5 - np.random.seed(1234) self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N) @@ -79,7 +79,6 @@ class Match(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) @@ -92,7 +91,6 @@ class Hashing(object): goal_time = 0.2 def setup_cache(self): - np.random.seed(1234) N = 10**5 df = pd.DataFrame( diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 14169ced4b71f..cc8766e1fa39c 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -6,6 +6,8 @@ except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Ops(object): @@ -15,7 +17,6 @@ class Ops(object): param_names = ['use_numexpr', 'threads'] def setup(self, use_numexpr, threads): - np.random.seed(1234) self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) @@ -47,7 +48,6 @@ class Ops2(object): def setup(self): N = 10**3 - np.random.seed(1234) self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) @@ -89,14 +89,12 @@ class Timeseries(object): param_names = ['tz'] def setup(self, tz): - self.N = 10**6 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T', - tz=tz)) - self.ts = self.s[self.halfway] + N = 10**6 + halfway = (N // 2) - 1 + self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.ts = self.s[halfway] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s', - tz=tz)) + self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) def time_series_timestamp_compare(self, tz): self.s <= self.ts @@ -131,7 +129,6 @@ class AddOverflowArray(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) N = 10**6 self.arr = np.arange(N) self.arr_rev = np.arange(-N, 0) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index df41a2afad1f8..1613ca1b97f4b 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -9,6 +9,8 @@ except ImportError: pass +from .pandas_vb_common import setup # noqa + class Concat(object): @@ -76,7 +78,6 @@ class ValueCounts(object): def setup(self, dropna): n = 5 * 10**5 - np.random.seed(2718281) arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype('category') @@ -101,7 +102,6 @@ class SetCategories(object): def setup(self): n = 5 * 10**5 - np.random.seed(2718281) arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype('category') @@ -116,7 +116,6 @@ class Rank(object): def setup(self): N = 10**5 ncats = 100 - np.random.seed(1234) self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = self.s_str.astype('category') diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 2c9c382e2db86..6276dc324ca0d 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,8 @@ import numpy as np from pandas import DataFrame, Series, Index, DatetimeIndex, Timestamp +from .pandas_vb_common import setup # noqa + class Constructors(object): @@ -8,7 +10,6 @@ class Constructors(object): def setup(self): N = 10**2 - np.random.seed(1234) self.arr = np.random.randn(N, N) self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index fd18b3f21cf45..8e581dcf22b4c 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -5,6 +5,8 @@ except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Eval(object): @@ -14,7 +16,6 @@ class Eval(object): param_names = ['engine', 'threads'] def setup(self, engine, threads): - np.random.seed(1234) self.df = pd.DataFrame(np.random.randn(20000, 100)) self.df2 = pd.DataFrame(np.random.randn(20000, 100)) self.df3 = pd.DataFrame(np.random.randn(20000, 100)) @@ -45,17 +46,16 @@ class Query(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) - self.N = 10**6 - self.halfway = (self.N // 2) - 1 - self.index = pd.date_range('20010101', periods=self.N, freq='T') - self.s = pd.Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = pd.DataFrame({'a': np.random.randn(self.N), 'dates': self.s}, - index=self.index) - self.data = np.random.randn(self.N) - self.min_val = self.data.min() - self.max_val = self.data.max() + N = 10**6 + halfway = (N // 2) - 1 + index = pd.date_range('20010101', periods=N, freq='T') + s = pd.Series(index) + self.ts = s.iloc[halfway] + self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': s}, + index=index) + data = np.random.randn(N) + self.min_val = data.min() + self.max_val = data.max() def time_query_datetime_index(self): self.df.query('index < @self.ts') diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index d577ebc20a31c..5f465a91d38d3 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -4,27 +4,23 @@ try: from pandas.tseries import offsets except: - from pandas.core.datetools import * + from pandas.core.datetools import * # noqa +from .pandas_vb_common import setup # noqa -# ---------------------------------------------------------------------- -# Creation from nested dict class FromDicts(object): goal_time = 0.2 def setup(self): - np.random.seed(1234) N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), - index=self.index, - columns=self.columns) - self.data = self.frame.to_dict() + index = tm.makeStringIndex(N) + columns = tm.makeStringIndex(K) + frame = DataFrame(np.random.randn(N, K), index=index, columns=columns) + self.data = frame.to_dict() self.some_dict = list(self.data.values())[0] - self.dict_list = self.frame.to_dict(orient='records') + self.dict_list = frame.to_dict(orient='records') self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} @@ -42,14 +38,13 @@ def time_frame_ctor_nested_dict_int64(self): DataFrame(self.data2) -# from a mi-series - class FromSeries(object): + goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_product([range(100), range(100)]) - self.s = Series(np.random.randn(10000), index=self.mi) + mi = MultiIndex.from_product([range(100), range(100)]) + self.s = Series(np.random.randn(10000), index=mi) def time_frame_from_mi_series(self): DataFrame(self.s) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 7ed341425e561..2b48168238ee8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -3,7 +3,8 @@ import pandas.util.testing as tm from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, isnull, NaT) -from .pandas_vb_common import setup + +from .pandas_vb_common import setup # noqa class GetNumericData(object): From c40c8f8b3baccbd658d078816698f85e3268a781 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 30 Nov 2017 13:55:59 +0000 Subject: [PATCH 65/98] DOC: clarify default window in rolling method (#18177) --- doc/source/computation.rst | 4 +++- pandas/core/window.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 2a358900e340d..66e16808f6af9 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -344,7 +344,9 @@ The following methods are available: :meth:`~Window.sum`, Sum of values :meth:`~Window.mean`, Mean of values -The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are: +The weights used in the window are specified by the ``win_type`` keyword. +The list of recognized types are the `scipy.signal window functions + `__: - ``boxcar`` - ``triang`` diff --git a/pandas/core/window.py b/pandas/core/window.py index 5143dddc5e866..345f9b035a36b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -503,6 +503,9 @@ class Window(_Window): * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. """ def validate(self): From 67c4d0f4f9f45b981d3e6cb07521f9c0bbb459d7 Mon Sep 17 00:00:00 2001 From: Chris Mazzullo Date: Thu, 30 Nov 2017 09:26:21 -0500 Subject: [PATCH 66/98] DOC: header='infer' is not working when there is no header, closes #17473 (#18042) --- doc/source/io.rst | 31 ++++++++++++++++++++++--------- pandas/io/parsers.py | 22 +++++++++++++--------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 5390fc3399e23..2aeafd99f6e72 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -103,15 +103,20 @@ Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ header : int or list of ints, default ``'infer'`` - Row number(s) to use as the column names, and the start of the data. Default - behavior is as if ``header=0`` if no ``names`` passed, otherwise as if - ``header=None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of ints that specify row locations for a - multi-index on the columns e.g. ``[0,1,3]``. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first line of the file, if column names are + passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to replace + existing names. + + The header can be a list of ints that specify row locations + for a multi-index on the columns e.g. ``[0,1,3]``. Intervening rows + that are not specified will be skipped (e.g. 2 in this example is + skipped). Note that this parameter ignores commented lines and empty + lines if ``skip_blank_lines=True``, so header=0 denotes the first + line of data rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. Duplicates in this list will cause @@ -553,6 +558,14 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. note:: + + Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first nonblank line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. + .. _io.dupe_names: Duplicate names parsing diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8f6b013558396..fe50b551ea948 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -74,15 +74,19 @@ .. versionadded:: 0.18.1 support for the Python parser. header : int or list of ints, default 'infer' - Row number(s) to use as the column names, and the start of the data. - Default behavior is as if set to 0 if no ``names`` passed, otherwise - ``None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of integers that specify row locations for - a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so header=0 denotes the first line of + data rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list will cause From 5da3759b30167cd5ef5cb02f5bbfb98ac1be1103 Mon Sep 17 00:00:00 2001 From: Eric Kisslinger <33908309+ekisslinger@users.noreply.github.com> Date: Thu, 30 Nov 2017 07:11:42 -0800 Subject: [PATCH 67/98] BUG: Fix groupby over a CategoricalIndex in axis=1 (#18525) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/groupby.py | 8 +++++--- pandas/tests/groupby/test_grouping.py | 25 ++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e307e605687bf..bebfd0ab50e90 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -137,6 +137,7 @@ Categorical - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +- Bug in ``DataFrame.groupby(axis=1)`` with a ``CategoricalIndex`` (:issue:`18432`) String ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4168e6f920d98..6052b373ca0ea 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2933,9 +2933,11 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must " - "have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + ("Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length" + .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index cc422f2d1cdeb..8702062e9cd0a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -6,7 +6,7 @@ from warnings import catch_warnings from pandas import (date_range, Timestamp, - Index, MultiIndex, DataFrame, Series) + Index, MultiIndex, DataFrame, Series, CategoricalIndex) from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) from pandas.compat import lrange, long @@ -251,6 +251,29 @@ def test_groupby_levels_and_columns(self): by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) + def test_groupby_categorical_index_and_columns(self): + # GH18432 + columns = ['A', 'B', 'A', 'B'] + categories = ['B', 'A'] + data = np.ones((5, 4), int) + cat_columns = CategoricalIndex(columns, + categories=categories, + ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0).sum() + expected_data = 2 * np.ones((5, 2), int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) + expected = DataFrame(data=expected_data, columns=expected_columns) + assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063 From 5cd5e3b81fc3850367bb3e25644cbe3197cdea5a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Nov 2017 17:05:15 -0800 Subject: [PATCH 68/98] Update pandas.read_gbq docs to point to pandas-gbq (#18548) --- doc/source/install.rst | 3 ++- pandas/io/gbq.py | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index b8968e18aecb0..7c1fde119ceaa 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -259,7 +259,8 @@ Optional Dependencies `xsel `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* For Google BigQuery I/O - see `here `__ +* `pandas-gbq `__: for Google BigQuery I/O. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 12e52123064e2..b452b0cf5ddd4 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -29,9 +29,8 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, The main method a user calls to execute a Query in Google BigQuery and read results into a pandas DataFrame. - Google BigQuery API Client Library v2 for Python is used. - Documentation is available `here - `__ + This function requires the `pandas-gbq package + `__. Authentication to the Google BigQuery service is via OAuth 2.0. @@ -70,7 +69,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is + 'standard' : Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference `__ From 1eedcf664cab1ca23a1d10071b2b7fb8095d0160 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 Dec 2017 02:06:58 +0100 Subject: [PATCH 69/98] API: change datetimelike Index to raise IndexError instead ValueError (#18386) --- doc/source/whatsnew/v0.22.0.txt | 2 ++ pandas/core/indexes/datetimelike.py | 4 +++- pandas/tests/indexes/test_base.py | 11 ++++++----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a27bf54de695..d43d5bec7175f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -150,6 +150,8 @@ Indexing - Bug in :class:`Index`` construction from list of mixed type tuples (:issue:`18505`) - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - Bug in ``IntervalIndex.symmetric_difference()`` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) +- Bug in indexing a datetimelike ``Index`` that raised ``ValueError`` instead of ``IndexError`` (:issue:`18386`). + I/O ^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5643d886a4fec..c15727c247e1e 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -263,7 +263,9 @@ def __getitem__(self, key): is_int = is_integer(key) if is_scalar(key) and not is_int: - raise ValueError + raise IndexError("only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean " + "arrays are valid indices") getitem = self._data.__getitem__ if is_int: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0b71f6bb3fb01..0b782e600822a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -623,12 +623,13 @@ def test_empty_fancy(self): # Index. pytest.raises(IndexError, idx.__getitem__, empty_farr) - def test_getitem(self): - arr = np.array(self.dateIndex) - exp = self.dateIndex[5] - exp = _to_m8(exp) + def test_getitem_error(self, indices): - assert exp == arr[5] + with pytest.raises(IndexError): + indices[101] + + with pytest.raises(IndexError): + indices['no_int'] def test_intersection(self): first = self.strIndex[:20] From d74ac70ffd44e9d00e6ffa8ecf7a1a88312e8065 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 1 Dec 2017 04:30:29 -0700 Subject: [PATCH 70/98] CLN/DOC: Interval and IntervalIndex classes (#18585) --- pandas/_libs/interval.pyx | 114 ++++++++++------- pandas/core/indexes/interval.py | 208 ++++++++++++++++++++------------ 2 files changed, 205 insertions(+), 117 deletions(-) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 39b26c61172ed..822df1ce2b968 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -14,30 +14,46 @@ import numbers _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) -cdef class IntervalMixin: - property closed_left: - def __get__(self): - return self.closed == 'left' or self.closed == 'both' - - property closed_right: - def __get__(self): - return self.closed == 'right' or self.closed == 'both' - - property open_left: - def __get__(self): - return not self.closed_left - - property open_right: - def __get__(self): - return not self.closed_right - - property mid: - def __get__(self): - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * (self.right - self.left) +cdef class IntervalMixin(object): + + @property + def closed_left(self): + """ + Return True if the Interval is closed on the left-side, else False + """ + return self.closed in ('left', 'both') + + @property + def closed_right(self): + """ + Return True if the Interval is closed on the right-side, else False + """ + return self.closed in ('right', 'both') + + @property + def open_left(self): + """ + Return True if the Interval is open on the left-side, else False + """ + return not self.closed_left + + @property + def open_right(self): + """ + Return True if the Interval is open on the right-side, else False + """ + return not self.closed_right + + @property + def mid(self): + """ + Return the midpoint of the Interval + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * (self.right - self.left) cdef _interval_like(other): @@ -55,12 +71,12 @@ cdef class Interval(IntervalMixin): Parameters ---------- left : value - Left bound for interval. + Left bound for the interval right : value - Right bound for interval. - closed : {'left', 'right', 'both', 'neither'} + Right bound for the interval + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the interval is closed on the left-side, right-side, both or - neither. Defaults to 'right'. + neither Examples -------- @@ -77,20 +93,30 @@ cdef class Interval(IntervalMixin): See Also -------- - IntervalIndex : an Index of ``interval`` s that are all closed on the same - side. - cut, qcut : convert arrays of continuous data into categoricals/series of - ``Interval``. + IntervalIndex : An Index of Interval objects that are all closed on the + same side. + cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Interval. """ - cdef readonly object left, right + cdef readonly object left + """Left bound for the interval""" + + cdef readonly object right + """Right bound for the interval""" + cdef readonly str closed + """ + Whether the interval is closed on the left-side, right-side, both or + neither + """ def __init__(self, left, right, str closed='right'): # note: it is faster to just do these checks than to use a special # constructor (__cinit__/__new__) to avoid them if closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': %s" % closed) + msg = "invalid option for 'closed': {closed}".format(closed=closed) + raise ValueError(msg) if not left <= right: raise ValueError('left side of interval must be <= right side') self.left = left @@ -122,10 +148,11 @@ cdef class Interval(IntervalMixin): if op == Py_EQ or op == Py_NE: return NotImplemented else: + name = type(self).__name__ + other = type(other).__name__ op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError( - 'unorderable types: %s() %s %s()' % - (type(self).__name__, op_str, type(other).__name__)) + raise TypeError('unorderable types: {name}() {op} {other}()' + .format(name=name, op=op_str, other=other)) def __reduce__(self): args = (self.left, self.right, self.closed) @@ -145,15 +172,18 @@ cdef class Interval(IntervalMixin): def __repr__(self): left, right = self._repr_base() - return ('%s(%r, %r, closed=%r)' % - (type(self).__name__, left, right, self.closed)) + name = type(self).__name__ + repr_str = '{name}({left!r}, {right!r}, closed={closed!r})'.format( + name=name, left=left, right=right, closed=self.closed) + return repr_str def __str__(self): left, right = self._repr_base() start_symbol = '[' if self.closed_left else '(' end_symbol = ']' if self.closed_right else ')' - return '%s%s, %s%s' % (start_symbol, left, right, end_symbol) + return '{start}{left}, {right}{end}'.format( + start=start_symbol, left=left, right=right, end=end_symbol) def __add__(self, y): if isinstance(y, numbers.Number): @@ -222,8 +252,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals): continue if not isinstance(interval, Interval): - raise TypeError("type {} with value {} is not an interval".format( - type(interval), interval)) + raise TypeError("type {typ} with value {iv} is not an interval" + .format(typ=type(interval), iv=interval)) left[i] = interval.left right[i] = interval.right diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3f74694880533..02ac74e619fa4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -61,8 +61,8 @@ def _get_next_label(label): elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: - raise TypeError('cannot determine next label for type %r' - % type(label)) + raise TypeError('cannot determine next label for type {typ!r}' + .format(typ=type(label))) def _get_prev_label(label): @@ -76,8 +76,8 @@ def _get_prev_label(label): elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: - raise TypeError('cannot determine next label for type %r' - % type(label)) + raise TypeError('cannot determine next label for type {typ!r}' + .format(typ=type(label))) def _get_interval_closed_bounds(interval): @@ -94,17 +94,18 @@ def _get_interval_closed_bounds(interval): def _new_IntervalIndex(cls, d): - """ This is called upon unpickling, - rather than the default which doesn't - have arguments and breaks __new__ """ - + """ + This is called upon unpickling, rather than the default which doesn't have + arguments and breaks __new__ + """ return cls.from_arrays(**d) class IntervalIndex(IntervalMixin, Index): """ Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of intervals that are all closed on the same side. + represents an Index of Interval objects that are all closed on the same + side. .. versionadded:: 0.20.0 @@ -117,9 +118,9 @@ class IntervalIndex(IntervalMixin, Index): ---------- left, right : array-like (1-dimensional) Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or - neither. Defaults to 'right'. + neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -146,7 +147,7 @@ class IntervalIndex(IntervalMixin, Index): closed='right', dtype='interval[int64]') It may also be constructed using one of the constructor - methods :meth:`IntervalIndex.from_arrays`, + methods: :meth:`IntervalIndex.from_arrays`, :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_intervals` and :meth:`IntervalIndex.from_tuples`. @@ -162,12 +163,10 @@ class IntervalIndex(IntervalMixin, Index): See Also -------- Index : The base pandas Index type - Interval : A bounded slice-like interval - interval_range : Function to create a fixed frequency - IntervalIndex, IntervalIndex.from_arrays, IntervalIndex.from_breaks, - IntervalIndex.from_intervals, IntervalIndex.from_tuples - cut, qcut : convert arrays of continuous data into categoricals/series of - ``Interval``. + Interval : A bounded slice-like interval; the elements of an IntervalIndex + interval_range : Function to create a fixed frequency IntervalIndex + cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Intervals """ _typ = 'intervalindex' _comparables = ['name'] @@ -232,9 +231,9 @@ def _simple_new(cls, left, right, closed=None, name=None, left = left.astype(right.dtype) if type(left) != type(right): - raise ValueError("must not have differing left [{}] " - "and right [{}] types".format( - type(left), type(right))) + raise ValueError("must not have differing left [{left}] " + "and right [{right}] types" + .format(left=type(left), right=type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " @@ -279,7 +278,8 @@ def _validate(self): Verify that the IntervalIndex is valid. """ if self.closed not in _VALID_CLOSED: - raise ValueError("invalid options for 'closed': %s" % self.closed) + raise ValueError("invalid options for 'closed': {closed}" + .format(closed=self.closed)) if len(self.left) != len(self.right): raise ValueError('left and right must have the same length') left_mask = notna(self.left) @@ -293,12 +293,15 @@ def _validate(self): @cache_readonly def hasnans(self): - """ return if I have any nans; enables various perf speedups """ + """ + Return if the IntervalIndex has any nans; enables various performance + speedups + """ return self._isnan.any() @cache_readonly def _isnan(self): - """ return if each value is nan""" + """Return a mask indicating if each value is NA""" if self._mask is None: self._mask = isna(self.left) return self._mask @@ -335,7 +338,7 @@ def __contains__(self, key): def contains(self, key): """ - return a boolean if this key is IN the index + Return a boolean indicating if the key is IN the index We accept / allow keys to be not *just* actual objects. @@ -363,9 +366,9 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): ---------- breaks : array-like (1-dimensional) Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -404,9 +407,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): Left bounds for each interval. right : array-like (1-dimensional) Right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -491,9 +494,9 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): ---------- data : array-like (1-dimensional) Array of tuples - closed : {'left', 'right', 'both', 'neither'}, optional + closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. + or neither. name : object, optional Name to be stored in the index. copy : boolean, default False @@ -521,15 +524,12 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): left = right = data for d in data: - if isna(d): - left.append(np.nan) - right.append(np.nan) - continue - - l, r = d - left.append(l) - right.append(r) + lhs = rhs = np.nan + else: + lhs, rhs = d + left.append(lhs) + right.append(rhs) # TODO # if we have nulls and we previous had *only* @@ -538,6 +538,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): return cls.from_arrays(left, right, closed, name=name, copy=False) def to_tuples(self): + """Return an Index of tuples of the form (left, right)""" return Index(_asarray_tuplesafe(zip(self.left, self.right))) @cache_readonly @@ -547,14 +548,26 @@ def _multiindex(self): @property def left(self): + """ + Return the left endpoints of each Interval in the IntervalIndex as + an Index + """ return self._left @property def right(self): + """ + Return the right endpoints of each Interval in the IntervalIndex as + an Index + """ return self._right @property def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither + """ return self._closed def __len__(self): @@ -563,7 +576,7 @@ def __len__(self): @cache_readonly def values(self): """ - Returns the IntervalIndex's data as a numpy array of Interval + Return the IntervalIndex's data as a numpy array of Interval objects (with dtype='object') """ left = self.left @@ -615,14 +628,17 @@ def astype(self, dtype, copy=True): elif is_categorical_dtype(dtype): from pandas import Categorical return Categorical(self, ordered=True) - raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype) + raise ValueError('Cannot cast IntervalIndex to dtype {dtype}' + .format(dtype=dtype)) @cache_readonly def dtype(self): + """Return the dtype object of the underlying data""" return IntervalDtype.construct_from_string(str(self.left.dtype)) @property def inferred_type(self): + """Return a string of the type inferred from the values""" return 'interval' @Appender(Index.memory_usage.__doc__) @@ -634,7 +650,8 @@ def memory_usage(self, deep=False): @cache_readonly def mid(self): - """Returns the mid-point of each interval in the index as an array + """ + Return the midpoint of each Interval in the IntervalIndex as an Index """ try: return Index(0.5 * (self.left.values + self.right.values)) @@ -645,22 +662,42 @@ def mid(self): @cache_readonly def is_monotonic(self): + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ return self._multiindex.is_monotonic @cache_readonly def is_monotonic_increasing(self): + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ return self._multiindex.is_monotonic_increasing @cache_readonly def is_monotonic_decreasing(self): + """ + Return True if the IntervalIndex is monotonic decreasing (only equal or + decreasing values), else False + """ return self._multiindex.is_monotonic_decreasing @cache_readonly def is_unique(self): + """ + Return True if the IntervalIndex contains unique elements, else False + """ return self._multiindex.is_unique @cache_readonly def is_non_overlapping_monotonic(self): + """ + Return True if the IntervalIndex is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False + """ # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) # we already require left <= right @@ -725,9 +762,8 @@ def _check_method(self, method): return if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: - raise NotImplementedError( - 'method {} not yet implemented for ' - 'IntervalIndex'.format(method)) + msg = 'method {method} not yet implemented for IntervalIndex' + raise NotImplementedError(msg.format(method=method)) raise ValueError("Invalid fill method") @@ -866,17 +902,14 @@ def get_value(self, series, key): elif isinstance(key, slice): if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default " - "step in a slice") + raise ValueError("cannot support not-default step in a slice") try: loc = self.get_loc(key) except TypeError: - - # we didn't find exact intervals - # or are non-unique - raise ValueError("unable to slice with " - "this key: {}".format(key)) + # we didn't find exact intervals or are non-unique + msg = "unable to slice with this key: {key}".format(key=key) + raise ValueError(msg) else: loc = self.get_loc(key) @@ -929,31 +962,31 @@ def _get_reindexer(self, target): indexer = [] n = len(self) - for i, (l, r) in enumerate(zip(lindexer, rindexer)): + for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)): target_value = target[i] # matching on the lhs bound - if (l != -1 and + if (lhs != -1 and self.closed == 'right' and - target_value.left == self[l].right): - l += 1 + target_value.left == self[lhs].right): + lhs += 1 # matching on the lhs bound - if (r != -1 and + if (rhs != -1 and self.closed == 'left' and - target_value.right == self[r].left): - r -= 1 + target_value.right == self[rhs].left): + rhs -= 1 # not found - if l == -1 and r == -1: + if lhs == -1 and rhs == -1: indexer.append(np.array([-1])) - elif r == -1: + elif rhs == -1: - indexer.append(np.arange(l, n)) + indexer.append(np.arange(lhs, n)) - elif l == -1: + elif lhs == -1: # care about left/right closed here value = self[i] @@ -976,10 +1009,10 @@ def _get_reindexer(self, target): indexer.append(np.array([-1])) continue - indexer.append(np.arange(0, r + 1)) + indexer.append(np.arange(0, rhs + 1)) else: - indexer.append(np.arange(l, r + 1)) + indexer.append(np.arange(lhs, rhs + 1)) return np.concatenate(indexer) @@ -996,11 +1029,32 @@ def where(self, cond, other=None): return self._shallow_copy(values) def delete(self, loc): + """ + Return a new IntervalIndex with passed location(-s) deleted + + Returns + ------- + new_index : IntervalIndex + """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) return self._shallow_copy(new_left, new_right) def insert(self, loc, item): + """ + Return a new IntervalIndex inserting new item at location. Follows + Python list.append semantics for negative values. Only Interval + objects and NA can be inserted into an IntervalIndex + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : IntervalIndex + """ if isinstance(item, Interval): if item.closed != self.closed: raise ValueError('inserted item must be closed on the same ' @@ -1108,23 +1162,23 @@ def _format_data(self, name=None): summary = '[]' elif n == 1: first = formatter(self[0]) - summary = '[{}]'.format(first) + summary = '[{first}]'.format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{}, {}]'.format(first, last) + summary = '[{first}, {last}]'.format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{} ... {}]'.format(', '.join(head), - ', '.join(tail)) + summary = '[{head} ... {tail}]'.format( + head=', '.join(head), tail=', '.join(tail)) else: head = [] tail = [formatter(x) for x in self] - summary = '[{}]'.format(', '.join(tail)) + summary = '[{tail}]'.format(tail=', '.join(tail)) return summary + self._format_space() @@ -1132,17 +1186,20 @@ def _format_attrs(self): attrs = [('closed', repr(self.closed))] if self.name is not None: attrs.append(('name', default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype))) return attrs def _format_space(self): - return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + space = ' ' * (len(self.__class__.__name__) + 1) + return "\n{space}".format(space=space) def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) def equals(self, other): - + """ + Determines if two IntervalIndex objects contain the same elements + """ if self.is_(other): return True @@ -1216,8 +1273,9 @@ def interval_range(start=None, end=None, periods=None, freq=None, for numeric and 'D' (calendar daily) for datetime-like. name : string, default None Name of the resulting IntervalIndex - closed : string, default 'right' - options are: 'left', 'right', 'both', 'neither' + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. Notes ----- From d5ffb1fc9653a47e5426121fefeccdf2be9e8c46 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 1 Dec 2017 11:45:19 -0500 Subject: [PATCH 71/98] Support merging DataFrames on a combo of columns and index levels (GH 14355) (#17484) --- doc/source/merging.rst | 68 ++- doc/source/whatsnew/v0.22.0.txt | 31 ++ pandas/core/frame.py | 37 +- pandas/core/generic.py | 310 ++++++++++++- pandas/core/groupby.py | 11 +- pandas/core/reshape/merge.py | 62 ++- .../generic/test_label_or_level_utils.py | 431 ++++++++++++++++++ pandas/tests/groupby/test_index_as_string.py | 2 +- pandas/tests/reshape/test_merge.py | 9 + .../reshape/test_merge_index_as_string.py | 215 +++++++++ 10 files changed, 1138 insertions(+), 38 deletions(-) create mode 100644 pandas/tests/generic/test_label_or_level_utils.py create mode 100644 pandas/tests/reshape/test_merge_index_as_string.py diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7d981b815d01b..86d2ec2254057 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -518,14 +518,16 @@ standard database join operations between DataFrame objects: - ``left``: A DataFrame object - ``right``: Another DataFrame object -- ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and +- ``on``: Column or index level names to join on. Must be found in both the left + and right DataFrame objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the DataFrames will be inferred to be the join keys -- ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame -- ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame +- ``left_on``: Columns or index levels from the left DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame +- ``right_on``: Columns or index levels from the right DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame - ``left_index``: If ``True``, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys @@ -563,6 +565,10 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.21.0 +.. note:: + + Support for specifying index levels as the ``on``, ``left_on``, and + ``right_on`` parameters was added in version 0.22.0. The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -1121,6 +1127,56 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using labels=['left', 'right'], vertical=False); plt.close('all'); +.. _merging.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.22 + +Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters +may refer to either column names or index level names. This enables merging +``DataFrame`` instances on a combination of index levels and columns without +resetting indexes. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + result = left.merge(right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merge_on_index_and_column.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. note:: + + When DataFrames are merged on a string that matches an index level in both + frames, the index level is preserved as an index level in the resulting + DataFrame. + +.. note:: + + If a string matches both a column name and an index level name, then a + warning is issued and the column takes precedence. This will result in an + ambiguity error in a future version. + Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index d43d5bec7175f..55e88d2e50919 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -32,6 +32,37 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp pd.get_dummies(df, columns=['c'], dtype=bool).dtypes +.. _whatsnew_0220.enhancements.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` +parameters may now refer to either column names or index level names. +This enables merging ``DataFrame`` instances on a combination of index levels +and columns without resetting indexes. See the :ref:`Merge on columns and +levels ` documentation section. +(:issue:`14355`) + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + left.merge(right, on=['key1', 'key2']) + + .. _whatsnew_0220.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3561f8a0eadf..ff42e39d9dbdd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -148,16 +148,17 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys on : label or list - Field names to join on. Must be found in both DataFrames. If on is - None and not merging on indexes, then it merges on the intersection of - the columns by default. + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index @@ -196,6 +197,11 @@ .. versionadded:: 0.21.0 +Notes +----- +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.22.0 + Examples -------- @@ -5214,12 +5220,12 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame - on : column name, tuple/list of column names, or array-like - Column(s) in the caller to join on the index in other, - otherwise joins index-on-index. If multiples - columns given, the passed DataFrame must have a MultiIndex. Can - pass an array as the join key if not already contained in the - calling DataFrame. Like an Excel VLOOKUP operation + on : name, tuple/list of names, or array-like + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects. @@ -5244,6 +5250,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Support for specifying index levels as the `on` parameter was added + in version 0.22.0 + Examples -------- >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 54b0089335b19..83fd36f0a864f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_re_compilable, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame from pandas.core.common import (_count_not_none, @@ -36,7 +37,7 @@ from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, - InvalidIndexError) + InvalidIndexError, RangeIndex) import pandas.core.indexing as indexing from pandas.core.indexing import maybe_convert_indices from pandas.core.indexes.datetimes import DatetimeIndex @@ -1038,6 +1039,313 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + def _is_level_reference(self, key, axis=0): + """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key: str + Potential level name for the given axis + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level: bool + """ + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_is_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + not self._is_label_reference(key, axis=axis)) + + def _is_label_reference(self, key, axis=0): + """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key: str + Potential label name + axis: int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + any(key in self.axes[ax] for ax in other_axes)) + + def _is_label_or_level_reference(self, key, axis=0): + """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key: str + Potential label or level name + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_label_or_level: bool + """ + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_or_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (self._is_level_reference(key, axis=axis) or + self._is_label_reference(key, axis=axis)) + + def _check_label_or_level_ambiguity(self, key, axis=0): + """ + Check whether `key` matches both a level of the input `axis` and a + label of the other axis and raise a ``FutureWarning`` if this is the + case. + + Note: This method will be altered to raise an ambiguity exception in + a future version. + + Parameters + ---------- + key: str or object + label or level name + + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + ambiguous: bool + + Raises + ------ + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(self))) + + if (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + any(key in self.axes[ax] for ax in other_axes)): + + # Build an informative and grammatical warning + level_article, level_type = (('an', 'index') + if axis == 0 else + ('a', 'column')) + + label_article, label_type = (('a', 'column') + if axis == 0 else + ('an', 'index')) + + msg = ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type) + + warnings.warn(msg, FutureWarning, stacklevel=2) + return True + else: + return False + + def _get_label_or_level_values(self, key, axis=0): + """ + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key: str + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_get_label_or_level_values is not implemented for {type}" + .format(type=type(self))) + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + label_axis_name = 'column' if axis == 0 else 'index' + raise ValueError(("The {label_axis_name} label '{key}' " + "is not unique") + .format(key=key, + label_axis_name=label_axis_name)) + + return values + + def _drop_labels_or_levels(self, keys, axis=0): + """ + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys: str or list of str + labels or levels to drop + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level + """ + + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_drop_labels_or_levels is not implemented for {type}" + .format(type=type(self))) + + # Validate keys + keys = com._maybe_make_list(keys) + invalid_keys = [k for k in keys if not + self._is_label_or_level_reference(k, axis=axis)] + + if invalid_keys: + raise ValueError(("The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys + if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) + + return dropped + # ---------------------------------------------------------------------- # Iteration diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6052b373ca0ea..a5d8cc254cd93 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2913,16 +2913,11 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if validate and gpr in obj.index.names: - warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % gpr, - FutureWarning, stacklevel=5) + if validate: + obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - elif gpr in obj.index.names: + elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 56ca913dbcddb..bad7088a126cf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -587,6 +587,8 @@ def get_result(self): self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_restore_index_levels(result) + return result def _indicator_pre_merge(self, left, right): @@ -629,6 +631,39 @@ def _indicator_post_merge(self, result): axis=1) return result + def _maybe_restore_index_levels(self, result): + """ + Restore index levels specified as `on` parameters + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + + Returns + ------- + None + """ + names_to_restore = [] + for name, left_key, right_key in zip(self.join_names, + self.left_on, + self.right_on): + if (self.orig_left._is_level_reference(left_key) and + self.orig_right._is_level_reference(right_key) and + name not in result.index.names): + + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None @@ -698,8 +733,17 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: key_col = Index(lvals).where(~mask, rvals) - if name in result: + if result._is_label_reference(name): result[name] = key_col + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + idx_list = [result.index.get_level_values(level_name) + if level_name != name else key_col + for level_name in result.index.names] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) else: result.insert(i, name or 'key_{i}'.format(i=i), key_col) @@ -796,7 +840,8 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -805,7 +850,8 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -818,7 +864,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left[lk]._values) + left_keys.append(left._get_label_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -830,7 +876,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left[k]._values) + left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -844,7 +890,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right[k]._values) + right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -854,10 +900,10 @@ def _get_merge_keys(self): left_keys = [self.left.index.values] if left_drop: - self.left = self.left.drop(left_drop, axis=1) + self.left = self.left._drop_labels_or_levels(left_drop) if right_drop: - self.right = self.right.drop(right_drop, axis=1) + self.right = self.right._drop_labels_or_levels(right_drop) return left_keys, right_keys, join_names diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py new file mode 100644 index 0000000000000..456cb48020500 --- /dev/null +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -0,0 +1,431 @@ +import pytest +import pandas as pd +import pandas.util.testing as tm +from pandas.core.dtypes.missing import array_equivalent + + +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3' """ + return pd.DataFrame({'L1': [1, 2, 3], + 'L2': [11, 12, 13], + 'L3': ['A', 'B', 'C']}) + + +@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ + df = df.set_index(['L1', 'L2']) + + df['L1'] = df['L3'] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = df.set_index(['L1']) + df = pd.concat([df, df['L2']], axis=1) + + return df + + +@pytest.fixture +def panel(): + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + return pd.Panel() + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names + if name is not None] + return expected_labels, expected_levels + + +def assert_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def assert_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_or_label_reference_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_reference_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + assert_label_reference(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + assert_level_reference(df_ambig, ['L2'], axis=axis) + + # df has a column named L3 and it not an level reference + assert_label_reference(df_ambig, ['L3'], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_reference(s, ['L1'], axis=0) + assert not s._is_level_reference('L2') + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_reference(s, ['L1', 'L2'], axis=0) + assert not s._is_level_reference('L3') + + +def test_is_level_reference_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._is_level_reference('L1', axis=1) + + +# Panel +# ----- +def test_is_level_reference_panel_error(panel): + msg = ("_is_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_level_reference('L1', axis=0) + + +def test_is_label_reference_panel_error(panel): + msg = ("_is_label_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_reference('L1', axis=0) + + +def test_is_label_or_level_reference_panel_error(panel): + msg = ("_is_label_or_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_or_level_reference('L1', axis=0) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous + with tm.assert_produces_warning(FutureWarning, + clear=True, + check_stacklevel=False) as w: + + assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) + warning_msg = w[0].message.args[0] + if axis == 0: + assert warning_msg.startswith("'L1' is both an index level " + "and a column label") + else: + assert warning_msg.startswith("'L1' is both a column level " + "and an index label") + + # df_ambig has an on-axis level named L2 and it is not ambiguous + # No warning should be raised + with tm.assert_produces_warning(None): + assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis) + + # df_ambig has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert not df_ambig._is_level_reference('L3', axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index('L1').L2 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + assert not s._check_label_or_level_ambiguity('L3', axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._check_label_or_level_ambiguity('L1', axis=1) + + +# Panel +# ----- +def test_check_label_or_level_ambiguity_panel_error(panel): + msg = ("_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._check_label_or_level_ambiguity('L1', axis=0) + + +# Test _get_label_or_level_values +# =============================== +def assert_label_values(frame, labels, axis): + for label in labels: + if axis == 0: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def assert_level_values(frame, levels, axis): + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = (frame.columns + .get_level_values(level=level) + ._values) + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous but will default to label + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + assert_label_values(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_level_values(df_ambig, ['L2'], axis=axis) + + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_label_values(df_ambig, ['L3'], axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_duplabels = df_duplabels.T + + # df has unambiguous level 'L1' + assert_level_values(df_duplabels, ['L1'], axis=axis) + + # df has unique label 'L3' + assert_label_values(df_duplabels, ['L3'], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with tm.assert_raises_regex(ValueError, expected_msg): + assert_label_values(df_duplabels, ['L2'], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_values(s, ['L1'], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_values(s, ['L1', 'L2'], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._get_label_or_level_values('L1', axis=1) + + +# Panel +# ----- +def test_get_label_or_level_values_panel_error(panel): + msg = ("_get_label_or_level_values is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._get_label_or_level_values('L1', axis=0) + + +# Test _drop_labels_or_levels +# =========================== +def assert_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def assert_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_drop_labels_or_levels_df(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df_levels._drop_labels_or_levels('L4', axis=axis) + + +# Series +# ------ +def test_drop_labels_or_levels_series(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_levels_dropped(s, ['L1'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_levels_dropped(s, ['L1', 'L2'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + +# Panel +# ----- +def test_drop_labels_or_levels_panel_error(panel): + msg = ("_drop_labels_or_levels is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._drop_labels_or_levels('L1', axis=0) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 3b6e15036cfe2..cee78eab3a636 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -108,7 +108,7 @@ def test_grouper_column_index_level_precedence(frame, assert_frame_equal(result, expected) - # Grouping with level Grouper should produce a difference result but + # Grouping with level Grouper should produce a different result but # still no warning with tm.assert_produces_warning(False): not_expected = frame.groupby(level_groupers).mean() diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index b76951e8c2ac2..cd0701e3864fc 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -72,6 +72,15 @@ def test_merge_common(self): exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) + def test_merge_index_as_on_arg(self): + # GH14355 + + left = self.df.set_index('key1') + right = self.df2.set_index('key1') + result = merge(left, right, on='key1') + expected = merge(self.df, self.df2, on='key1').set_index('key1') + assert_frame_equal(result, expected) + def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py new file mode 100644 index 0000000000000..4c638f8e441fa --- /dev/null +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -0,0 +1,215 @@ +import numpy as np +import pytest + +from pandas import DataFrame +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture +def df1(): + return DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11))) + + +@pytest.fixture +def df2(): + return DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12))) + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def left_df(request, df1): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + if levels: + df1 = df1.set_index(levels) + + return df1 + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def right_df(request, df2): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + + if levels: + df2 = df2.set_index(levels) + + return df2 + + +def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + +@pytest.mark.parametrize('on,how', + [(['outer'], 'inner'), + (['inner'], 'left'), + (['outer', 'inner'], 'right'), + (['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_on(left_df, right_df, on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_on,right_on,how', + [(['outer'], ['outer'], 'inner'), + (['inner'], ['inner'], 'right'), + (['outer', 'inner'], ['outer', 'inner'], 'left'), + (['inner', 'outer'], ['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_lefton_righton( + left_df, right_df, left_on, right_on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = left_df.merge(right_df, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_index', + ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize('how', + ['inner', 'left', 'right', 'outer']) +def test_join_indexes_and_columns_on(df1, df2, left_index, how): + + # Construct left_df + left_df = df1.set_index(left_index) + + # Construct right_df + right_df = df2.set_index(['outer', 'inner']) + + # Result + expected = (left_df.reset_index() + .join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + .set_index(left_index)) + + # Perform join + result = left_df.join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + + assert_frame_equal(result, expected, check_like=True) + + +def test_merge_index_column_precedence(df1, df2): + + # Construct left_df with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + left_df = df1.set_index('outer') + left_df['outer'] = left_df['inner'] + + # Construct right_df with an index level named 'outer' + right_df = df2.set_index('outer') + + # Construct expected result. + # The 'outer' column from left_df is chosen and the resulting + # frame has no index levels + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) + + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + + # Perform the same using the left_on and right_on parameters + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) + + assert_frame_equal(result, expected) From f7df0ff3b4a4b391c8cf21aeeb7b11403b5515bf Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 1 Dec 2017 19:59:01 +0100 Subject: [PATCH 72/98] BUG: do not fail when stack()ing unsortable level (#18363) closes #18310 --- doc/source/whatsnew/v0.22.0.txt | 3 ++- pandas/core/indexes/multi.py | 24 ++++++++++++------------ pandas/tests/frame/test_reshape.py | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 55e88d2e50919..6b932cd004904 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -218,7 +218,8 @@ Sparse Reshaping ^^^^^^^^^ -- +- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) + - - diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 456999b94c523..4f9a3a2b1aaa6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1320,19 +1320,19 @@ def _sort_levels_monotonic(self): for lev, lab in zip(self.levels, self.labels): - if lev.is_monotonic: - new_levels.append(lev) - new_labels.append(lab) - continue - - # indexer to reorder the levels - indexer = lev.argsort() - lev = lev.take(indexer) + if not lev.is_monotonic: + try: + # indexer to reorder the levels + indexer = lev.argsort() + except TypeError: + pass + else: + lev = lev.take(indexer) - # indexer to reorder the labels - indexer = _ensure_int64(indexer) - ri = lib.get_reverse_indexer(indexer, len(indexer)) - lab = algos.take_1d(ri, lab) + # indexer to reorder the labels + indexer = _ensure_int64(indexer) + ri = lib.get_reverse_indexer(indexer, len(indexer)) + lab = algos.take_1d(ri, lab) new_levels.append(lev) new_labels.append(lab) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 04dcea2b9d533..f34d25142a057 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -133,6 +133,30 @@ def test_stack_unstack(self): assert_frame_equal(unstacked_cols.T, df) assert_frame_equal(unstacked_cols_df['bar'].T, df) + def test_stack_mixed_level(self): + # GH 18310 + levels = [range(3), [3, 'a', 'b'], [1, 2]] + + # flat columns: + df = DataFrame(1, index=levels[0], columns=levels[1]) + result = df.stack() + expected = Series(1, index=MultiIndex.from_product(levels[:2])) + assert_series_equal(result, expected) + + # MultiIndex columns: + df = DataFrame(1, index=levels[0], + columns=MultiIndex.from_product(levels[1:])) + result = df.stack(1) + expected = DataFrame(1, index=MultiIndex.from_product([levels[0], + levels[2]]), + columns=levels[1]) + assert_frame_equal(result, expected) + + # as above, but used labels in level are actually of homogeneous type + result = df[['a', 'b']].stack(1) + expected = expected[['a', 'b']] + assert_frame_equal(result, expected) + def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series From d270bbb1448ecaccbb567721c991350bac715059 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 1 Dec 2017 20:02:41 +0100 Subject: [PATCH 73/98] Construction of Series from dict containing NaN as key (#18496) closes #18480 closes #18515 --- doc/source/whatsnew/v0.22.0.txt | 2 + pandas/core/base.py | 5 +- pandas/core/indexes/base.py | 21 --------- pandas/core/indexes/datetimelike.py | 8 ---- pandas/core/indexes/datetimes.py | 11 ----- pandas/core/series.py | 55 +++++++++++++++++----- pandas/tests/series/test_apply.py | 1 + pandas/tests/series/test_combine_concat.py | 3 +- pandas/tests/series/test_constructors.py | 55 ++++++++++++++++++---- 9 files changed, 96 insertions(+), 65 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6b932cd004904..1a08a1353a605 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -99,6 +99,7 @@ Other API Changes - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) - ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) +- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) @@ -242,5 +243,6 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - diff --git a/pandas/core/base.py b/pandas/core/base.py index ae92b62ce1d11..72acd0052202b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -874,9 +874,8 @@ def _map_values(self, mapper, na_action=None): # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples - from pandas import Series, Index - index = Index(mapper, tupleize_cols=False) - mapper = Series(mapper, index=index) + from pandas import Series + mapper = Series(mapper) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10f9022e2666b..2bf3afe47d007 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2822,27 +2822,6 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer - _index_shared_docs['_get_values_from_dict'] = """ - Return the values of the input dictionary in the order the keys are - in the index. np.nan is returned for index values not in the - dictionary. - - Parameters - ---------- - data : dict - The dictionary from which to extract the values - - Returns - ------- - np.array - - """ - - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - return lib.fast_multiget(data, self.values, - default=np.nan) - def _maybe_promote(self, other): # A hack, but it works from pandas.core.indexes.datetimes import DatetimeIndex diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c15727c247e1e..5f543ab6e510d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -700,14 +700,6 @@ def __rsub__(self, other): def _add_delta(self, other): return NotImplemented - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - if len(data): - return np.array([data.get(i, np.nan) - for i in self.asobject.values]) - - return np.array([np.nan]) - def _add_delta_td(self, other): # add a delta of a timedeltalike # return the i8 result view diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 196c881f97526..ee6263a9f0aad 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1457,17 +1457,6 @@ def get_value_maybe_box(self, series, key): key, tz=self.tz) return _maybe_box(self, values, series, key) - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - if len(data): - # coerce back to datetime objects for lookup - data = com._dict_compat(data) - return lib.fast_multiget(data, - self.asobject.values, - default=np.nan) - - return np.array([np.nan]) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/core/series.py b/pandas/core/series.py index bff7c21ad69b1..5d0e6907a6595 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -42,7 +42,6 @@ _default_index, _asarray_tuplesafe, _values_from_object, - _try_sort, _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, @@ -198,18 +197,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data.reindex(index, copy=copy) data = data._data elif isinstance(data, dict): - if index is None: - if isinstance(data, OrderedDict): - index = Index(data) - else: - index = Index(_try_sort(data)) - - try: - data = index._get_values_from_dict(data) - except TypeError: - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False elif isinstance(data, SingleBlockManager): if index is None: index = data.index @@ -257,6 +247,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None, self.name = name self._set_axis(0, index, fastpath=True) + def _init_dict(self, data, index=None, dtype=None): + """ + Derive the "_data" and "index" attributes of a new Series from a + dictionary input. + + Parameters + ---------- + data : dict or dict-like + Data used to populate the new Series + index : Index or index-like, default None + index for the new Series: if None, use dict keys + dtype : dtype, default None + dtype for the new Series: if None, infer from data + + Returns + ------- + _data : BlockManager for the new Series + index : index for the new Series + """ + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] + # raises KeyError), so we iterate the entire dict, and align + if data: + keys, values = zip(*compat.iteritems(data)) + else: + keys, values = [], [] + + # Input is now list-like, so rely on "standard" construction: + s = Series(values, index=keys, dtype=dtype) + + # Now we just make sure the order is respected, if any + if index is not None: + s = s.reindex(index, copy=False) + elif not isinstance(data, OrderedDict): + try: + s = s.sort_index() + except TypeError: + pass + return s._data, s.index + @classmethod def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False): diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index fe21ba569ae99..cafe6a34720be 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -422,6 +422,7 @@ def test_map_dict_with_tuple_keys(self): converted to a multi-index, preventing tuple values from being mapped properly. """ + # GH 18496 df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 71ac00975af03..6cf60e818c845 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self): # categorical assert pd.concat([Series(dtype='category'), Series(dtype='category')]).dtype == 'category' - assert pd.concat([Series(dtype='category'), + # GH 18515 + assert pd.concat([Series(np.array([]), dtype='category'), Series(dtype='float64')]).dtype == 'float64' assert pd.concat([Series(dtype='category'), Series(dtype='object')]).dtype == 'object' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ccc04da3299fe..a57385a9cf690 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -4,6 +4,7 @@ import pytest from datetime import datetime, timedelta +from collections import OrderedDict from numpy import nan import numpy as np @@ -79,17 +80,42 @@ def test_constructor(self): m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m) - def test_constructor_empty(self): + @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) + def test_constructor_empty(self, input_class): empty = Series() - empty2 = Series([]) + empty2 = Series(input_class()) - # the are Index() and RangeIndex() which don't compare type equal + # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) - empty = Series(index=lrange(10)) - empty2 = Series(np.nan, index=lrange(10)) - assert_series_equal(empty, empty2) + # With explicit dtype: + empty = Series(dtype='float64') + empty2 = Series(input_class(), dtype='float64') + assert_series_equal(empty, empty2, check_index_type=False) + + # GH 18515 : with dtype=category: + empty = Series(dtype='category') + empty2 = Series(input_class(), dtype='category') + assert_series_equal(empty, empty2, check_index_type=False) + + if input_class is not list: + # With index: + empty = Series(index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10)) + assert_series_equal(empty, empty2) + + # With index and dtype float64: + empty = Series(np.nan, index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10), dtype='float64') + assert_series_equal(empty, empty2) + + @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) + def test_constructor_nan(self, input_arg): + empty = Series(dtype='float64', index=lrange(10)) + empty2 = Series(input_arg, index=lrange(10)) + + assert_series_equal(empty, empty2, check_index_type=False) def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] @@ -625,6 +651,21 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + def test_constructor_dict_nan_key(self, value): + # GH 18480 + d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) + assert_series_equal(result, expected) + + # MultiIndex: + d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c'], + index=Index([(1, 1), (2, np.nan), (3, value)])) + assert_series_equal(result, expected) + def test_constructor_dict_datetime64_index(self): # GH 9456 @@ -658,8 +699,6 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data - @pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with ' - 'NaN keys') def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} From d163de70c93547035579870e2ae9008cb3640b45 Mon Sep 17 00:00:00 2001 From: Nicholas Ursa Date: Fri, 1 Dec 2017 14:15:22 -0500 Subject: [PATCH 74/98] BLD Added --strict and -r sxX to test scripts (#18598) --- test.bat | 2 +- test.sh | 2 +- test_fast.bat | 2 +- test_fast.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test.bat b/test.bat index 2424f62b8dbfe..e07c84f257a69 100644 --- a/test.bat +++ b/test.bat @@ -1,3 +1,3 @@ :: test on windows -pytest --strict --skip-slow --skip-network pandas -n 2 %* +pytest --skip-slow --skip-network pandas -n 2 -r sxX --strict %* diff --git a/test.sh b/test.sh index 23c7ff52d2ce9..1255a39816f78 100755 --- a/test.sh +++ b/test.sh @@ -1,4 +1,4 @@ #!/bin/sh command -v coverage >/dev/null && coverage erase command -v python-coverage >/dev/null && python-coverage erase -pytest pandas --cov=pandas +pytest pandas --cov=pandas -r sxX --strict diff --git a/test_fast.bat b/test_fast.bat index 17dc54b580137..81f30dd310e28 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network -m "not single" -n 4 pandas +pytest --skip-slow --skip-network -m "not single" -n 4 -r sXX --strict pandas diff --git a/test_fast.sh b/test_fast.sh index 9b984156a796c..1fb55e581d292 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network -m "not single" -n 4 "$@" +pytest pandas --skip-slow --skip-network -m "not single" -n 4 -r sxX --strict "$@" From e1ba19a1fea96726f57415669b57316ba060bc1e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 2 Dec 2017 12:34:36 -0500 Subject: [PATCH 75/98] API: empty map should not infer (#18517) closes #18509 --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/base.py | 18 +----------------- pandas/tests/indexes/common.py | 17 ++++++++++++++--- pandas/tests/indexes/datetimelike.py | 3 ++- pandas/tests/indexes/test_base.py | 4 +--- pandas/tests/series/test_apply.py | 8 ++++++++ pandas/tests/test_resample.py | 24 +++++++++++++----------- pandas/util/testing.py | 3 ++- 8 files changed, 42 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1a08a1353a605..09b504cac5ed4 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -107,7 +107,7 @@ Other API Changes - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) - `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) -- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`). +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). - :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2bf3afe47d007..94e9947155c41 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2897,25 +2897,9 @@ def map(self, mapper, na_action=None): names=names) attributes['copy'] = False - - # we want to try to return our original dtype - # ints infer to integer, but if we have - # uints, would prefer to return these - if is_unsigned_integer_dtype(self.dtype): - inferred = lib.infer_dtype(new_values) - if inferred == 'integer': - attributes['dtype'] = self.dtype - - elif not new_values.size: + if not new_values.size: # empty attributes['dtype'] = self.dtype - elif isna(new_values).all(): - # all nan - inferred = lib.infer_dtype(self) - if inferred in ['datetime', 'datetime64', - 'timedelta', 'timedelta64', - 'period']: - new_values = [libts.NaT] * len(new_values) return Index(new_values, **attributes) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 99bdaf02e25ff..c1ee18526cc01 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1009,7 +1009,13 @@ def test_searchsorted_monotonic(self, indices): def test_map(self): # callable index = self.create_index() - expected = index + + # we don't infer UInt64 + if isinstance(index, pd.UInt64Index): + expected = index.astype('int64') + else: + expected = index + result = index.map(lambda x: x) tm.assert_index_equal(result, expected) @@ -1024,9 +1030,14 @@ def test_map_dictlike(self, mapper): if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)): pytest.skip("skipping tests for {}".format(type(index))) - expected = index - identity = mapper(index.values, index) + + # we don't infer to UInt64 for a dict + if isinstance(index, pd.UInt64Index) and isinstance(identity, dict): + expected = index.astype('int64') + else: + expected = index + result = index.map(identity) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index a01c60a47c0f9..ad76d17c93c41 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -1,5 +1,6 @@ """ generic datetimelike tests """ import pytest +import numpy as np import pandas as pd from .common import Base import pandas.util.testing as tm @@ -72,6 +73,6 @@ def test_map_dictlike(self, mapper): # empty map; these map to np.nan because we cannot know # to re-infer things - expected = pd.Index([pd.NaT] * len(self.index)) + expected = pd.Index([np.nan] * len(self.index)) result = self.index.map(mapper([], [])) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0b782e600822a..9ef7a43b2193a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -885,9 +885,7 @@ def test_map_dictlike(self, mapper): expected = Index(np.arange(len(index), 0, -1)) # to match proper result coercion for uints - if name == 'uintIndex': - expected = expected.astype('uint64') - elif name == 'empty': + if name == 'empty': expected = Index([]) result = index.map(mapper(expected, index)) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index cafe6a34720be..8899ab585d6cb 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -377,6 +377,14 @@ def test_map(self): exp = Series([np.nan, 'B', 'C', 'D']) tm.assert_series_equal(a.map(c), exp) + @pytest.mark.parametrize("index", tm.all_index_generator(10)) + def test_map_empty(self, index): + s = Series(index) + result = s.map({}) + + expected = pd.Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) + def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index b0154f6db7022..29dd99ac9c655 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -816,21 +816,23 @@ def test_resample_empty_dataframe(self): # test size for GH13212 (currently stays as df) - def test_resample_empty_dtypes(self): + @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) + @pytest.mark.parametrize( + "dtype", + [np.float, np.int, np.object, 'datetime64[ns]']) + def test_resample_empty_dtypes(self, index, dtype): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass + for how in downsample_methods + upsample_methods: + empty_series = Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass def test_resample_loffset_arg_type(self): # GH 13218, 15002 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index ff6fa8ae717d3..850c42a011958 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1695,7 +1695,8 @@ def all_index_generator(k=10): """ all_make_index_funcs = [makeIntIndex, makeFloatIndex, makeStringIndex, makeUnicodeIndex, makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeBoolIndex, + makeTimedeltaIndex, makeBoolIndex, makeRangeIndex, + makeIntervalIndex, makeCategoricalIndex] for make_index_func in all_make_index_funcs: yield make_index_func(k=k) From 0e168188811677f9de72a6a5b97253e551b6b04a Mon Sep 17 00:00:00 2001 From: fjdiod Date: Sat, 2 Dec 2017 20:43:01 +0300 Subject: [PATCH 76/98] BUG: Unwanted conversion from timedelta to float (#18493) (#18586) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/internals.py | 3 ++- pandas/tests/indexing/test_timedelta.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index bebfd0ab50e90..3d4850b334ff9 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -74,6 +74,7 @@ Indexing - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) - Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) +- Bug in masked assignment of a ``timedelta64[ns]`` dtype ``Series``, incorrectly coerced to float (:issue:`18493`) - I/O diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4f25a19d437ca..1d1d71be16c00 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1956,7 +1956,8 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) - return isinstance(element, (timedelta, np.timedelta64)) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) def fillna(self, value, **kwargs): diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 32609362e49af..3ad3b771b2ab2 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,6 +2,7 @@ import pandas as pd from pandas.util import testing as tm +import numpy as np class TestTimedeltaIndexing(object): @@ -47,3 +48,23 @@ def test_string_indexing(self): expected = df.iloc[0] sliced = df.loc['0 days'] tm.assert_series_equal(sliced, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_masked_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series[series == series[0]] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_listlike_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series.iloc[0] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) From 7a3f81a34507a38e4a69fbf8d80f2ca95fc610dc Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sun, 3 Dec 2017 15:26:50 +0000 Subject: [PATCH 77/98] ENH: Better error message if usecols doesn't match columns (#17310) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/io/parsers.py | 42 ++++++++++++++++++++++++++++--- pandas/tests/io/parser/usecols.py | 21 ++++++++++------ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 09b504cac5ed4..af580403aa4b2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -76,6 +76,7 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) +- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fe50b551ea948..83b1d8ec1a070 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1141,6 +1141,38 @@ def _evaluate_usecols(usecols, names): return usecols +def _validate_usecols_names(usecols, names): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: {missing}".format(missing=missing) + ) + + return usecols + + def _validate_skipfooter_arg(skipfooter): """ Validate the 'skipfooter' parameter. @@ -1753,14 +1785,14 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - raise ValueError("Usecols do not match names.") + _validate_usecols_names(usecols, self.names) self._set_noconvert_columns() @@ -2532,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key): raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] + for col in self.usecols: if isinstance(col, string_types): - col_indices.append(usecols_key.index(col)) + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + _validate_usecols_names(self.usecols, usecols_key) else: col_indices.append(col) else: diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index f582e5037ca07..0fa53e6288bda 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self): # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - if self.engine == 'c': - msg = 'Usecols do not match names' - else: - msg = 'is not in list' + msg = ( + "Usecols do not match columns, " + "columns expected but not found: {missing}" + ) usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) @@ -492,11 +492,16 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): + self.read_csv(StringIO(data), usecols=usecols) + + usecols = ['a', 'b', 'f', 'g'] + with tm.assert_raises_regex( + ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -520,9 +525,9 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), names=names, usecols=usecols) From 8172565a75d235d0c8af888a746a53741a797244 Mon Sep 17 00:00:00 2001 From: Chris Mazzullo Date: Sun, 3 Dec 2017 11:36:30 -0500 Subject: [PATCH 78/98] BUG: GH17464 MultiIndex now raises an error when levels aren't unique, tests changed (#17971) --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/indexes/multi.py | 9 +++++++-- pandas/tests/groupby/test_functional.py | 13 +++++++------ pandas/tests/indexes/test_multi.py | 23 ++++++++++++++++++----- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index af580403aa4b2..9b9b5b63b63fa 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -114,6 +114,7 @@ Other API Changes - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). - :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) +- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4f9a3a2b1aaa6..0cbb87c65ccd7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -177,8 +177,8 @@ def _verify_integrity(self, labels=None, levels=None): Raises ------ ValueError - * if length of levels and labels don't match or any label would - exceed level bounds + If length of levels and labels don't match, if any label would + exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -198,6 +198,11 @@ def _verify_integrity(self, labels=None, levels=None): " level (%d). NOTE: this index is in an" " inconsistent state" % (i, label.max(), len(level))) + if not level.is_unique: + raise ValueError("Level values must be unique: {values} on " + "level {level}".format( + values=[value for value in level], + level=i)) @property def levels(self): diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py index bc13d51c4f4f6..b9718663570bd 100644 --- a/pandas/tests/groupby/test_functional.py +++ b/pandas/tests/groupby/test_functional.py @@ -52,10 +52,10 @@ def test_frame_describe_multikey(self): desc_groups = [] for col in self.tsframe: group = grouped[col].describe() - group_col = pd.MultiIndex([[col] * len(group.columns), - group.columns], - [[0] * len(group.columns), - range(len(group.columns))]) + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + labels=[[0] * len(group.columns), range(len(group.columns))]) group = pd.DataFrame(group.values, columns=group_col, index=group.index) @@ -67,8 +67,9 @@ def test_frame_describe_multikey(self): 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() expected = self.tsframe.describe().T - expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], - [range(4), range(len(expected.index))]) + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + labels=[[0, 0, 1, 1], range(len(expected.index))]) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5c2a0254b072b..a2c0a75e21f43 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1618,7 +1618,9 @@ def test_is_(self): # shouldn't change assert mi2.is_(mi) mi4 = mi3.view() - mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True) + + # GH 17464 - Remove duplicate MultiIndex levels + mi4.set_levels([lrange(10), lrange(10)], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() mi5.set_levels(mi5.levels, inplace=True) @@ -2450,13 +2452,11 @@ def test_isna_behavior(self): pd.isna(self.index) def test_level_setting_resets_attributes(self): - ind = MultiIndex.from_arrays([ + ind = pd.MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) assert ind.is_monotonic - ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) - + ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic @@ -3083,3 +3083,16 @@ def test_million_record_attribute_error(self): with tm.assert_raises_regex(AttributeError, "'Series' object has no attribute 'foo'"): df['a'].foo() + + def test_duplicate_multiindex_labels(self): + # GH 17464 + # Make sure that a MultiIndex with duplicate levels throws a ValueError + with pytest.raises(ValueError): + ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + + # And that using set_levels with duplicate levels fails + ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], + [1, 2, 1, 2, 3]]) + with pytest.raises(ValueError): + ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], + inplace=True) From dc5403f2fe75bea0201ad6af347f788a9adeebb8 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sun, 3 Dec 2017 18:00:22 +0000 Subject: [PATCH 79/98] CLN: Move period.pyx to tslibs/period.pyx (#18555) --- pandas/_libs/index.pyx | 3 +- pandas/_libs/{ => tslibs}/period.pyx | 29 +++++++++----------- pandas/compat/pickle_compat.py | 6 +++- pandas/core/indexes/accessors.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 5 ++-- pandas/core/indexes/period.py | 10 +++---- pandas/core/resample.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/scalar/test_period.py | 3 +- pandas/tests/scalar/test_timestamp.py | 3 +- pandas/tests/test_resample.py | 2 +- setup.py | 6 ++-- 14 files changed, 40 insertions(+), 37 deletions(-) rename pandas/_libs/{ => tslibs}/period.pyx (98%) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 03596d7d091e0..fa2e1271f4649 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,7 +17,8 @@ from tslibs.conversion cimport maybe_datetimelike_to_i8 from hashtable cimport HashTable -from pandas._libs import algos, period as periodlib, hashtable as _hash +from pandas._libs import algos, hashtable as _hash +from pandas._libs.tslibs import period as periodlib from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta, date diff --git a/pandas/_libs/period.pyx b/pandas/_libs/tslibs/period.pyx similarity index 98% rename from pandas/_libs/period.pyx rename to pandas/_libs/tslibs/period.pyx index b95632b5b0eff..cf73257caf227 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -21,26 +21,23 @@ from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT # import datetime C API PyDateTime_IMPORT -from tslibs.np_datetime cimport (pandas_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, - is_leapyear) - +from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, + dt64_to_dtstruct, is_leapyear) cimport util from util cimport is_period_object, is_string_object, INT32_MIN -from missing cimport is_null_datetimelike -from pandas._libs.tslib import Timestamp -from tslibs.timezones cimport ( - is_utc, is_tzlocal, get_utcoffset, get_dst_info) -from tslibs.timedeltas cimport delta_to_nanoseconds - -from tslibs.parsing import (parse_time_string, NAT_SENTINEL, - _get_rule_month, _MONTH_NUMBERS) -from tslibs.frequencies cimport get_freq_code -from tslibs.resolution import resolution, Resolution -from tslibs.nattype import nat_strings, NaT, iNaT -from tslibs.nattype cimport _nat_scalar_rules, NPY_NAT +from pandas._libs.missing cimport is_null_datetimelike +from timestamps import Timestamp +from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info +from timedeltas cimport delta_to_nanoseconds + +from parsing import (parse_time_string, NAT_SENTINEL, + _get_rule_month, _MONTH_NUMBERS) +from frequencies cimport get_freq_code +from resolution import resolution, Resolution +from nattype import nat_strings, NaT, iNaT +from nattype cimport _nat_scalar_rules, NPY_NAT from pandas.tseries import offsets from pandas.tseries import frequencies diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 8015642919611..07b34961ce25d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -74,7 +74,11 @@ def load_reduce(self): ('pandas._libs.sparse', 'BlockIndex'), ('pandas.tslib', 'Timestamp'): ('pandas._libs.tslib', 'Timestamp'), - ('pandas._period', 'Period'): ('pandas._libs.period', 'Period'), + + # 18543 moving period + ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'), + ('pandas._libs.period', 'Period'): + ('pandas._libs.tslibs.period', 'Period'), # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype ('pandas.tslib', '__nat_unpickle'): diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2176338574304..27e1006c23174 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -14,7 +14,7 @@ from pandas.core.accessor import PandasDelegate from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex -from pandas._libs.period import IncompatibleFrequency # noqa +from pandas._libs.tslibs.period import IncompatibleFrequency # noqa from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.algorithms import take_1d diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5f543ab6e510d..c2fc983c983a6 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -25,7 +25,7 @@ import pandas.io.formats.printing as printing from pandas._libs import lib, iNaT, NaT -from pandas._libs.period import Period +from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds from pandas.core.indexes.base import Index, _index_shared_docs diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ee6263a9f0aad..1578ae924c9bb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -54,8 +54,9 @@ from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, - Timestamp, period as libperiod) -from pandas._libs.tslibs import timezones, conversion, fields + Timestamp) +from pandas._libs.tslibs import (timezones, conversion, fields, + period as libperiod) # -------- some conversion wrapper functions diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6535eee386e8b..ac9b511606066 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -31,12 +31,12 @@ import pandas.tseries.offsets as offsets from pandas._libs.lib import infer_dtype -from pandas._libs import tslib, period, index as libindex -from pandas._libs.period import (Period, IncompatibleFrequency, - get_period_field_arr, _validate_end_alias, - _quarter_to_myear) +from pandas._libs import tslib, index as libindex +from pandas._libs.tslibs.period import (Period, IncompatibleFrequency, + get_period_field_arr, + _validate_end_alias, _quarter_to_myear) from pandas._libs.tslibs.fields import isleapyear_arr -from pandas._libs.tslibs import resolution +from pandas._libs.tslibs import resolution, period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds from pandas.core.base import _shared_docs diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bd441a8248841..9f5439b68558b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,7 +24,7 @@ from pandas._libs import lib, tslib from pandas._libs.lib import Timestamp -from pandas._libs.period import IncompatibleFrequency +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.util._decorators import Appender from pandas.core.generic import _shared_docs diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index d20ed66c06ce9..6cb4226dffc5a 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -9,7 +9,7 @@ from pandas._libs import tslib, tslibs from pandas import (PeriodIndex, Series, DatetimeIndex, period_range, Period) -from pandas._libs import period as libperiod +from pandas._libs.tslibs import period as libperiod class TestGetItem(object): diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 074678164e6f9..3774111f44fb2 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,7 +6,7 @@ import pandas.core.indexes.period as period from pandas.compat import lrange from pandas.tseries.frequencies import get_freq, MONTHS -from pandas._libs.period import period_ordinal, period_asfreq +from pandas._libs.tslibs.period import period_ordinal, period_asfreq from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, date_range, to_datetime, period_range) diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index 3bd4a28b7767d..eb6363689cca0 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -10,7 +10,8 @@ from pandas.compat import text_type, iteritems from pandas.compat.numpy import np_datetime64_compat -from pandas._libs import tslib, period as libperiod +from pandas._libs import tslib +from pandas._libs.tslibs import period as libperiod from pandas._libs.tslibs.parsing import DateParseError from pandas import Period, Timestamp, offsets from pandas._libs.tslibs.resolution import DAYS, _MONTHS as MONTHS diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 992f211229441..9d97057569580 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -16,9 +16,8 @@ import pandas.util.testing as tm from pandas.tseries import offsets, frequencies -from pandas._libs import period from pandas._libs.tslibs.timezones import get_timezone -from pandas._libs.tslibs import conversion +from pandas._libs.tslibs import conversion, period from pandas.compat import long, PY3 from pandas.util.testing import assert_series_equal diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 29dd99ac9c655..10c3c0ea507c1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -33,7 +33,7 @@ from pandas.core.indexes.timedeltas import timedelta_range, TimedeltaIndex from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, assert_index_equal) -from pandas._libs.period import IncompatibleFrequency +from pandas._libs.tslibs.period import IncompatibleFrequency bday = BDay() diff --git a/setup.py b/setup.py index 68e1319458a33..d43f8ec12b18a 100755 --- a/setup.py +++ b/setup.py @@ -331,7 +331,6 @@ class CheckSDist(sdist_class): _pyxfiles = ['pandas/_libs/lib.pyx', 'pandas/_libs/hashtable.pyx', 'pandas/_libs/tslib.pyx', - 'pandas/_libs/period.pyx', 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', 'pandas/_libs/join.pyx', @@ -344,6 +343,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/skiplist.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', + 'pandas/_libs/tslibs/period.pyx', 'pandas/_libs/tslibs/strptime.pyx', 'pandas/_libs/tslibs/np_datetime.pyx', 'pandas/_libs/tslibs/timedeltas.pyx', @@ -530,8 +530,8 @@ def pxd(name): 'pandas/_libs/src/numpy_helper.h'], 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, - '_libs.period': { - 'pyxfile': '_libs/period', + '_libs.tslibs.period': { + 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util', '_libs/lib', '_libs/tslibs/timedeltas', From a9e47312697a933e6af495a8682ce73717cf0ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20Konefa=C5=82?= Date: Sun, 3 Dec 2017 10:38:24 -0500 Subject: [PATCH 80/98] =?UTF-8?q?BLD:=20since=20we=20already=20use=20setup?= =?UTF-8?q?tools,=20let's=20remove=20the=20optional=20logic=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #18113 Author: Grzegorz Konefał Author: Krzysztof Chomski Closes #18448 from gkonefal-reef/GH18113 and squashes the following commits: 21cbe79 [Grzegorz Konefał] Comments applied 290b49c [Krzysztof Chomski] BLD: since we already use setuptools, let's remove the optional logic in setup.py (GH18113). --- doc/source/whatsnew/v0.22.0.txt | 1 + setup.py | 56 +++++++++------------------------ 2 files changed, 15 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 9b9b5b63b63fa..304ccd1f9350b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -115,6 +115,7 @@ Other API Changes - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). - :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) - When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) +- Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`) .. _whatsnew_0220.deprecations: diff --git a/setup.py b/setup.py index d43f8ec12b18a..72f0774ffd44b 100755 --- a/setup.py +++ b/setup.py @@ -9,14 +9,18 @@ import os from os.path import join as pjoin +import pkg_resources import sys import shutil from distutils.version import LooseVersion +from setuptools import setup, Command # versioning import versioneer cmdclass = versioneer.get_cmdclass() +PY3 = sys.version_info[0] >= 3 + def is_platform_windows(): return sys.platform == 'win32' or sys.platform == 'cygwin' @@ -38,46 +42,18 @@ def is_platform_mac(): except ImportError: _CYTHON_INSTALLED = False -try: - import pkg_resources - from setuptools import setup, Command - _have_setuptools = True -except ImportError: - # no setuptools installed - from distutils.core import setup, Command - _have_setuptools = False -setuptools_kwargs = {} min_numpy_ver = '1.9.0' -if sys.version_info[0] >= 3: +setuptools_kwargs = { + 'install_requires': [ + 'python-dateutil >= 2' if PY3 else 'python-dateutil', + 'pytz >= 2011k', + 'numpy >= %s' % min_numpy_ver, + ], + 'setup_requires': ['numpy >= %s' % min_numpy_ver], + 'zip_safe': False, +} - setuptools_kwargs = {'zip_safe': False, - 'install_requires': ['python-dateutil >= 2', - 'pytz >= 2011k', - 'numpy >= %s' % min_numpy_ver], - 'setup_requires': ['numpy >= %s' % min_numpy_ver]} - if not _have_setuptools: - sys.exit("need setuptools/distribute for Py3k" - "\n$ pip install distribute") - -else: - setuptools_kwargs = { - 'install_requires': ['python-dateutil', - 'pytz >= 2011k', - 'numpy >= %s' % min_numpy_ver], - 'setup_requires': ['numpy >= %s' % min_numpy_ver], - 'zip_safe': False, - } - - if not _have_setuptools: - try: - import numpy # noqa:F401 - import dateutil # noqa:F401 - setuptools_kwargs = {} - except ImportError: - sys.exit("install requires: 'python-dateutil < 2','numpy'." - " use pip or easy_install." - "\n $ pip install 'python-dateutil < 2' 'numpy'") from distutils.extension import Extension # noqa:E402 from distutils.command.build import build # noqa:E402 @@ -695,7 +671,7 @@ def pxd(name): # ---------------------------------------------------------------------- # ujson -if suffix == '.pyx' and 'setuptools' in sys.modules: +if suffix == '.pyx': # undo dumb setuptools bug clobbering .pyx sources back to .c for ext in extensions: if ext.sources[0].endswith(('.c', '.cpp')): @@ -729,10 +705,6 @@ def pxd(name): sources=['pandas/util/move.c']) extensions.append(_move_ext) - -if _have_setuptools: - setuptools_kwargs["test_suite"] = "nose.collector" - # The build cache system does string matching below this point. # if you change something, be careful. From bd9a3e06621e0b2bff3a7f4a6bf68c886b0db4c3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 3 Dec 2017 11:18:34 -0500 Subject: [PATCH 81/98] STYLE: conform setup.py to use .format string formatting --- setup.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 72f0774ffd44b..f01689d0cf4ac 100755 --- a/setup.py +++ b/setup.py @@ -48,9 +48,9 @@ def is_platform_mac(): 'install_requires': [ 'python-dateutil >= 2' if PY3 else 'python-dateutil', 'pytz >= 2011k', - 'numpy >= %s' % min_numpy_ver, + 'numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver), ], - 'setup_requires': ['numpy >= %s' % min_numpy_ver], + 'setup_requires': ['numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver)], 'zip_safe': False, } @@ -342,8 +342,9 @@ def run(self): else: for pyxfile in self._pyxfiles: cfile = pyxfile[:-3] + 'c' - msg = "C-source file '%s' not found." % (cfile) +\ - " Run 'setup.py cython' before sdist." + msg = ("C-source file '{source}' not found.\n" + "Run 'setup.py cython' before sdist.".format( + source=cfile)) assert os.path.isfile(cfile), msg sdist_class.run(self) @@ -359,10 +360,10 @@ def check_cython_extensions(self, extensions): for src in ext.sources: if not os.path.exists(src): print("{}: -> [{}]".format(ext.name, ext.sources)) - raise Exception("""Cython-generated file '%s' not found. + raise Exception("""Cython-generated file '{src}' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. - """ % src) + """.format(src=src)) def build_extensions(self): self.check_cython_extensions(self.extensions) @@ -623,7 +624,7 @@ def pxd(name): include = data.get('include', common_include) - obj = Extension('pandas.%s' % name, + obj = Extension('pandas.{name}'.format(name=name), sources=sources, depends=data.get('depends', []), include_dirs=include, From 6e56195fcb021e7c53301722804087ef2a83bc39 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 3 Dec 2017 14:57:18 -0800 Subject: [PATCH 82/98] Cleanup cimports (#18556) --- pandas/_libs/src/inference.pyx | 2 +- pandas/_libs/tslib.pxd | 3 --- pandas/_libs/tslib.pyx | 10 ++++------ setup.py | 4 +++- 4 files changed, 8 insertions(+), 11 deletions(-) delete mode 100644 pandas/_libs/tslib.pxd diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 6e964077dd56e..cb192fcced318 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -3,7 +3,7 @@ from decimal import Decimal cimport util cimport cython from tslibs.nattype import NaT -from tslib cimport convert_to_tsobject +from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone from datetime import datetime, timedelta diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd deleted file mode 100644 index b74cf5b79c4cb..0000000000000 --- a/pandas/_libs/tslib.pxd +++ /dev/null @@ -1,3 +0,0 @@ -from numpy cimport ndarray, int64_t - -from tslibs.conversion cimport convert_to_tsobject diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6d8cf39114f6f..020ac812e1c20 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -5,8 +5,9 @@ # distutils: define_macros=CYTHON_TRACE_NOGIL=0 cimport numpy as np -from numpy cimport int64_t, import_array, ndarray, float64_t +from numpy cimport int64_t, ndarray, float64_t import numpy as np +np.import_array() from cpython cimport PyTypeObject, PyFloat_Check @@ -35,18 +36,15 @@ from tslibs.np_datetime cimport (check_dts_bounds, dayofweek, is_leapyear) from tslibs.np_datetime import OutOfBoundsDatetime -from .tslibs.parsing import parse_datetime_string +from tslibs.parsing import parse_datetime_string cimport cython +from cython cimport Py_ssize_t -import warnings import pytz UTC = pytz.utc -# initialize numpy -import_array() - from tslibs.timedeltas cimport cast_from_unit from tslibs.timedeltas import Timedelta diff --git a/setup.py b/setup.py index f01689d0cf4ac..ba948abf4302b 100755 --- a/setup.py +++ b/setup.py @@ -494,7 +494,9 @@ def pxd(name): 'depends': _pxi_dep['join']}, '_libs.lib': { 'pyxfile': '_libs/lib', - 'pxdfiles': ['_libs/src/util', '_libs/missing'], + 'pxdfiles': ['_libs/src/util', + '_libs/missing', + '_libs/tslibs/conversion'], 'depends': lib_depends + tseries_depends}, '_libs.missing': { 'pyxfile': '_libs/missing', From aa5b6e6b53637ca861d7b6cbae2878e7c4687199 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 4 Dec 2017 10:49:24 +0000 Subject: [PATCH 83/98] DEPR: deprecate .asobject property (#18572) --- asv_bench/benchmarks/index_object.py | 2 +- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/_libs/algos_common_helper.pxi.in | 4 +- pandas/core/accessor.py | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/dtypes/concat.py | 4 +- pandas/core/frame.py | 4 +- pandas/core/indexes/datetimelike.py | 21 ++++++++--- pandas/core/indexes/datetimes.py | 6 +-- pandas/core/indexes/period.py | 14 +++---- pandas/core/indexes/timedeltas.py | 4 +- pandas/core/indexing.py | 3 +- pandas/core/internals.py | 2 +- pandas/core/ops.py | 2 +- pandas/core/series.py | 16 +++++--- pandas/io/formats/format.py | 2 +- pandas/plotting/_converter.py | 3 +- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/indexes/datetimelike.py | 7 ++++ pandas/tests/indexes/datetimes/test_ops.py | 41 +++++++++++---------- pandas/tests/indexes/period/test_ops.py | 39 ++++++++++---------- pandas/tests/indexes/period/test_period.py | 10 ++--- pandas/tests/indexes/test_base.py | 6 +-- pandas/tests/indexes/timedeltas/test_ops.py | 37 ++++++++++--------- pandas/tests/plotting/test_datetimelike.py | 12 +++--- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/series/test_datetime_values.py | 4 +- pandas/tests/series/test_dtypes.py | 6 +++ pandas/tests/series/test_timeseries.py | 4 +- pandas/tests/test_base.py | 6 +-- pandas/tests/tseries/test_frequencies.py | 6 +-- pandas/tests/tseries/test_timezones.py | 8 ++-- 32 files changed, 158 insertions(+), 127 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 7697c3b9d3840..a607168ea0457 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -12,7 +12,7 @@ def setup(self): if (self.rng.dtype == object): self.idx_rng = self.rng.view(Index) else: - self.idx_rng = self.rng.asobject + self.idx_rng = self.rng.astype(object) self.idx_rng2 = self.idx_rng[:(-1)] # other datetime diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 304ccd1f9350b..77503b4653437 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -124,7 +124,7 @@ Deprecations - ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). - ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). -- +- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use '.astype(object)' instead (:issue:`18572`) .. _whatsnew_0220.prior_deprecations: diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 336dd77ea9a89..0d3f6664da9e3 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -552,8 +552,8 @@ cpdef ensure_object(object arr): return arr else: return arr.astype(np.object_) - elif hasattr(arr, 'asobject'): - return arr.asobject + elif hasattr(arr, '_box_values_as_index'): + return arr._box_values_as_index() else: return np.array(arr, dtype=np.object_) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 7a2da9655cc4a..53ead5e8f74a3 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -10,7 +10,7 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset([]) + _deprecations = frozenset(['asobject']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9f712a1cf039b..0ceb8966fd3c8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -369,7 +369,7 @@ def unique(values): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - uniques = uniques.asobject.values + uniques = uniques.astype(object).values return uniques diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index c1ba018adbcec..cd98064dee86e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -401,7 +401,7 @@ def convert_to_pydatetime(x, axis): # if dtype is of datetimetz or timezone if x.dtype.kind == _NS_DTYPE.kind: if getattr(x, 'tz', None) is not None: - x = x.asobject.values + x = x.astype(object).values else: shape = x.shape x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), @@ -479,7 +479,7 @@ def _concat_index_asobject(to_concat, name=None): """ klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex - to_concat = [x.asobject if isinstance(x, klasses) else x + to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] from pandas import Index diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ff42e39d9dbdd..90d1ab8d0e242 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3330,7 +3330,7 @@ class max type def _maybe_casted_values(index, labels=None): if isinstance(index, PeriodIndex): - values = index.asobject.values + values = index.astype(object).values elif isinstance(index, DatetimeIndex) and index.tz is not None: values = index else: @@ -5077,7 +5077,7 @@ def applymap(self, func): def infer(x): if x.empty: return lib.map_infer(x, func) - return lib.map_infer(x.asobject, func) + return lib.map_infer(x.astype(object).values, func) return self.apply(infer) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c2fc983c983a6..5c96e4eeff69d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -242,6 +242,13 @@ def _box_values(self, values): """ return lib.map_infer(values, self._box_func) + def _box_values_as_index(self): + """ + return object Index which contains boxed values + """ + from pandas.core.index import Index + return Index(self._box_values(self.asi8), name=self.name, dtype=object) + def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) @@ -360,7 +367,7 @@ def map(self, f): raise TypeError('The map function must return an Index object') return result except Exception: - return self.asobject.map(f) + return self.astype(object).map(f) def sort_values(self, return_indexer=False, ascending=True): """ @@ -424,13 +431,15 @@ def _isnan(self): @property def asobject(self): - """ + """DEPRECATED: Use ``astype(object)`` instead. + return object Index which contains boxed values *this is an internal non-public method* """ - from pandas.core.index import Index - return Index(self._box_values(self.asi8), name=self.name, dtype=object) + warnings.warn("'asobject' is deprecated. Use 'astype(object)'" + " instead", FutureWarning, stacklevel=2) + return self.astype(object) def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance, box=False)) @@ -468,7 +477,7 @@ def tolist(self): """ return a list of the underlying data """ - return list(self.asobject) + return list(self.astype(object)) def min(self, axis=None, *args, **kwargs): """ @@ -746,7 +755,7 @@ def isin(self, values): try: values = type(self)(values) except ValueError: - return self.asobject.isin(values) + return self.astype(object).isin(values) return algorithms.isin(self.asi8, values.asi8) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1578ae924c9bb..55c6063b74286 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -907,7 +907,7 @@ def to_datetime(self, dayfirst=False): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') @@ -1679,7 +1679,7 @@ def time(self): Returns numpy array of datetime.time. The time part of the Timestamps. """ return self._maybe_mask_results(libalgos.arrmap_object( - self.asobject.values, + self.astype(object).values, lambda x: np.nan if x is libts.NaT else x.time())) @property @@ -1789,7 +1789,7 @@ def insert(self, loc, item): # fall back to object index if isinstance(item, compat.string_types): - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) raise TypeError( "cannot insert DatetimeIndex with incompatible label") diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ac9b511606066..cb0c4a9ce2a86 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -418,7 +418,7 @@ def _int64index(self): @property def values(self): - return self.asobject.values + return self.astype(object).values @property def _values(self): @@ -428,7 +428,7 @@ def __array__(self, dtype=None): if is_integer_dtype(dtype): return self.asi8 else: - return self.asobject.values + return self.astype(object).values def __array_wrap__(self, result, context=None): """ @@ -476,7 +476,7 @@ def _to_embed(self, keep_tz=False, dtype=None): if dtype is not None: return self.astype(dtype)._to_embed(keep_tz=keep_tz) - return self.asobject.values + return self.astype(object).values @property def _formatter_func(self): @@ -506,7 +506,7 @@ def asof_locs(self, where, mask): def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_integer_dtype(dtype): if copy: return self._int64index.copy() @@ -656,7 +656,7 @@ def end_time(self): def _mpl_repr(self): # how to represent ourselves to matplotlib - return self.asobject.values + return self.astype(object).values def to_timestamp(self, freq=None, how='start'): """ @@ -971,7 +971,7 @@ def _convert_tolerance(self, tolerance, target): def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8)) @@ -1018,7 +1018,7 @@ def _apply_meta(self, rawarr): def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - values = self.asobject.values + values = self.astype(object).values if date_format: formatter = lambda dt: dt.strftime(date_format) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 97f6ca2e5d642..77e05ccf4db22 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -482,7 +482,7 @@ def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): - return self.asobject + return self._box_values_as_index() elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() @@ -883,7 +883,7 @@ def insert(self, loc, item): # fall back to object index if isinstance(item, compat.string_types): - return self.asobject.insert(loc, item) + return self.astype(object).insert(loc, item) raise TypeError( "cannot insert TimedeltaIndex with incompatible label") diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 90733fa6d68d1..c6642657e386e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -405,7 +405,8 @@ def _setitem_with_indexer(self, indexer, value): new_values = np.concatenate([self.obj._values, new_values]) except TypeError: - new_values = np.concatenate([self.obj.asobject, + as_obj = self.obj.astype(object) + new_values = np.concatenate([as_obj, new_values]) self.obj._data = self.obj._constructor( new_values, index=new_index, name=self.obj.name)._data diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1d1d71be16c00..e5db5679c43f6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2191,7 +2191,7 @@ def _try_coerce_args(self, values, other): if isinstance(other, ABCDatetimeIndex): # to store DatetimeTZBlock as object - other = other.asobject.values + other = other.astype(object).values return values, False, other, False diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 934570602c99d..2fb0cbb14c225 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -850,7 +850,7 @@ def wrapper(self, other, axis=None): # tested in test_nat_comparisons # (pandas.tests.series.test_operators.TestSeriesOperators) return self._constructor(na_op(self.values, - other.asobject.values), + other.astype(object).values), index=self.index) return self._constructor(na_op(self.values, np.asarray(other)), diff --git a/pandas/core/series.py b/pandas/core/series.py index 5d0e6907a6595..15550de16e5d2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -149,7 +149,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _metadata = ['name'] _accessors = frozenset(['dt', 'cat', 'str']) _deprecations = generic.NDFrame._deprecations | frozenset( - ['sortlevel', 'reshape', 'get_value', 'set_value', 'from_csv']) + ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value', + 'from_csv']) _allow_index_ops = True def __init__(self, data=None, index=None, dtype=None, name=None, @@ -449,12 +450,15 @@ def get_values(self): @property def asobject(self): - """ + """DEPRECATED: Use ``astype(object)`` instead. + return object Series which contains boxed values *this is an internal non-public method* """ - return self._data.asobject + warnings.warn("'asobject' is deprecated. Use 'astype(object)'" + " instead", FutureWarning, stacklevel=2) + return self.astype(object).values # ops def ravel(self, order='C'): @@ -1322,7 +1326,7 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.asobject.values + result = result.astype(object).values return result @@ -2549,7 +2553,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if is_extension_type(self.dtype): mapped = self._values.map(f) else: - values = self.asobject + values = self.astype(object).values mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): @@ -3125,7 +3129,7 @@ def _sanitize_index(data, index, copy=False): if isinstance(data, ABCIndexClass) and not copy: pass elif isinstance(data, PeriodIndex): - data = data.asobject + data = data.astype(object).values elif isinstance(data, DatetimeIndex): data = data._to_embed(keep_tz=True) elif isinstance(data, np.ndarray): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index e116635c99264..8f25eb3af70cd 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -2231,7 +2231,7 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self): """ we by definition have a TZ """ - values = self.values.asobject + values = self.values.astype(object) is_dates_only = _is_dates_only(values) formatter = (self.formatter or _get_format_datetime64(is_dates_only, diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 9daee918b9f30..2ced5f653825d 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -363,7 +363,8 @@ def __call__(self): tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) - all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).asobject + all_dates = date_range(start=st, end=ed, + freq=freq, tz=tz).astype(object) try: if len(all_dates) > 0: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b6090a13c8d38..876e0ea7ea0b3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -501,8 +501,8 @@ def test_constructor_period(self): assert df['b'].dtype == 'object' # list of periods - df = pd.DataFrame({'a': a.asobject.tolist(), - 'b': b.asobject.tolist()}) + df = pd.DataFrame({'a': a.astype(object).tolist(), + 'b': b.astype(object).tolist()}) assert df['a'].dtype == 'object' assert df['b'].dtype == 'object' diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index ad76d17c93c41..7d01a2a70145d 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -76,3 +76,10 @@ def test_map_dictlike(self, mapper): expected = pd.Index([np.nan] * len(self.index)) result = self.index.map(mapper([], [])) tm.assert_index_equal(result, expected) + + def test_asobject_deprecated(self): + # GH18572 + d = self.create_index() + with tm.assert_produces_warning(FutureWarning): + i = d.asobject + assert isinstance(i, pd.Index) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 0db26652eb191..41d0dd38cd5f6 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -51,7 +51,7 @@ def test_ops_properties_basic(self): assert s.day == 10 pytest.raises(AttributeError, lambda: s.weekday) - def test_asobject_tolist(self): + def test_astype_object(self): idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') expected_list = [Timestamp('2013-01-31'), @@ -59,7 +59,7 @@ def test_asobject_tolist(self): Timestamp('2013-03-31'), Timestamp('2013-04-30')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -74,7 +74,7 @@ def test_asobject_tolist(self): Timestamp('2013-03-31', tz='Asia/Tokyo'), Timestamp('2013-04-30', tz='Asia/Tokyo')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -87,7 +87,7 @@ def test_asobject_tolist(self): Timestamp('2013-01-02'), pd.NaT, Timestamp('2013-01-04')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -389,26 +389,27 @@ def test_comp_nat(self): pd.Timestamp('2011-01-03')]) right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = rhs == lhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = lhs != rhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -636,9 +637,9 @@ def test_equals(self): idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) @@ -646,8 +647,8 @@ def test_equals(self): tz='US/Pacific') assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) @@ -656,8 +657,8 @@ def test_equals(self): tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.asobject) - assert not idx.asobject.equals(idx3) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 1d77de0d2d8f3..a78bc6fc577b8 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -27,7 +27,7 @@ def test_ops_properties(self): self.check_ops_properties(PeriodIndex._object_ops, f) self.check_ops_properties(PeriodIndex._bool_ops, f) - def test_asobject_tolist(self): + def test_astype_object(self): idx = pd.period_range(start='2013-01-01', periods=4, freq='M', name='idx') expected_list = [pd.Period('2013-01-31', freq='M'), @@ -35,7 +35,7 @@ def test_asobject_tolist(self): pd.Period('2013-03-31', freq='M'), pd.Period('2013-04-30', freq='M')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -49,7 +49,7 @@ def test_asobject_tolist(self): pd.Period('NaT', freq='D'), pd.Period('2013-01-04', freq='D')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -290,26 +290,27 @@ def test_comp_nat(self): pd.Period('2011-01-03')]) right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = lhs == rhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = lhs != rhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -614,9 +615,9 @@ def test_equals(self): freq=freq) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) @@ -624,8 +625,8 @@ def test_equals(self): freq='H') assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) @@ -634,8 +635,8 @@ def test_equals(self): tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.asobject) - assert not idx.asobject.equals(idx3) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 13a63de22169e..48378233dd638 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -24,7 +24,7 @@ def setup_method(self, method): def create_index(self): return period_range('20130101', periods=5, freq='D') - def test_astype(self): + def test_astype_conversion(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -380,23 +380,23 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - def test_asobject_like(self): + def test_astype_object(self): idx = pd.PeriodIndex([], freq='M') exp = np.array([], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], dtype=object) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) def test_is_(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9ef7a43b2193a..72b312f29a793 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -122,7 +122,7 @@ def test_constructor_from_index_datetimetz(self): tm.assert_index_equal(result, idx) assert result.tz == idx.tz - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) assert result.tz == idx.tz @@ -131,7 +131,7 @@ def test_constructor_from_index_timedelta(self): result = pd.Index(idx) tm.assert_index_equal(result, idx) - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) def test_constructor_from_index_period(self): @@ -139,7 +139,7 @@ def test_constructor_from_index_period(self): result = pd.Index(idx) tm.assert_index_equal(result, idx) - result = pd.Index(idx.asobject) + result = pd.Index(idx.astype(object)) tm.assert_index_equal(result, idx) def test_constructor_from_series_datetimetz(self): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 67238665a2e8a..fac3745ba4fb4 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -25,12 +25,12 @@ def test_ops_properties(self): self.check_ops_properties(TimedeltaIndex._field_ops, f) self.check_ops_properties(TimedeltaIndex._object_ops, f) - def test_asobject_tolist(self): + def test_astype_object(self): idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') expected_list = [Timedelta('1 days'), Timedelta('2 days'), Timedelta('3 days'), Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -43,7 +43,7 @@ def test_asobject_tolist(self): expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject + result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object tm.assert_index_equal(result, expected) @@ -217,26 +217,27 @@ def test_comp_nat(self): pd.Timedelta('3 days')]) right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) - for l, r in [(left, right), (left.asobject, right.asobject)]: - result = l == r + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = rhs == lhs expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) - result = l != r + result = rhs != lhs expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == r, expected) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(l != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != l, expected) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(l < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > l, expected) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_value_counts_unique(self): # GH 7735 @@ -473,18 +474,18 @@ def test_equals(self): idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) assert idx.equals(idx) assert idx.equals(idx.copy()) - assert idx.equals(idx.asobject) - assert idx.asobject.equals(idx) - assert idx.asobject.equals(idx.asobject) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.asobject) - assert not idx.asobject.equals(idx2) - assert not idx.asobject.equals(idx2.asobject) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.astype(object).equals(idx2.astype(object)) assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index f1a478581e730..8f237a7f810c3 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -272,7 +272,7 @@ def test_irreg_hf(self): _, ax = self.plt.subplots() df2 = df.copy() - df2.index = df.index.asobject + df2.index = df.index.astype(object) df2.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() assert (np.fabs(diffs[1:] - sec) < 1e-8).all() @@ -712,9 +712,9 @@ def test_mixed_freq_irregular_first(self): assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() - tm.assert_numpy_array_equal(x1, s2.index.asobject.values) + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) x2 = lines[1].get_xdata() - tm.assert_numpy_array_equal(x2, s1.index.asobject.values) + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_regular_first_df(self): # GH 9852 @@ -744,9 +744,9 @@ def test_mixed_freq_irregular_first_df(self): assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() - tm.assert_numpy_array_equal(x1, s2.index.asobject.values) + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) x2 = lines[1].get_xdata() - tm.assert_numpy_array_equal(x2, s1.index.asobject.values) + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_hf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') @@ -1019,7 +1019,7 @@ def test_irreg_dtypes(self): # np.datetime64 idx = date_range('1/1/2000', periods=10) - idx = idx[[0, 2, 5, 9]].asobject + idx = idx[[0, 2, 5, 9]].astype(object) df = DataFrame(np.random.randn(len(idx), 3), idx) _, ax = self.plt.subplots() _check_plot_works(df.plot, ax=ax) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a57385a9cf690..c814cade77e5c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -632,7 +632,7 @@ def test_constructor_periodindex(self): pi = period_range('20130101', periods=5, freq='D') s = Series(pi) - expected = Series(pi.asobject) + expected = Series(pi.astype(object)) assert_series_equal(s, expected) assert s.dtype == 'object' diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index e810eadd2dee9..b79d8def905af 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -228,7 +228,7 @@ def get_dir(s): results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) s = Series(period_range('20130101', periods=5, - freq='D', name='xxx').asobject) + freq='D', name='xxx').astype(object)) results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_period + ok_for_period_methods)))) @@ -387,7 +387,7 @@ def test_sub_of_datetime_from_TimeSeries(self): assert result.dtype == 'timedelta64[ns]' def test_between(self): - s = Series(bdate_range('1/1/2000', periods=20).asobject) + s = Series(bdate_range('1/1/2000', periods=20).astype(object)) s[::2] = np.nan result = s[s.between(s[3], s[17])] diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index ad6d019b5287e..163950b75bc34 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -37,6 +37,12 @@ def test_astype(self, dtype): assert as_typed.dtype == dtype assert as_typed.name == s.name + def test_asobject_deprecated(self): + s = Series(np.random.randn(5), name='foo') + with tm.assert_produces_warning(FutureWarning): + o = s.asobject + assert isinstance(o, np.ndarray) + def test_dtype(self): assert self.ts.dtype == np.dtype('float64') diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index c1ef70bba8634..b0d0e2a51b5f4 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -134,13 +134,13 @@ def test_shift_dst(self): assert res.dtype == 'datetime64[ns, US/Eastern]' res = s.shift(1) - exp_vals = [NaT] + dates.asobject.values.tolist()[:9] + exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] exp = Series(exp_vals) tm.assert_series_equal(res, exp) assert res.dtype == 'datetime64[ns, US/Eastern]' res = s.shift(-2) - exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT] + exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] exp = Series(exp_vals) tm.assert_series_equal(res, exp) assert res.dtype == 'datetime64[ns, US/Eastern]' diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 31f4ca146040e..df76390d7ce7a 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -437,7 +437,7 @@ def test_value_counts_unique_nunique(self): for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal(result, - orig._values.asobject.values) + orig._values.astype(object).values) else: tm.assert_numpy_array_equal(result, orig.values) @@ -525,8 +525,8 @@ def test_value_counts_unique_nunique_null(self): Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan - tm.assert_numpy_array_equal(result[1:], - values[2:].asobject.values) + vals = values[2:].astype(object).values + tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 9666a4c154c63..beea6df086b72 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -720,15 +720,15 @@ def _check_generated_range(self, start, freq): def test_infer_freq(self): rng = period_range('1959Q2', '2009Q3', freq='Q') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-DEC' rng = period_range('1959Q2', '2009Q3', freq='Q-NOV') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-NOV' rng = period_range('1959Q2', '2009Q3', freq='Q-OCT') - rng = Index(rng.to_timestamp('D', how='e').asobject) + rng = Index(rng.to_timestamp('D', how='e').astype(object)) assert rng.inferred_freq == 'Q-OCT' def test_infer_freq_tz(self): diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index a01166daf6be1..5fd2089d234c1 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -688,7 +688,7 @@ def test_index_astype_asobject_tzinfos(self): # dates around a dst transition rng = date_range('2/13/2010', '5/6/2010', tz=self.tzstr('US/Eastern')) - objs = rng.asobject + objs = rng.astype(object) for i, x in enumerate(objs): exval = rng[i] assert x == exval @@ -1552,8 +1552,8 @@ def test_append_aware_naive(self): ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.asobject.append( - ts2.index.asobject)) + assert ts_result.index.equals(ts1.index.astype(object).append( + ts2.index.astype(object))) # mixed rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') @@ -1561,7 +1561,7 @@ def test_append_aware_naive(self): ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.asobject.append( + assert ts_result.index.equals(ts1.index.astype(object).append( ts2.index)) def test_equal_join_ensure_utc(self): From 5bf948643aed7b034850bf35dd2da16817d07a02 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 4 Dec 2017 03:00:54 -0800 Subject: [PATCH 84/98] CLN: Remove SparseList from pandas API (#18621) Deprecated in 0.19.0. xref gh-14007. --- doc/source/sparse.rst | 9 -- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/sparse/api.py | 1 - pandas/core/sparse/list.py | 152 ------------------------------- pandas/tests/api/test_api.py | 2 +- pandas/tests/sparse/test_list.py | 111 ---------------------- pandas/util/testing.py | 7 -- 7 files changed, 2 insertions(+), 281 deletions(-) delete mode 100644 pandas/core/sparse/list.py delete mode 100644 pandas/tests/sparse/test_list.py diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 89efa7b4be3ee..2e224f103a95e 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -85,15 +85,6 @@ can be converted back to a regular ndarray by calling ``to_dense``: sparr.to_dense() -.. _sparse.list: - -SparseList ----------- - -The ``SparseList`` class has been deprecated and will be removed in a future version. -See the `docs of a previous version `__ -for documentation on ``SparseList``. - SparseIndex objects ------------------- diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 77503b4653437..e8f2823f32edd 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -136,6 +136,7 @@ Removal of prior version deprecations/changes - ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) - ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) - ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) +- The ``SparseList`` class has been removed (:issue:`14007`) .. _whatsnew_0220.performance: diff --git a/pandas/core/sparse/api.py b/pandas/core/sparse/api.py index f79bb4886da4b..85941e6923338 100644 --- a/pandas/core/sparse/api.py +++ b/pandas/core/sparse/api.py @@ -1,6 +1,5 @@ # pylint: disable=W0611 # flake8: noqa from pandas.core.sparse.array import SparseArray -from pandas.core.sparse.list import SparseList from pandas.core.sparse.series import SparseSeries from pandas.core.sparse.frame import SparseDataFrame diff --git a/pandas/core/sparse/list.py b/pandas/core/sparse/list.py deleted file mode 100644 index f3e64b7efc764..0000000000000 --- a/pandas/core/sparse/list.py +++ /dev/null @@ -1,152 +0,0 @@ -import warnings -import numpy as np -from pandas.core.base import PandasObject -from pandas.io.formats.printing import pprint_thing - -from pandas.core.dtypes.common import is_scalar -from pandas.core.sparse.array import SparseArray -from pandas.util._validators import validate_bool_kwarg -import pandas._libs.sparse as splib - - -class SparseList(PandasObject): - - """ - Data structure for accumulating data to be converted into a - SparseArray. Has similar API to the standard Python list - - Parameters - ---------- - data : scalar or array-like - fill_value : scalar, default NaN - """ - - def __init__(self, data=None, fill_value=np.nan): - - # see gh-13784 - warnings.warn("SparseList is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) - - self.fill_value = fill_value - self._chunks = [] - - if data is not None: - self.append(data) - - def __unicode__(self): - contents = '\n'.join(repr(c) for c in self._chunks) - return '{self}\n{contents}'.format(self=object.__repr__(self), - contents=pprint_thing(contents)) - - def __len__(self): - return sum(len(c) for c in self._chunks) - - def __getitem__(self, i): - if i < 0: - if i + len(self) < 0: # pragma: no cover - raise ValueError('{index} out of range'.format(index=i)) - i += len(self) - - passed = 0 - j = 0 - while i >= passed + len(self._chunks[j]): - passed += len(self._chunks[j]) - j += 1 - return self._chunks[j][i - passed] - - def __setitem__(self, i, value): - raise NotImplementedError - - @property - def nchunks(self): - return len(self._chunks) - - @property - def is_consolidated(self): - return self.nchunks == 1 - - def consolidate(self, inplace=True): - """ - Internally consolidate chunks of data - - Parameters - ---------- - inplace : boolean, default True - Modify the calling object instead of constructing a new one - - Returns - ------- - splist : SparseList - If inplace=False, new object, otherwise reference to existing - object - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if not inplace: - result = self.copy() - else: - result = self - - if result.is_consolidated: - return result - - result._consolidate_inplace() - return result - - def _consolidate_inplace(self): - new_values = np.concatenate([c.sp_values for c in self._chunks]) - new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks]) - new_arr = SparseArray(new_values, sparse_index=new_index, - fill_value=self.fill_value) - self._chunks = [new_arr] - - def copy(self): - """ - Return copy of the list - - Returns - ------- - new_list : SparseList - """ - new_splist = SparseList(fill_value=self.fill_value) - new_splist._chunks = list(self._chunks) - return new_splist - - def to_array(self): - """ - Return SparseArray from data stored in the SparseList - - Returns - ------- - sparr : SparseArray - """ - self.consolidate(inplace=True) - return self._chunks[0] - - def append(self, value): - """ - Append element or array-like chunk of data to the SparseList - - Parameters - ---------- - value: scalar or array-like - """ - if is_scalar(value): - value = [value] - - sparr = SparseArray(value, fill_value=self.fill_value) - self._chunks.append(sparr) - self._consolidated = False - - -def _concat_sparse_indexes(indexes): - all_indices = [] - total_length = 0 - - for index in indexes: - # increment by offset - inds = index.to_int_index().indices + total_length - - all_indices.append(inds) - total_length += index.length - - return splib.IntIndex(total_length, np.concatenate(all_indices)) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 0d1ea1c775aeb..e47f1919faaf5 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -52,7 +52,7 @@ class TestPDApi(Base): # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'Panel4D', 'TimeGrouper', - 'SparseList', 'Expr', 'Term'] + 'Expr', 'Term'] # these should be deprecated in the future deprecated_classes_in_future = ['Panel'] diff --git a/pandas/tests/sparse/test_list.py b/pandas/tests/sparse/test_list.py deleted file mode 100644 index 6c721ca813a21..0000000000000 --- a/pandas/tests/sparse/test_list.py +++ /dev/null @@ -1,111 +0,0 @@ -from pandas.compat import range - -from numpy import nan -import numpy as np - -from pandas.core.sparse.api import SparseList, SparseArray -import pandas.util.testing as tm - - -class TestSparseList(object): - - def setup_method(self, method): - self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) - self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) - - def test_deprecation(self): - # see gh-13784 - with tm.assert_produces_warning(FutureWarning): - SparseList() - - def test_constructor(self): - with tm.assert_produces_warning(FutureWarning): - lst1 = SparseList(self.na_data[:5]) - with tm.assert_produces_warning(FutureWarning): - exp = SparseList() - - exp.append(self.na_data[:5]) - tm.assert_sp_list_equal(lst1, exp) - - def test_len(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - assert len(splist) == 5 - splist.append(arr[5]) - assert len(splist) == 6 - splist.append(arr[6:]) - assert len(splist) == 10 - - def test_append_na(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr)) - - def test_append_zero(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.zero_data - splist = SparseList(fill_value=0) - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - # list always produces int64, but SA constructor - # is platform dtype aware - sparr = splist.to_array() - exp = SparseArray(arr, fill_value=0) - tm.assert_sp_array_equal(sparr, exp, check_dtype=False) - - def test_consolidate(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = self.na_data - exp_sparr = SparseArray(arr) - - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - consol = splist.consolidate(inplace=False) - assert consol.nchunks == 1 - assert splist.nchunks == 3 - tm.assert_sp_array_equal(consol.to_array(), exp_sparr) - - splist.consolidate() - assert splist.nchunks == 1 - tm.assert_sp_array_equal(splist.to_array(), exp_sparr) - - def test_copy(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - arr = self.na_data - exp_sparr = SparseArray(arr) - - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - - cp = splist.copy() - cp.append(arr[6:]) - assert splist.nchunks == 2 - tm.assert_sp_array_equal(cp.to_array(), exp_sparr) - - def test_getitem(self): - with tm.assert_produces_warning(FutureWarning): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - for i in range(len(arr)): - tm.assert_almost_equal(splist[i], arr[i]) - tm.assert_almost_equal(splist[-i], arr[-i]) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 850c42a011958..9db09f23eb849 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1582,13 +1582,6 @@ def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, for col in right: assert (col in left) - -def assert_sp_list_equal(left, right): - assert isinstance(left, pd.SparseList) - assert isinstance(right, pd.SparseList) - - assert_sp_array_equal(left.to_array(), right.to_array()) - # ----------------------------------------------------------------------------- # Others From fe34b32b0014979eba6e8b6a714c5e2b76c1612b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Dec 2017 03:21:49 -0800 Subject: [PATCH 85/98] timestamp/timedelta test cleanup (#18619) --- pandas/tests/scalar/test_timedelta.py | 90 ++++--- pandas/tests/scalar/test_timestamp.py | 348 +++++++++++++------------- 2 files changed, 227 insertions(+), 211 deletions(-) diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 17c818779c76d..001f6c1fdbef4 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -15,6 +15,28 @@ class TestTimedeltaArithmetic(object): _multiprocess_can_split_ = True + def test_arithmetic_overflow(self): + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') + + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) + + def test_ops_error_str(self): + # GH 13624 + td = Timedelta('1 day') + + for left, right in [(td, 'a'), ('a', td)]: + + with pytest.raises(TypeError): + left + right + + with pytest.raises(TypeError): + left > right + + assert not left == right + assert left != right + def test_to_timedelta_on_nanoseconds(self): # GH 9273 result = Timedelta(nanoseconds=100) @@ -93,38 +115,53 @@ def test_ops_offsets(self): assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td - # TODO: Split by op, better name - def test_ops(self): + def test_unary_ops(self): td = Timedelta(10, unit='d') + + # __neg__, __pos__ assert -td == Timedelta(-10, unit='d') + assert -td == Timedelta('-10d') assert +td == Timedelta(10, unit='d') - assert td - td == Timedelta(0, unit='ns') + + # __abs__, __abs__(__neg__) + assert abs(td) == td + assert abs(-td) == td + assert abs(-td) == Timedelta('10d') + + def test_binary_ops_nat(self): + td = Timedelta(10, unit='d') + assert (td - pd.NaT) is pd.NaT - assert td + td == Timedelta(20, unit='d') assert (td + pd.NaT) is pd.NaT - assert td * 2 == Timedelta(20, unit='d') assert (td * pd.NaT) is pd.NaT - assert td / 2 == Timedelta(5, unit='d') - assert td // 2 == Timedelta(5, unit='d') - assert abs(td) == td - assert abs(-td) == td - assert td / td == 1 assert (td / pd.NaT) is np.nan assert (td // pd.NaT) is np.nan + def test_binary_ops_integers(self): + td = Timedelta(10, unit='d') + + assert td * 2 == Timedelta(20, unit='d') + assert td / 2 == Timedelta(5, unit='d') + assert td // 2 == Timedelta(5, unit='d') + # invert - assert -td == Timedelta('-10d') assert td * -1 == Timedelta('-10d') assert -1 * td == Timedelta('-10d') - assert abs(-td) == Timedelta('10d') - - # invalid multiply with another timedelta - pytest.raises(TypeError, lambda: td * td) # can't operate with integers pytest.raises(TypeError, lambda: td + 2) pytest.raises(TypeError, lambda: td - 2) + def test_binary_ops_with_timedelta(self): + td = Timedelta(10, unit='d') + + assert td - td == Timedelta(0, unit='ns') + assert td + td == Timedelta(20, unit='d') + assert td / td == 1 + + # invalid multiply with another timedelta + pytest.raises(TypeError, lambda: td * td) + class TestTimedeltas(object): _multiprocess_can_split_ = True @@ -733,14 +770,6 @@ def test_timedelta_arithmetic(self): tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) - def test_arithmetic_overflow(self): - - with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') - - with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) - def test_apply_to_timedelta(self): timedelta_NaT = pd.to_timedelta('NaT') @@ -803,18 +832,3 @@ def test_isoformat(self): result = Timedelta(minutes=1).isoformat() expected = 'P0DT0H1M0S' assert result == expected - - def test_ops_error_str(self): - # GH 13624 - td = Timedelta('1 day') - - for l, r in [(td, 'a'), ('a', td)]: - - with pytest.raises(TypeError): - l + r - - with pytest.raises(TypeError): - l > r - - assert not l == r - assert l != r diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 9d97057569580..dab508de335c4 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -45,6 +45,11 @@ def test_overflow_offset(self): with pytest.raises(OverflowError): stamp - offset + def test_delta_preserve_nanos(self): + val = Timestamp(long(1337299200000000123)) + result = val + timedelta(1) + assert result.nanosecond == val.nanosecond + class TestTimestampProperties(object): @@ -68,7 +73,7 @@ def test_properties_business(self): assert control.is_quarter_end -class TestTimestamp(object): +class TestTimestampConstructors(object): def test_constructor(self): base_str = '2014-07-01 09:00' @@ -290,6 +295,17 @@ def test_constructor_fromordinal(self): assert Timestamp('2000-01-01', tz='US/Eastern') == ts assert base.toordinal() == ts.toordinal() + # GH#3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + assert ts.to_pydatetime() == dt + + # with a tzinfo + stamp = Timestamp('2011-4-16', tz='US/Eastern') + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') + assert ts.to_pydatetime() == dt_tz + def test_constructor_offset_depr(self): # see gh-12160 with tm.assert_produces_warning(FutureWarning, @@ -320,6 +336,9 @@ def test_constructor_offset_depr_fromordinal(self): with tm.assert_raises_regex(TypeError, msg): Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + +class TestTimestamp(object): + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') @@ -335,10 +354,10 @@ def test_conversion(self): assert type(result) == type(expected) assert result.dtype == expected.dtype - def test_repr(self): - dates = ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001'] - + @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) + @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', + '2014-01-01 00:00:00.000000001']) + def test_repr(self, date, freq): # dateutil zone change (only matters for repr) if (dateutil.__version__ >= LooseVersion('2.3') and (dateutil.__version__ <= LooseVersion('2.4.0') or @@ -349,43 +368,40 @@ def test_repr(self): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles'] - freqs = ['D', 'M', 'S', 'N'] - - for date in dates: - for tz in timezones: - for freq in freqs: - - # avoid to match with timezone name - freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') - else: - tz_repr = tz - - date_only = Timestamp(date) - assert date in repr(date_only) - assert tz_repr not in repr(date_only) - assert freq_repr not in repr(date_only) - assert date_only == eval(repr(date_only)) - - date_tz = Timestamp(date, tz=tz) - assert date in repr(date_tz) - assert tz_repr in repr(date_tz) - assert freq_repr not in repr(date_tz) - assert date_tz == eval(repr(date_tz)) - - date_freq = Timestamp(date, freq=freq) - assert date in repr(date_freq) - assert tz_repr not in repr(date_freq) - assert freq_repr in repr(date_freq) - assert date_freq == eval(repr(date_freq)) - - date_tz_freq = Timestamp(date, tz=tz, freq=freq) - assert date in repr(date_tz_freq) - assert tz_repr in repr(date_tz_freq) - assert freq_repr in repr(date_tz_freq) - assert date_tz_freq == eval(repr(date_tz_freq)) + for tz in timezones: + # avoid to match with timezone name + freq_repr = "'{0}'".format(freq) + if tz.startswith('dateutil'): + tz_repr = tz.replace('dateutil', '') + else: + tz_repr = tz + + date_only = Timestamp(date) + assert date in repr(date_only) + assert tz_repr not in repr(date_only) + assert freq_repr not in repr(date_only) + assert date_only == eval(repr(date_only)) + + date_tz = Timestamp(date, tz=tz) + assert date in repr(date_tz) + assert tz_repr in repr(date_tz) + assert freq_repr not in repr(date_tz) + assert date_tz == eval(repr(date_tz)) + + date_freq = Timestamp(date, freq=freq) + assert date in repr(date_freq) + assert tz_repr not in repr(date_freq) + assert freq_repr in repr(date_freq) + assert date_freq == eval(repr(date_freq)) + + date_tz_freq = Timestamp(date, tz=tz, freq=freq) + assert date in repr(date_tz_freq) + assert tz_repr in repr(date_tz_freq) + assert freq_repr in repr(date_tz_freq) + assert date_tz_freq == eval(repr(date_tz_freq)) + + def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) @@ -396,6 +412,16 @@ def test_repr(self): 'pytz.FixedOffset(-240)') assert date_with_utc_offset == eval(expr) + def test_timestamp_repr_pre1900(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + assert iso8601 in result + def test_bounds_with_different_units(self): out_of_bounds_dates = ('1677-09-21', '2262-04-12', ) @@ -474,32 +500,34 @@ def test_tz_localize_errors_ambiguous(self): pytest.raises(AmbiguousTimeError, ts.tz_localize, 'US/Pacific', errors='coerce') - def test_tz_localize_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t) - localized = ts.tz_localize(tz) - assert localized == Timestamp(t, tz=tz) - - with pytest.raises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - assert reset == ts - assert reset.tzinfo is None - - def test_tz_convert_roundtrip(self): - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t, tz='UTC') - converted = ts.tz_convert(tz) - - reset = converted.tz_convert(None) - assert reset == Timestamp(t) - assert reset.tzinfo is None - assert reset == converted.tz_convert('UTC').tz_localize(None) + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + def test_tz_localize_roundtrip(self, tz): + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t) + localized = ts.tz_localize(tz) + assert localized == Timestamp(t, tz=tz) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + assert reset == ts + assert reset.tzinfo is None + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + def test_tz_convert_roundtrip(self, tz): + for t in ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']: + ts = Timestamp(t, tz='UTC') + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + assert reset == Timestamp(t) + assert reset.tzinfo is None + assert reset == converted.tz_convert('UTC').tz_localize(None) def test_barely_oob_dts(self): one_us = np.timedelta64(1).astype('timedelta64[us]') @@ -906,6 +934,51 @@ def test_roundtrip(self): assert result == Timestamp(str(base) + ".200005") assert result.microsecond == 5 + 200 * 1000 + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1): 5} + stamp = Timestamp(datetime(2011, 1, 1)) + assert d[stamp] == 5 + + @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']) + def test_is_leap_year(self, tz): + # GH 13727 + dt = Timestamp('2000-01-01 00:00:00', tz=tz) + assert dt.is_leap_year + assert isinstance(dt.is_leap_year, bool) + + dt = Timestamp('1999-01-01 00:00:00', tz=tz) + assert not dt.is_leap_year + + dt = Timestamp('2004-01-01 00:00:00', tz=tz) + assert dt.is_leap_year + + dt = Timestamp('2100-01-01 00:00:00', tz=tz) + assert not dt.is_leap_year + + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') + utsc = tsc.tz_convert('UTC') + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + if PY3: + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() + + +class TestTimestampComparison(object): def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = long(1337299200000000000) @@ -937,7 +1010,6 @@ def test_comparison(self): assert other >= val def test_compare_invalid(self): - # GH 8058 val = Timestamp('20130101 12:01:02') assert not val == 'foo' @@ -1028,16 +1100,6 @@ def test_cant_compare_tz_naive_w_aware_dateutil(self): assert not a == b.to_pydatetime() assert not a.to_pydatetime() == b - def test_delta_preserve_nanos(self): - val = Timestamp(long(1337299200000000123)) - result = val + timedelta(1) - assert result.nanosecond == val.nanosecond - - def test_hash_equivalent(self): - d = {datetime(2011, 1, 1): 5} - stamp = Timestamp(datetime(2011, 1, 1)) - assert d[stamp] == 5 - def test_timestamp_compare_scalars(self): # case where ndim == 0 lhs = np.datetime64(datetime(2013, 12, 6)) @@ -1098,43 +1160,20 @@ def test_timestamp_compare_series(self): result = right_f(Timestamp('nat'), s_nat) tm.assert_series_equal(result, expected) - def test_is_leap_year(self): - # GH 13727 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - dt = Timestamp('2000-01-01 00:00:00', tz=tz) - assert dt.is_leap_year - assert isinstance(dt.is_leap_year, bool) - - dt = Timestamp('1999-01-01 00:00:00', tz=tz) - assert not dt.is_leap_year - - dt = Timestamp('2004-01-01 00:00:00', tz=tz) - assert dt.is_leap_year - - dt = Timestamp('2100-01-01 00:00:00', tz=tz) - assert not dt.is_leap_year - - def test_timestamp(self): - # GH#17329 - # tz-naive --> treat it as if it were UTC for purposes of timestamp() - ts = Timestamp.now() - uts = ts.replace(tzinfo=utc) - assert ts.timestamp() == uts.timestamp() - - tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') - utsc = tsc.tz_convert('UTC') - - # utsc is a different representation of the same time - assert tsc.timestamp() == utsc.timestamp() - - if PY3: - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp('2012-01-01') - # should agree with datetime.timestamp method - dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() + assert not stamp == datetime.min + assert not stamp == datetime(1600, 1, 1) + assert not stamp == datetime(2700, 1, 1) + assert stamp != datetime.min + assert stamp != datetime(1600, 1, 1) + assert stamp != datetime(2700, 1, 1) + assert stamp > datetime(1600, 1, 1) + assert stamp >= datetime(1600, 1, 1) + assert stamp < datetime(2700, 1, 1) + assert stamp <= datetime(2700, 1, 1) class TestTimestampNsOperations(object): @@ -1281,7 +1320,9 @@ def test_addition_subtraction_preserve_frequency(self): assert (timestamp_instance - timedelta64_instance).freq == original_freq - def test_resolution(self): + @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Eastern']) + def test_resolution(self, tz): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], @@ -1290,12 +1331,9 @@ def test_resolution(self): RESO_HR, RESO_MIN, RESO_SEC, RESO_MS, RESO_US]): - for tz in [None, 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Eastern']: - idx = date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) - result = period.resolution(idx.asi8, idx.tz) - assert result == expected + idx = date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) + result = period.resolution(idx.asi8, idx.tz) + assert result == expected class TestTimestampToJulianDate(object): @@ -1321,8 +1359,7 @@ def test_compare_hour13(self): assert r == 2451769.0416666666666666 -class TestTimeSeries(object): - +class TestTimestampConversion(object): def test_timestamp_to_datetime(self): stamp = Timestamp('20090415', tz='US/Eastern', freq='D') dtval = stamp.to_pydatetime() @@ -1350,47 +1387,25 @@ def test_timestamp_to_datetime_explicit_dateutil(self): assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo - def test_timestamp_date_out_of_range(self): - pytest.raises(ValueError, Timestamp, '1676-01-01') - pytest.raises(ValueError, Timestamp, '2263-01-01') - - def test_timestamp_repr(self): - # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') - repr(stamp) - - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') - result = repr(stamp) - assert iso8601 in result - - def test_timestamp_from_ordinal(self): + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == + Timestamp.max.value / 1000) - # GH 3042 - dt = datetime(2011, 4, 16, 0, 0) - ts = Timestamp.fromordinal(dt.toordinal()) - assert ts.to_pydatetime() == dt + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == + Timestamp.min.value / 1000) - # with a tzinfo - stamp = Timestamp('2011-4-16', tz='US/Eastern') - dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') - assert ts.to_pydatetime() == dt_tz - def test_timestamp_compare_with_early_datetime(self): - # e.g. datetime.min - stamp = Timestamp('2012-01-01') +class TestTimeSeries(object): - assert not stamp == datetime.min - assert not stamp == datetime(1600, 1, 1) - assert not stamp == datetime(2700, 1, 1) - assert stamp != datetime.min - assert stamp != datetime(1600, 1, 1) - assert stamp != datetime(2700, 1, 1) - assert stamp > datetime(1600, 1, 1) - assert stamp >= datetime(1600, 1, 1) - assert stamp < datetime(2700, 1, 1) - assert stamp <= datetime(2700, 1, 1) + def test_timestamp_date_out_of_range(self): + pytest.raises(ValueError, Timestamp, '1676-01-01') + pytest.raises(ValueError, Timestamp, '2263-01-01') def test_timestamp_equality(self): @@ -1483,16 +1498,3 @@ def test_min_valid(self): def test_max_valid(self): # Ensure that Timestamp.max is a valid Timestamp Timestamp(Timestamp.max) - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == - Timestamp.max.value / 1000) - - exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == - Timestamp.min.value / 1000) From 73ed6de17ca390418d23a5698cf4db78aa8b7b80 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Mon, 4 Dec 2017 11:27:29 +0000 Subject: [PATCH 86/98] DOC: Remove keep=False docs on nlargest/nsmallest (#18617) --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 90d1ab8d0e242..313c9ec872179 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3769,7 +3769,7 @@ def nlargest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -3804,7 +3804,7 @@ def nsmallest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. diff --git a/pandas/core/series.py b/pandas/core/series.py index 15550de16e5d2..19c84c34d7d1d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2062,7 +2062,7 @@ def nlargest(self, n=5, keep='first'): ---------- n : int Return this many descending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -2109,7 +2109,7 @@ def nsmallest(self, n=5, keep='first'): ---------- n : int Return this many ascending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. From 3e4e4b3bfc38651d728074df1eb4c42d3b033047 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 4 Dec 2017 06:35:44 -0500 Subject: [PATCH 87/98] DEPS: require updated python-dateutil, openpyxl (#18182) --- ci/environment-dev.yaml | 4 +- ci/requirements-2.7.build | 2 +- ci/requirements-2.7.run | 4 +- ci/requirements-2.7_COMPAT.build | 2 +- ci/requirements-2.7_COMPAT.run | 2 +- ci/requirements-2.7_LOCALE.run | 4 +- ci/requirements-optional-pip.txt | 4 +- ci/requirements_dev.txt | 4 +- conda.recipe/meta.yaml | 4 +- doc/source/install.rst | 8 +- doc/source/io.rst | 6 +- doc/source/whatsnew/v0.22.0.txt | 15 +- pandas/compat/__init__.py | 20 +- pandas/compat/openpyxl_compat.py | 35 --- pandas/io/excel.py | 197 +--------------- pandas/tests/indexes/datetimes/test_tools.py | 4 +- pandas/tests/io/test_excel.py | 231 +------------------ pandas/tests/scalar/test_timestamp.py | 7 +- setup.py | 4 +- 19 files changed, 59 insertions(+), 498 deletions(-) delete mode 100644 pandas/compat/openpyxl_compat.py diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index c3d3d59f895c6..57748fef1a2e5 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -6,8 +6,8 @@ dependencies: - Cython - NumPy - moto - - pytest - - python-dateutil + - pytest>=3.1 + - python-dateutil>=2.5.0 - python=3 - pytz - setuptools diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index 415df13179fcf..d1cc61df0a77c 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,5 +1,5 @@ python=2.7* -python-dateutil=2.4.1 +python-dateutil=2.5.0 pytz=2013b nomkl numpy diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index a68e1d256058d..7c10b98fb6e14 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -1,11 +1,11 @@ -python-dateutil=2.4.1 +python-dateutil=2.5.0 pytz=2013b numpy xlwt=0.7.5 numexpr pytables matplotlib -openpyxl=1.6.2 +openpyxl=2.4.0 xlrd=0.9.2 sqlalchemy=0.9.6 lxml diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build index d9c932daa110b..aa767c1001196 100644 --- a/ci/requirements-2.7_COMPAT.build +++ b/ci/requirements-2.7_COMPAT.build @@ -1,5 +1,5 @@ python=2.7* numpy=1.9.2 cython=0.23 -dateutil=1.5 +python-dateutil=2.5.0 pytz=2013b diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run index 39bf720140733..c3daed6e6e1da 100644 --- a/ci/requirements-2.7_COMPAT.run +++ b/ci/requirements-2.7_COMPAT.run @@ -1,5 +1,5 @@ numpy=1.9.2 -dateutil=1.5 +python-dateutil=2.5.0 pytz=2013b scipy=0.14.0 xlwt=0.7.5 diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run index 978bbf6a051c5..0a809a7dd6e5d 100644 --- a/ci/requirements-2.7_LOCALE.run +++ b/ci/requirements-2.7_LOCALE.run @@ -1,8 +1,8 @@ python-dateutil -pytz=2013b +pytz numpy=1.9.2 xlwt=0.7.5 -openpyxl=1.6.2 +openpyxl=2.4.0 xlsxwriter=0.5.2 xlrd=0.9.2 bottleneck=1.0.0 diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 06b22bd8f2c63..8d4421ba2b681 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -1,11 +1,13 @@ # This file was autogenerated by scripts/convert_deps.py -# Do not modify directlybeautifulsoup4 +# Do not modify directly +beautifulsoup4 blosc bottleneck fastparquet feather-format html5lib ipython +ipykernel jinja2 lxml matplotlib diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 2fb36b7cd70d8..e9840388203b1 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -3,8 +3,8 @@ Cython NumPy moto -pytest -python-dateutil +pytest>=3.1 +python-dateutil>=2.5.0 pytz setuptools sphinx \ No newline at end of file diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 2aee11772896f..8152af84228b8 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -16,13 +16,11 @@ requirements: - cython - numpy x.x - setuptools - - pytz - - python-dateutil run: - python - numpy x.x - - python-dateutil + - python-dateutil >=2.5.0 - pytz test: diff --git a/doc/source/install.rst b/doc/source/install.rst index 7c1fde119ceaa..ae89c64b6e91e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -200,8 +200,8 @@ Dependencies * `setuptools `__ * `NumPy `__: 1.9.0 or higher -* `python-dateutil `__: 1.5 or higher -* `pytz `__: Needed for time zone support +* `python-dateutil `__: 2.5.0 or higher +* `pytz `__ .. _install.recommended_dependencies: @@ -244,8 +244,8 @@ Optional Dependencies * For Excel I/O: * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) - * `openpyxl `__: openpyxl version 1.6.1 - or higher (but lower than 2.0.0), or version 2.2 or higher, for writing .xlsx files (xlrd >= 0.9.0) + * `openpyxl `__: openpyxl version 2.4.0 + for writing .xlsx files (xlrd >= 0.9.0) * `XlsxWriter `__: Alternative Excel writer * `Jinja2 `__: Template engine for conditional HTML formatting. diff --git a/doc/source/io.rst b/doc/source/io.rst index 2aeafd99f6e72..f96e33dbf9882 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2935,7 +2935,7 @@ Writing Excel Files to Memory +++++++++++++++++++++++++++++ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or -``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. Pandas also supports Openpyxl >= 2.2. +``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -2991,9 +2991,7 @@ files if `Xlsxwriter`_ is not available. To specify which writer you want to use, you can pass an engine keyword argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: -- ``openpyxl``: This includes stable support for Openpyxl from 1.6.1. However, - it is advised to use version 2.2 and higher, especially when working with - styles. +- ``openpyxl``: version 2.4 or higher is required - ``xlsxwriter`` - ``xlwt`` diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index e8f2823f32edd..5e605ecb7d8d5 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -86,9 +86,22 @@ Backwards incompatible API changes - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) -- +.. _whatsnew_0220.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`15184`). +If installed, we now require: + +-----------------+-----------------+----------+ + | Package | Minimum Version | Required | + +=================+=================+==========+ + | python-dateutil | 2.5.0 | X | + +-----------------+-----------------+----------+ + | openpyxl | 2.4.0 | | + +-----------------+-----------------+----------+ diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index a615e098135a9..2deb29dabe764 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -396,25 +396,13 @@ def raise_with_traceback(exc, traceback=Ellipsis): If traceback is not passed, uses sys.exc_info() to get traceback.""" -# http://stackoverflow.com/questions/4126348 -# Thanks to @martineau at SO - +# dateutil minimum version import dateutil -if PY2 and LooseVersion(dateutil.__version__) == '2.0': - # dateutil brokenness - raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' - 'install version 1.5 or 2.1+!') - +if LooseVersion(dateutil.__version__) < '2.5': + raise ImportError('dateutil 2.5.0 is the minimum required version') from dateutil import parser as _date_parser -if LooseVersion(dateutil.__version__) < '2.0': - - @functools.wraps(_date_parser.parse) - def parse_date(timestr, *args, **kwargs): - timestr = bytes(timestr) - return _date_parser.parse(timestr, *args, **kwargs) -else: - parse_date = _date_parser.parse +parse_date = _date_parser.parse # https://github.com/pandas-dev/pandas/pull/9123 diff --git a/pandas/compat/openpyxl_compat.py b/pandas/compat/openpyxl_compat.py deleted file mode 100644 index 87cf52cf00fef..0000000000000 --- a/pandas/compat/openpyxl_compat.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Detect incompatible version of OpenPyXL - -GH7169 -""" - -from distutils.version import LooseVersion - -start_ver = '1.6.1' -stop_ver = '2.0.0' - - -def is_compat(major_ver=1): - """Detect whether the installed version of openpyxl is supported - - Parameters - ---------- - ver : int - 1 requests compatibility status among the 1.x.y series - 2 requests compatibility status of 2.0.0 and later - Returns - ------- - compat : bool - ``True`` if openpyxl is installed and is a compatible version. - ``False`` otherwise. - """ - import openpyxl - ver = LooseVersion(openpyxl.__version__) - if major_ver == 1: - return LooseVersion(start_ver) <= ver < LooseVersion(stop_ver) - elif major_ver == 2: - return LooseVersion(stop_ver) <= ver - else: - raise ValueError('cannot test for openpyxl compatibility with ver {0}' - .format(major_ver)) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index fec916dc52d20..882130bedcbf0 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -28,7 +28,6 @@ from pandas.core import config from pandas.io.formats.printing import pprint_thing import pandas.compat as compat -import pandas.compat.openpyxl_compat as openpyxl_compat from warnings import warn from distutils.version import LooseVersion from pandas.util._decorators import Appender, deprecate_kwarg @@ -185,22 +184,6 @@ def _get_default_writer(ext): def get_writer(engine_name): - if engine_name == 'openpyxl': - try: - import openpyxl - - # with version-less openpyxl engine - # make sure we make the intelligent choice for the user - if LooseVersion(openpyxl.__version__) < '2.0.0': - return _writers['openpyxl1'] - elif LooseVersion(openpyxl.__version__) < '2.2.0': - return _writers['openpyxl20'] - else: - return _writers['openpyxl22'] - except ImportError: - # fall through to normal exception handling below - pass - try: return _writers[engine_name] except KeyError: @@ -828,20 +811,15 @@ def close(self): return self.save() -class _Openpyxl1Writer(ExcelWriter): - engine = 'openpyxl1' +class _OpenpyxlWriter(ExcelWriter): + engine = 'openpyxl' supported_extensions = ('.xlsx', '.xlsm') - openpyxl_majorver = 1 def __init__(self, path, engine=None, **engine_kwargs): - if not openpyxl_compat.is_compat(major_ver=self.openpyxl_majorver): - raise ValueError('Installed openpyxl is not supported at this ' - 'time. Use {majorver}.x.y.' - .format(majorver=self.openpyxl_majorver)) # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super(_Openpyxl1Writer, self).__init__(path, **engine_kwargs) + super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -861,72 +839,6 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): - # Write the frame cells using openpyxl. - from openpyxl.cell import get_column_letter - - sheet_name = self._get_sheet_name(sheet_name) - - if sheet_name in self.sheets: - wks = self.sheets[sheet_name] - else: - wks = self.book.create_sheet() - wks.title = sheet_name - self.sheets[sheet_name] = wks - - for cell in cells: - colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks.cell("{col}{row}".format(col=colletter, - row=startrow + cell.row + 1)) - if (isinstance(cell.val, compat.string_types) and - xcell.data_type_for_value(cell.val) != xcell.TYPE_STRING): - xcell.set_value_explicit(cell.val) - else: - xcell.value = _conv_value(cell.val) - style = None - if cell.style: - style = self._convert_to_style(cell.style) - for field in style.__fields__: - xcell.style.__setattr__(field, - style.__getattribute__(field)) - - if isinstance(cell.val, datetime): - xcell.style.number_format.format_code = self.datetime_format - elif isinstance(cell.val, date): - xcell.style.number_format.format_code = self.date_format - - if cell.mergestart is not None and cell.mergeend is not None: - cletterstart = get_column_letter(startcol + cell.col + 1) - cletterend = get_column_letter(startcol + cell.mergeend + 1) - - wks.merge_cells('{start}{row}:{end}{mergestart}' - .format(start=cletterstart, - row=startrow + cell.row + 1, - end=cletterend, - mergestart=startrow + - cell.mergestart + 1)) - - # Excel requires that the format of the first cell in a merged - # range is repeated in the rest of the merged range. - if style: - first_row = startrow + cell.row + 1 - last_row = startrow + cell.mergestart + 1 - first_col = startcol + cell.col + 1 - last_col = startcol + cell.mergeend + 1 - - for row in range(first_row, last_row + 1): - for col in range(first_col, last_col + 1): - if row == first_row and col == first_col: - # Ignore first cell. It is already handled. - continue - colletter = get_column_letter(col) - xcell = wks.cell("{col}{row}" - .format(col=colletter, row=row)) - for field in style.__fields__: - xcell.style.__setattr__( - field, style.__getattribute__(field)) - @classmethod def _convert_to_style(cls, style_dict): """ @@ -948,88 +860,6 @@ def _convert_to_style(cls, style_dict): return xls_style - -register_writer(_Openpyxl1Writer) - - -class _OpenpyxlWriter(_Openpyxl1Writer): - engine = 'openpyxl' - - -register_writer(_OpenpyxlWriter) - - -class _Openpyxl20Writer(_Openpyxl1Writer): - """ - Note: Support for OpenPyxl v2 is currently EXPERIMENTAL (GH7565). - """ - engine = 'openpyxl20' - openpyxl_majorver = 2 - - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): - # Write the frame cells using openpyxl. - from openpyxl.cell import get_column_letter - - sheet_name = self._get_sheet_name(sheet_name) - - if sheet_name in self.sheets: - wks = self.sheets[sheet_name] - else: - wks = self.book.create_sheet() - wks.title = sheet_name - self.sheets[sheet_name] = wks - - for cell in cells: - colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks["{col}{row}" - .format(col=colletter, row=startrow + cell.row + 1)] - xcell.value = _conv_value(cell.val) - style_kwargs = {} - - # Apply format codes before cell.style to allow override - if isinstance(cell.val, datetime): - style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format': {'format_code': self.datetime_format}})) - elif isinstance(cell.val, date): - style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format': {'format_code': self.date_format}})) - - if cell.style: - style_kwargs.update(self._convert_to_style_kwargs(cell.style)) - - if style_kwargs: - xcell.style = xcell.style.copy(**style_kwargs) - - if cell.mergestart is not None and cell.mergeend is not None: - cletterstart = get_column_letter(startcol + cell.col + 1) - cletterend = get_column_letter(startcol + cell.mergeend + 1) - - wks.merge_cells('{start}{row}:{end}{mergestart}' - .format(start=cletterstart, - row=startrow + cell.row + 1, - end=cletterend, - mergestart=startrow + - cell.mergestart + 1)) - - # Excel requires that the format of the first cell in a merged - # range is repeated in the rest of the merged range. - if style_kwargs: - first_row = startrow + cell.row + 1 - last_row = startrow + cell.mergestart + 1 - first_col = startcol + cell.col + 1 - last_col = startcol + cell.mergeend + 1 - - for row in range(first_row, last_row + 1): - for col in range(first_col, last_col + 1): - if row == first_row and col == first_col: - # Ignore first cell. It is already handled. - continue - colletter = get_column_letter(col) - xcell = wks["{col}{row}" - .format(col=colletter, row=row)] - xcell.style = xcell.style.copy(**style_kwargs) - @classmethod def _convert_to_style_kwargs(cls, style_dict): """ @@ -1341,13 +1171,7 @@ def _convert_to_number_format(cls, number_format_dict): ------- number_format : str """ - try: - # >= 2.0.0 < 2.1.0 - from openpyxl.styles import NumberFormat - return NumberFormat(**number_format_dict) - except: - # >= 2.1.0 - return number_format_dict['format_code'] + return number_format_dict['format_code'] @classmethod def _convert_to_protection(cls, protection_dict): @@ -1367,17 +1191,6 @@ def _convert_to_protection(cls, protection_dict): return Protection(**protection_dict) - -register_writer(_Openpyxl20Writer) - - -class _Openpyxl22Writer(_Openpyxl20Writer): - """ - Note: Support for OpenPyxl v2.2 is currently EXPERIMENTAL (GH7565). - """ - engine = 'openpyxl22' - openpyxl_majorver = 2 - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None): # Write the frame cells using openpyxl. @@ -1443,7 +1256,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, setattr(xcell, k, v) -register_writer(_Openpyxl22Writer) +register_writer(_OpenpyxlWriter) class _XlwtWriter(ExcelWriter): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index a1287c3102b77..6c72e65b1021c 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1160,9 +1160,9 @@ class TestDatetimeParsingWrappers(object): @pytest.mark.parametrize('cache', [True, False]) def test_parsers(self, cache): + # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 - import dateutil - yearfirst = dateutil.__version__ >= LooseVersion('2.5.0') + yearfirst = True cases = {'2011-01-01': datetime(2011, 1, 1), '2Q2005': datetime(2005, 4, 1), diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index d33136a86faad..96117b3c21a9b 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1,6 +1,4 @@ # pylint: disable=E1101 -import functools -import operator import os import sys import warnings @@ -17,12 +15,12 @@ import pandas as pd import pandas.util.testing as tm from pandas import DataFrame, Index, MultiIndex -from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems +from pandas.compat import u, range, map, BytesIO, iteritems from pandas.core.config import set_option, get_option from pandas.io.common import URLError from pandas.io.excel import ( - ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _Openpyxl1Writer, - _Openpyxl20Writer, _Openpyxl22Writer, register_writer, _XlsxWriter + ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter, + register_writer, _XlsxWriter ) from pandas.io.formats.excel import ExcelFormatter from pandas.io.parsers import read_csv @@ -1926,207 +1924,10 @@ def test_path_localpath(self): tm.assert_frame_equal(df, result) -def raise_wrapper(major_ver): - def versioned_raise_wrapper(orig_method): - @functools.wraps(orig_method) - def wrapped(self, *args, **kwargs): - _skip_if_no_openpyxl() - if openpyxl_compat.is_compat(major_ver=major_ver): - orig_method(self, *args, **kwargs) - else: - msg = (r'Installed openpyxl is not supported at this ' - r'time\. Use.+') - with tm.assert_raises_regex(ValueError, msg): - orig_method(self, *args, **kwargs) - return wrapped - return versioned_raise_wrapper - - -def raise_on_incompat_version(major_ver): - def versioned_raise_on_incompat_version(cls): - methods = filter(operator.methodcaller( - 'startswith', 'test_'), dir(cls)) - for method in methods: - setattr(cls, method, raise_wrapper( - major_ver)(getattr(cls, method))) - return cls - return versioned_raise_on_incompat_version - - -@raise_on_incompat_version(1) class TestOpenpyxlTests(ExcelWriterBase): + engine_name = 'openpyxl' ext = '.xlsx' - engine_name = 'openpyxl1' - check_skip = staticmethod(lambda *args, **kwargs: None) - - def test_to_excel_styleconverter(self): - _skip_if_no_openpyxl() - if not openpyxl_compat.is_compat(major_ver=1): - pytest.skip('incompatible openpyxl version') - - import openpyxl - - hstyle = {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", "vertical": "top"}} - - xlsx_style = _Openpyxl1Writer._convert_to_style(hstyle) - assert xlsx_style.font.bold - assert (openpyxl.style.Border.BORDER_THIN == - xlsx_style.borders.top.border_style) - assert (openpyxl.style.Border.BORDER_THIN == - xlsx_style.borders.right.border_style) - assert (openpyxl.style.Border.BORDER_THIN == - xlsx_style.borders.bottom.border_style) - assert (openpyxl.style.Border.BORDER_THIN == - xlsx_style.borders.left.border_style) - assert (openpyxl.style.Alignment.HORIZONTAL_CENTER == - xlsx_style.alignment.horizontal) - assert (openpyxl.style.Alignment.VERTICAL_TOP == - xlsx_style.alignment.vertical) - - -def skip_openpyxl_gt21(cls): - """Skip test case if openpyxl >= 2.2""" - - @classmethod - def setup_class(cls): - _skip_if_no_openpyxl() - import openpyxl - ver = openpyxl.__version__ - if (not (LooseVersion(ver) >= LooseVersion('2.0.0') and - LooseVersion(ver) < LooseVersion('2.2.0'))): - pytest.skip("openpyxl %s >= 2.2" % str(ver)) - - cls.setup_class = setup_class - return cls - - -@raise_on_incompat_version(2) -@skip_openpyxl_gt21 -class TestOpenpyxl20Tests(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'openpyxl20' - check_skip = staticmethod(lambda *args, **kwargs: None) - - def test_to_excel_styleconverter(self): - import openpyxl - from openpyxl import styles - - hstyle = { - "font": { - "color": '00FF0000', - "bold": True, - }, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": { - "horizontal": "center", - "vertical": "top", - }, - "fill": { - "patternType": 'solid', - 'fgColor': { - 'rgb': '006666FF', - 'tint': 0.3, - }, - }, - "number_format": { - "format_code": "0.00" - }, - "protection": { - "locked": True, - "hidden": False, - }, - } - - font_color = styles.Color('00FF0000') - font = styles.Font(bold=True, color=font_color) - side = styles.Side(style=styles.borders.BORDER_THIN) - border = styles.Border(top=side, right=side, bottom=side, left=side) - alignment = styles.Alignment(horizontal='center', vertical='top') - fill_color = styles.Color(rgb='006666FF', tint=0.3) - fill = styles.PatternFill(patternType='solid', fgColor=fill_color) - - # ahh openpyxl API changes - ver = openpyxl.__version__ - if ver >= LooseVersion('2.0.0') and ver < LooseVersion('2.1.0'): - number_format = styles.NumberFormat(format_code='0.00') - else: - number_format = '0.00' # XXX: Only works with openpyxl-2.1.0 - - protection = styles.Protection(locked=True, hidden=False) - - kw = _Openpyxl20Writer._convert_to_style_kwargs(hstyle) - assert kw['font'] == font - assert kw['border'] == border - assert kw['alignment'] == alignment - assert kw['fill'] == fill - assert kw['number_format'] == number_format - assert kw['protection'] == protection - - def test_write_cells_merge_styled(self): - from pandas.io.formats.excel import ExcelCell - from openpyxl import styles - - sheet_name = 'merge_styled' - - sty_b1 = {'font': {'color': '00FF0000'}} - sty_a2 = {'font': {'color': '0000FF00'}} - - initial_cells = [ - ExcelCell(col=1, row=0, val=42, style=sty_b1), - ExcelCell(col=0, row=1, val=99, style=sty_a2), - ] - - sty_merged = {'font': {'color': '000000FF', 'bold': True}} - sty_kwargs = _Openpyxl20Writer._convert_to_style_kwargs(sty_merged) - openpyxl_sty_merged = styles.Style(**sty_kwargs) - merge_cells = [ - ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), - ] - - with ensure_clean('.xlsx') as path: - writer = _Openpyxl20Writer(path) - writer.write_cells(initial_cells, sheet_name=sheet_name) - writer.write_cells(merge_cells, sheet_name=sheet_name) - - wks = writer.sheets[sheet_name] - xcell_b1 = wks['B1'] - xcell_a2 = wks['A2'] - assert xcell_b1.style == openpyxl_sty_merged - assert xcell_a2.style == openpyxl_sty_merged - - -def skip_openpyxl_lt22(cls): - """Skip test case if openpyxl < 2.2""" - - @classmethod - def setup_class(cls): - _skip_if_no_openpyxl() - import openpyxl - ver = openpyxl.__version__ - if LooseVersion(ver) < LooseVersion('2.2.0'): - pytest.skip("openpyxl %s < 2.2" % str(ver)) - - cls.setup_class = setup_class - return cls - - -@raise_on_incompat_version(2) -@skip_openpyxl_lt22 -class TestOpenpyxl22Tests(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'openpyxl22' - check_skip = staticmethod(lambda *args, **kwargs: None) + check_skip = staticmethod(_skip_if_no_openpyxl) def test_to_excel_styleconverter(self): from openpyxl import styles @@ -2174,7 +1975,7 @@ def test_to_excel_styleconverter(self): protection = styles.Protection(locked=True, hidden=False) - kw = _Openpyxl22Writer._convert_to_style_kwargs(hstyle) + kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) assert kw['font'] == font assert kw['border'] == border assert kw['alignment'] == alignment @@ -2183,9 +1984,6 @@ def test_to_excel_styleconverter(self): assert kw['protection'] == protection def test_write_cells_merge_styled(self): - if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatible openpyxl version') - from pandas.io.formats.excel import ExcelCell sheet_name = 'merge_styled' @@ -2199,7 +1997,7 @@ def test_write_cells_merge_styled(self): ] sty_merged = {'font': {'color': '000000FF', 'bold': True}} - sty_kwargs = _Openpyxl22Writer._convert_to_style_kwargs(sty_merged) + sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) openpyxl_sty_merged = sty_kwargs['font'] merge_cells = [ ExcelCell(col=0, row=0, val='pandas', @@ -2207,7 +2005,7 @@ def test_write_cells_merge_styled(self): ] with ensure_clean('.xlsx') as path: - writer = _Openpyxl22Writer(path) + writer = _OpenpyxlWriter(path) writer.write_cells(initial_cells, sheet_name=sheet_name) writer.write_cells(merge_cells, sheet_name=sheet_name) @@ -2322,7 +2120,7 @@ def test_column_format(self): try: read_num_format = cell.number_format - except: + except Exception: read_num_format = cell.style.number_format._format_code assert read_num_format == num_format @@ -2366,9 +2164,7 @@ def test_ExcelWriter_dispatch(self): writer_klass = _XlsxWriter except ImportError: _skip_if_no_openpyxl() - if not openpyxl_compat.is_compat(major_ver=1): - pytest.skip('incompatible openpyxl version') - writer_klass = _Openpyxl1Writer + writer_klass = _OpenpyxlWriter with ensure_clean('.xlsx') as path: writer = ExcelWriter(path) @@ -2461,10 +2257,6 @@ def custom_converter(css): pytest.importorskip('jinja2') pytest.importorskip(engine) - if engine == 'openpyxl' and openpyxl_compat.is_compat(major_ver=1): - pytest.xfail('openpyxl1 does not support some openpyxl2-compatible ' - 'style dicts') - # Prepare spreadsheets df = DataFrame(np.random.randn(10, 3)) @@ -2482,9 +2274,6 @@ def custom_converter(css): # For other engines, we only smoke test return openpyxl = pytest.importorskip('openpyxl') - if not openpyxl_compat.is_compat(major_ver=2): - pytest.skip('incompatible openpyxl version') - wb = openpyxl.load_workbook(path) # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index dab508de335c4..e23911e8d2003 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -16,7 +16,7 @@ import pandas.util.testing as tm from pandas.tseries import offsets, frequencies -from pandas._libs.tslibs.timezones import get_timezone +from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz from pandas._libs.tslibs import conversion, period from pandas.compat import long, PY3 @@ -359,9 +359,7 @@ def test_conversion(self): '2014-01-01 00:00:00.000000001']) def test_repr(self, date, freq): # dateutil zone change (only matters for repr) - if (dateutil.__version__ >= LooseVersion('2.3') and - (dateutil.__version__ <= LooseVersion('2.4.0') or - dateutil.__version__ >= LooseVersion('2.6.0'))): + if dateutil.__version__ >= LooseVersion('2.6.0'): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] else: @@ -1381,7 +1379,6 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): tm._skip_if_windows_python_3() - from pandas._libs.tslibs.timezones import dateutil_gettz as gettz stamp = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') dtval = stamp.to_pydatetime() assert stamp == dtval diff --git a/setup.py b/setup.py index ba948abf4302b..57131255884de 100755 --- a/setup.py +++ b/setup.py @@ -19,8 +19,6 @@ import versioneer cmdclass = versioneer.get_cmdclass() -PY3 = sys.version_info[0] >= 3 - def is_platform_windows(): return sys.platform == 'win32' or sys.platform == 'cygwin' @@ -46,7 +44,7 @@ def is_platform_mac(): min_numpy_ver = '1.9.0' setuptools_kwargs = { 'install_requires': [ - 'python-dateutil >= 2' if PY3 else 'python-dateutil', + 'python-dateutil >= 2.5.0', 'pytz >= 2011k', 'numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver), ], From 02e72ecf1ce75e1fbfc6be0e8fb3568c36fa7fa3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Dec 2017 03:37:26 -0800 Subject: [PATCH 88/98] remove unused args, metadata structs (#18567) --- pandas/_libs/src/datetime/np_datetime.c | 73 ++++--------------- pandas/_libs/src/datetime/np_datetime.h | 34 +-------- .../_libs/src/datetime/np_datetime_strings.c | 7 +- pandas/_libs/src/ujson/python/objToJSON.c | 2 +- 4 files changed, 18 insertions(+), 98 deletions(-) diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index cb4f9d3efdcd0..edc9c0f8f903d 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -318,26 +318,19 @@ int cmp_pandas_datetimestruct(const pandas_datetimestruct *a, /* * * Tests for and converts a Python datetime.datetime or datetime.date - * object into a NumPy pandas_datetimestruct. + * object into a NumPy pandas_datetimestruct. Uses tzinfo (if present) + * to convert to UTC time. * * While the C API has PyDate_* and PyDateTime_* functions, the following * implementation just asks for attributes, and thus supports * datetime duck typing. The tzinfo time zone conversion would require * this style of access anyway. * - * 'out_bestunit' gives a suggested unit based on whether the object - * was a datetime.date or datetime.datetime object. - * - * If 'apply_tzinfo' is 1, this function uses the tzinfo to convert - * to UTC time, otherwise it returns the struct with the local time. - * * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the neeeded date or datetime attributes. */ int convert_pydatetime_to_datetimestruct(PyObject *obj, - pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo) { + pandas_datetimestruct *out) { PyObject *tmp; int isleap; @@ -404,10 +397,6 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, !PyObject_HasAttrString(obj, "minute") || !PyObject_HasAttrString(obj, "second") || !PyObject_HasAttrString(obj, "microsecond")) { - /* The best unit for date is 'D' */ - if (out_bestunit != NULL) { - *out_bestunit = PANDAS_FR_D; - } return 0; } @@ -465,7 +454,7 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, } /* Apply the time zone offset if it exists */ - if (apply_tzinfo && PyObject_HasAttrString(obj, "tzinfo")) { + if (PyObject_HasAttrString(obj, "tzinfo")) { tmp = PyObject_GetAttrString(obj, "tzinfo"); if (tmp == NULL) { return -1; @@ -506,11 +495,6 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, } } - /* The resolution of Python's datetime is 'us' */ - if (out_bestunit != NULL) { - *out_bestunit = PANDAS_FR_us; - } - return 0; invalid_date: @@ -529,51 +513,34 @@ int convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) { - pandas_datetime_metadata meta; npy_datetime result = PANDAS_DATETIME_NAT; - meta.base = fr; - meta.num = 1; - - convert_datetimestruct_to_datetime(&meta, d, &result); + convert_datetimestruct_to_datetime(fr, d, &result); return result; } void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) { - pandas_datetime_metadata meta; - - meta.base = fr; - meta.num = 1; - - convert_datetime_to_datetimestruct(&meta, val, result); + convert_datetime_to_datetimestruct(fr, val, result); } void pandas_timedelta_to_timedeltastruct(npy_timedelta val, PANDAS_DATETIMEUNIT fr, pandas_timedeltastruct *result) { - pandas_datetime_metadata meta; - - meta.base = fr; - meta.num = 1; - - convert_timedelta_to_timedeltastruct(&meta, val, result); + convert_timedelta_to_timedeltastruct(fr, val, result); } /* * Converts a datetime from a datetimestruct to a datetime based - * on some metadata. The date is assumed to be valid. - * - * TODO: If meta->num is really big, there could be overflow + * on a metadata unit. The date is assumed to be valid. * * Returns 0 on success, -1 on failure. */ -int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, +int convert_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT base, const pandas_datetimestruct *dts, npy_datetime *out) { npy_datetime ret; - PANDAS_DATETIMEUNIT base = meta->base; if (base == PANDAS_FR_Y) { /* Truncate to the year */ @@ -665,15 +632,6 @@ int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, } } - /* Divide by the multiplier */ - if (meta->num > 1) { - if (ret >= 0) { - ret /= meta->num; - } else { - ret = (ret - meta->num + 1) / meta->num; - } - } - *out = ret; return 0; @@ -682,7 +640,7 @@ int convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, /* * Converts a datetime based on the given metadata into a datetimestruct */ -int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, +int convert_datetime_to_datetimestruct(PANDAS_DATETIMEUNIT base, npy_datetime dt, pandas_datetimestruct *out) { npy_int64 perday; @@ -693,14 +651,11 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, out->month = 1; out->day = 1; - /* TODO: Change to a mechanism that avoids the potential overflow */ - dt *= meta->num; - /* * Note that care must be taken with the / and % operators * for negative values. */ - switch (meta->base) { + switch (base) { case PANDAS_FR_Y: out->year = 1970 + dt; break; @@ -902,11 +857,11 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, /* * Converts a timedelta from a timedeltastruct to a timedelta based - * on some metadata. The timedelta is assumed to be valid. + * on a metadata unit. The timedelta is assumed to be valid. * * Returns 0 on success, -1 on failure. */ -int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, +int convert_timedelta_to_timedeltastruct(PANDAS_DATETIMEUNIT base, npy_timedelta td, pandas_timedeltastruct *out) { npy_int64 frac; @@ -918,7 +873,7 @@ int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, /* Initialize the output to all zeros */ memset(out, 0, sizeof(pandas_timedeltastruct)); - switch (meta->base) { + switch (base) { case PANDAS_FR_ns: // put frac in seconds diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index 980c66218f7e6..b6c0852bfe764 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -40,8 +40,6 @@ typedef enum { #define PANDAS_DATETIME_NUMUNITS 13 -#define PANDAS_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) - #define PANDAS_DATETIME_NAT NPY_MIN_INT64 typedef struct { @@ -54,13 +52,6 @@ typedef struct { npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; } pandas_timedeltastruct; -typedef struct { - PANDAS_DATETIMEUNIT base; - int num; -} pandas_datetime_metadata; - -typedef pandas_datetime_metadata pandas_timedelta_metadata; - extern const pandas_datetimestruct _NS_MIN_DTS; extern const pandas_datetimestruct _NS_MAX_DTS; @@ -68,9 +59,7 @@ extern const pandas_datetimestruct _NS_MAX_DTS; // ---------------------------------------------------------------------------- int convert_pydatetime_to_datetimestruct(PyObject *obj, - pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo); + pandas_datetimestruct *out); npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d); @@ -91,19 +80,6 @@ extern const int days_per_month_table[2][12]; int is_leapyear(npy_int64 year); -/* - * Converts a datetime from a datetimestruct to a datetime based - * on some metadata. The date is assumed to be valid. - * - * TODO: If meta->num is really big, there could be overflow - * - * Returns 0 on success, -1 on failure. - */ -int -convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, - const pandas_datetimestruct *dts, - npy_datetime *out); - /* * Calculates the days offset from the 1970 epoch. */ @@ -127,14 +103,8 @@ add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); int -convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, +convert_datetime_to_datetimestruct(PANDAS_DATETIMEUNIT base, npy_datetime dt, pandas_datetimestruct *out); -int -convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, - npy_timedelta td, - pandas_timedeltastruct *out); - - #endif // PANDAS__LIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/src/datetime/np_datetime_strings.c b/pandas/_libs/src/datetime/np_datetime_strings.c index 1ff4f08cf3c9d..92f030b5fea2b 100644 --- a/pandas/_libs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/datetime/np_datetime_strings.c @@ -279,14 +279,9 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, if (len == 3 && tolower(str[0]) == 'n' && tolower(str[1]) == 'o' && tolower(str[2]) == 'w') { NPY_TIME_T rawtime = 0; - pandas_datetime_metadata meta; time(&rawtime); - /* Set up a dummy metadata for the conversion */ - meta.base = PANDAS_FR_s; - meta.num = 1; - bestunit = PANDAS_FR_s; /* @@ -304,7 +299,7 @@ int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, *out_special = 1; } - return convert_datetime_to_datetimestruct(&meta, rawtime, out); + return convert_datetime_to_datetimestruct(PANDAS_FR_s, rawtime, out); } /* Anything else isn't a special value */ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c8a29cd949c3c..7c64db69f0c46 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -493,7 +493,7 @@ static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, PRINTMARK(); - if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) { + if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { PRINTMARK(); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } else { From e99cb9c0448ed2dad3be33c22179da8a1177c65c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Dec 2017 03:38:39 -0800 Subject: [PATCH 89/98] Update imports, use nogil version of sqrt (#18557) --- pandas/_libs/algos.pyx | 2 +- pandas/_libs/lib.pyx | 35 +++++++++++--------- pandas/_libs/src/util.pxd | 1 + pandas/core/indexes/datetimes.py | 17 +++++----- pandas/core/tools/datetimes.py | 2 -- pandas/tests/indexes/datetimes/test_tools.py | 6 ++-- pandas/tseries/offsets.py | 4 +-- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 61d543cd7303a..df8f7bab51dbe 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -27,7 +27,7 @@ from numpy cimport (ndarray, cdef double NaN = np.NaN cdef double nan = NaN -from libc.math cimport sqrt, fabs +from libc.math cimport fabs, sqrt # this is our util.pxd from util cimport numeric, get_nat diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 02b3839ebf181..a39f83d5261c0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,13 +1,18 @@ # cython: profile=False -cimport numpy as np -cimport cython -import numpy as np -import sys +import operator -cdef bint PY3 = (sys.version_info[0] >= 3) - -from numpy cimport * +cimport cython +from cython cimport Py_ssize_t +import numpy as np +cimport numpy as np +from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, PyArray_SETITEM, + PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + flatiter, NPY_OBJECT, + int64_t, + float32_t, float64_t, + uint8_t, uint64_t, + complex128_t) # initialize numpy np.import_array() np.import_ufunc() @@ -57,12 +62,12 @@ from tslib import NaT, Timestamp, Timedelta, array_to_datetime from interval import Interval from missing cimport checknull -cdef int64_t NPY_NAT = util.get_nat() cimport util +cdef int64_t NPY_NAT = util.get_nat() from util cimport is_array, _checknull -from libc.math cimport sqrt, fabs +from libc.math cimport fabs, sqrt def values_from_object(object o): @@ -494,7 +499,6 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): @cython.wraparound(False) @cython.boundscheck(False) def scalar_compare(ndarray[object] values, object val, object op): - import operator cdef: Py_ssize_t i, n = len(values) ndarray[uint8_t, cast=True] result @@ -529,7 +533,7 @@ def scalar_compare(ndarray[object] values, object val, object op): result[i] = True else: try: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) except (TypeError): result[i] = True elif flag == cpython.Py_EQ: @@ -541,7 +545,7 @@ def scalar_compare(ndarray[object] values, object val, object op): result[i] = False else: try: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) except (TypeError): result[i] = False @@ -553,7 +557,7 @@ def scalar_compare(ndarray[object] values, object val, object op): elif isnull_val: result[i] = False else: - result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + result[i] = PyObject_RichCompareBool(x, val, flag) return result.view(bool) @@ -582,7 +586,6 @@ cpdef bint array_equivalent_object(object[:] left, object[:] right): @cython.wraparound(False) @cython.boundscheck(False) def vec_compare(ndarray[object] left, ndarray[object] right, object op): - import operator cdef: Py_ssize_t i, n = len(left) ndarray[uint8_t, cast=True] result @@ -617,7 +620,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): if checknull(x) or checknull(y): result[i] = True else: - result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + result[i] = PyObject_RichCompareBool(x, y, flag) else: for i in range(n): x = left[i] @@ -626,7 +629,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): if checknull(x) or checknull(y): result[i] = False else: - result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + result[i] = PyObject_RichCompareBool(x, y, flag) return result.view(bool) diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index 61783ab47cb86..e5fe90aa81f7d 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -2,6 +2,7 @@ from numpy cimport ndarray cimport numpy as cnp cimport cpython + cdef extern from "numpy_helper.h": void set_array_not_contiguous(ndarray ao) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 55c6063b74286..fb86d25625b6a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -43,8 +43,7 @@ DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin) from pandas.tseries.offsets import ( DateOffset, generate_range, Tick, CDay, prefix_mapping) -from pandas.core.tools.datetimes import ( - parse_time_string, normalize_date, to_time) + from pandas.core.tools.timedeltas import to_timedelta from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) @@ -55,7 +54,7 @@ from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, Timestamp) -from pandas._libs.tslibs import (timezones, conversion, fields, +from pandas._libs.tslibs import (timezones, conversion, fields, parsing, period as libperiod) # -------- some conversion wrapper functions @@ -524,14 +523,14 @@ def _generate(cls, start, end, periods, name, offset, if start is not None: if normalize: - start = normalize_date(start) + start = libts.normalize_date(start) _normalized = True else: _normalized = _normalized and start.time() == _midnight if end is not None: if normalize: - end = normalize_date(end) + end = libts.normalize_date(end) _normalized = True else: _normalized = _normalized and end.time() == _midnight @@ -1529,7 +1528,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): if isinstance(label, compat.string_types): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - _, parsed, reso = parse_time_string(label, freq) + _, parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -1546,7 +1545,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - _, parsed, reso = parse_time_string(key, freq) + _, parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc @@ -1965,8 +1964,8 @@ def indexer_between_time(self, start_time, end_time, include_start=True, ------- values_between_time : TimeSeries """ - start_time = to_time(start_time) - end_time = to_time(end_time) + start_time = tools.to_time(start_time) + end_time = tools.to_time(end_time) time_micros = self._get_time_micros() start_micros = _time_to_micros(start_time) end_micros = _time_to_micros(end_time) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 219fb3f67db97..4245b9eb641ba 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -629,8 +629,6 @@ def calc_with_mask(carg, mask): return None -normalize_date = tslib.normalize_date - # Fixed time formats for time parsing _time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 6c72e65b1021c..d03951458f12a 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -16,7 +16,7 @@ from pandas._libs import tslib from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools -from pandas.core.tools.datetimes import normalize_date + from pandas.compat import lmap from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype @@ -1576,12 +1576,12 @@ def test_coerce_of_invalid_datetimes(self): def test_normalize_date(): value = date(2012, 9, 7) - result = normalize_date(value) + result = tslib.normalize_date(value) assert (result == datetime(2012, 9, 7)) value = datetime(2012, 9, 7, 12) - result = normalize_date(value) + result = tslib.normalize_date(value) assert (result == datetime(2012, 9, 7)) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a3cddaa19dc17..857ec9e9881d9 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -8,7 +8,7 @@ import numpy as np from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex, ABCPeriod -from pandas.core.tools.datetimes import to_datetime, normalize_date +from pandas.core.tools.datetimes import to_datetime from pandas.core.common import AbstractMethodError # import after tools, dateutil check @@ -103,7 +103,7 @@ def wrapper(self, other): if self.normalize: # normalize_date returns normal datetime - result = normalize_date(result) + result = tslib.normalize_date(result) if tz is not None and result.tzinfo is None: result = tslib._localize_pydatetime(result, tz) From 2c903d594299b2441d4742e777a10e8c76557386 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Mon, 4 Dec 2017 13:55:54 +0100 Subject: [PATCH 90/98] json_normalize: Make code more pythonic and avoid modification of meta if mutable (#18610) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/json/normalize.py | 6 ++---- pandas/tests/io/json/test_normalize.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 3d4850b334ff9..a9608594be547 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -90,6 +90,7 @@ I/O - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) +- Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index d062e4f2830ff..595031b04e367 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -181,7 +181,7 @@ def _pull_field(js, spec): return result - if isinstance(data, list) and len(data) is 0: + if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob @@ -207,9 +207,7 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] + meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 49b765b18d623..1cceae32cd748 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -173,6 +173,21 @@ def test_meta_name_conflict(self): for val in ['metafoo', 'metabar', 'foo', 'bar']: assert val in result + def test_meta_parameter_not_modified(self): + # GH 18610 + data = [{'foo': 'hello', + 'bar': 'there', + 'data': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}]}] + + COLUMNS = ['foo', 'bar'] + result = json_normalize(data, 'data', meta=COLUMNS, + meta_prefix='meta') + + assert COLUMNS == ['foo', 'bar'] + for val in ['metafoo', 'metabar', 'foo', 'bar']: + assert val in result + def test_record_prefix(self, state_data): result = json_normalize(state_data[0], 'counties') expected = DataFrame(state_data[0]['counties']) From a7646638d06f1ce98481b88f3505e2b4badf172c Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 4 Dec 2017 13:13:31 -0700 Subject: [PATCH 91/98] BLD: Bump Cython version from 0.23 to 0.24 (#18623) --- ci/requirements-2.7.build | 2 +- ci/requirements-2.7_COMPAT.build | 2 +- ci/requirements-2.7_LOCALE.build | 2 +- doc/source/enhancingperf.rst | 3 +-- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.22.0.txt | 10 ++++------ setup.py | 2 +- 7 files changed, 10 insertions(+), 13 deletions(-) diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index d1cc61df0a77c..e24baa98d956e 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -3,4 +3,4 @@ python-dateutil=2.5.0 pytz=2013b nomkl numpy -cython=0.23 +cython=0.24 diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build index aa767c1001196..0a83a7346e8b5 100644 --- a/ci/requirements-2.7_COMPAT.build +++ b/ci/requirements-2.7_COMPAT.build @@ -1,5 +1,5 @@ python=2.7* numpy=1.9.2 -cython=0.23 +cython=0.24 python-dateutil=2.5.0 pytz=2013b diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build index 96cb184ec2665..a6f2e25387910 100644 --- a/ci/requirements-2.7_LOCALE.build +++ b/ci/requirements-2.7_LOCALE.build @@ -2,4 +2,4 @@ python=2.7* python-dateutil pytz=2013b numpy=1.9.2 -cython=0.23 +cython=0.24 diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 264bd1de1fc77..cbe945e0cf2cf 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -94,8 +94,7 @@ hence we'll concentrate our efforts cythonizing these two functions. Plain cython ~~~~~~~~~~~~ -First we're going to need to import the cython magic function to ipython (for -cython versions < 0.21 you can use ``%load_ext cythonmagic``): +First we're going to need to import the cython magic function to ipython: .. ipython:: python :okwarning: diff --git a/doc/source/install.rst b/doc/source/install.rst index ae89c64b6e91e..aeb1abbadabb3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -228,7 +228,7 @@ Optional Dependencies ~~~~~~~~~~~~~~~~~~~~~ * `Cython `__: Only necessary to build development - version. Version 0.23 or higher. + version. Version 0.24 or higher. * `SciPy `__: miscellaneous statistical functions, Version 0.14.0 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 5e605ecb7d8d5..fd37f269c2f83 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -83,10 +83,6 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) -- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - .. _whatsnew_0220.api_breaking.deps: Dependencies have increased minimum versions @@ -104,8 +100,6 @@ If installed, we now require: +-----------------+-----------------+----------+ - - .. _whatsnew_0220.api: Other API Changes @@ -129,6 +123,10 @@ Other API Changes - :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) - When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) - Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`) +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) +- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`) .. _whatsnew_0220.deprecations: diff --git a/setup.py b/setup.py index 57131255884de..004f111115079 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def is_platform_mac(): return sys.platform == 'darwin' -min_cython_ver = '0.23' +min_cython_ver = '0.24' try: import Cython ver = Cython.__version__ From 52fefd50f8bffa493018ba8ff8b8c46b95c94ada Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 4 Dec 2017 16:23:54 -0800 Subject: [PATCH 92/98] CLN: Remove io.data and io.wb (#18612) Deprecated in 0.17.0. xref gh-13735. --- doc/source/index.rst.template | 1 - doc/source/remote_data.rst | 30 ------------------------------ doc/source/whatsnew/v0.22.0.txt | 1 + pandas/io/data.py | 6 ------ pandas/io/wb.py | 6 ------ 5 files changed, 1 insertion(+), 43 deletions(-) delete mode 100644 doc/source/remote_data.rst delete mode 100644 pandas/io/data.py delete mode 100644 pandas/io/wb.py diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f5c65e175b0db..7c7457df8ea93 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -137,7 +137,6 @@ See the package overview for more detail about what's in the library. visualization style io - remote_data enhancingperf sparse gotchas diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst deleted file mode 100644 index 5054bb7bcd12e..0000000000000 --- a/doc/source/remote_data.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _remote_data: - -.. currentmodule:: pandas - -****************** -Remote Data Access -****************** - -.. _remote_data.pandas_datareader: - -DataReader ----------- - -The sub-package ``pandas.io.data`` was removed in -`v.0.19 `__. -Instead there has been created a separately installable -`pandas-datareader package `__. -This will allow the data modules to be independently updated on your pandas installation. - -For code older than < 0.19 you should replace the imports of the following: - -.. code-block:: python - - from pandas.io import data, wb - -With: - -.. code-block:: python - - from pandas_datareader import data, wb diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index fd37f269c2f83..cd727c728eb3d 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -148,6 +148,7 @@ Removal of prior version deprecations/changes - ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) - ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) - The ``SparseList`` class has been removed (:issue:`14007`) +- The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) .. _whatsnew_0220.performance: diff --git a/pandas/io/data.py b/pandas/io/data.py deleted file mode 100644 index e76790a6ab98b..0000000000000 --- a/pandas/io/data.py +++ /dev/null @@ -1,6 +0,0 @@ -raise ImportError( - "The pandas.io.data module is moved to a separate package " - "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/wb.py b/pandas/io/wb.py deleted file mode 100644 index 5dc4d9ce1adc4..0000000000000 --- a/pandas/io/wb.py +++ /dev/null @@ -1,6 +0,0 @@ -raise ImportError( - "The pandas.io.wb module is moved to a separate package " - "(pandas-datareader). After installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.") From 52838e609c1b2a495069964dea862a39dd067b2b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 5 Dec 2017 01:18:56 -0800 Subject: [PATCH 93/98] CLN: ASV groupby benchmarks (#18611) --- asv_bench/benchmarks/groupby.py | 702 ++++++++++--------------- asv_bench/benchmarks/reshape.py | 20 + asv_bench/benchmarks/series_methods.py | 22 + 3 files changed, 329 insertions(+), 415 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 13b5cd2b06032..3abf2338e1d94 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,85 +1,108 @@ -from .pandas_vb_common import * from string import ascii_letters, digits from itertools import product +from functools import partial +import numpy as np +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + TimeGrouper, Categorical) +import pandas.util.testing as tm -class groupby_agg_builtins(object): +from .pandas_vb_common import setup # noqa + + +class ApplyDictReturn(object): goal_time = 0.2 def setup(self): - np.random.seed(27182) - self.n = 100000 - self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + self.labels = np.arange(1000).repeat(10) + self.data = Series(np.random.randn(len(self.labels))) + self.f = lambda x: {'first': x.values[0], 'last': x.values[(-1)]} - def time_groupby_agg_builtins1(self): - self.df.groupby('jim').agg([sum, min, max]) + def time_groupby_apply_dict_return(self): + self.data.groupby(self.labels).apply(self.f) - def time_groupby_agg_builtins2(self): - self.df.groupby(['jim', 'joe']).agg([sum, min, max]) -#---------------------------------------------------------------------- -# dict return values +class Apply(object): -class groupby_apply_dict_return(object): goal_time = 0.2 def setup(self): - self.labels = np.arange(1000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], }) + N = 10**4 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + self.df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4), + }) + self.scalar_function = lambda x: 1 + + def time_scalar_function_multi_col(self): + self.df.groupby(['key', 'key2']).apply(self.scalar_function) - def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(self.f) + def time_scalar_function_single_col(self): + self.df.groupby('key').apply(self.scalar_function) + @staticmethod + def df_copy_function(g): + # ensure that the group name is available (see GH #15062) + g.name + return g.copy() + + def time_copy_function_multi_col(self): + self.df.groupby(['key', 'key2']).apply(self.df_copy_function) + + def time_copy_overhead_single_col(self): + self.df.groupby('key').apply(self.df_copy_function) -#---------------------------------------------------------------------- -# groups class Groups(object): - goal_time = 0.1 - size = 2 ** 22 - data = { - 'int64_small': Series(np.random.randint(0, 100, size=size)), - 'int64_large' : Series(np.random.randint(0, 10000, size=size)), - 'object_small': Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))), - 'object_large': Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) - } + goal_time = 0.2 - param_names = ['df'] + param_names = ['key'] params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def setup(self, df): - self.df = self.data[df] + def setup_cache(self): + size = 10**6 + data = {'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large': Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series( + tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=size))), + 'object_large': Series( + tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=size)))} + return data - def time_groupby_groups(self, df): - self.df.groupby(self.df).groups + def setup(self, data, key): + self.ser = data[key] + def time_series_groups(self, data, key): + self.ser.groupby(self.ser).groups -#---------------------------------------------------------------------- -# First / last functions class FirstLast(object): + goal_time = 0.2 param_names = ['dtype'] params = ['float32', 'float64', 'datetime', 'object'] - # with datetimes (GH7555) - def setup(self, dtype): - + N = 10**5 + # with datetimes (GH7555) if dtype == 'datetime': - self.df = DataFrame( - {'values': date_range('1/1/2011', periods=100000, freq='s'), - 'key': range(100000),}) + self.df = DataFrame({'values': date_range('1/1/2011', + periods=N, + freq='s'), + 'key': range(N)}) elif dtype == 'object': - self.df = DataFrame( - {'values': (['foo'] * 100000), - 'key': range(100000)}) + self.df = DataFrame({'values': ['foo'] * N, + 'key': range(N)}) else: - labels = np.arange(10000).repeat(10) - data = Series(randn(len(labels)), dtype=dtype) + labels = np.arange(N / 10).repeat(10) + data = Series(np.random.randn(len(labels)), dtype=dtype) data[::3] = np.nan data[1::3] = np.nan labels = labels.take(np.random.permutation(len(labels))) @@ -91,313 +114,249 @@ def time_groupby_first(self, dtype): def time_groupby_last(self, dtype): self.df.groupby('key').last() - def time_groupby_nth_any(self, dtype): + def time_groupby_nth_all(self, dtype): self.df.groupby('key').nth(0, dropna='all') def time_groupby_nth_none(self, dtype): self.df.groupby('key').nth(0) -#---------------------------------------------------------------------- -# DataFrame Apply overhead +class GroupManyLabels(object): -class groupby_frame_apply(object): goal_time = 0.2 + params = [1, 1000] + param_names = ['ncols'] - def setup(self): - self.N = 10000 - self.labels = np.random.randint(0, 2000, size=self.N) - self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({ - 'key': self.labels, - 'key2': self.labels2, - 'value1': np.random.randn(self.N), - 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)), - }) - - @staticmethod - def scalar_function(g): - return 1 + def setup(self, ncols): + N = 1000 + data = np.random.randn(N, ncols) + self.labels = np.random.randint(0, 100, size=N) + self.df = DataFrame(data) - def time_groupby_frame_apply_scalar_function(self): - self.df.groupby(['key', 'key2']).apply(self.scalar_function) - - def time_groupby_frame_apply_scalar_function_overhead(self): - self.df.groupby('key').apply(self.scalar_function) + def time_sum(self, ncols): + self.df.groupby(self.labels).sum() - @staticmethod - def df_copy_function(g): - # ensure that the group name is available (see GH #15062) - g.name - return g.copy() - def time_groupby_frame_df_copy_function(self): - self.df.groupby(['key', 'key2']).apply(self.df_copy_function) +class Nth(object): - def time_groupby_frame_apply_df_copy_overhead(self): - self.df.groupby('key').apply(self.df_copy_function) + goal_time = 0.2 + def setup_cache(self): + df = DataFrame(np.random.randint(1, 100, (10000, 2))) + df.iloc[1, 1] = np.nan + return df -#---------------------------------------------------------------------- -# 2d grouping, aggregate many columns + def time_frame_nth_any(self, df): + df.groupby(0).nth(0, dropna='any') -class groupby_frame_cython_many_columns(object): - goal_time = 0.2 + def time_frame_nth(self, df): + df.groupby(0).nth(0) - def setup(self): - self.labels = np.random.randint(0, 100, size=1000) - self.df = DataFrame(randn(1000, 1000)) + def time_series_nth_any(self, df): + df[1].groupby(df[0]).nth(0, dropna='any') - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_series_nth(self, df): + df[1].groupby(df[0]).nth(0) -#---------------------------------------------------------------------- -# single key, long, integer key +class DateAttributes(object): -class groupby_frame_singlekey_integer(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100000, 1) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) + rng = date_range('1/1/2000', '12/31/2005', freq='H') + self.year, self.month, self.day = rng.year, rng.month, rng.day + self.ts = Series(np.random.randn(len(rng)), index=rng) - def time_sum(self): - self.df.groupby(self.labels).sum() + def time_len_groupby_object(self): + len(self.ts.groupby([self.year, self.month, self.day])) -#---------------------------------------------------------------------- -# DataFrame nth +class Int64(object): -class groupby_nth(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) - - def time_groupby_frame_nth_any(self): - self.df.groupby(0).nth(0, dropna='any') - - def time_groupby_frame_nth_none(self): - self.df.groupby(0).nth(0) + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) + i = np.random.choice(len(arr), len(arr) * 5) + arr = np.vstack((arr, arr[i])) + i = np.random.permutation(len(arr)) + arr = arr[i] + self.cols = list('abcde') + self.df = DataFrame(arr, columns=self.cols) + self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 - def time_groupby_series_nth_any(self): - self.df[1].groupby(self.df[0]).nth(0, dropna='any') + def time_overflow(self): + self.df.groupby(self.cols).max() - def time_groupby_series_nth_none(self): - self.df[1].groupby(self.df[0]).nth(0) +class CountMultiDtype(object): -#---------------------------------------------------------------------- -# groupby_indices replacement, chop up Series - -class groupby_indices(object): goal_time = 0.2 - def setup(self): - try: - self.rng = date_range('1/1/2000', '12/31/2005', freq='H') - (self.year, self.month, self.day) = (self.rng.year, self.rng.month, self.rng.day) - except: - self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) - self.year = self.rng.map((lambda x: x.year)) - self.month = self.rng.map((lambda x: x.month)) - self.day = self.rng.map((lambda x: x.day)) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_groupby_indices(self): - len(self.ts.groupby([self.year, self.month, self.day])) - + def setup_cache(self): + n = 10000 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64('nat') + offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + value2 = np.random.randn(n) + value2[np.random.rand(n) > 0.5] = np.nan + obj = np.random.choice(list('ab'), size=n).astype(object) + obj[np.random.randn(n) > 0.5] = np.nan + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'dates': dates, + 'value2': value2, + 'value3': np.random.randn(n), + 'ints': np.random.randint(0, 1000, size=n), + 'obj': obj, + 'offsets': offsets}) + return df + + def time_multi_count(self, df): + df.groupby(['key1', 'key2']).count() + + +class CountInt(object): -class groupby_int64_overflow(object): goal_time = 0.2 - def setup(self): - self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) - self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) - self.arr = np.vstack((self.arr, self.arr[self.i])) - self.i = np.random.permutation(len(self.arr)) - self.arr = self.arr[self.i] - self.df = DataFrame(self.arr, columns=list('abcde')) - (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10) + def setup_cache(self): + n = 10000 + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'ints': np.random.randint(0, 1000, size=n), + 'ints2': np.random.randint(0, 1000, size=n)}) + return df - def time_groupby_int64_overflow(self): - self.df.groupby(list('abcde')).max() + def time_int_count(self, df): + df.groupby(['key1', 'key2']).count() + def time_int_nunique(self, df): + df.groupby(['key1', 'key2']).nunique() -#---------------------------------------------------------------------- -# count() speed -class groupby_multi_count(object): - goal_time = 0.2 +class AggFunctions(object): - def setup(self): - self.n = 10000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') - self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') - self.value2 = np.random.randn(self.n) - self.value2[(np.random.rand(self.n) > 0.5)] = np.nan - self.obj = np.random.choice(list('ab'), size=self.n).astype(object) - self.obj[(np.random.randn(self.n) > 0.5)] = np.nan - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), - 'key2': np.random.randint(0, 100, size=self.n), - 'dates': self.dates, - 'value2': self.value2, - 'value3': np.random.randn(self.n), - 'ints': np.random.randint(0, 1000, size=self.n), - 'obj': self.obj, - 'offsets': self.offsets, }) - - def time_groupby_multi_count(self): - self.df.groupby(['key1', 'key2']).count() - - -class groupby_int_count(object): goal_time = 0.2 - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) - - def time_groupby_int_count(self): - self.df.groupby(['key1', 'key2']).count() + def setup_cache(self): + N = 10**5 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), + 'key2': fac2.take(np.random.randint(0, 2, size=N)), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + return df + def time_different_str_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': 'mean', + 'value2': 'var', + 'value3': 'sum'}) -#---------------------------------------------------------------------- -# nunique() speed + def time_different_numpy_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': np.mean, + 'value2': np.var, + 'value3': np.sum}) -class groupby_nunique(object): + def time_different_python_functions_multicol(self, df): + df.groupby(['key1', 'key2']).agg([sum, min, max]) - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) + def time_different_python_functions_singlecol(self, df): + df.groupby('key1').agg([sum, min, max]) - def time_groupby_nunique(self): - self.df.groupby(['key1', 'key2']).nunique() +class GroupStrings(object): -#---------------------------------------------------------------------- -# group with different functions per column - -class groupby_agg_multi(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) - - def time_groupby_multi_different_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum'}) - - def time_groupby_multi_different_numpy_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum}) - - -class groupby_multi_index(object): goal_time = 0.2 def setup(self): - self.n = (((5 * 7) * 11) * (1 << 9)) - self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) - self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k)) - self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), }) + n = (5 * 7 * 11) * (1 << 9) + alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) + f = lambda k: np.repeat(np.random.choice(alpha, (n // k)), k) + self.df = DataFrame({'a': f(11), + 'b': f(7), + 'c': f(5), + 'd': f(1)}) self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) - self.i = np.random.permutation(len(self.df)) - self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + i = np.random.permutation(len(self.df)) + self.df = self.df.iloc[i].reset_index(drop=True) - def time_groupby_multi_index(self): + def time_multi_columns(self): self.df.groupby(list('abcd')).max() -class groupby_multi(object): +class MultiColumn(object): + goal_time = 0.2 def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, int(n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr - - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) - - def time_groupby_multi_cython(self): + N = 10**5 + key1 = np.tile(np.arange(100, dtype=object), 1000) + key2 = key1.copy() + np.random.shuffle(key1) + np.random.shuffle(key2) + self.df = DataFrame({'key1': key1, + 'key2': key2, + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + self.f = lambda x: x.values.sum() + + def time_lambda_sum(self): + self.df.groupby(['key1', 'key2']).agg(self.f) + + def time_cython_sum(self): self.df.groupby(['key1', 'key2']).sum() - def time_groupby_multi_python(self): - self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) - - def time_groupby_multi_series_op(self): - self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) - - def time_groupby_series_simple_cython(self): - self.simple_series.groupby(self.key1).sum() + def time_col_select_lambda_sum(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(self.f) - def time_groupby_series_simple_rank(self): - self.df.groupby('key1').rank(pct=True) + def time_col_select_numpy_sum(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(np.sum) -#---------------------------------------------------------------------- -# size() speed +class Size(object): -class groupby_size(object): goal_time = 0.2 def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) - - N = 1000000 - self.draws = pd.Series(np.random.randn(N)) - labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + n = 10**5 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'value1': np.random.randn(n), + 'value2': np.random.randn(n), + 'value3': np.random.randn(n), + 'dates': dates}) + self.draws = Series(np.random.randn(n)) + labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') - def time_groupby_multi_size(self): + def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_groupby_dt_size(self): + def time_dt_size(self): self.df.groupby(['dates']).size() - def time_groupby_dt_timegrouper_size(self): + def time_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_groupby_size(self): + def time_category_size(self): self.draws.groupby(self.cats).size() +class GroupByMethods(object): -#---------------------------------------------------------------------- -# groupby with a variable value for ngroups - -class GroupBySuite(object): goal_time = 0.2 param_names = ['dtype', 'ngroups'] params = [['int', 'float'], [100, 10000]] def setup(self, dtype, ngroups): - np.random.seed(1234) size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) @@ -479,6 +438,9 @@ def time_rank(self, dtype, ngroups): def time_sem(self, dtype, ngroups): self.df.groupby('key')['values'].sem() + def time_shift(self, dtype, ngroups): + self.df.groupby('key')['values'].shift() + def time_size(self, dtype, ngroups): self.df.groupby('key')['values'].size() @@ -504,7 +466,7 @@ def time_var(self, dtype, ngroups): self.df.groupby('key')['values'].var() -class groupby_float32(object): +class Float32(object): # GH 13335 goal_time = 0.2 @@ -515,27 +477,28 @@ def setup(self): arr = np.repeat(tmp, 10) self.df = DataFrame(dict(a=arr, b=arr)) - def time_groupby_sum(self): + def time_sum(self): self.df.groupby(['a'])['b'].sum() -class groupby_categorical(object): +class Categories(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 arr = np.random.random(N) - - self.df = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N)), - b=arr)) - self.df_ordered = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N), ordered=True), - b=arr)) - self.df_extra_cat = DataFrame(dict( - a=Categorical(np.random.randint(100, size=N), - categories=np.arange(10000)), - b=arr)) + data = {'a': Categorical(np.random.randint(10000, size=N)), + 'b': arr} + self.df = DataFrame(data) + data = {'a': Categorical(np.random.randint(10000, size=N), + ordered=True), + 'b': arr} + self.df_ordered = DataFrame(data) + data = {'a': Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + 'b': arr} + self.df_extra_cat = DataFrame(data) def time_groupby_sort(self): self.df.groupby('a')['b'].count() @@ -556,130 +519,71 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby('a', sort=False)['b'].count() -class groupby_period(object): +class Datelike(object): # GH 14338 goal_time = 0.2 - - def make_grouper(self, N): - return pd.period_range('1900-01-01', freq='D', periods=N) - - def setup(self): - N = 10000 - self.grouper = self.make_grouper(N) - self.df = pd.DataFrame(np.random.randn(N, 2)) - - def time_groupby_sum(self): + params = ['period_range', 'date_range', 'date_range_tz'] + param_names = ['grouper'] + + def setup(self, grouper): + N = 10**4 + rng_map = {'period_range': period_range, + 'date_range': date_range, + 'date_range_tz': partial(date_range, tz='US/Central')} + self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) + self.df = DataFrame(np.random.randn(10**4, 2)) + + def time_sum(self, grouper): self.df.groupby(self.grouper).sum() -class groupby_datetime(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N) - - -class groupby_datetimetz(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N, - tz='US/Central') - -#---------------------------------------------------------------------- -# Series.value_counts - -class series_value_counts(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)) - self.s2 = self.s.astype(float) - - self.K = 1000 - self.N = 100000 - self.uniques = tm.makeStringIndex(self.K).values - self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) - - def time_value_counts_int64(self): - self.s.value_counts() - - def time_value_counts_float64(self): - self.s2.value_counts() - - def time_value_counts_strings(self): - self.s.value_counts() - - -#---------------------------------------------------------------------- -# pivot_table - -class groupby_pivot_table(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.ind1 = np.random.randint(0, 3, size=100000) - self.ind2 = np.random.randint(0, 2, size=100000) - self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) - - def time_groupby_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) - - -#---------------------------------------------------------------------- -# Sum booleans #2692 - -class groupby_sum_booleans(object): +class SumBools(object): + # GH 2692 goal_time = 0.2 def setup(self): - self.N = 500 - self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], }) + N = 500 + self.df = DataFrame({'ii': range(N), + 'bb': [True] * N}) def time_groupby_sum_booleans(self): self.df.groupby('ii').sum() -#---------------------------------------------------------------------- -# multi-indexed group sum #9049 - -class groupby_sum_multiindex(object): +class SumMultiLevel(object): + # GH 9049 goal_time = 0.2 + timeout = 120.0 def setup(self): - self.N = 50 - self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B']) + N = 50 + self.df = DataFrame({'A': range(N) * 2, + 'B': range(N * 2), + 'C': 1}).set_index(['A', 'B']) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() -#------------------------------------------------------------------------------- -# Transform testing - class Transform(object): + goal_time = 0.2 def setup(self): n1 = 400 n2 = 250 - - index = MultiIndex( - levels=[np.arange(n1), pd.util.testing.makeStringIndex(n2)], - labels=[[i for i in range(n1) for _ in range(n2)], - (list(range(n2)) * n1)], - names=['lev1', 'lev2']) - - data = DataFrame(np.random.randn(n1 * n2, 3), - index=index, columns=['col1', 'col20', 'col3']) - step = int((n1 * n2 * 0.1)) - for col in range(len(data.columns)): - idx = col - while (idx < len(data)): - data.set_value(data.index[idx], data.columns[col], np.nan) - idx += step + index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], + labels=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], + names=['lev1', 'lev2']) + arr = np.random.randn(n1 * n2, 3) + arr[::10000, 0] = np.nan + arr[1::10000, 1] = np.nan + arr[2::10000, 2] = np.nan + data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) self.df = data - self.f_fillna = (lambda x: x.fillna(method='pad')) + self.f_max = lambda x: max(x) - np.random.seed(2718281) n = 20000 self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), columns=['jim', 'joe', 'jolie']) @@ -691,10 +595,10 @@ def setup(self): self.df4 = self.df3.copy() self.df4['jim'] = self.df4['joe'] - def time_transform_func(self): - self.df.groupby(level='lev2').transform(self.f_fillna) + def time_transform_lambda_max(self): + self.df.groupby(level='lev1').transform(self.f_max) - def time_transform_ufunc(self): + def time_transform_ufunc_max(self): self.df.groupby(level='lev1').transform(np.max) def time_transform_multi_key1(self): @@ -710,63 +614,31 @@ def time_transform_multi_key4(self): self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') +class TransformBools(object): - -np.random.seed(0) -N = 120000 -N_TRANSITIONS = 1400 -transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] -transition_points.sort() -transitions = np.zeros((N,), dtype=np.bool) -transitions[transition_points] = True -g = transitions.cumsum() -df = DataFrame({'signal': np.random.rand(N), }) - - - - - -class groupby_transform_series(object): goal_time = 0.2 def setup(self): - np.random.seed(0) N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros((N,), dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({'signal': np.random.rand(N)}) - def time_groupby_transform_series(self): + def time_transform_mean(self): self.df['signal'].groupby(self.g).transform(np.mean) -class groupby_transform_series2(object): +class TransformNaN(object): + # GH 12737 goal_time = 0.2 def setup(self): - np.random.seed(0) - self.df = DataFrame({'key': (np.arange(100000) // 3), - 'val': np.random.randn(100000)}) - - self.df_nans = pd.DataFrame({'key': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df_nans.ix[4::10, 'B':'C'] = 5 - - def time_transform_series2(self): - self.df.groupby('key')['val'].transform(np.mean) - - def time_cumprod(self): - self.df.groupby('key').cumprod() - - def time_cumsum(self): - self.df.groupby('key').cumsum() - - def time_shift(self): - self.df.groupby('key').shift() + self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.loc[4::10, 'B':'C'] = 5 - def time_transform_dataframe(self): - # GH 12737 + def time_first(self): self.df_nans.groupby('key').transform('first') diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 177e3e7cb87fa..951f718257170 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -117,3 +117,23 @@ def setup(self): def time_wide_to_long_big(self): self.df['id'] = self.df.index wide_to_long(self.df, list(self.vars), i='id', j='year') + + +class PivotTable(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + ind1 = np.random.randint(0, 3, size=N) + ind2 = np.random.randint(0, 2, size=N) + self.df = DataFrame({'key1': fac1.take(ind1), + 'key2': fac2.take(ind2), + 'key3': fac2.take(ind2), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + + def time_pivot_table(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3']) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 5e8cf3a0350bb..81c43f7bc975f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -155,3 +155,25 @@ def setup(self): def time_series_dropna_datetime(self): self.s.clip(0, 1) + + +class series_value_counts(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=100000)) + self.s2 = self.s.astype(float) + + self.K = 1000 + self.N = 100000 + self.uniques = tm.makeStringIndex(self.K).values + self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) + + def time_value_counts_int64(self): + self.s.value_counts() + + def time_value_counts_float64(self): + self.s2.value_counts() + + def time_value_counts_strings(self): + self.s.value_counts() From c3c04e266cbc5e176cc6ef4dc385cdd88fda0669 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 5 Dec 2017 03:15:21 -0800 Subject: [PATCH 94/98] CLN: Remove Categorical.from_array (#18642) Deprecated in 0.19.0 xref gh-13854. --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/categorical.py | 20 -------------------- pandas/tests/test_categorical.py | 7 +------ 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cd727c728eb3d..495d0beaf3faa 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -149,6 +149,7 @@ Removal of prior version deprecations/changes - ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) - The ``SparseList`` class has been removed (:issue:`14007`) - The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) +- ``Categorical.from_array`` has been removed (:issue:`13854`) .. _whatsnew_0220.performance: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index deaec20586005..e34755e665f8d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -552,26 +552,6 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, return cls(codes, dtype=dtype, fastpath=True) - @classmethod - def from_array(cls, data, **kwargs): - """ - .. deprecated:: 0.19.0 - Use ``Categorical`` instead. - - Make a Categorical type from a single array-like object. - - For internal compatibility with numpy arrays. - - Parameters - ---------- - data : array-like - Can be an Index or array-like. The categories are assumed to be - the unique values of `data`. - """ - warn("Categorical.from_array is deprecated, use Categorical instead", - FutureWarning, stacklevel=2) - return cls(data, **kwargs) - @classmethod def from_codes(cls, codes, categories, ordered=False): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b570672124976..b661bde434814 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1955,11 +1955,6 @@ def test_deprecated_labels(self): res = cat.labels tm.assert_numpy_array_equal(res, exp) - def test_deprecated_from_array(self): - # GH13854, `.from_array` is deprecated - with tm.assert_produces_warning(FutureWarning): - Categorical.from_array([0, 1]) - def test_datetime_categorical_comparison(self): dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True) tm.assert_numpy_array_equal(dt_cat > dt_cat[0], @@ -4817,7 +4812,7 @@ def test_constructor(self): assert isinstance(sc, tm.SubclassedCategorical) tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) - def test_from_array(self): + def test_from_codes(self): sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) assert isinstance(sc, tm.SubclassedCategorical) exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) From b288d19d031ae699ecd480fbac70595472aa5295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20Konefa=C5=82?= Date: Wed, 22 Nov 2017 20:28:10 +0100 Subject: [PATCH 95/98] BUG: fillna maximum recursion depth exceeded in cmp (GH18159). --- doc/source/whatsnew/v0.21.1.txt | 2 ++ pandas/core/internals.py | 5 +++-- pandas/tests/internals/test_internals.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index a9608594be547..aaddc61f8aeb8 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,6 +56,8 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- Bug in :meth:`fillna` where maximum recursion depth gets exceeded in comparison (:issue:`18159`). + Conversion ^^^^^^^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e5db5679c43f6..6b7ebe12b9724 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1847,8 +1847,9 @@ def _can_hold_element(self, element): if tipo is not None: return (issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return (isinstance(element, (float, int, np.floating, np.int_)) and - not isinstance(element, (bool, np.bool_, datetime, timedelta, + return ( + isinstance(element, (float, int, np.floating, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))) def to_native_types(self, slicer=None, na_rep='', float_format=None, diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a22d0174947e1..ac78def824b3e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1222,6 +1222,7 @@ class TestCanHoldElement(object): @pytest.mark.parametrize('value, dtype', [ (1, 'i8'), (1.0, 'f8'), + (2**63, 'f8'), (1j, 'complex128'), (True, 'bool'), (np.timedelta64(20, 'ns'), ' Date: Sun, 3 Dec 2017 10:18:06 +0100 Subject: [PATCH 96/98] Applied requested changes --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/internals.py | 2 +- pandas/tests/internals/test_internals.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index aaddc61f8aeb8..149a9931c91a0 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,7 +56,7 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- Bug in :meth:`fillna` where maximum recursion depth gets exceeded in comparison (:issue:`18159`). +- Bug in :meth:`Series.fillna` which was raising RuntimeError when got large integer (:issue:`18159`). Conversion diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6b7ebe12b9724..a215c5a0b6b13 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1898,7 +1898,7 @@ def _can_hold_element(self, element): return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) return (isinstance(element, - (float, int, complex, np.float_, np.int_)) and + (float, int, complex, np.float_, np.int_, compat.long)) and not isinstance(element, (bool, np.bool_))) def should_store(self, value): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ac78def824b3e..08f769e02e267 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1224,6 +1224,7 @@ class TestCanHoldElement(object): (1.0, 'f8'), (2**63, 'f8'), (1j, 'complex128'), + (2**63, 'complex128'), (True, 'bool'), (np.timedelta64(20, 'ns'), ' Date: Sun, 3 Dec 2017 22:21:01 +0100 Subject: [PATCH 97/98] Moved change log to conversions section --- doc/source/whatsnew/v0.21.1.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 149a9931c91a0..73962efad61e7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,7 +56,7 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- Bug in :meth:`Series.fillna` which was raising RuntimeError when got large integer (:issue:`18159`). +- Conversion @@ -67,6 +67,7 @@ Conversion - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) - Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) - Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) +- Bug in :meth:`Series.fillna` which raised when passed a long integer on Python 2 (:issue:`18159`). - Indexing From 03b2f6b1e7d121b7b34bb998914c684357c0f98c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20Konefa=C5=82?= Date: Fri, 8 Dec 2017 17:57:43 +0100 Subject: [PATCH 98/98] Lint fixes --- pandas/core/internals.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a215c5a0b6b13..4169a001655cb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1848,7 +1848,8 @@ def _can_hold_element(self, element): return (issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(tipo.type, (np.datetime64, np.timedelta64))) return ( - isinstance(element, (float, int, np.floating, np.int_, compat.long)) + isinstance( + element, (float, int, np.floating, np.int_, compat.long)) and not isinstance(element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))) @@ -1897,9 +1898,11 @@ def _can_hold_element(self, element): if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) - return (isinstance(element, - (float, int, complex, np.float_, np.int_, compat.long)) and - not isinstance(element, (bool, np.bool_))) + return ( + isinstance( + element, + (float, int, complex, np.float_, np.int_, compat.long)) + and not isinstance(element, (bool, np.bool_))) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating)