Skip to content

CLN/BUG: Better validation of levels/labels/names #5209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 14, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ Improvements to existing features
(:issue:`4039`)
- Add ``rename`` and ``set_names`` methods to ``Index`` as well as
``set_names``, ``set_levels``, ``set_labels`` to ``MultiIndex``.
(:issue:`4039`)
(:issue:`4039`) with improved validation for all (:issue:`4039`,
:issue:`4794`)
- A Series of dtype ``timedelta64[ns]`` can now be divided/multiplied
by an integer series (:issue`4521`)
- A Series of dtype ``timedelta64[ns]`` can now be divided by another
Expand Down
2 changes: 1 addition & 1 deletion doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ API changes
# but setting names is not deprecated.
index = index.set_names(["bob", "cranberry"])

# and all methods take an inplace kwarg
# and all methods take an inplace kwarg - but returns None
index.set_names(["bob", "cranberry"], inplace=True)

- Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`)
Expand Down
72 changes: 39 additions & 33 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1672,8 +1672,8 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
subarr = np.array(data, dtype=np.int64, copy=copy)
if len(data) > 0:
if (subarr != data).any():
raise TypeError('Unsafe NumPy casting, you must '
'explicitly cast')
raise TypeError('Unsafe NumPy casting to integer, you must'
' explicitly cast')

subarr = subarr.view(cls)
subarr.name = name
Expand Down Expand Up @@ -1857,11 +1857,12 @@ class MultiIndex(Index):

def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
copy=False):
if levels is None or labels is None:
raise TypeError("Must pass both levels and labels")
if len(levels) != len(labels):
raise ValueError(
'Length of levels and labels must be the same')
raise ValueError('Length of levels and labels must be the same.')
if len(levels) == 0:
raise TypeError('Must pass non-zero number of levels/labels')
raise ValueError('Must pass non-zero number of levels/labels')
if len(levels) == 1:
if names:
name = names[0]
Expand All @@ -1872,10 +1873,12 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,

# v3, 0.8.0
subarr = np.empty(0, dtype=object).view(cls)
subarr._set_levels(levels, copy=copy)
subarr._set_labels(labels, copy=copy)
# we've already validated levels and labels, so shortcut here
subarr._set_levels(levels, copy=copy, validate=False)
subarr._set_labels(labels, copy=copy, validate=False)

if names is not None:
# handles name validation
subarr._set_names(names)

if sortorder is not None:
Expand All @@ -1888,12 +1891,14 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
def _get_levels(self):
return self._levels

def _set_levels(self, levels, copy=False):
def _set_levels(self, levels, copy=False, validate=True):
# This is NOT part of the levels property because it should be
# externally not allowed to set levels. User beware if you change
# _levels directly
if len(levels) == 0:
raise ValueError("Must set non-zero number of levels.")
if validate and len(levels) == 0:
raise ValueError('Must set non-zero number of levels.')
if validate and len(levels) != len(self._labels):
raise ValueError('Length of levels must match length of labels.')
levels = FrozenList(_ensure_index(lev, copy=copy)._shallow_copy()
for lev in levels)
names = self.names
Expand All @@ -1917,13 +1922,16 @@ def set_levels(self, levels, inplace=False):
-------
new index (of same type and class...etc)
"""
if not com.is_list_like(levels) or not com.is_list_like(levels[0]):
raise TypeError("Levels must be list of lists-like")
if inplace:
idx = self
else:
idx = self._shallow_copy()
idx._reset_identity()
idx._set_levels(levels)
return idx
if not inplace:
return idx

# remove me in 0.14 and change to read only property
__set_levels = deprecate("setting `levels` directly",
Expand All @@ -1934,9 +1942,9 @@ def set_levels(self, levels, inplace=False):
def _get_labels(self):
return self._labels

def _set_labels(self, labels, copy=False):
if len(labels) != self.nlevels:
raise ValueError("Length of levels and labels must be the same.")
def _set_labels(self, labels, copy=False, validate=True):
if validate and len(labels) != self.nlevels:
raise ValueError("Length of labels must match length of levels")
self._labels = FrozenList(_ensure_frozen(labs, copy=copy)._shallow_copy()
for labs in labels)

Expand All @@ -1956,13 +1964,16 @@ def set_labels(self, labels, inplace=False):
-------
new index (of same type and class...etc)
"""
if not com.is_list_like(labels) or not com.is_list_like(labels[0]):
raise TypeError("Labels must be list of lists-like")
if inplace:
idx = self
else:
idx = self._shallow_copy()
idx._reset_identity()
idx._set_labels(labels)
return idx
if not inplace:
return idx

# remove me in 0.14 and change to readonly property
__set_labels = deprecate("setting labels directly",
Expand Down Expand Up @@ -2021,7 +2032,8 @@ def __array_finalize__(self, obj):
# instance.
return

self._set_levels(getattr(obj, 'levels', []))
# skip the validation on first, rest will catch the errors
self._set_levels(getattr(obj, 'levels', []), validate=False)
self._set_labels(getattr(obj, 'labels', []))
self._set_names(getattr(obj, 'names', []))
self.sortorder = getattr(obj, 'sortorder', None)
Expand Down Expand Up @@ -2083,16 +2095,15 @@ def _convert_slice_indexer(self, key, typ=None):
def _get_names(self):
return FrozenList(level.name for level in self.levels)

def _set_names(self, values):
def _set_names(self, values, validate=True):
"""
sets names on levels. WARNING: mutates!

Note that you generally want to set this *after* changing levels, so that it only
acts on copies"""
values = list(values)
if len(values) != self.nlevels:
raise ValueError('Length of names (%d) must be same as level '
'(%d)' % (len(values), self.nlevels))
if validate and len(values) != self.nlevels:
raise ValueError('Length of names must match length of levels')
# set the name
for name, level in zip(values, self.levels):
level.rename(name, inplace=True)
Expand Down Expand Up @@ -2446,7 +2457,7 @@ def __setstate__(self, state):
np.ndarray.__setstate__(self, nd_state)
levels, labels, sortorder, names = own_state

self._set_levels([Index(x) for x in levels])
self._set_levels([Index(x) for x in levels], validate=False)
self._set_labels(labels)
self._set_names(names)
self.sortorder = sortorder
Expand All @@ -2473,7 +2484,7 @@ def __getitem__(self, key):
new_labels = [lab[key] for lab in self.labels]

# an optimization
result._set_levels(self.levels)
result._set_levels(self.levels, validate=False)
result._set_labels(new_labels)
result.sortorder = sortorder
result._set_names(self.names)
Expand Down Expand Up @@ -3351,17 +3362,12 @@ def _ensure_index(index_like, copy=False):
return Index(index_like)


def _ensure_frozen(nd_array_like, copy=False):
if not isinstance(nd_array_like, FrozenNDArray):
arr = np.asarray(nd_array_like, dtype=np.int_)
# have to do this separately so that non-index input gets copied
if copy:
arr = arr.copy()
nd_array_like = arr.view(FrozenNDArray)
else:
if copy:
nd_array_like = nd_array_like.copy()
return nd_array_like
def _ensure_frozen(array_like, copy=False):
array_like = np.asanyarray(array_like, dtype=np.int_)
array_like = array_like.view(FrozenNDArray)
if copy:
array_like = array_like.copy()
return array_like


def _validate_join_method(method):
Expand Down
70 changes: 68 additions & 2 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,12 @@ def test_constructor_corner(self):

# preventing casting
arr = np.array([1, '2', 3, '4'], dtype=object)
self.assertRaises(TypeError, Int64Index, arr)
with tm.assertRaisesRegexp(TypeError, 'casting'):
Int64Index(arr)

arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1]
with tm.assertRaisesRegexp(TypeError, 'casting'):
Int64Index(arr_with_floats)

def test_hash_error(self):
with tm.assertRaisesRegexp(TypeError,
Expand Down Expand Up @@ -1240,6 +1245,62 @@ def test_set_levels_and_set_labels(self):
minor_labels = [(x + 1) % 1 for x in minor_labels]
new_labels = [major_labels, minor_labels]

def assert_matching(actual, expected):
# avoid specifying internal representation
# as much as possible
self.assertEqual(len(actual), len(expected))
for act, exp in zip(actual, expected):
act = np.asarray(act)
exp = np.asarray(exp)
assert_almost_equal(act, exp)

# level changing [w/o mutation]
ind2 = self.index.set_levels(new_levels)
assert_matching(ind2.levels, new_levels)
assert_matching(self.index.levels, levels)

# level changing [w/ mutation]
ind2 = self.index.copy()
inplace_return = ind2.set_levels(new_levels, inplace=True)
self.assert_(inplace_return is None)
assert_matching(ind2.levels, new_levels)

# label changing [w/o mutation]
ind2 = self.index.set_labels(new_labels)
assert_matching(ind2.labels, new_labels)
assert_matching(self.index.labels, labels)

# label changing [w/ mutation]
ind2 = self.index.copy()
inplace_return = ind2.set_labels(new_labels, inplace=True)
self.assert_(inplace_return is None)
assert_matching(ind2.labels, new_labels)

def test_set_levels_labels_names_bad_input(self):
levels, labels = self.index.levels, self.index.labels
names = self.index.names

with tm.assertRaisesRegexp(ValueError, 'Length of levels'):
self.index.set_levels([levels[0]])

with tm.assertRaisesRegexp(ValueError, 'Length of labels'):
self.index.set_labels([labels[0]])

with tm.assertRaisesRegexp(ValueError, 'Length of names'):
self.index.set_names([names[0]])

# shouldn't scalar data error, instead should demand list-like
with tm.assertRaisesRegexp(TypeError, 'list of lists-like'):
self.index.set_levels(levels[0])

# shouldn't scalar data error, instead should demand list-like
with tm.assertRaisesRegexp(TypeError, 'list of lists-like'):
self.index.set_labels(labels[0])

# shouldn't scalar data error, instead should demand list-like
with tm.assertRaisesRegexp(TypeError, 'list-like'):
self.index.set_names(names[0])

def test_metadata_immutable(self):
levels, labels = self.index.levels, self.index.labels
# shouldn't be able to set at either the top level or base level
Expand Down Expand Up @@ -1342,8 +1403,13 @@ def test_constructor_single_level(self):
self.assert_(single_level.name is None)

def test_constructor_no_levels(self):
assertRaisesRegexp(TypeError, "non-zero number of levels/labels",
assertRaisesRegexp(ValueError, "non-zero number of levels/labels",
MultiIndex, levels=[], labels=[])
both_re = re.compile('Must pass both levels and labels')
with tm.assertRaisesRegexp(TypeError, both_re):
MultiIndex(levels=[])
with tm.assertRaisesRegexp(TypeError, both_re):
MultiIndex(labels=[])

def test_constructor_mismatched_label_levels(self):
levels = [np.array([1]), np.array([2]), np.array([3])]
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,8 @@ def test_frame_setitem_multi_column(self):
# it broadcasts
df['B', '1'] = [1, 2, 3]
df['A'] = df['B', '1']
assert_almost_equal(df['A', '1'], df['B', '1'])
assert_almost_equal(df['A', '2'], df['B', '1'])
assert_series_equal(df['A', '1'], df['B', '1'])
assert_series_equal(df['A', '2'], df['B', '1'])

def test_getitem_tuple_plus_slice(self):
# GH #671
Expand Down
13 changes: 7 additions & 6 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,11 +332,11 @@ def equalContents(arr1, arr2):
return frozenset(arr1) == frozenset(arr2)


def assert_isinstance(obj, class_type_or_tuple):
def assert_isinstance(obj, class_type_or_tuple, msg=''):
"""asserts that obj is an instance of class_type_or_tuple"""
assert isinstance(obj, class_type_or_tuple), (
"Expected object to be of type %r, found %r instead" % (
type(obj), class_type_or_tuple))
"%sExpected object to be of type %r, found %r instead" % (
msg, class_type_or_tuple, type(obj)))


def assert_equal(a, b, msg=""):
Expand All @@ -355,6 +355,8 @@ def assert_equal(a, b, msg=""):


def assert_index_equal(left, right):
assert_isinstance(left, Index, '[index] ')
assert_isinstance(right, Index, '[index] ')
if not left.equals(right):
raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype,
left,
Expand All @@ -373,6 +375,8 @@ def isiterable(obj):
return hasattr(obj, '__iter__')


# NOTE: don't pass an NDFrame or index to this function - may not handle it
# well.
def assert_almost_equal(a, b, check_less_precise=False):
if isinstance(a, dict) or isinstance(b, dict):
return assert_dict_equal(a, b)
Expand All @@ -385,9 +389,6 @@ def assert_almost_equal(a, b, check_less_precise=False):
np.testing.assert_(isiterable(b))
na, nb = len(a), len(b)
assert na == nb, "%s != %s" % (na, nb)
# TODO: Figure out why I thought this needed instance cheacks...
# if (isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and
# np.array_equal(a, b)):
if np.array_equal(a, b):
return True
else:
Expand Down