From 65c66b0c91f45c2d3432e0604db51443bfc7fff7 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 24 Dec 2018 23:10:48 +0000 Subject: [PATCH 1/6] Improve error messages --- pandas/core/dtypes/dtypes.py | 6 +++++- pandas/tests/dtypes/test_dtypes.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ac69927d4adf1..12063d42f6c61 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs import NaT, Period, Timestamp, timezones from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass +from .inference import is_list_like from pandas import compat @@ -408,7 +409,10 @@ def validate_categories(categories, fastpath=False): """ from pandas import Index - if not isinstance(categories, ABCIndexClass): + if not fastpath and not is_list_like(categories, allow_sets=True): + msg = "Parameter 'categories' must be list-like, was {!r}" + raise TypeError(msg.format(categories)) + elif not isinstance(categories, ABCIndexClass): categories = Index(categories, tupleize_cols=False) if not fastpath: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index ab52a8a81385c..be3c1d1ef7512 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -90,7 +90,7 @@ def test_construction_from_string(self): TypeError, lambda: CategoricalDtype.construct_from_string('foo')) def test_constructor_invalid(self): - msg = "CategoricalIndex.* must be called" + msg = "categories must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category") @@ -706,7 +706,7 @@ def test_invalid_raises(self): with pytest.raises(TypeError, match='ordered'): CategoricalDtype(['a', 'b'], ordered='foo') - with pytest.raises(TypeError, match='collection'): + with pytest.raises(TypeError, match='categories must be list-like'): CategoricalDtype('category') def test_mixed(self): From 60f7bec87b7186c53cf2f3ce918c7704a8a9ff53 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 25 Dec 2018 00:30:34 +0000 Subject: [PATCH 2/6] REF: clearer construction of Categorical/CategoricalIndex --- pandas/core/arrays/categorical.py | 108 +++++++++++------- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/indexes/category.py | 44 +++---- .../arrays/categorical/test_constructors.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 4 +- pandas/tests/indexes/test_category.py | 2 +- 6 files changed, 85 insertions(+), 77 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a47406cded7b4..b2ea300b262c9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -200,6 +200,62 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) +def create_categorical_dtype(values, categories=None, ordered=None, + dtype=None): + """ + Helper function to Construct/return a :class:`CategoricalDtype`. + + Construct the CategoricalDtype from typical inputs to :class:`Categorical`. + + Parameters + ---------- + values : array-like or Categorical, (1-dimensional), optional + categories : list-like, optional + categories for the CategoricalDtype + ordered : bool, optional + designating if the categories are ordered + dtype : CategoricalDtype, optional + Cannot be used in combination with `categories` or `ordered`. + + Returns + ------- + CategoricalDtype + + Examples + -------- + >>> create_categorical_dtype() + CategoricalDtype(categories=None, ordered=None) + >>> create_categorical_dtype(categories=['a', 'b'], ordered=True) + CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> dtype = CategoricalDtype(['a', 'b'], ordered=True) + >>> c = Categorical([0, 1], dtype=dtype, fastpath=True) + >>> create_categorical_dtype(c, ['x', 'y'], True, dtype=dtype) + CategoricalDtype(['a', 'b'], ordered=True) + """ + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + msg = "Unknown dtype {dtype!r}" + raise ValueError(msg.format(dtype=dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify `categories` or `ordered` " + "together with `dtype`.") + elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + # If dtype=None and values is not categorical, create a new dtype. + # Note: This could potentially have categories=None and ordered=None. + dtype = CategoricalDtype(categories, ordered) + + return dtype + + _codes_doc = """\ The category codes of this categorical. @@ -316,50 +372,18 @@ class Categorical(ExtensionArray, PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): - # Ways of specifying the dtype (prioritized ordered) - # 1. dtype is a CategoricalDtype - # a.) with known categories, use dtype.categories - # b.) else with Categorical values, use values.dtype - # c.) else, infer from values - # d.) specifying dtype=CategoricalDtype and categories is an error - # 2. dtype is a string 'category' - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - # 3. dtype is None - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - if dtype is not None: - # The dtype argument takes precedence over values.dtype (if any) - if isinstance(dtype, compat.string_types): - if dtype == 'category': - dtype = CategoricalDtype(categories, ordered) - else: - msg = "Unknown `dtype` {dtype}" - raise ValueError(msg.format(dtype=dtype)) - elif categories is not None or ordered is not None: - raise ValueError("Cannot specify both `dtype` and `categories`" - " or `ordered`.") - elif is_categorical(values): - # If no "dtype" was passed, use the one from "values", but honor - # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) + dtype = create_categorical_dtype(values, categories, ordered, dtype) + # At this point, dtype is always a CategoricalDtype, but + # we may have dtype.categories be None, and we need to + # infer categories in a factorization step futher below + if is_categorical(values): # GH23814, for perf, if values._values already an instance of # Categorical, set values to codes, and run fastpath if (isinstance(values, (ABCSeries, ABCIndexClass)) and isinstance(values._values, type(self))): values = values._values.codes.copy() fastpath = True - else: - # If dtype=None and values is not categorical, create a new dtype - dtype = CategoricalDtype(categories, ordered) - - # At this point, dtype is always a CategoricalDtype and you should not - # use categories and ordered seperately. - # if dtype.categories is None, we are inferring if fastpath: self._codes = coerce_indexer_dtype(values, dtype.categories) @@ -656,6 +680,8 @@ def from_codes(cls, codes, categories, ordered=False): categorical. If not given, the resulting categorical will be unordered. """ + dtype = create_categorical_dtype(codes, categories, ordered) + codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): msg = "codes need to be array-like integers" @@ -675,14 +701,12 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = CategoricalDtype.validate_categories(categories) - - if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): + if len(codes) and ( + codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " "len(categories)-1") - return cls(codes, categories=categories, ordered=ordered, - fastpath=True) + return cls(codes, dtype=dtype, fastpath=True) _codes = None diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 12063d42f6c61..56376c228ff9e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,11 +9,11 @@ from pandas._libs.tslibs import NaT, Period, Timestamp, timezones from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass -from .inference import is_list_like from pandas import compat from .base import ExtensionDtype, _DtypeOpsMixin +from .inference import is_list_like def register_extension_dtype(cls): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0ef7422555fe6..3f98e9d58d942 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -17,7 +17,8 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, contains +from pandas.core.arrays.categorical import ( + Categorical, contains, create_categorical_dtype) import pandas.core.common as com from pandas.core.config import get_option import pandas.core.indexes.base as ibase @@ -107,29 +108,22 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if fastpath: return cls._simple_new(data, name=name, dtype=dtype) + dtype = create_categorical_dtype(data, categories, ordered, dtype) + if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, ABCCategorical): - data = cls._create_categorical(data, categories, ordered, - dtype) - elif isinstance(data, CategoricalIndex): - data = data._data - data = cls._create_categorical(data, categories, ordered, - dtype) - else: - + if not is_categorical_dtype(data): # don't allow scalars # if data is None, then categories must be provided if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] - data = cls._create_categorical(data, categories, ordered, - dtype) - if copy: - data = data.copy() + data = cls._create_categorical(data, dtype=dtype) + + data = data.copy() if copy else data return cls._simple_new(data, name=name) @@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None): return CategoricalIndex(cat, name=name) @classmethod - def _create_categorical(cls, data, categories=None, ordered=None, - dtype=None): + def _create_categorical(cls, data, dtype=None): """ *this is an internal non-public method* @@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None, Parameters ---------- data : data for new Categorical - categories : optional categories, defaults to existing - ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns @@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None, data = data.values if not isinstance(data, ABCCategorical): - if ordered is None and dtype is None: - ordered = False - data = Categorical(data, categories=categories, ordered=ordered, - dtype=dtype) - else: - if categories is not None: - data = data.set_categories(categories, ordered=ordered) - elif ordered is not None and ordered != data.ordered: - data = data.set_ordered(ordered) - if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: - # we want to silently ignore dtype='category' - data = data._set_dtype(dtype) + return Categorical(data, dtype=dtype) + + if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 79e10de2b8aaf..f8e9e393091e5 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered): tm.assert_categorical_equal(result, expected) def test_constructor_str_unknown(self): - with pytest.raises(ValueError, match="Unknown `dtype`"): + with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") def test_constructor_from_categorical_with_dtype(self): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index be3c1d1ef7512..0d990421c6d70 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -90,7 +90,7 @@ def test_construction_from_string(self): TypeError, lambda: CategoricalDtype.construct_from_string('foo')) def test_constructor_invalid(self): - msg = "categories must be list-like" + msg = "Parameter 'categories' must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category") @@ -706,7 +706,7 @@ def test_invalid_raises(self): with pytest.raises(TypeError, match='ordered'): CategoricalDtype(['a', 'b'], ordered='foo') - with pytest.raises(TypeError, match='categories must be list-like'): + with pytest.raises(TypeError, match="'categories' must be list-like"): CategoricalDtype('category') def test_mixed(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 8552e65a0dd24..8518c1fa369c2 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self): tm.assert_index_equal(result, expected, exact=True) # error when combining categories/ordered and dtype kwargs - msg = 'Cannot specify both `dtype` and `categories` or `ordered`.' + msg = 'Cannot specify `categories` or `ordered` together with `dtype`.' with pytest.raises(ValueError, match=msg): CategoricalIndex(data, categories=cats, dtype=dtype) From 72523227422545e32b17e9201a19b11797ad7daf Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 25 Dec 2018 09:40:46 +0000 Subject: [PATCH 3/6] Change doc string according to comments --- pandas/core/arrays/categorical.py | 33 ++++++++++++------- .../arrays/categorical/test_constructors.py | 30 +++++++++++++++++ 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b2ea300b262c9..b13ce4d1516f8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -200,22 +200,25 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -def create_categorical_dtype(values, categories=None, ordered=None, +def create_categorical_dtype(values=None, categories=None, ordered=None, dtype=None): """ - Helper function to Construct/return a :class:`CategoricalDtype`. + Construct and return a :class:`~pandas.api.types.CategoricalDtype`. - Construct the CategoricalDtype from typical inputs to :class:`Categorical`. + This is a helper function, and specifically does not do the + factorization step, if that is needed. Parameters ---------- - values : array-like or Categorical, (1-dimensional), optional + values : list-like, optional + The list-like must be 1-dimensional. categories : list-like, optional - categories for the CategoricalDtype + Categories for the CategoricalDtype. ordered : bool, optional - designating if the categories are ordered - dtype : CategoricalDtype, optional - Cannot be used in combination with `categories` or `ordered`. + Designating if the categories are ordered. + dtype : CategoricalDtype or the string "category", optional + If ``CategoricalDtype`` cannot be used together with + `categories` or `ordered`. Returns ------- @@ -227,10 +230,16 @@ def create_categorical_dtype(values, categories=None, ordered=None, CategoricalDtype(categories=None, ordered=None) >>> create_categorical_dtype(categories=['a', 'b'], ordered=True) CategoricalDtype(categories=['a', 'b'], ordered=True) - >>> dtype = CategoricalDtype(['a', 'b'], ordered=True) - >>> c = Categorical([0, 1], dtype=dtype, fastpath=True) - >>> create_categorical_dtype(c, ['x', 'y'], True, dtype=dtype) - CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) + >>> create_categorical_dtype(c, ['x', 'y'], ordered=True, dtype=dtype2) + ValueError: Cannot specify `categories` or `ordered` together with `dtype`. + + The supplied dtype takes precedence over values's dtype: + + >>> create_categorical_dtype(c, dtype=dtype2) + CategoricalDtype(['x', 'y'], ordered=False) """ if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index f8e9e393091e5..61c326da856a4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -13,6 +13,7 @@ Categorical, CategoricalIndex, DatetimeIndex, Index, Interval, IntervalIndex, NaT, Series, Timestamp, date_range, period_range, timedelta_range) +from pandas.core.arrays.categorical import create_categorical_dtype import pandas.util.testing as tm @@ -530,3 +531,32 @@ def test_constructor_imaginary(self): c1 = Categorical(values) tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values)) + + +class TestCreateCategoricalDtype(object): + dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + c = Categorical([0, 1], dtype=dtype1, fastpath=True) + + @pytest.mark.parametrize('values, categories, ordered, dtype, expected', [ + [None, None, None, None, CategoricalDtype()], + [None, ['a', 'b'], True, None, dtype1], + [c, None, None, dtype2, dtype2], + [c, ['x', 'y'], False, None, dtype2], + ]) + def test_create_categorical_dtype( + self, values, categories, ordered, dtype, expected): + result = create_categorical_dtype(values, categories, ordered, dtype) + assert result == expected + + @pytest.mark.parametrize('values, categories, ordered, dtype', [ + [None, ['a', 'b'], True, dtype2], + [None, ['a', 'b'], None, dtype2], + [None, None, True, dtype2], + ]) + def test_create_categorical_dtype_raises(self, values, categories, ordered, + dtype): + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + + with pytest.raises(ValueError, match=msg): + create_categorical_dtype(values, categories, ordered, dtype) From e33dcee3708c3e459c543f3f732e7f51d58a8852 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 26 Dec 2018 08:12:09 +0000 Subject: [PATCH 4/6] move new constructor to dtypes/dtypes.py --- pandas/core/arrays/categorical.py | 71 ++----------------- pandas/core/dtypes/dtypes.py | 71 +++++++++++++++++++ pandas/core/indexes/category.py | 6 +- .../arrays/categorical/test_constructors.py | 30 -------- pandas/tests/dtypes/test_dtypes.py | 32 +++++++++ 5 files changed, 110 insertions(+), 100 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b13ce4d1516f8..969add2d3efef 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -200,71 +200,6 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -def create_categorical_dtype(values=None, categories=None, ordered=None, - dtype=None): - """ - Construct and return a :class:`~pandas.api.types.CategoricalDtype`. - - This is a helper function, and specifically does not do the - factorization step, if that is needed. - - Parameters - ---------- - values : list-like, optional - The list-like must be 1-dimensional. - categories : list-like, optional - Categories for the CategoricalDtype. - ordered : bool, optional - Designating if the categories are ordered. - dtype : CategoricalDtype or the string "category", optional - If ``CategoricalDtype`` cannot be used together with - `categories` or `ordered`. - - Returns - ------- - CategoricalDtype - - Examples - -------- - >>> create_categorical_dtype() - CategoricalDtype(categories=None, ordered=None) - >>> create_categorical_dtype(categories=['a', 'b'], ordered=True) - CategoricalDtype(categories=['a', 'b'], ordered=True) - >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) - >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) - >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) - >>> create_categorical_dtype(c, ['x', 'y'], ordered=True, dtype=dtype2) - ValueError: Cannot specify `categories` or `ordered` together with `dtype`. - - The supplied dtype takes precedence over values's dtype: - - >>> create_categorical_dtype(c, dtype=dtype2) - CategoricalDtype(['x', 'y'], ordered=False) - """ - if dtype is not None: - # The dtype argument takes precedence over values.dtype (if any) - if isinstance(dtype, compat.string_types): - if dtype == 'category': - dtype = CategoricalDtype(categories, ordered) - else: - msg = "Unknown dtype {dtype!r}" - raise ValueError(msg.format(dtype=dtype)) - elif categories is not None or ordered is not None: - raise ValueError("Cannot specify `categories` or `ordered` " - "together with `dtype`.") - elif is_categorical(values): - # If no "dtype" was passed, use the one from "values", but honor - # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) - else: - # If dtype=None and values is not categorical, create a new dtype. - # Note: This could potentially have categories=None and ordered=None. - dtype = CategoricalDtype(categories, ordered) - - return dtype - - _codes_doc = """\ The category codes of this categorical. @@ -381,7 +316,8 @@ class Categorical(ExtensionArray, PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): - dtype = create_categorical_dtype(values, categories, ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below @@ -689,7 +625,8 @@ def from_codes(cls, codes, categories, ordered=False): categorical. If not given, the resulting categorical will be unordered. """ - dtype = create_categorical_dtype(codes, categories, ordered) + dtype = CategoricalDtype._from_values_or_dtype(codes, categories, + ordered) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 56376c228ff9e..169583da03c48 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -241,6 +241,77 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): ordered = dtype.ordered return cls(categories, ordered) + @classmethod + def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, + dtype=None): + """ + Construct from the inputs used in :class:`Categorical` construction. + + This is an internal helper method, and specifically does not do the + factorization step, if that is needed. Additional steps may + therefore have to be taken to create the final dtype. + + Parameters + ---------- + values : list-like, optional + The list-like must be 1-dimensional. + categories : list-like, optional + Categories for the CategoricalDtype. + ordered : bool, optional + Designating if the categories are ordered. + dtype : CategoricalDtype or the string "category", optional + If ``CategoricalDtype`` cannot be used together with + `categories` or `ordered`. + + Returns + ------- + CategoricalDtype + + Examples + -------- + >>> CategoricalDtype._from_values_or_dtype() + CategoricalDtype(categories=None, ordered=None) + >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'], + ... ordered=True) + CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) + >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True, + ... dtype=dtype2) + ValueError: Cannot specify `categories` or `ordered` together with + `dtype`. + + The supplied dtype takes precedence over values's dtype: + + >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) + CategoricalDtype(['x', 'y'], ordered=False) + """ + from pandas.core.dtypes.common import is_categorical + + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + msg = "Unknown dtype {dtype!r}" + raise ValueError(msg.format(dtype=dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify `categories` or `ordered` " + "together with `dtype`.") + elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + # If dtype=None and values is not categorical, create a new dtype. + # Note: This could potentially have categories=None and ordered=None. + dtype = CategoricalDtype(categories, ordered) + + return dtype + def _finalize(self, categories, ordered, fastpath=False): if ordered is not None: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3f98e9d58d942..f76085f9889dd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -17,8 +17,7 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import ( - Categorical, contains, create_categorical_dtype) +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com from pandas.core.config import get_option import pandas.core.indexes.base as ibase @@ -108,7 +107,8 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if fastpath: return cls._simple_new(data, name=name, dtype=dtype) - dtype = create_categorical_dtype(data, categories, ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype(data, categories, + ordered, dtype) if name is None and hasattr(data, 'name'): name = data.name diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 61c326da856a4..f8e9e393091e5 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -13,7 +13,6 @@ Categorical, CategoricalIndex, DatetimeIndex, Index, Interval, IntervalIndex, NaT, Series, Timestamp, date_range, period_range, timedelta_range) -from pandas.core.arrays.categorical import create_categorical_dtype import pandas.util.testing as tm @@ -531,32 +530,3 @@ def test_constructor_imaginary(self): c1 = Categorical(values) tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values)) - - -class TestCreateCategoricalDtype(object): - dtype1 = CategoricalDtype(['a', 'b'], ordered=True) - dtype2 = CategoricalDtype(['x', 'y'], ordered=False) - c = Categorical([0, 1], dtype=dtype1, fastpath=True) - - @pytest.mark.parametrize('values, categories, ordered, dtype, expected', [ - [None, None, None, None, CategoricalDtype()], - [None, ['a', 'b'], True, None, dtype1], - [c, None, None, dtype2, dtype2], - [c, ['x', 'y'], False, None, dtype2], - ]) - def test_create_categorical_dtype( - self, values, categories, ordered, dtype, expected): - result = create_categorical_dtype(values, categories, ordered, dtype) - assert result == expected - - @pytest.mark.parametrize('values, categories, ordered, dtype', [ - [None, ['a', 'b'], True, dtype2], - [None, ['a', 'b'], None, dtype2], - [None, None, True, dtype2], - ]) - def test_create_categorical_dtype_raises(self, values, categories, ordered, - dtype): - msg = "Cannot specify `categories` or `ordered` together with `dtype`." - - with pytest.raises(ValueError, match=msg): - create_categorical_dtype(values, categories, ordered, dtype) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0d990421c6d70..0f936718ae1f0 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -94,6 +94,38 @@ def test_constructor_invalid(self): with pytest.raises(TypeError, match=msg): CategoricalDtype("category") + dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + c = Categorical([0, 1], dtype=dtype1, fastpath=True) + + @pytest.mark.parametrize('values, categories, ordered, dtype, expected', + [ + [None, None, None, None, + CategoricalDtype()], + [None, ['a', 'b'], True, None, dtype1], + [c, None, None, dtype2, dtype2], + [c, ['x', 'y'], False, None, dtype2], + ]) + def test_create_categorical_dtype( + self, values, categories, ordered, dtype, expected): + result = CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) + assert result == expected + + @pytest.mark.parametrize('values, categories, ordered, dtype', [ + [None, ['a', 'b'], True, dtype2], + [None, ['a', 'b'], None, dtype2], + [None, None, True, dtype2], + ]) + def test_create_categorical_dtype_raises(self, values, categories, + ordered, + dtype): + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + + with pytest.raises(ValueError, match=msg): + CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) + def test_is_dtype(self): assert CategoricalDtype.is_dtype(self.dtype) assert CategoricalDtype.is_dtype('category') From 51d363a6fd8d7f9d0360952fc7d6a8c2fbbab991 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 1 Jan 2019 01:57:13 +0000 Subject: [PATCH 5/6] adjust doc string --- pandas/core/dtypes/dtypes.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 169583da03c48..c17f72363618c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -245,11 +245,23 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, dtype=None): """ - Construct from the inputs used in :class:`Categorical` construction. - - This is an internal helper method, and specifically does not do the - factorization step, if that is needed. Additional steps may - therefore have to be taken to create the final dtype. + Construct dtype from the input parameters used in :class:`Categorical`. + + This constructor method specifically does not do the factorization + step, if that is needed to find the categories. This constructor may + therefore return ``CategoricalDtype(categories=None, ordered=None)``, + which may not be useful. Additional steps may therefore have to be + taken to create the final dtype. + + The return dtype is specified from the inputs in this prioritized + order: + 1. if dtype is a CategoricalDtype, return dtype + 2. if dtype is the string 'category', create a CategoricalDtype from + the supplied categories and ordered parameters, and return that. + 3. if values is a categorical, use value.dtype, but override it with + categories and ordered if either/both of those are not None. + 4. if dtype is None and values is not a categorical, construct the + dtype from categories and ordered, even if either of those is None. Parameters ---------- @@ -307,7 +319,8 @@ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, categories, ordered) else: # If dtype=None and values is not categorical, create a new dtype. - # Note: This could potentially have categories=None and ordered=None. + # Note: This could potentially have categories=None and + # ordered=None. dtype = CategoricalDtype(categories, ordered) return dtype From 346510e5973e30bac2412f05eb9d0d3987d78dfa Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 3 Jan 2019 22:09:18 +0000 Subject: [PATCH 6/6] change according to comments --- pandas/core/dtypes/dtypes.py | 6 +++--- pandas/tests/dtypes/test_dtypes.py | 8 +++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c17f72363618c..e35ee32657509 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -272,7 +272,7 @@ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, ordered : bool, optional Designating if the categories are ordered. dtype : CategoricalDtype or the string "category", optional - If ``CategoricalDtype`` cannot be used together with + If ``CategoricalDtype``, cannot be used together with `categories` or `ordered`. Returns @@ -294,7 +294,7 @@ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, ValueError: Cannot specify `categories` or `ordered` together with `dtype`. - The supplied dtype takes precedence over values's dtype: + The supplied dtype takes precedence over values' dtype: >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) CategoricalDtype(['x', 'y'], ordered=False) @@ -493,7 +493,7 @@ def validate_categories(categories, fastpath=False): """ from pandas import Index - if not fastpath and not is_list_like(categories, allow_sets=True): + if not fastpath and not is_list_like(categories): msg = "Parameter 'categories' must be list-like, was {!r}" raise TypeError(msg.format(categories)) elif not isinstance(categories, ABCIndexClass): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0f936718ae1f0..40b8f7afa3598 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -106,7 +106,7 @@ def test_constructor_invalid(self): [c, None, None, dtype2, dtype2], [c, ['x', 'y'], False, None, dtype2], ]) - def test_create_categorical_dtype( + def test_from_values_or_dtype( self, values, categories, ordered, dtype, expected): result = CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype) @@ -117,11 +117,9 @@ def test_create_categorical_dtype( [None, ['a', 'b'], None, dtype2], [None, None, True, dtype2], ]) - def test_create_categorical_dtype_raises(self, values, categories, - ordered, - dtype): + def test_from_values_or_dtype_raises(self, values, categories, + ordered, dtype): msg = "Cannot specify `categories` or `ordered` together with `dtype`." - with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype)